totalngrams: add -l option to count <token>\t<lemma>\t<pos>
diff --git a/src/main/java/org/ids_mannheim/TotalNGram.java b/src/main/java/org/ids_mannheim/TotalNGram.java
index 0e4a703..ceec820 100644
--- a/src/main/java/org/ids_mannheim/TotalNGram.java
+++ b/src/main/java/org/ids_mannheim/TotalNGram.java
@@ -50,6 +50,10 @@
boolean force_overwrite = false;
@SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-l", "--with-lemma-pos"}, description = "Use also lemma and part-of-speech annotations (default: ${DEFAULT-VALUE}")
+ boolean with_lemma_and_pos = false;
+
+ @SuppressWarnings("CanBeFinal")
@CommandLine.Option(names = {"-P",
"--max-procs"}, description = "Run up to max-procs processes at a time (default: ${DEFAULT-VALUE})")
int max_threads = MAX_THREADS;
@@ -129,7 +133,7 @@
int threads = Math.min(max_threads, inputFiles.size());
logger.info("Processing fold " + fold + "/" + FOLDS);
logger.info("Using " + threads + " threads");
- IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, workerNodePool, etaPrinter, logger)));
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, workerNodePool, etaPrinter, logger)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index 18f993c..6383591 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -26,10 +26,11 @@
private final int target_fold;
private final Logger logger;
private final WorkerNodePool pool;
+ private final boolean with_lemma_and_pos;
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
ConcurrentHashMap<String, AtomicInteger> map,
- WorkerNodePool pool,
+ boolean with_lemma_and_pos, WorkerNodePool pool,
Progressbar etaPrinter, Logger logger) {
this.queue = queue;
this.fnames = fnames;
@@ -37,6 +38,7 @@
this.ngram_size = ngram_size;
this.folds = folds;
this.target_fold = target_fold;
+ this.with_lemma_and_pos = with_lemma_and_pos;
this.pool = pool;
this.etaPrinter = etaPrinter;
this.logger = logger;
@@ -79,8 +81,12 @@
if (strings.length < 4) {
continue;
}
- //noinspection ConstantConditions
- slidingWindowQueue.add(strings[1]);
+ if (with_lemma_and_pos) {
+ //noinspection ConstantCondition
+ slidingWindowQueue.add(join("\t", strings[1], strings[2], strings[3]));
+ } else {
+ slidingWindowQueue.add(strings[1]);
+ }
}
}
pool.markFree(poolIndex);