Add --pad option to optionally add padding symbols at text edges
1 ich ich PPER PPER _ _ _ _ 1
2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
3 alex alex NE NE _ _ _ _ 0.565630
4 . . $. $. _ _ _ _ 1.000000
# text_id = TST_TST.00001
1 alex alex NE NE _ _ _ _ 0.565630
2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
3 ich ich PPER PPER _ _ _ _ 1
4 . . $. $. _ _ _ _ 1.000000
# text_id = TST_TST.00002
1 ich ich PPER PPER _ _ _ _ 1
2 heiße heißen VAFIN VAFIN _ _ _ _ 1.000000
3 alex alex NE NE _ _ _ _ 0.565630
4 . . $. $. _ _ _ _ 1.000000
---->
. «END» «END» 3
«END» «END» «END» 3
«START» «START» «START» 3
«START» «START» ich 2
alex . «END» 2
«START» «START» alex 1
«START» alex bin 1
«START» ich bin 1
«START» ich heiße 1
alex bin ich 1
bin alex . 1
bin ich . 1
heiße alex . 1
ich . «END» 1
ich bin alex 1
ich heiße alex 1
Change-Id: Ib4826400da657787940805c616a0000ac089120d
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 86932d4..ee1b665 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -92,6 +92,11 @@
"--numeric-secondary-sort"}, description = "Sort entries with same frequency numerically (default: ${DEFAULT-VALUE})")
boolean numericSecondarySort = false;
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {
+ "--pad"}, description = "Add padding " + PaddedSlidingWindowQueue.TEXT_START_SYMBOL + " and " + PaddedSlidingWindowQueue.TEXT_END_SYMBOL + " symbols at text edges (default: ${DEFAULT-VALUE})")
+ boolean addPadding = false;
+
private Progressbar etaPrinter;
public TotalNGrams() {
@@ -164,7 +169,7 @@
int threads = Math.min(max_threads, inputFiles.size());
logger.info("Processing fold " + fold + "/" + FOLDS);
logger.info("Using " + threads + " threads");
- IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger)));
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {