Add --pad option to optionally add padding symbols at text edges

1       ich     ich     PPER    PPER    _       _       _       _       1
 2       bin     sein    VAFIN   VAFIN   _       _       _       _       1.000000
 3       alex    alex    NE      NE      _       _       _       _       0.565630
 4       .       .       $.      $.      _       _       _       _       1.000000

 # text_id = TST_TST.00001
 1       alex    alex    NE      NE      _       _       _       _       0.565630
 2       bin     sein    VAFIN   VAFIN   _       _       _       _       1.000000
 3       ich     ich     PPER    PPER    _       _       _       _       1
 4       .       .       $.      $.      _       _       _       _       1.000000

 # text_id = TST_TST.00002
 1       ich     ich     PPER    PPER    _       _       _       _       1
 2       heiße   heißen  VAFIN   VAFIN   _       _       _       _       1.000000
 3       alex    alex    NE      NE      _       _       _       _       0.565630
 4       .       .       $.      $.      _       _       _       _       1.000000

---->

.       «END»   «END»   3
«END»   «END»   «END»   3
«START» «START» «START» 3
«START» «START» ich     2
alex    .       «END»   2
«START» «START» alex    1
«START» alex    bin     1
«START» ich     bin     1
«START» ich     heiße   1
alex    bin     ich     1
bin     alex    .       1
bin     ich     .       1
heiße   alex    .       1
ich     .       «END»   1
ich     bin     alex    1
ich     heiße   alex    1

Change-Id: Ib4826400da657787940805c616a0000ac089120d
diff --git a/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java b/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java
new file mode 100644
index 0000000..7c141bd
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java
@@ -0,0 +1,47 @@
+package org.ids_mannheim;
+
+
+import static org.ids_mannheim.TotalNGrams.logger;
+
+public class PaddedSlidingWindowQueue extends SlidingWindowQueue {
+    public static final String TEXT_END_SYMBOL = "«END»";
+    public static final String TEXT_START_SYMBOL = "«START»";
+    public static final String TEXT_STARTEND_POS = "«STARTEND»";
+    private static String TEXT_START_COMBO, TEXT_END_COMBO;
+    private boolean needsFlush;
+
+    public PaddedSlidingWindowQueue(int size, Increaser flush, boolean with_lemma_and_pos) {
+        super(size, flush, with_lemma_and_pos);
+        if (with_lemma_and_pos) {
+            TEXT_START_COMBO = TEXT_START_SYMBOL + '\t' + TEXT_START_SYMBOL + '\t' + TEXT_STARTEND_POS;
+            TEXT_END_COMBO = TEXT_END_SYMBOL + '\t' + TEXT_END_SYMBOL + '\t' + TEXT_STARTEND_POS;
+        } else {
+            TEXT_START_COMBO = TEXT_START_SYMBOL;
+            TEXT_END_COMBO = TEXT_END_SYMBOL;
+        }
+        logger.info("Using padding.");
+        needsFlush = false;
+    }
+
+    @Override
+    public void reset(int fold) {
+        this.clear();
+        for (int i = 0; i < maxSize; i++)
+            add(TEXT_START_COMBO);
+        this.fold = fold;
+    }
+
+    @Override
+    public void add(String k) {
+        super.add(k);
+        needsFlush = true;
+    }
+
+    @Override
+    public void textBreak() {
+        if (needsFlush)
+            for (int i = 0; i < maxSize; i++)
+                add(TEXT_END_COMBO);
+    }
+
+}
diff --git a/src/main/java/org/ids_mannheim/SlidingWindowQueue.java b/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
index 93acd58..32e0f60 100644
--- a/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
+++ b/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
@@ -3,17 +3,15 @@
 import java.util.LinkedList;
 
 public class SlidingWindowQueue extends LinkedList {
-    private final int maxSize;
-    private final Increaser flush;
+    final int maxSize;
+    final Increaser flush;
+    final boolean with_lemma_and_pos;
     public int fold;
 
-    interface Increaser {
-        void accept(String s);
-    }
-
-    public SlidingWindowQueue(int size, Increaser flush) {
+    public SlidingWindowQueue(int size, Increaser flush, boolean with_lemma_and_pos) {
         this.maxSize = size;
         this.flush = flush;
+        this.with_lemma_and_pos = with_lemma_and_pos;
     }
 
     public void add(String k) {
@@ -28,4 +26,11 @@
         this.clear();
         this.fold = fold;
     }
+
+    public void textBreak() {
+    }
+
+    interface Increaser {
+        void accept(String s);
+    }
 }
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 86932d4..ee1b665 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -92,6 +92,11 @@
             "--numeric-secondary-sort"}, description = "Sort entries with same frequency numerically  (default: ${DEFAULT-VALUE})")
     boolean numericSecondarySort = false;
 
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {
+            "--pad"}, description = "Add padding " + PaddedSlidingWindowQueue.TEXT_START_SYMBOL + " and " + PaddedSlidingWindowQueue.TEXT_END_SYMBOL + " symbols at text edges  (default: ${DEFAULT-VALUE})")
+    boolean addPadding = false;
+
     private Progressbar etaPrinter;
 
     public TotalNGrams() {
@@ -164,7 +169,7 @@
         int threads = Math.min(max_threads, inputFiles.size());
         logger.info("Processing fold " + fold + "/" + FOLDS);
         logger.info("Using " + threads + " threads");
-        IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger)));
+        IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding)));
         queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
         IntStream.range(0, threads).forEach(unused -> {
             try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index ca7f27c..2f6b32b 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -29,13 +29,14 @@
     private final WorkerNodePool pool;
     private final boolean with_lemma_and_pos;
     private final boolean downcase_tokens;
+    private final boolean addPadding;
 
     private final DeterministicRandomProvider deterministicRandomProvider;
 
     public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
                   ConcurrentHashMap<String, AtomicInteger> map,
                   boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
-                  Progressbar etaPrinter, Logger logger) {
+                  Progressbar etaPrinter, Logger logger, boolean addPadding) {
         this.queue = queue;
         this.fnames = fnames;
         this.map = map;
@@ -47,6 +48,7 @@
         this.logger = logger;
         this.deterministicRandomProvider = new DeterministicRandomProvider(folds);
         this.downcase_tokens = downcase_tokens;
+        this.addPadding = addPadding;
     }
 
     @Override
@@ -55,7 +57,8 @@
             int index = queue.take();
             int retries = MAX_RETRIES;
             int texts = 0;
-            SlidingWindowQueue slidingWindowQueue = new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s));
+            SlidingWindowQueue slidingWindowQueue = (addPadding ? new PaddedSlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos) :
+                    new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos));
             while (index >= 0) {
                 String fname = fnames.get(index);
                 File current_file = new File(fname);
@@ -113,6 +116,7 @@
                     if (line.startsWith("#")) {
                         Matcher matcher = new_text_pattern.matcher(line);
                         if (matcher.find()) {
+                            slidingWindowQueue.textBreak();
                             fold = deterministicRandomProvider.getFoldFromTextID(matcher.group(1)) + 1;
                             texts++;
                             if (fold == target_fold) {
@@ -124,7 +128,7 @@
                         if (strings.length < 4) {
                             continue;
                         }
-                        assert strings.length == 10 : "CoNLL-U Format must have 10 columns";
+                        assert strings.length == 10 : "Error: CoNLL-U Format must have 10 columns in line \""+line+"\"";
                         String token = ( downcase_tokens?
                                 Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
                                 Utils.unEscapeEntities(strings[1]));
@@ -156,6 +160,7 @@
                         }
                     }
                 }
+                slidingWindowQueue.textBreak();
                 pool.markFree(poolIndex);
                 if (texts > 0) {
                     logger.info(pool.getHost(poolIndex) + " finished " + fname + " with " + texts + " texts");
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index c4d8758..5e0623f 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -6,6 +6,9 @@
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Map;
 import java.util.Objects;
@@ -13,10 +16,12 @@
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.logging.Logger;
+import java.util.stream.Collectors;
 
 import static org.junit.jupiter.api.Assertions.*;
 
 class WorkerTest {
+    public static final String splitFreqlistRegex = "\\t(?=[0-9]+$)";
     private ByteArrayOutputStream errContent;
     private final PrintStream originalErr = System.err;
     Worker worker;
@@ -68,7 +73,7 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()));
+                Logger.getLogger(TotalNGrams.class.getSimpleName()), false);
 
         queue.add(0);
         queue.add(-1);
@@ -118,7 +123,8 @@
                 true,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()));
+                Logger.getLogger(TotalNGrams.class.getSimpleName()),
+                false);
 
         queue.add(0);
         queue.add(-1);
@@ -163,7 +169,8 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()));
+                Logger.getLogger(TotalNGrams.class.getSimpleName()),
+                false);
 
         queue.add(0);
         queue.add(-1);
@@ -208,7 +215,8 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()));
+                Logger.getLogger(TotalNGrams.class.getSimpleName()),
+                false);
 
         queue.add(0);
         queue.add(-1);
@@ -216,4 +224,53 @@
         gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
     }
 
+    @Test
+    void paddingWorks() throws IOException {
+        File tempFile = File.createTempFile("simple", ".conllu");
+        tempFile.deleteOnExit();
+        try (FileOutputStream out = new FileOutputStream(tempFile)) {
+            IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+                    .getResourceAsStream("simple.conllu")), out);
+        }
+
+        for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
+            for (int n = 1; n <= 3; n++) {
+                ArrayList<String> fnames = new ArrayList<>();
+                fnames.add(tempFile.getAbsolutePath());
+
+                File tempFreqFile = File.createTempFile("simple", ".freq");
+                tempFreqFile.deleteOnExit();
+                try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
+                    IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+                            .getResourceAsStream("simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram_padded.freq")), out);
+                }
+
+                Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
+                Map<String, Integer> gold = Files.lines(path)
+                        .filter(s -> s.matches(splitFreqlistRegex))
+                        .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
+
+                map = new ConcurrentHashMap<>();
+                LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+                worker = new Worker(
+                        queue,
+                        fnames,
+                        n,
+                        1,
+                        1,
+                        map,
+                        with_lemma_and_pos,
+                        false,
+                        new WorkerNodePool(""),
+                        new Progressbar(tempFile.length()),
+                        Logger.getLogger(TotalNGrams.class.getSimpleName()),
+                        true);
+
+                queue.add(0);
+                queue.add(-1);
+                worker.run();
+                gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+            }
+        }
+    }
 }
diff --git a/src/test/resources/simple.conllu b/src/test/resources/simple.conllu
new file mode 100644
index 0000000..2e52539
--- /dev/null
+++ b/src/test/resources/simple.conllu
@@ -0,0 +1,17 @@
+# text_id = TST_TST.00000
+1	ich	ich	PPER	PPER	_	_	_	_	1
+2	bin	sein	VAFIN	VAFIN	_	_	_	_	1.000000
+3	alex	alex	NE	NE	_	_	_	_	0.565630
+4	.	.	$.	$.	_	_	_	_	1.000000
+
+# text_id = TST_TST.00001
+1	alex	alex	NE	NE	_	_	_	_	0.565630
+2	bin	sein	VAFIN	VAFIN	_	_	_	_	1.000000
+3	ich	ich	PPER	PPER	_	_	_	_	1
+4	.	.	$.	$.	_	_	_	_	1.000000
+
+# text_id = TST_TST.00002
+1	ich	ich	PPER	PPER	_	_	_	_	1
+2	heiße	heißen	VAFIN	VAFIN	_	_	_	_	1.000000
+3	alex	alex	NE	NE	_	_	_	_	0.565630
+4	.	.	$.	$.	_	_	_	_	1.000000
diff --git a/src/test/resources/simple_1gram_padded.freq b/src/test/resources/simple_1gram_padded.freq
new file mode 100644
index 0000000..54522cb
--- /dev/null
+++ b/src/test/resources/simple_1gram_padded.freq
@@ -0,0 +1,7 @@
+.	3
+«END»	3
+«START»	3
+alex	3
+ich	3
+bin	2
+heiße	1
diff --git a/src/test/resources/simple_1lpgram_padded.freq b/src/test/resources/simple_1lpgram_padded.freq
new file mode 100644
index 0000000..ff8c4f7
--- /dev/null
+++ b/src/test/resources/simple_1lpgram_padded.freq
@@ -0,0 +1,7 @@
+.	.	$.	3
+alex	alex	NE	3
+ich	ich	PPER	3
+«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	3
+bin	sein	VAFIN	2
+heiße	heißen	VAFIN	1
diff --git a/src/test/resources/simple_2gram_padded.freq b/src/test/resources/simple_2gram_padded.freq
new file mode 100644
index 0000000..ba04a3a
--- /dev/null
+++ b/src/test/resources/simple_2gram_padded.freq
@@ -0,0 +1,13 @@
+.	«END»	3
+«END»	«END»	3
+«START»	«START»	3
+«START»	ich	2
+alex	.	2
+«START»	alex	1
+alex	bin	1
+bin	alex	1
+bin	ich	1
+heiße	alex	1
+ich	.	1
+ich	bin	1
+ich	heiße	1
diff --git a/src/test/resources/simple_2lpgram_padded.freq b/src/test/resources/simple_2lpgram_padded.freq
new file mode 100644
index 0000000..7b1bd84
--- /dev/null
+++ b/src/test/resources/simple_2lpgram_padded.freq
@@ -0,0 +1,13 @@
+.	.	$.	«END»	«END»	«STARTEND»	3
+«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	3
+alex	alex	NE	.	.	$.	2
+«START»	«START»	«STARTEND»	ich	ich	PPER	2
+alex	alex	NE	bin	sein	VAFIN	1
+bin	sein	VAFIN	alex	alex	NE	1
+bin	sein	VAFIN	ich	ich	PPER	1
+heiße	heißen	VAFIN	alex	alex	NE	1
+ich	ich	PPER	.	.	$.	1
+ich	ich	PPER	bin	sein	VAFIN	1
+ich	ich	PPER	heiße	heißen	VAFIN	1
+«START»	«START»	«STARTEND»	alex	alex	NE	1
diff --git a/src/test/resources/simple_3gram_padded.freq b/src/test/resources/simple_3gram_padded.freq
new file mode 100644
index 0000000..f54eede
--- /dev/null
+++ b/src/test/resources/simple_3gram_padded.freq
@@ -0,0 +1,16 @@
+.	«END»	«END»	3
+«END»	«END»	«END»	3
+«START»	«START»	«START»	3
+«START»	«START»	ich	2
+alex	.	«END»	2
+«START»	«START»	alex	1
+«START»	alex	bin	1
+«START»	ich	bin	1
+«START»	ich	heiße	1
+alex	bin	ich	1
+bin	alex	.	1
+bin	ich	.	1
+heiße	alex	.	1
+ich	.	«END»	1
+ich	bin	alex	1
+ich	heiße	alex	1
diff --git a/src/test/resources/simple_3lpgram_padded.freq b/src/test/resources/simple_3lpgram_padded.freq
new file mode 100644
index 0000000..19b899c
--- /dev/null
+++ b/src/test/resources/simple_3lpgram_padded.freq
@@ -0,0 +1,16 @@
+.	.	$.	«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	3
+alex	alex	NE	.	.	$.	«END»	«END»	«STARTEND»	2
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	ich	ich	PPER	2
+alex	alex	NE	bin	sein	VAFIN	ich	ich	PPER	1
+bin	sein	VAFIN	alex	alex	NE	.	.	$.	1
+bin	sein	VAFIN	ich	ich	PPER	.	.	$.	1
+heiße	heißen	VAFIN	alex	alex	NE	.	.	$.	1
+ich	ich	PPER	.	.	$.	«END»	«END»	«STARTEND»	1
+ich	ich	PPER	bin	sein	VAFIN	alex	alex	NE	1
+ich	ich	PPER	heiße	heißen	VAFIN	alex	alex	NE	1
+«START»	«START»	«STARTEND»	alex	alex	NE	bin	sein	VAFIN	1
+«START»	«START»	«STARTEND»	ich	ich	PPER	bin	sein	VAFIN	1
+«START»	«START»	«STARTEND»	ich	ich	PPER	heiße	heißen	VAFIN	1
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	alex	alex	NE	1