Add option -Z to exclude empty texts

Change-Id: I1f2594ce839351205ce43b5047b349af7ba019bb
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index f1ec290..2bce13d 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -24,6 +24,7 @@
     public static final String splitFreqlistRegex = "\\t(?=[0-9]+$)";
     private ByteArrayOutputStream errContent;
     private final PrintStream originalErr = System.err;
+    private final boolean excludeEmptyTexts= true;
     Worker worker;
     ConcurrentHashMap<String, AtomicInteger> map;
     @BeforeEach
@@ -73,7 +74,7 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false);
+                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -122,7 +123,7 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true);
+                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -179,7 +180,7 @@
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                false, false);
+                false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -225,7 +226,7 @@
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                false, false);
+                false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -271,7 +272,7 @@
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                false, false);
+                false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -287,57 +288,59 @@
             IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
                     .getResourceAsStream("simple.conllu")), out);
         }
-
         for (boolean with_padding : new boolean[]{false, true}) {
-            for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
-                for (int n = 1; n <= 3; n++) {
-                    ArrayList<String> fnames = new ArrayList<>();
-                    fnames.add(tempFile.getAbsolutePath());
+            for (boolean excludeEmptyTexts : new boolean[]{false, true}) {
+                for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
+                    for (int n = 1; n <= 3; n++) {
+                        ArrayList<String> fnames = new ArrayList<>();
+                        fnames.add(tempFile.getAbsolutePath());
 
-                    File tempFreqFile = File.createTempFile("simple", ".freq");
-                    tempFreqFile.deleteOnExit();
-                    String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + ".freq";
-                    try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
-                        IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
-                                .getResourceAsStream(goldFileName)), out);
+                        File tempFreqFile = File.createTempFile("simple", ".freq");
+                        tempFreqFile.deleteOnExit();
+                        String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + (with_padding && excludeEmptyTexts? "_ignore-empty-texts" : "") + ".freq";
+                        try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
+                            IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+                                    .getResourceAsStream(goldFileName)), out);
+                        }
+
+                        Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
+                        Map<String, Integer> gold = Files.lines(path)
+                                .filter(s -> s.matches(".*\\t.*"))
+                                .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
+                        assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
+
+                        map = new ConcurrentHashMap<>();
+                        LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+                        worker = new Worker(
+                                queue,
+                                fnames,
+                                n,
+                                1,
+                                1,
+                                map,
+                                with_lemma_and_pos,
+                                false,
+                                new WorkerNodePool(""),
+                                new Progressbar(tempFile.length()),
+                                Logger.getLogger(TotalNGrams.class.getSimpleName()),
+                                with_padding, false, excludeEmptyTexts);
+
+                        queue.add(0);
+                        queue.add(-1);
+                        worker.run();
+                        String conditionDescription =  " like in " +goldFileName + " for simple.conllu in condition: "
+                                + (with_padding ? "padding" : "no padding") + ", "
+                                + (excludeEmptyTexts ? "excluding empty texts" : "not excluding empty texts") + ", "
+                                + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
+                        gold.forEach((key, value) -> {
+                            assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
+                            assertEquals(value, map.get(key).intValue(),
+                                    "Frequency for " + key + " is correct " + conditionDescription);
+                        });
+
+                        assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
+                                + conditionDescription);
                     }
-
-                    Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
-                    Map<String, Integer> gold = Files.lines(path)
-                            .filter(s -> s.matches(".*\\t.*"))
-                            .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
-                    assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
-
-                    map = new ConcurrentHashMap<>();
-                    LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
-                    worker = new Worker(
-                            queue,
-                            fnames,
-                            n,
-                            1,
-                            1,
-                            map,
-                            with_lemma_and_pos,
-                            false,
-                            new WorkerNodePool(""),
-                            new Progressbar(tempFile.length()),
-                            Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                            with_padding, false);
-
-                    queue.add(0);
-                    queue.add(-1);
-                    worker.run();
-                    String conditionDescription =  " like in " +goldFileName + " for simple.conllu in condition: "
-                            + (with_padding ? "padding" : "no padding") + ", "
-                            + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
-                    gold.forEach((key, value) -> {
-                        assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
-                        assertEquals(value, map.get(key).intValue(),
-                                "Frequency for " + key + " is correct " + conditionDescription);
-                    });
-
-                    assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
-                    + conditionDescription);
                 }
             }
         }
diff --git a/src/test/resources/simple.conllu b/src/test/resources/simple.conllu
index 5a907cc..48dc9fc 100644
--- a/src/test/resources/simple.conllu
+++ b/src/test/resources/simple.conllu
@@ -1,5 +1,5 @@
 # text_id = TST_TST.00001
-# empty texts are expected to count
+# empty texts are expected to count *not*
 
 # text_id = TST_TST.00002
 1	ich	ich	PPER	PPER	_	_	_	_	1
@@ -26,6 +26,6 @@
 # make sure that an empty text header does no harm
 
 # text_id = TST_TST.00007
-# in the unigrams we should have 7 start and end tags
+# in the unigrams we should have 3 start and end tags, because empty texts do not count starting with v2.2.3
 
 
diff --git a/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..637d66b
--- /dev/null
+++ b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,7 @@
+«END»	3
+«START»	3
+.	3
+alex	3
+ich	3
+bin	2
+heiße	1
diff --git a/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..793f882
--- /dev/null
+++ b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,7 @@
+«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	3
+.	.	$.	3
+alex	alex	NE	3
+ich	ich	PPER	3
+bin	sein	VAFIN	2
+heiße	heißen	VAFIN	1
diff --git a/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..1a6fb0d
--- /dev/null
+++ b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,13 @@
+«END»	«END»	3
+«START»	«START»	3
+.	«END»	3
+alex	.	2
+«START»	ich	2
+alex	bin	1
+bin	alex	1
+bin	ich	1
+heiße	alex	1
+ich	.	1
+ich	bin	1
+ich	heiße	1
+«START»	alex	1
diff --git a/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..98544ce
--- /dev/null
+++ b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,13 @@
+«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	3
+.	.	$.	«END»	«END»	«STARTEND»	3
+alex	alex	NE	.	.	$.	2
+«START»	«START»	«STARTEND»	ich	ich	PPER	2
+alex	alex	NE	bin	sein	VAFIN	1
+bin	sein	VAFIN	alex	alex	NE	1
+bin	sein	VAFIN	ich	ich	PPER	1
+heiße	heißen	VAFIN	alex	alex	NE	1
+ich	ich	PPER	.	.	$.	1
+ich	ich	PPER	bin	sein	VAFIN	1
+ich	ich	PPER	heiße	heißen	VAFIN	1
+«START»	«START»	«STARTEND»	alex	alex	NE	1
diff --git a/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..431eafa
--- /dev/null
+++ b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,16 @@
+«END»	«END»	«END»	3
+«START»	«START»	«START»	3
+.	«END»	«END»	3
+alex	.	«END»	2
+«START»	«START»	ich	2
+alex	bin	ich	1
+bin	alex	.	1
+bin	ich	.	1
+heiße	alex	.	1
+ich	.	«END»	1
+ich	bin	alex	1
+ich	heiße	alex	1
+«START»	alex	bin	1
+«START»	ich	bin	1
+«START»	ich	heiße	1
+«START»	«START»	alex	1
diff --git a/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..8e13026
--- /dev/null
+++ b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,16 @@
+«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	3
+.	.	$.	«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+alex	alex	NE	.	.	$.	«END»	«END»	«STARTEND»	2
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	ich	ich	PPER	2
+alex	alex	NE	bin	sein	VAFIN	ich	ich	PPER	1
+bin	sein	VAFIN	alex	alex	NE	.	.	$.	1
+bin	sein	VAFIN	ich	ich	PPER	.	.	$.	1
+heiße	heißen	VAFIN	alex	alex	NE	.	.	$.	1
+ich	ich	PPER	.	.	$.	«END»	«END»	«STARTEND»	1
+ich	ich	PPER	bin	sein	VAFIN	alex	alex	NE	1
+ich	ich	PPER	heiße	heißen	VAFIN	alex	alex	NE	1
+«START»	«START»	«STARTEND»	alex	alex	NE	bin	sein	VAFIN	1
+«START»	«START»	«STARTEND»	ich	ich	PPER	bin	sein	VAFIN	1
+«START»	«START»	«STARTEND»	ich	ich	PPER	heiße	heißen	VAFIN	1
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	alex	alex	NE	1