Add option -Z to exclude empty texts
Change-Id: I1f2594ce839351205ce43b5047b349af7ba019bb
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index f1ec290..2bce13d 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -24,6 +24,7 @@
public static final String splitFreqlistRegex = "\\t(?=[0-9]+$)";
private ByteArrayOutputStream errContent;
private final PrintStream originalErr = System.err;
+ private final boolean excludeEmptyTexts= true;
Worker worker;
ConcurrentHashMap<String, AtomicInteger> map;
@BeforeEach
@@ -73,7 +74,7 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false);
+ Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -122,7 +123,7 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true);
+ Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -179,7 +180,7 @@
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()),
- false, false);
+ false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -225,7 +226,7 @@
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()),
- false, false);
+ false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -271,7 +272,7 @@
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()),
- false, false);
+ false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -287,57 +288,59 @@
IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
.getResourceAsStream("simple.conllu")), out);
}
-
for (boolean with_padding : new boolean[]{false, true}) {
- for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
- for (int n = 1; n <= 3; n++) {
- ArrayList<String> fnames = new ArrayList<>();
- fnames.add(tempFile.getAbsolutePath());
+ for (boolean excludeEmptyTexts : new boolean[]{false, true}) {
+ for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
+ for (int n = 1; n <= 3; n++) {
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
- File tempFreqFile = File.createTempFile("simple", ".freq");
- tempFreqFile.deleteOnExit();
- String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + ".freq";
- try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
- IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
- .getResourceAsStream(goldFileName)), out);
+ File tempFreqFile = File.createTempFile("simple", ".freq");
+ tempFreqFile.deleteOnExit();
+ String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + (with_padding && excludeEmptyTexts? "_ignore-empty-texts" : "") + ".freq";
+ try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
+ IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(goldFileName)), out);
+ }
+
+ Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
+ Map<String, Integer> gold = Files.lines(path)
+ .filter(s -> s.matches(".*\\t.*"))
+ .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
+ assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
+
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ n,
+ 1,
+ 1,
+ map,
+ with_lemma_and_pos,
+ false,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGrams.class.getSimpleName()),
+ with_padding, false, excludeEmptyTexts);
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ String conditionDescription = " like in " +goldFileName + " for simple.conllu in condition: "
+ + (with_padding ? "padding" : "no padding") + ", "
+ + (excludeEmptyTexts ? "excluding empty texts" : "not excluding empty texts") + ", "
+ + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
+ gold.forEach((key, value) -> {
+ assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
+ assertEquals(value, map.get(key).intValue(),
+ "Frequency for " + key + " is correct " + conditionDescription);
+ });
+
+ assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
+ + conditionDescription);
}
-
- Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
- Map<String, Integer> gold = Files.lines(path)
- .filter(s -> s.matches(".*\\t.*"))
- .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
- assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
-
- map = new ConcurrentHashMap<>();
- LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
- worker = new Worker(
- queue,
- fnames,
- n,
- 1,
- 1,
- map,
- with_lemma_and_pos,
- false,
- new WorkerNodePool(""),
- new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()),
- with_padding, false);
-
- queue.add(0);
- queue.add(-1);
- worker.run();
- String conditionDescription = " like in " +goldFileName + " for simple.conllu in condition: "
- + (with_padding ? "padding" : "no padding") + ", "
- + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
- gold.forEach((key, value) -> {
- assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
- assertEquals(value, map.get(key).intValue(),
- "Frequency for " + key + " is correct " + conditionDescription);
- });
-
- assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
- + conditionDescription);
}
}
}
diff --git a/src/test/resources/simple.conllu b/src/test/resources/simple.conllu
index 5a907cc..48dc9fc 100644
--- a/src/test/resources/simple.conllu
+++ b/src/test/resources/simple.conllu
@@ -1,5 +1,5 @@
# text_id = TST_TST.00001
-# empty texts are expected to count
+# empty texts are expected to count *not*
# text_id = TST_TST.00002
1 ich ich PPER PPER _ _ _ _ 1
@@ -26,6 +26,6 @@
# make sure that an empty text header does no harm
# text_id = TST_TST.00007
-# in the unigrams we should have 7 start and end tags
+# in the unigrams we should have 3 start and end tags, because empty texts do not count starting with v2.2.3
diff --git a/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..637d66b
--- /dev/null
+++ b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,7 @@
+«END» 3
+«START» 3
+. 3
+alex 3
+ich 3
+bin 2
+heiße 1
diff --git a/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..793f882
--- /dev/null
+++ b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,7 @@
+«END» «END» «STARTEND» 3
+«START» «START» «STARTEND» 3
+. . $. 3
+alex alex NE 3
+ich ich PPER 3
+bin sein VAFIN 2
+heiße heißen VAFIN 1
diff --git a/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..1a6fb0d
--- /dev/null
+++ b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,13 @@
+«END» «END» 3
+«START» «START» 3
+. «END» 3
+alex . 2
+«START» ich 2
+alex bin 1
+bin alex 1
+bin ich 1
+heiße alex 1
+ich . 1
+ich bin 1
+ich heiße 1
+«START» alex 1
diff --git a/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..98544ce
--- /dev/null
+++ b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,13 @@
+«END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«START» «START» «STARTEND» «START» «START» «STARTEND» 3
+. . $. «END» «END» «STARTEND» 3
+alex alex NE . . $. 2
+«START» «START» «STARTEND» ich ich PPER 2
+alex alex NE bin sein VAFIN 1
+bin sein VAFIN alex alex NE 1
+bin sein VAFIN ich ich PPER 1
+heiße heißen VAFIN alex alex NE 1
+ich ich PPER . . $. 1
+ich ich PPER bin sein VAFIN 1
+ich ich PPER heiße heißen VAFIN 1
+«START» «START» «STARTEND» alex alex NE 1
diff --git a/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..431eafa
--- /dev/null
+++ b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,16 @@
+«END» «END» «END» 3
+«START» «START» «START» 3
+. «END» «END» 3
+alex . «END» 2
+«START» «START» ich 2
+alex bin ich 1
+bin alex . 1
+bin ich . 1
+heiße alex . 1
+ich . «END» 1
+ich bin alex 1
+ich heiße alex 1
+«START» alex bin 1
+«START» ich bin 1
+«START» ich heiße 1
+«START» «START» alex 1
diff --git a/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..8e13026
--- /dev/null
+++ b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,16 @@
+«END» «END» «STARTEND» «END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«START» «START» «STARTEND» «START» «START» «STARTEND» «START» «START» «STARTEND» 3
+. . $. «END» «END» «STARTEND» «END» «END» «STARTEND» 3
+alex alex NE . . $. «END» «END» «STARTEND» 2
+«START» «START» «STARTEND» «START» «START» «STARTEND» ich ich PPER 2
+alex alex NE bin sein VAFIN ich ich PPER 1
+bin sein VAFIN alex alex NE . . $. 1
+bin sein VAFIN ich ich PPER . . $. 1
+heiße heißen VAFIN alex alex NE . . $. 1
+ich ich PPER . . $. «END» «END» «STARTEND» 1
+ich ich PPER bin sein VAFIN alex alex NE 1
+ich ich PPER heiße heißen VAFIN alex alex NE 1
+«START» «START» «STARTEND» alex alex NE bin sein VAFIN 1
+«START» «START» «STARTEND» ich ich PPER bin sein VAFIN 1
+«START» «START» «STARTEND» ich ich PPER heiße heißen VAFIN 1
+«START» «START» «STARTEND» «START» «START» «STARTEND» alex alex NE 1