Add option -Z to exclude empty texts Change-Id: I1f2594ce839351205ce43b5047b349af7ba019bb

commit: 104c94b34d577adb1e51979ef9c12619f58382fb [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 02 19:53:53 2023 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Feb 05 11:16:06 2023 +0100
tree: 1a9c2501101c3c1383f29da77af667e235232f77
parent: b028de453ff7bc84e558bdf05c331803b1cdb0a8 [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1ba41c..b161766 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,5 +1,7 @@
 # Changelog
 
+- added option `--exclude-empty-texts` (`-Z`)
+
 ## [2.2.2] - 2022-01-23
 - fixed empty cardinals (e.g. "1000" -> "") in FilterKeys result
 

diff --git a/Readme.md b/Readme.md
index 52fcc3f..9e018be 100644
--- a/Readme.md
+++ b/Readme.md

@@ -73,6 +73,7 @@
         -n $n \
         -f $f \
         -F $FOLDS \
+        --exclude-empty-texts \
         $l -o "$BASE/paddedlemmaposfreq/$n-gram-token$l-freqs.$f.tsv.xz" $BASE/conllu/*.conllu.gz
     done
   done

diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index f6ca887..0823c32 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java

@@ -54,7 +54,7 @@
 
     @SuppressWarnings("CanBeFinal")
     @CommandLine.Option(names = { "-l",
-            "--with-lemma-pos" }, description = "Use also lemma and part-of-speech annotations (default: ${DEFAULT-VALUE}")
+            "--with-lemma-pos" }, description = "Use also lemma and part-of-speech annotations (default: ${DEFAULT-VALUE})")
     boolean with_lemma_and_pos = false;
 
     @SuppressWarnings("CanBeFinal")
@@ -103,6 +103,11 @@
             "--exclude-punctuation" }, description = "Ignore all tokens tagged as punctuation (according to STTS tags set, i.e. starting with '$') (default: ${DEFAULT-VALUE})")
     boolean excludePunctuation = false;
 
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = { "-Z",
+            "--exclude-empty-texts" }, description = "Ignore all texts without any tokens. This options only makes a difference in combination with --pad. If not set (and --pad is set), empty texts will be visible, e.g. in 2-gram frequencies as «START»-«END»-bigrams.")
+    boolean excludeEmptyTexts = false;
+
     private Progressbar etaPrinter;
 
     public TotalNGrams() {
@@ -179,7 +184,7 @@
         logger.info("Processing fold " + fold + "/" + FOLDS);
         logger.info("Using " + threads + " threads");
         IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS,
-                map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding, excludePunctuation)));
+                map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding, excludePunctuation, excludeEmptyTexts)));
         queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
         IntStream.range(0, threads).forEach(unused -> {
             try {

diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index e5b1f9b..aa0256c 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java

@@ -33,11 +33,12 @@
 
     private final DeterministicRandomProvider deterministicRandomProvider;
     private final boolean excludePunctuation;
+    private final boolean excludeEmptyTexts;
 
     public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
                   ConcurrentHashMap<String, AtomicInteger> map,
                   boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
-                  Progressbar etaPrinter, Logger logger, boolean addPadding, boolean excludePunctuation) {
+                  Progressbar etaPrinter, Logger logger, boolean addPadding, boolean excludePunctuation, boolean excludeEmptyTexts) {
         this.queue = queue;
         this.fnames = fnames;
         this.map = map;
@@ -51,6 +52,7 @@
         this.downcase_tokens = downcase_tokens;
         this.addPadding = addPadding;
         this.excludePunctuation = excludePunctuation;
+        this.excludeEmptyTexts = excludeEmptyTexts;
     }
 
     @Override
@@ -59,12 +61,14 @@
             int index = queue.take();
             int retries = MAX_RETRIES;
             int texts = 0;
+            boolean reset_required = true;
             SlidingWindowQueue slidingWindowQueue = (addPadding ? new PaddedSlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos) :
                     new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos));
             while (index >= 0) {
                 String fname = fnames.get(index);
                 File current_file = new File(fname);
                 long file_size = current_file.length();
+                int tokens = 0;
                 int poolIndex = 0;
                 BufferedReader in = null;
                 logger.info(String.format("Processing %d/%d %s %s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
@@ -114,17 +118,26 @@
                 }
                 String line;
                 int fold = -1;
+                String text_sigle = "";
                 while ((line = in.readLine()) != null) {
                     if (line.startsWith("#")) {
                         Matcher matcher = new_text_pattern.matcher(line);
                         if (matcher.find()) {
                             if (fold == target_fold) {
-                                slidingWindowQueue.textBreak();
+                                if (!excludeEmptyTexts || tokens > 0) {
+                                    slidingWindowQueue.textBreak();
+                                    tokens = 0;
+                                    reset_required = true;
+                                } else {
+                                    logger.info(pool.getHost(poolIndex) + ": text " + text_sigle + " is empty");
+                                }
                             }
-                            fold = deterministicRandomProvider.getFoldFromTextID(matcher.group(1)) + 1;
+                            text_sigle = matcher.group(1);
+                            fold = deterministicRandomProvider.getFoldFromTextID(text_sigle) + 1;
                             texts++;
-                            if (fold == target_fold) {
+                            if (!excludeEmptyTexts && fold == target_fold) {
                                 slidingWindowQueue.reset(fold);
+                                reset_required = false;
                             }
                         }
                     } else if (fold == target_fold) {
@@ -137,6 +150,11 @@
                                 Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
                                 Utils.unEscapeEntities(strings[1]));
                         if(!excludePunctuation || !Utils.isPunctuation(token, strings[2], strings[3])) {
+                            tokens++;
+                            if (reset_required) {
+                                slidingWindowQueue.reset(fold);
+                                reset_required = false;
+                            }
                             if (with_lemma_and_pos) {
                                 String lemma, pos;
 
@@ -167,7 +185,13 @@
                     }
                 }
                 if (fold == target_fold) {
-                    slidingWindowQueue.textBreak();
+                    if (!excludeEmptyTexts || tokens > 0) {
+                        slidingWindowQueue.textBreak();
+                        reset_required = true;
+                    } else {
+                        logger.info(pool.getHost(poolIndex) + ": last text " + text_sigle + " is empty");
+                        reset_required = false;
+                    }
                 }
                 pool.markFree(poolIndex);
                 if (texts > 0) {

diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index f1ec290..2bce13d 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java

@@ -24,6 +24,7 @@
     public static final String splitFreqlistRegex = "\\t(?=[0-9]+$)";
     private ByteArrayOutputStream errContent;
     private final PrintStream originalErr = System.err;
+    private final boolean excludeEmptyTexts= true;
     Worker worker;
     ConcurrentHashMap<String, AtomicInteger> map;
     @BeforeEach
@@ -73,7 +74,7 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false);
+                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -122,7 +123,7 @@
                 false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
-                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true);
+                Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -179,7 +180,7 @@
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                false, false);
+                false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -225,7 +226,7 @@
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                false, false);
+                false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -271,7 +272,7 @@
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                false, false);
+                false, false, excludeEmptyTexts);
 
         queue.add(0);
         queue.add(-1);
@@ -287,57 +288,59 @@
             IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
                     .getResourceAsStream("simple.conllu")), out);
         }
-
         for (boolean with_padding : new boolean[]{false, true}) {
-            for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
-                for (int n = 1; n <= 3; n++) {
-                    ArrayList<String> fnames = new ArrayList<>();
-                    fnames.add(tempFile.getAbsolutePath());
+            for (boolean excludeEmptyTexts : new boolean[]{false, true}) {
+                for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
+                    for (int n = 1; n <= 3; n++) {
+                        ArrayList<String> fnames = new ArrayList<>();
+                        fnames.add(tempFile.getAbsolutePath());
 
-                    File tempFreqFile = File.createTempFile("simple", ".freq");
-                    tempFreqFile.deleteOnExit();
-                    String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + ".freq";
-                    try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
-                        IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
-                                .getResourceAsStream(goldFileName)), out);
+                        File tempFreqFile = File.createTempFile("simple", ".freq");
+                        tempFreqFile.deleteOnExit();
+                        String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + (with_padding && excludeEmptyTexts? "_ignore-empty-texts" : "") + ".freq";
+                        try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
+                            IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+                                    .getResourceAsStream(goldFileName)), out);
+                        }
+
+                        Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
+                        Map<String, Integer> gold = Files.lines(path)
+                                .filter(s -> s.matches(".*\\t.*"))
+                                .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
+                        assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
+
+                        map = new ConcurrentHashMap<>();
+                        LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+                        worker = new Worker(
+                                queue,
+                                fnames,
+                                n,
+                                1,
+                                1,
+                                map,
+                                with_lemma_and_pos,
+                                false,
+                                new WorkerNodePool(""),
+                                new Progressbar(tempFile.length()),
+                                Logger.getLogger(TotalNGrams.class.getSimpleName()),
+                                with_padding, false, excludeEmptyTexts);
+
+                        queue.add(0);
+                        queue.add(-1);
+                        worker.run();
+                        String conditionDescription =  " like in " +goldFileName + " for simple.conllu in condition: "
+                                + (with_padding ? "padding" : "no padding") + ", "
+                                + (excludeEmptyTexts ? "excluding empty texts" : "not excluding empty texts") + ", "
+                                + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
+                        gold.forEach((key, value) -> {
+                            assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
+                            assertEquals(value, map.get(key).intValue(),
+                                    "Frequency for " + key + " is correct " + conditionDescription);
+                        });
+
+                        assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
+                                + conditionDescription);
                     }
-
-                    Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
-                    Map<String, Integer> gold = Files.lines(path)
-                            .filter(s -> s.matches(".*\\t.*"))
-                            .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
-                    assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
-
-                    map = new ConcurrentHashMap<>();
-                    LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
-                    worker = new Worker(
-                            queue,
-                            fnames,
-                            n,
-                            1,
-                            1,
-                            map,
-                            with_lemma_and_pos,
-                            false,
-                            new WorkerNodePool(""),
-                            new Progressbar(tempFile.length()),
-                            Logger.getLogger(TotalNGrams.class.getSimpleName()),
-                            with_padding, false);
-
-                    queue.add(0);
-                    queue.add(-1);
-                    worker.run();
-                    String conditionDescription =  " like in " +goldFileName + " for simple.conllu in condition: "
-                            + (with_padding ? "padding" : "no padding") + ", "
-                            + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
-                    gold.forEach((key, value) -> {
-                        assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
-                        assertEquals(value, map.get(key).intValue(),
-                                "Frequency for " + key + " is correct " + conditionDescription);
-                    });
-
-                    assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
-                    + conditionDescription);
                 }
             }
         }

diff --git a/src/test/resources/simple.conllu b/src/test/resources/simple.conllu
index 5a907cc..48dc9fc 100644
--- a/src/test/resources/simple.conllu
+++ b/src/test/resources/simple.conllu

@@ -1,5 +1,5 @@
 # text_id = TST_TST.00001
-# empty texts are expected to count
+# empty texts are expected to count *not*
 
 # text_id = TST_TST.00002
 1	ich	ich	PPER	PPER	_	_	_	_	1
@@ -26,6 +26,6 @@
 # make sure that an empty text header does no harm
 
 # text_id = TST_TST.00007
-# in the unigrams we should have 7 start and end tags
+# in the unigrams we should have 3 start and end tags, because empty texts do not count starting with v2.2.3
 
 

diff --git a/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..637d66b
--- /dev/null
+++ b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq

@@ -0,0 +1,7 @@
+«END»	3
+«START»	3
+.	3
+alex	3
+ich	3
+bin	2
+heiße	1

diff --git a/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..793f882
--- /dev/null
+++ b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq

@@ -0,0 +1,7 @@
+«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	3
+.	.	$.	3
+alex	alex	NE	3
+ich	ich	PPER	3
+bin	sein	VAFIN	2
+heiße	heißen	VAFIN	1

diff --git a/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..1a6fb0d
--- /dev/null
+++ b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq

@@ -0,0 +1,13 @@
+«END»	«END»	3
+«START»	«START»	3
+.	«END»	3
+alex	.	2
+«START»	ich	2
+alex	bin	1
+bin	alex	1
+bin	ich	1
+heiße	alex	1
+ich	.	1
+ich	bin	1
+ich	heiße	1
+«START»	alex	1

diff --git a/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..98544ce
--- /dev/null
+++ b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq

@@ -0,0 +1,13 @@
+«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	3
+.	.	$.	«END»	«END»	«STARTEND»	3
+alex	alex	NE	.	.	$.	2
+«START»	«START»	«STARTEND»	ich	ich	PPER	2
+alex	alex	NE	bin	sein	VAFIN	1
+bin	sein	VAFIN	alex	alex	NE	1
+bin	sein	VAFIN	ich	ich	PPER	1
+heiße	heißen	VAFIN	alex	alex	NE	1
+ich	ich	PPER	.	.	$.	1
+ich	ich	PPER	bin	sein	VAFIN	1
+ich	ich	PPER	heiße	heißen	VAFIN	1
+«START»	«START»	«STARTEND»	alex	alex	NE	1

diff --git a/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..431eafa
--- /dev/null
+++ b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq

@@ -0,0 +1,16 @@
+«END»	«END»	«END»	3
+«START»	«START»	«START»	3
+.	«END»	«END»	3
+alex	.	«END»	2
+«START»	«START»	ich	2
+alex	bin	ich	1
+bin	alex	.	1
+bin	ich	.	1
+heiße	alex	.	1
+ich	.	«END»	1
+ich	bin	alex	1
+ich	heiße	alex	1
+«START»	alex	bin	1
+«START»	ich	bin	1
+«START»	ich	heiße	1
+«START»	«START»	alex	1

diff --git a/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..8e13026
--- /dev/null
+++ b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq

@@ -0,0 +1,16 @@
+«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	3
+.	.	$.	«END»	«END»	«STARTEND»	«END»	«END»	«STARTEND»	3
+alex	alex	NE	.	.	$.	«END»	«END»	«STARTEND»	2
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	ich	ich	PPER	2
+alex	alex	NE	bin	sein	VAFIN	ich	ich	PPER	1
+bin	sein	VAFIN	alex	alex	NE	.	.	$.	1
+bin	sein	VAFIN	ich	ich	PPER	.	.	$.	1
+heiße	heißen	VAFIN	alex	alex	NE	.	.	$.	1
+ich	ich	PPER	.	.	$.	«END»	«END»	«STARTEND»	1
+ich	ich	PPER	bin	sein	VAFIN	alex	alex	NE	1
+ich	ich	PPER	heiße	heißen	VAFIN	alex	alex	NE	1
+«START»	«START»	«STARTEND»	alex	alex	NE	bin	sein	VAFIN	1
+«START»	«START»	«STARTEND»	ich	ich	PPER	bin	sein	VAFIN	1
+«START»	«START»	«STARTEND»	ich	ich	PPER	heiße	heißen	VAFIN	1
+«START»	«START»	«STARTEND»	«START»	«START»	«STARTEND»	alex	alex	NE	1
commit	104c94b34d577adb1e51979ef9c12619f58382fb	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 02 19:53:53 2023 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Feb 05 11:16:06 2023 +0100
tree	1a9c2501101c3c1383f29da77af667e235232f77
parent	b028de453ff7bc84e558bdf05c331803b1cdb0a8 [diff]