Add option -Z to exclude empty texts
Change-Id: I1f2594ce839351205ce43b5047b349af7ba019bb
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1ba41c..b161766 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
# Changelog
+- added option `--exclude-empty-texts` (`-Z`)
+
## [2.2.2] - 2022-01-23
- fixed empty cardinals (e.g. "1000" -> "") in FilterKeys result
diff --git a/Readme.md b/Readme.md
index 52fcc3f..9e018be 100644
--- a/Readme.md
+++ b/Readme.md
@@ -73,6 +73,7 @@
-n $n \
-f $f \
-F $FOLDS \
+ --exclude-empty-texts \
$l -o "$BASE/paddedlemmaposfreq/$n-gram-token$l-freqs.$f.tsv.xz" $BASE/conllu/*.conllu.gz
done
done
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index f6ca887..0823c32 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -54,7 +54,7 @@
@SuppressWarnings("CanBeFinal")
@CommandLine.Option(names = { "-l",
- "--with-lemma-pos" }, description = "Use also lemma and part-of-speech annotations (default: ${DEFAULT-VALUE}")
+ "--with-lemma-pos" }, description = "Use also lemma and part-of-speech annotations (default: ${DEFAULT-VALUE})")
boolean with_lemma_and_pos = false;
@SuppressWarnings("CanBeFinal")
@@ -103,6 +103,11 @@
"--exclude-punctuation" }, description = "Ignore all tokens tagged as punctuation (according to STTS tags set, i.e. starting with '$') (default: ${DEFAULT-VALUE})")
boolean excludePunctuation = false;
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = { "-Z",
+ "--exclude-empty-texts" }, description = "Ignore all texts without any tokens. This options only makes a difference in combination with --pad. If not set (and --pad is set), empty texts will be visible, e.g. in 2-gram frequencies as «START»-«END»-bigrams.")
+ boolean excludeEmptyTexts = false;
+
private Progressbar etaPrinter;
public TotalNGrams() {
@@ -179,7 +184,7 @@
logger.info("Processing fold " + fold + "/" + FOLDS);
logger.info("Using " + threads + " threads");
IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS,
- map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding, excludePunctuation)));
+ map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding, excludePunctuation, excludeEmptyTexts)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index e5b1f9b..aa0256c 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -33,11 +33,12 @@
private final DeterministicRandomProvider deterministicRandomProvider;
private final boolean excludePunctuation;
+ private final boolean excludeEmptyTexts;
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
ConcurrentHashMap<String, AtomicInteger> map,
boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
- Progressbar etaPrinter, Logger logger, boolean addPadding, boolean excludePunctuation) {
+ Progressbar etaPrinter, Logger logger, boolean addPadding, boolean excludePunctuation, boolean excludeEmptyTexts) {
this.queue = queue;
this.fnames = fnames;
this.map = map;
@@ -51,6 +52,7 @@
this.downcase_tokens = downcase_tokens;
this.addPadding = addPadding;
this.excludePunctuation = excludePunctuation;
+ this.excludeEmptyTexts = excludeEmptyTexts;
}
@Override
@@ -59,12 +61,14 @@
int index = queue.take();
int retries = MAX_RETRIES;
int texts = 0;
+ boolean reset_required = true;
SlidingWindowQueue slidingWindowQueue = (addPadding ? new PaddedSlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos) :
new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos));
while (index >= 0) {
String fname = fnames.get(index);
File current_file = new File(fname);
long file_size = current_file.length();
+ int tokens = 0;
int poolIndex = 0;
BufferedReader in = null;
logger.info(String.format("Processing %d/%d %s %s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
@@ -114,17 +118,26 @@
}
String line;
int fold = -1;
+ String text_sigle = "";
while ((line = in.readLine()) != null) {
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);
if (matcher.find()) {
if (fold == target_fold) {
- slidingWindowQueue.textBreak();
+ if (!excludeEmptyTexts || tokens > 0) {
+ slidingWindowQueue.textBreak();
+ tokens = 0;
+ reset_required = true;
+ } else {
+ logger.info(pool.getHost(poolIndex) + ": text " + text_sigle + " is empty");
+ }
}
- fold = deterministicRandomProvider.getFoldFromTextID(matcher.group(1)) + 1;
+ text_sigle = matcher.group(1);
+ fold = deterministicRandomProvider.getFoldFromTextID(text_sigle) + 1;
texts++;
- if (fold == target_fold) {
+ if (!excludeEmptyTexts && fold == target_fold) {
slidingWindowQueue.reset(fold);
+ reset_required = false;
}
}
} else if (fold == target_fold) {
@@ -137,6 +150,11 @@
Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
Utils.unEscapeEntities(strings[1]));
if(!excludePunctuation || !Utils.isPunctuation(token, strings[2], strings[3])) {
+ tokens++;
+ if (reset_required) {
+ slidingWindowQueue.reset(fold);
+ reset_required = false;
+ }
if (with_lemma_and_pos) {
String lemma, pos;
@@ -167,7 +185,13 @@
}
}
if (fold == target_fold) {
- slidingWindowQueue.textBreak();
+ if (!excludeEmptyTexts || tokens > 0) {
+ slidingWindowQueue.textBreak();
+ reset_required = true;
+ } else {
+ logger.info(pool.getHost(poolIndex) + ": last text " + text_sigle + " is empty");
+ reset_required = false;
+ }
}
pool.markFree(poolIndex);
if (texts > 0) {
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index f1ec290..2bce13d 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -24,6 +24,7 @@
public static final String splitFreqlistRegex = "\\t(?=[0-9]+$)";
private ByteArrayOutputStream errContent;
private final PrintStream originalErr = System.err;
+ private final boolean excludeEmptyTexts= true;
Worker worker;
ConcurrentHashMap<String, AtomicInteger> map;
@BeforeEach
@@ -73,7 +74,7 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false);
+ Logger.getLogger(TotalNGrams.class.getSimpleName()), false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -122,7 +123,7 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true);
+ Logger.getLogger(TotalNGrams.class.getSimpleName()), false, true, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -179,7 +180,7 @@
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()),
- false, false);
+ false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -225,7 +226,7 @@
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()),
- false, false);
+ false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -271,7 +272,7 @@
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()),
- false, false);
+ false, false, excludeEmptyTexts);
queue.add(0);
queue.add(-1);
@@ -287,57 +288,59 @@
IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
.getResourceAsStream("simple.conllu")), out);
}
-
for (boolean with_padding : new boolean[]{false, true}) {
- for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
- for (int n = 1; n <= 3; n++) {
- ArrayList<String> fnames = new ArrayList<>();
- fnames.add(tempFile.getAbsolutePath());
+ for (boolean excludeEmptyTexts : new boolean[]{false, true}) {
+ for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
+ for (int n = 1; n <= 3; n++) {
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
- File tempFreqFile = File.createTempFile("simple", ".freq");
- tempFreqFile.deleteOnExit();
- String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + ".freq";
- try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
- IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
- .getResourceAsStream(goldFileName)), out);
+ File tempFreqFile = File.createTempFile("simple", ".freq");
+ tempFreqFile.deleteOnExit();
+ String goldFileName = "simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram" + (with_padding? "_padded" : "") + (with_padding && excludeEmptyTexts? "_ignore-empty-texts" : "") + ".freq";
+ try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
+ IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(goldFileName)), out);
+ }
+
+ Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
+ Map<String, Integer> gold = Files.lines(path)
+ .filter(s -> s.matches(".*\\t.*"))
+ .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
+ assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
+
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ n,
+ 1,
+ 1,
+ map,
+ with_lemma_and_pos,
+ false,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGrams.class.getSimpleName()),
+ with_padding, false, excludeEmptyTexts);
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ String conditionDescription = " like in " +goldFileName + " for simple.conllu in condition: "
+ + (with_padding ? "padding" : "no padding") + ", "
+ + (excludeEmptyTexts ? "excluding empty texts" : "not excluding empty texts") + ", "
+ + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
+ gold.forEach((key, value) -> {
+ assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
+ assertEquals(value, map.get(key).intValue(),
+ "Frequency for " + key + " is correct " + conditionDescription);
+ });
+
+ assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
+ + conditionDescription);
}
-
- Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
- Map<String, Integer> gold = Files.lines(path)
- .filter(s -> s.matches(".*\\t.*"))
- .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
- assertTrue(gold.size() > 0, "Gold frequency test file '"+goldFileName + "' is parsed correctly");
-
- map = new ConcurrentHashMap<>();
- LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
- worker = new Worker(
- queue,
- fnames,
- n,
- 1,
- 1,
- map,
- with_lemma_and_pos,
- false,
- new WorkerNodePool(""),
- new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()),
- with_padding, false);
-
- queue.add(0);
- queue.add(-1);
- worker.run();
- String conditionDescription = " like in " +goldFileName + " for simple.conllu in condition: "
- + (with_padding ? "padding" : "no padding") + ", "
- + (with_lemma_and_pos ? "with lemma and POS" : "without lemma or pos");
- gold.forEach((key, value) -> {
- assertNotNull(map.get(key), "Key " + key + " exists " + conditionDescription);
- assertEquals(value, map.get(key).intValue(),
- "Frequency for " + key + " is correct " + conditionDescription);
- });
-
- assertEquals(map.size(), gold.size(), "Actual map should not contain more keys than gold map, in "
- + conditionDescription);
}
}
}
diff --git a/src/test/resources/simple.conllu b/src/test/resources/simple.conllu
index 5a907cc..48dc9fc 100644
--- a/src/test/resources/simple.conllu
+++ b/src/test/resources/simple.conllu
@@ -1,5 +1,5 @@
# text_id = TST_TST.00001
-# empty texts are expected to count
+# empty texts are expected to count *not*
# text_id = TST_TST.00002
1 ich ich PPER PPER _ _ _ _ 1
@@ -26,6 +26,6 @@
# make sure that an empty text header does no harm
# text_id = TST_TST.00007
-# in the unigrams we should have 7 start and end tags
+# in the unigrams we should have 3 start and end tags, because empty texts do not count starting with v2.2.3
diff --git a/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..637d66b
--- /dev/null
+++ b/src/test/resources/simple_1gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,7 @@
+«END» 3
+«START» 3
+. 3
+alex 3
+ich 3
+bin 2
+heiße 1
diff --git a/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..793f882
--- /dev/null
+++ b/src/test/resources/simple_1lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,7 @@
+«END» «END» «STARTEND» 3
+«START» «START» «STARTEND» 3
+. . $. 3
+alex alex NE 3
+ich ich PPER 3
+bin sein VAFIN 2
+heiße heißen VAFIN 1
diff --git a/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..1a6fb0d
--- /dev/null
+++ b/src/test/resources/simple_2gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,13 @@
+«END» «END» 3
+«START» «START» 3
+. «END» 3
+alex . 2
+«START» ich 2
+alex bin 1
+bin alex 1
+bin ich 1
+heiße alex 1
+ich . 1
+ich bin 1
+ich heiße 1
+«START» alex 1
diff --git a/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..98544ce
--- /dev/null
+++ b/src/test/resources/simple_2lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,13 @@
+«END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«START» «START» «STARTEND» «START» «START» «STARTEND» 3
+. . $. «END» «END» «STARTEND» 3
+alex alex NE . . $. 2
+«START» «START» «STARTEND» ich ich PPER 2
+alex alex NE bin sein VAFIN 1
+bin sein VAFIN alex alex NE 1
+bin sein VAFIN ich ich PPER 1
+heiße heißen VAFIN alex alex NE 1
+ich ich PPER . . $. 1
+ich ich PPER bin sein VAFIN 1
+ich ich PPER heiße heißen VAFIN 1
+«START» «START» «STARTEND» alex alex NE 1
diff --git a/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..431eafa
--- /dev/null
+++ b/src/test/resources/simple_3gram_padded_ignore-empty-texts.freq
@@ -0,0 +1,16 @@
+«END» «END» «END» 3
+«START» «START» «START» 3
+. «END» «END» 3
+alex . «END» 2
+«START» «START» ich 2
+alex bin ich 1
+bin alex . 1
+bin ich . 1
+heiße alex . 1
+ich . «END» 1
+ich bin alex 1
+ich heiße alex 1
+«START» alex bin 1
+«START» ich bin 1
+«START» ich heiße 1
+«START» «START» alex 1
diff --git a/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
new file mode 100644
index 0000000..8e13026
--- /dev/null
+++ b/src/test/resources/simple_3lpgram_padded_ignore-empty-texts.freq
@@ -0,0 +1,16 @@
+«END» «END» «STARTEND» «END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«START» «START» «STARTEND» «START» «START» «STARTEND» «START» «START» «STARTEND» 3
+. . $. «END» «END» «STARTEND» «END» «END» «STARTEND» 3
+alex alex NE . . $. «END» «END» «STARTEND» 2
+«START» «START» «STARTEND» «START» «START» «STARTEND» ich ich PPER 2
+alex alex NE bin sein VAFIN ich ich PPER 1
+bin sein VAFIN alex alex NE . . $. 1
+bin sein VAFIN ich ich PPER . . $. 1
+heiße heißen VAFIN alex alex NE . . $. 1
+ich ich PPER . . $. «END» «END» «STARTEND» 1
+ich ich PPER bin sein VAFIN alex alex NE 1
+ich ich PPER heiße heißen VAFIN alex alex NE 1
+«START» «START» «STARTEND» alex alex NE bin sein VAFIN 1
+«START» «START» «STARTEND» ich ich PPER bin sein VAFIN 1
+«START» «START» «STARTEND» ich ich PPER heiße heißen VAFIN 1
+«START» «START» «STARTEND» «START» «START» «STARTEND» alex alex NE 1