Add --pad option to optionally add padding symbols at text edges
1 ich ich PPER PPER _ _ _ _ 1
2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
3 alex alex NE NE _ _ _ _ 0.565630
4 . . $. $. _ _ _ _ 1.000000
# text_id = TST_TST.00001
1 alex alex NE NE _ _ _ _ 0.565630
2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
3 ich ich PPER PPER _ _ _ _ 1
4 . . $. $. _ _ _ _ 1.000000
# text_id = TST_TST.00002
1 ich ich PPER PPER _ _ _ _ 1
2 heiße heißen VAFIN VAFIN _ _ _ _ 1.000000
3 alex alex NE NE _ _ _ _ 0.565630
4 . . $. $. _ _ _ _ 1.000000
---->
. «END» «END» 3
«END» «END» «END» 3
«START» «START» «START» 3
«START» «START» ich 2
alex . «END» 2
«START» «START» alex 1
«START» alex bin 1
«START» ich bin 1
«START» ich heiße 1
alex bin ich 1
bin alex . 1
bin ich . 1
heiße alex . 1
ich . «END» 1
ich bin alex 1
ich heiße alex 1
Change-Id: Ib4826400da657787940805c616a0000ac089120d
diff --git a/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java b/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java
new file mode 100644
index 0000000..7c141bd
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java
@@ -0,0 +1,47 @@
+package org.ids_mannheim;
+
+
+import static org.ids_mannheim.TotalNGrams.logger;
+
+public class PaddedSlidingWindowQueue extends SlidingWindowQueue {
+ public static final String TEXT_END_SYMBOL = "«END»";
+ public static final String TEXT_START_SYMBOL = "«START»";
+ public static final String TEXT_STARTEND_POS = "«STARTEND»";
+ private static String TEXT_START_COMBO, TEXT_END_COMBO;
+ private boolean needsFlush;
+
+ public PaddedSlidingWindowQueue(int size, Increaser flush, boolean with_lemma_and_pos) {
+ super(size, flush, with_lemma_and_pos);
+ if (with_lemma_and_pos) {
+ TEXT_START_COMBO = TEXT_START_SYMBOL + '\t' + TEXT_START_SYMBOL + '\t' + TEXT_STARTEND_POS;
+ TEXT_END_COMBO = TEXT_END_SYMBOL + '\t' + TEXT_END_SYMBOL + '\t' + TEXT_STARTEND_POS;
+ } else {
+ TEXT_START_COMBO = TEXT_START_SYMBOL;
+ TEXT_END_COMBO = TEXT_END_SYMBOL;
+ }
+ logger.info("Using padding.");
+ needsFlush = false;
+ }
+
+ @Override
+ public void reset(int fold) {
+ this.clear();
+ for (int i = 0; i < maxSize; i++)
+ add(TEXT_START_COMBO);
+ this.fold = fold;
+ }
+
+ @Override
+ public void add(String k) {
+ super.add(k);
+ needsFlush = true;
+ }
+
+ @Override
+ public void textBreak() {
+ if (needsFlush)
+ for (int i = 0; i < maxSize; i++)
+ add(TEXT_END_COMBO);
+ }
+
+}
diff --git a/src/main/java/org/ids_mannheim/SlidingWindowQueue.java b/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
index 93acd58..32e0f60 100644
--- a/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
+++ b/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
@@ -3,17 +3,15 @@
import java.util.LinkedList;
public class SlidingWindowQueue extends LinkedList {
- private final int maxSize;
- private final Increaser flush;
+ final int maxSize;
+ final Increaser flush;
+ final boolean with_lemma_and_pos;
public int fold;
- interface Increaser {
- void accept(String s);
- }
-
- public SlidingWindowQueue(int size, Increaser flush) {
+ public SlidingWindowQueue(int size, Increaser flush, boolean with_lemma_and_pos) {
this.maxSize = size;
this.flush = flush;
+ this.with_lemma_and_pos = with_lemma_and_pos;
}
public void add(String k) {
@@ -28,4 +26,11 @@
this.clear();
this.fold = fold;
}
+
+ public void textBreak() {
+ }
+
+ interface Increaser {
+ void accept(String s);
+ }
}
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 86932d4..ee1b665 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -92,6 +92,11 @@
"--numeric-secondary-sort"}, description = "Sort entries with same frequency numerically (default: ${DEFAULT-VALUE})")
boolean numericSecondarySort = false;
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {
+ "--pad"}, description = "Add padding " + PaddedSlidingWindowQueue.TEXT_START_SYMBOL + " and " + PaddedSlidingWindowQueue.TEXT_END_SYMBOL + " symbols at text edges (default: ${DEFAULT-VALUE})")
+ boolean addPadding = false;
+
private Progressbar etaPrinter;
public TotalNGrams() {
@@ -164,7 +169,7 @@
int threads = Math.min(max_threads, inputFiles.size());
logger.info("Processing fold " + fold + "/" + FOLDS);
logger.info("Using " + threads + " threads");
- IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger)));
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index ca7f27c..2f6b32b 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -29,13 +29,14 @@
private final WorkerNodePool pool;
private final boolean with_lemma_and_pos;
private final boolean downcase_tokens;
+ private final boolean addPadding;
private final DeterministicRandomProvider deterministicRandomProvider;
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
ConcurrentHashMap<String, AtomicInteger> map,
boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
- Progressbar etaPrinter, Logger logger) {
+ Progressbar etaPrinter, Logger logger, boolean addPadding) {
this.queue = queue;
this.fnames = fnames;
this.map = map;
@@ -47,6 +48,7 @@
this.logger = logger;
this.deterministicRandomProvider = new DeterministicRandomProvider(folds);
this.downcase_tokens = downcase_tokens;
+ this.addPadding = addPadding;
}
@Override
@@ -55,7 +57,8 @@
int index = queue.take();
int retries = MAX_RETRIES;
int texts = 0;
- SlidingWindowQueue slidingWindowQueue = new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s));
+ SlidingWindowQueue slidingWindowQueue = (addPadding ? new PaddedSlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos) :
+ new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos));
while (index >= 0) {
String fname = fnames.get(index);
File current_file = new File(fname);
@@ -113,6 +116,7 @@
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);
if (matcher.find()) {
+ slidingWindowQueue.textBreak();
fold = deterministicRandomProvider.getFoldFromTextID(matcher.group(1)) + 1;
texts++;
if (fold == target_fold) {
@@ -124,7 +128,7 @@
if (strings.length < 4) {
continue;
}
- assert strings.length == 10 : "CoNLL-U Format must have 10 columns";
+ assert strings.length == 10 : "Error: CoNLL-U Format must have 10 columns in line \""+line+"\"";
String token = ( downcase_tokens?
Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
Utils.unEscapeEntities(strings[1]));
@@ -156,6 +160,7 @@
}
}
}
+ slidingWindowQueue.textBreak();
pool.markFree(poolIndex);
if (texts > 0) {
logger.info(pool.getHost(poolIndex) + " finished " + fname + " with " + texts + " texts");
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index c4d8758..5e0623f 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -6,6 +6,9 @@
import org.junit.jupiter.api.Test;
import java.io.*;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Map;
import java.util.Objects;
@@ -13,10 +16,12 @@
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
+import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
class WorkerTest {
+ public static final String splitFreqlistRegex = "\\t(?=[0-9]+$)";
private ByteArrayOutputStream errContent;
private final PrintStream originalErr = System.err;
Worker worker;
@@ -68,7 +73,7 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()));
+ Logger.getLogger(TotalNGrams.class.getSimpleName()), false);
queue.add(0);
queue.add(-1);
@@ -118,7 +123,8 @@
true,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()));
+ Logger.getLogger(TotalNGrams.class.getSimpleName()),
+ false);
queue.add(0);
queue.add(-1);
@@ -163,7 +169,8 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()));
+ Logger.getLogger(TotalNGrams.class.getSimpleName()),
+ false);
queue.add(0);
queue.add(-1);
@@ -208,7 +215,8 @@
false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
- Logger.getLogger(TotalNGrams.class.getSimpleName()));
+ Logger.getLogger(TotalNGrams.class.getSimpleName()),
+ false);
queue.add(0);
queue.add(-1);
@@ -216,4 +224,53 @@
gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
}
+ @Test
+ void paddingWorks() throws IOException {
+ File tempFile = File.createTempFile("simple", ".conllu");
+ tempFile.deleteOnExit();
+ try (FileOutputStream out = new FileOutputStream(tempFile)) {
+ IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("simple.conllu")), out);
+ }
+
+ for (boolean with_lemma_and_pos : new boolean[]{false, true}) {
+ for (int n = 1; n <= 3; n++) {
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
+
+ File tempFreqFile = File.createTempFile("simple", ".freq");
+ tempFreqFile.deleteOnExit();
+ try (FileOutputStream out = new FileOutputStream(tempFreqFile)) {
+ IOUtils.copy(Objects.requireNonNull(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("simple_" + n + (with_lemma_and_pos ? "lp" : "") + "gram_padded.freq")), out);
+ }
+
+ Path path = FileSystems.getDefault().getPath(tempFreqFile.getAbsolutePath());
+ Map<String, Integer> gold = Files.lines(path)
+ .filter(s -> s.matches(splitFreqlistRegex))
+ .collect(Collectors.toMap(k -> k.split(splitFreqlistRegex)[0], v -> Integer.parseInt(v.split(splitFreqlistRegex)[1])));
+
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ n,
+ 1,
+ 1,
+ map,
+ with_lemma_and_pos,
+ false,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGrams.class.getSimpleName()),
+ true);
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+ }
+ }
+ }
}
diff --git a/src/test/resources/simple.conllu b/src/test/resources/simple.conllu
new file mode 100644
index 0000000..2e52539
--- /dev/null
+++ b/src/test/resources/simple.conllu
@@ -0,0 +1,17 @@
+# text_id = TST_TST.00000
+1 ich ich PPER PPER _ _ _ _ 1
+2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
+3 alex alex NE NE _ _ _ _ 0.565630
+4 . . $. $. _ _ _ _ 1.000000
+
+# text_id = TST_TST.00001
+1 alex alex NE NE _ _ _ _ 0.565630
+2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
+3 ich ich PPER PPER _ _ _ _ 1
+4 . . $. $. _ _ _ _ 1.000000
+
+# text_id = TST_TST.00002
+1 ich ich PPER PPER _ _ _ _ 1
+2 heiße heißen VAFIN VAFIN _ _ _ _ 1.000000
+3 alex alex NE NE _ _ _ _ 0.565630
+4 . . $. $. _ _ _ _ 1.000000
diff --git a/src/test/resources/simple_1gram_padded.freq b/src/test/resources/simple_1gram_padded.freq
new file mode 100644
index 0000000..54522cb
--- /dev/null
+++ b/src/test/resources/simple_1gram_padded.freq
@@ -0,0 +1,7 @@
+. 3
+«END» 3
+«START» 3
+alex 3
+ich 3
+bin 2
+heiße 1
diff --git a/src/test/resources/simple_1lpgram_padded.freq b/src/test/resources/simple_1lpgram_padded.freq
new file mode 100644
index 0000000..ff8c4f7
--- /dev/null
+++ b/src/test/resources/simple_1lpgram_padded.freq
@@ -0,0 +1,7 @@
+. . $. 3
+alex alex NE 3
+ich ich PPER 3
+«END» «END» «STARTEND» 3
+«START» «START» «STARTEND» 3
+bin sein VAFIN 2
+heiße heißen VAFIN 1
diff --git a/src/test/resources/simple_2gram_padded.freq b/src/test/resources/simple_2gram_padded.freq
new file mode 100644
index 0000000..ba04a3a
--- /dev/null
+++ b/src/test/resources/simple_2gram_padded.freq
@@ -0,0 +1,13 @@
+. «END» 3
+«END» «END» 3
+«START» «START» 3
+«START» ich 2
+alex . 2
+«START» alex 1
+alex bin 1
+bin alex 1
+bin ich 1
+heiße alex 1
+ich . 1
+ich bin 1
+ich heiße 1
diff --git a/src/test/resources/simple_2lpgram_padded.freq b/src/test/resources/simple_2lpgram_padded.freq
new file mode 100644
index 0000000..7b1bd84
--- /dev/null
+++ b/src/test/resources/simple_2lpgram_padded.freq
@@ -0,0 +1,13 @@
+. . $. «END» «END» «STARTEND» 3
+«END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«START» «START» «STARTEND» «START» «START» «STARTEND» 3
+alex alex NE . . $. 2
+«START» «START» «STARTEND» ich ich PPER 2
+alex alex NE bin sein VAFIN 1
+bin sein VAFIN alex alex NE 1
+bin sein VAFIN ich ich PPER 1
+heiße heißen VAFIN alex alex NE 1
+ich ich PPER . . $. 1
+ich ich PPER bin sein VAFIN 1
+ich ich PPER heiße heißen VAFIN 1
+«START» «START» «STARTEND» alex alex NE 1
diff --git a/src/test/resources/simple_3gram_padded.freq b/src/test/resources/simple_3gram_padded.freq
new file mode 100644
index 0000000..f54eede
--- /dev/null
+++ b/src/test/resources/simple_3gram_padded.freq
@@ -0,0 +1,16 @@
+. «END» «END» 3
+«END» «END» «END» 3
+«START» «START» «START» 3
+«START» «START» ich 2
+alex . «END» 2
+«START» «START» alex 1
+«START» alex bin 1
+«START» ich bin 1
+«START» ich heiße 1
+alex bin ich 1
+bin alex . 1
+bin ich . 1
+heiße alex . 1
+ich . «END» 1
+ich bin alex 1
+ich heiße alex 1
diff --git a/src/test/resources/simple_3lpgram_padded.freq b/src/test/resources/simple_3lpgram_padded.freq
new file mode 100644
index 0000000..19b899c
--- /dev/null
+++ b/src/test/resources/simple_3lpgram_padded.freq
@@ -0,0 +1,16 @@
+. . $. «END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«END» «END» «STARTEND» «END» «END» «STARTEND» «END» «END» «STARTEND» 3
+«START» «START» «STARTEND» «START» «START» «STARTEND» «START» «START» «STARTEND» 3
+alex alex NE . . $. «END» «END» «STARTEND» 2
+«START» «START» «STARTEND» «START» «START» «STARTEND» ich ich PPER 2
+alex alex NE bin sein VAFIN ich ich PPER 1
+bin sein VAFIN alex alex NE . . $. 1
+bin sein VAFIN ich ich PPER . . $. 1
+heiße heißen VAFIN alex alex NE . . $. 1
+ich ich PPER . . $. «END» «END» «STARTEND» 1
+ich ich PPER bin sein VAFIN alex alex NE 1
+ich ich PPER heiße heißen VAFIN alex alex NE 1
+«START» «START» «STARTEND» alex alex NE bin sein VAFIN 1
+«START» «START» «STARTEND» ich ich PPER bin sein VAFIN 1
+«START» «START» «STARTEND» ich ich PPER heiße heißen VAFIN 1
+«START» «START» «STARTEND» «START» «START» «STARTEND» alex alex NE 1