Add --pad option to optionally add padding symbols at text edges
1 ich ich PPER PPER _ _ _ _ 1
2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
3 alex alex NE NE _ _ _ _ 0.565630
4 . . $. $. _ _ _ _ 1.000000
# text_id = TST_TST.00001
1 alex alex NE NE _ _ _ _ 0.565630
2 bin sein VAFIN VAFIN _ _ _ _ 1.000000
3 ich ich PPER PPER _ _ _ _ 1
4 . . $. $. _ _ _ _ 1.000000
# text_id = TST_TST.00002
1 ich ich PPER PPER _ _ _ _ 1
2 heiße heißen VAFIN VAFIN _ _ _ _ 1.000000
3 alex alex NE NE _ _ _ _ 0.565630
4 . . $. $. _ _ _ _ 1.000000
---->
. «END» «END» 3
«END» «END» «END» 3
«START» «START» «START» 3
«START» «START» ich 2
alex . «END» 2
«START» «START» alex 1
«START» alex bin 1
«START» ich bin 1
«START» ich heiße 1
alex bin ich 1
bin alex . 1
bin ich . 1
heiße alex . 1
ich . «END» 1
ich bin alex 1
ich heiße alex 1
Change-Id: Ib4826400da657787940805c616a0000ac089120d
diff --git a/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java b/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java
new file mode 100644
index 0000000..7c141bd
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/PaddedSlidingWindowQueue.java
@@ -0,0 +1,47 @@
+package org.ids_mannheim;
+
+
+import static org.ids_mannheim.TotalNGrams.logger;
+
+public class PaddedSlidingWindowQueue extends SlidingWindowQueue {
+ public static final String TEXT_END_SYMBOL = "«END»";
+ public static final String TEXT_START_SYMBOL = "«START»";
+ public static final String TEXT_STARTEND_POS = "«STARTEND»";
+ private static String TEXT_START_COMBO, TEXT_END_COMBO;
+ private boolean needsFlush;
+
+ public PaddedSlidingWindowQueue(int size, Increaser flush, boolean with_lemma_and_pos) {
+ super(size, flush, with_lemma_and_pos);
+ if (with_lemma_and_pos) {
+ TEXT_START_COMBO = TEXT_START_SYMBOL + '\t' + TEXT_START_SYMBOL + '\t' + TEXT_STARTEND_POS;
+ TEXT_END_COMBO = TEXT_END_SYMBOL + '\t' + TEXT_END_SYMBOL + '\t' + TEXT_STARTEND_POS;
+ } else {
+ TEXT_START_COMBO = TEXT_START_SYMBOL;
+ TEXT_END_COMBO = TEXT_END_SYMBOL;
+ }
+ logger.info("Using padding.");
+ needsFlush = false;
+ }
+
+ @Override
+ public void reset(int fold) {
+ this.clear();
+ for (int i = 0; i < maxSize; i++)
+ add(TEXT_START_COMBO);
+ this.fold = fold;
+ }
+
+ @Override
+ public void add(String k) {
+ super.add(k);
+ needsFlush = true;
+ }
+
+ @Override
+ public void textBreak() {
+ if (needsFlush)
+ for (int i = 0; i < maxSize; i++)
+ add(TEXT_END_COMBO);
+ }
+
+}
diff --git a/src/main/java/org/ids_mannheim/SlidingWindowQueue.java b/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
index 93acd58..32e0f60 100644
--- a/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
+++ b/src/main/java/org/ids_mannheim/SlidingWindowQueue.java
@@ -3,17 +3,15 @@
import java.util.LinkedList;
public class SlidingWindowQueue extends LinkedList {
- private final int maxSize;
- private final Increaser flush;
+ final int maxSize;
+ final Increaser flush;
+ final boolean with_lemma_and_pos;
public int fold;
- interface Increaser {
- void accept(String s);
- }
-
- public SlidingWindowQueue(int size, Increaser flush) {
+ public SlidingWindowQueue(int size, Increaser flush, boolean with_lemma_and_pos) {
this.maxSize = size;
this.flush = flush;
+ this.with_lemma_and_pos = with_lemma_and_pos;
}
public void add(String k) {
@@ -28,4 +26,11 @@
this.clear();
this.fold = fold;
}
+
+ public void textBreak() {
+ }
+
+ interface Increaser {
+ void accept(String s);
+ }
}
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 86932d4..ee1b665 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -92,6 +92,11 @@
"--numeric-secondary-sort"}, description = "Sort entries with same frequency numerically (default: ${DEFAULT-VALUE})")
boolean numericSecondarySort = false;
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {
+ "--pad"}, description = "Add padding " + PaddedSlidingWindowQueue.TEXT_START_SYMBOL + " and " + PaddedSlidingWindowQueue.TEXT_END_SYMBOL + " symbols at text edges (default: ${DEFAULT-VALUE})")
+ boolean addPadding = false;
+
private Progressbar etaPrinter;
public TotalNGrams() {
@@ -164,7 +169,7 @@
int threads = Math.min(max_threads, inputFiles.size());
logger.info("Processing fold " + fold + "/" + FOLDS);
logger.info("Using " + threads + " threads");
- IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger)));
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger, addPadding)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index ca7f27c..2f6b32b 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -29,13 +29,14 @@
private final WorkerNodePool pool;
private final boolean with_lemma_and_pos;
private final boolean downcase_tokens;
+ private final boolean addPadding;
private final DeterministicRandomProvider deterministicRandomProvider;
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
ConcurrentHashMap<String, AtomicInteger> map,
boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
- Progressbar etaPrinter, Logger logger) {
+ Progressbar etaPrinter, Logger logger, boolean addPadding) {
this.queue = queue;
this.fnames = fnames;
this.map = map;
@@ -47,6 +48,7 @@
this.logger = logger;
this.deterministicRandomProvider = new DeterministicRandomProvider(folds);
this.downcase_tokens = downcase_tokens;
+ this.addPadding = addPadding;
}
@Override
@@ -55,7 +57,8 @@
int index = queue.take();
int retries = MAX_RETRIES;
int texts = 0;
- SlidingWindowQueue slidingWindowQueue = new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s));
+ SlidingWindowQueue slidingWindowQueue = (addPadding ? new PaddedSlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos) :
+ new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s), with_lemma_and_pos));
while (index >= 0) {
String fname = fnames.get(index);
File current_file = new File(fname);
@@ -113,6 +116,7 @@
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);
if (matcher.find()) {
+ slidingWindowQueue.textBreak();
fold = deterministicRandomProvider.getFoldFromTextID(matcher.group(1)) + 1;
texts++;
if (fold == target_fold) {
@@ -124,7 +128,7 @@
if (strings.length < 4) {
continue;
}
- assert strings.length == 10 : "CoNLL-U Format must have 10 columns";
+ assert strings.length == 10 : "Error: CoNLL-U Format must have 10 columns in line \""+line+"\"";
String token = ( downcase_tokens?
Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
Utils.unEscapeEntities(strings[1]));
@@ -156,6 +160,7 @@
}
}
}
+ slidingWindowQueue.textBreak();
pool.markFree(poolIndex);
if (texts > 0) {
logger.info(pool.getHost(poolIndex) + " finished " + fname + " with " + texts + " texts");