Add --downcase/-d option to convert all token characters to lower case
Change-Id: I236cf61369faead4b9e4d955b7190b25f88d4a46
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 408b0a4..9d868c0 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -44,6 +44,10 @@
String output_fillename = "-";
@SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-d", "--downcase"}, description = "Convert all token characters into lower case (default: ${DEFAULT-VALUE})")
+ boolean downcase_tokens = false;
+
+ @SuppressWarnings("CanBeFinal")
@CommandLine.Option(names = {"--force"}, description = "Force overwrite (default: ${DEFAULT-VALUE})")
boolean force_overwrite = false;
@@ -151,7 +155,7 @@
int threads = Math.min(max_threads, inputFiles.size());
logger.info("Processing fold " + fold + "/" + FOLDS);
logger.info("Using " + threads + " threads");
- IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, workerNodePool, etaPrinter, logger)));
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index f342eef..b468a65 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -4,6 +4,7 @@
import java.io.*;
import java.util.ArrayList;
+import java.util.Locale;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
@@ -26,11 +27,13 @@
private final Logger logger;
private final WorkerNodePool pool;
private final boolean with_lemma_and_pos;
+ private final boolean downcase_tokens;
+
private final DeterministicRandomProvider deterministicRandomProvider;
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
ConcurrentHashMap<String, AtomicInteger> map,
- boolean with_lemma_and_pos, WorkerNodePool pool,
+ boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
Progressbar etaPrinter, Logger logger) {
this.queue = queue;
this.fnames = fnames;
@@ -42,6 +45,7 @@
this.etaPrinter = etaPrinter;
this.logger = logger;
this.deterministicRandomProvider = new DeterministicRandomProvider(folds);
+ this.downcase_tokens = downcase_tokens;
}
@Override
@@ -109,7 +113,9 @@
continue;
}
assert strings.length == 10 : "CoNLL-U Format must have 10 columns";
- String token = Utils.unEscapeEntities(strings[1]);
+ String token = ( downcase_tokens?
+ Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
+ Utils.unEscapeEntities(strings[1]));
if (with_lemma_and_pos) {
String lemma, pos;
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index a80b6d6..80474ae 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -63,6 +63,7 @@
10,
map,
true,
+ false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()));
@@ -70,12 +71,70 @@
queue.add(0);
queue.add(-1);
worker.run();
- gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+ gold.forEach((key, value) -> {
+ AtomicInteger observed = map.get(key);
+ assertNotNull(observed);
+ if (observed != null) {
+ assertEquals(value, observed.intValue());
+ }
+ });
assertTrue(errContent.toString().contains("with 1 text"));
assertTrue(errContent.toString().contains("100%"));
}
@Test
+ void downcasedResultAndOutputAreCorrect() throws IOException {
+ Map<String, Integer> gold = Map.of(
+ "und und KON fluchen Fluchen NN", 1,
+ "bestreben Bestreben NN , , $,", 1,
+ "bürger Bürger NN sich sich PRF", 1,
+ "dieses dies PDAT würdigen würdig ADJA", 1,
+ "im in APPRART kriegshandwerk Kriegshandwerk NN", 1,
+ "man man PIS nur nur ADV", 1,
+ "von von APPR longwy -- NE", 3,
+ "nicht nicht PTKNEG ungeschickt ungeschickt ADJD", 1,
+ "republikaner Republikaner NN und und KON", 1,
+ "patriotismus Patriotismus NN derer die PDS", 1
+ );
+
+ File tempFile = File.createTempFile("goe_sample", ".conllu.gz");
+ tempFile.deleteOnExit();
+ try (FileOutputStream out = new FileOutputStream(tempFile)) {
+ IOUtils.copy(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("goe_sample.conllu.gz"), out);
+ }
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ 2,
+ 8,
+ 10,
+ map,
+ true,
+ true,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGrams.class.getSimpleName()));
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ gold.forEach((key, value) -> {
+// System.out.println("KEY: " + key);
+ AtomicInteger observed = map.get(key);
+ assertNotNull(observed);
+ if (observed != null) {
+ assertEquals(value, observed.intValue());
+ }
+ });
+ assertTrue(errContent.toString().contains("100%"));
+ }
+
+ @Test
void resultAndOutputAreCorrectForEntities() throws IOException {
Map<String, Integer> gold = Map.of(
"\"\t\"\t$(", 1,
@@ -103,6 +162,7 @@
10,
map,
true,
+ false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()));
diff --git a/src/test/resources/entities.conllu b/src/test/resources/entities.conllu
new file mode 100644
index 0000000..fe4731d
--- /dev/null
+++ b/src/test/resources/entities.conllu
@@ -0,0 +1,15 @@
+# foundry = tree_tagger
+# filename = A00/JAN/00001/tree_tagger/morpho.xml
+# text_id = A00_JAN.00001
+# start_offsets = 0 0 3 7 14 18 28 31 38 45 51 63
+# end_offsets = 64 2 6 13 17 27 30 37 44 50 63 64
+1 In in APPR APPR _ _ _ _ 1.000000
+2 den die ART ART _ _ _ _ 0.999974
+3 Farben Farbe NN NN _ _ _ _ 1.000000
+4 der die ART ART _ _ _ _ 0.999973
+5 Dämmerung Dämmerung NN NN _ _ _ _ 1.000000
+6 ' -- NN NN _ _ _ _ 1.0
+7 < -- NN NN _ _ _ _ 1.0
+8 > -- NN NN _ _ _ _ 1.0
+9 & -- NN NN _ _ _ _ 1.0
+10 " -- NN NN _ _ _ _ 1.0