Add --downcase/-d option to convert all token characters to lower case

Change-Id: I236cf61369faead4b9e4d955b7190b25f88d4a46
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 408b0a4..9d868c0 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -44,6 +44,10 @@
     String output_fillename = "-";
 
     @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {"-d", "--downcase"}, description = "Convert all token characters into lower case (default: ${DEFAULT-VALUE})")
+    boolean downcase_tokens = false;
+
+    @SuppressWarnings("CanBeFinal")
     @CommandLine.Option(names = {"--force"}, description = "Force overwrite (default: ${DEFAULT-VALUE})")
     boolean force_overwrite = false;
 
@@ -151,7 +155,7 @@
         int threads = Math.min(max_threads, inputFiles.size());
         logger.info("Processing fold " + fold + "/" + FOLDS);
         logger.info("Using " + threads + " threads");
-        IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, workerNodePool, etaPrinter, logger)));
+        IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, fold, FOLDS, map, with_lemma_and_pos, downcase_tokens, workerNodePool, etaPrinter, logger)));
         queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
         IntStream.range(0, threads).forEach(unused -> {
             try {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index f342eef..b468a65 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -4,6 +4,7 @@
 
 import java.io.*;
 import java.util.ArrayList;
+import java.util.Locale;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -26,11 +27,13 @@
     private final Logger logger;
     private final WorkerNodePool pool;
     private final boolean with_lemma_and_pos;
+    private final boolean downcase_tokens;
+
     private final DeterministicRandomProvider deterministicRandomProvider;
 
     public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
                   ConcurrentHashMap<String, AtomicInteger> map,
-                  boolean with_lemma_and_pos, WorkerNodePool pool,
+                  boolean with_lemma_and_pos, boolean downcase_tokens, WorkerNodePool pool,
                   Progressbar etaPrinter, Logger logger) {
         this.queue = queue;
         this.fnames = fnames;
@@ -42,6 +45,7 @@
         this.etaPrinter = etaPrinter;
         this.logger = logger;
         this.deterministicRandomProvider = new DeterministicRandomProvider(folds);
+        this.downcase_tokens = downcase_tokens;
     }
 
     @Override
@@ -109,7 +113,9 @@
                             continue;
                         }
                         assert strings.length == 10 : "CoNLL-U Format must have 10 columns";
-                        String token = Utils.unEscapeEntities(strings[1]);
+                        String token = ( downcase_tokens?
+                                Utils.unEscapeEntities(strings[1]).toLowerCase(Locale.ROOT) :
+                                Utils.unEscapeEntities(strings[1]));
                         if (with_lemma_and_pos) {
                             String lemma, pos;
 
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index a80b6d6..80474ae 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -63,6 +63,7 @@
                 10,
                 map,
                 true,
+                false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()));
@@ -70,12 +71,70 @@
         queue.add(0);
         queue.add(-1);
         worker.run();
-        gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+        gold.forEach((key, value) -> {
+            AtomicInteger observed = map.get(key);
+            assertNotNull(observed);
+            if (observed != null) {
+                assertEquals(value, observed.intValue());
+            }
+        });
         assertTrue(errContent.toString().contains("with 1 text"));
         assertTrue(errContent.toString().contains("100%"));
     }
 
     @Test
+    void downcasedResultAndOutputAreCorrect() throws IOException {
+        Map<String, Integer> gold = Map.of(
+                "und	und	KON	fluchen	Fluchen	NN", 1,
+                "bestreben	Bestreben	NN	,	,	$,", 1,
+                "bürger	Bürger	NN	sich	sich	PRF", 1,
+                "dieses	dies	PDAT	würdigen	würdig	ADJA", 1,
+                "im	in	APPRART	kriegshandwerk	Kriegshandwerk	NN", 1,
+                "man	man	PIS	nur	nur	ADV", 1,
+                "von	von	APPR	longwy	--	NE", 3,
+                "nicht	nicht	PTKNEG	ungeschickt	ungeschickt	ADJD", 1,
+                "republikaner	Republikaner	NN	und	und	KON", 1,
+                "patriotismus	Patriotismus	NN	derer	die	PDS", 1
+        );
+
+        File tempFile = File.createTempFile("goe_sample", ".conllu.gz");
+        tempFile.deleteOnExit();
+        try (FileOutputStream out = new FileOutputStream(tempFile)) {
+            IOUtils.copy(Thread.currentThread().getContextClassLoader()
+                    .getResourceAsStream("goe_sample.conllu.gz"), out);
+        }
+        ArrayList<String> fnames = new ArrayList<>();
+        fnames.add(tempFile.getAbsolutePath());
+        map = new ConcurrentHashMap<>();
+        LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+        worker = new Worker(
+                queue,
+                fnames,
+                2,
+                8,
+                10,
+                map,
+                true,
+                true,
+                new WorkerNodePool(""),
+                new Progressbar(tempFile.length()),
+                Logger.getLogger(TotalNGrams.class.getSimpleName()));
+
+        queue.add(0);
+        queue.add(-1);
+        worker.run();
+        gold.forEach((key, value) -> {
+//            System.out.println("KEY: " + key);
+            AtomicInteger observed = map.get(key);
+            assertNotNull(observed);
+            if (observed != null) {
+                assertEquals(value, observed.intValue());
+            }
+        });
+        assertTrue(errContent.toString().contains("100%"));
+    }
+
+    @Test
     void resultAndOutputAreCorrectForEntities() throws IOException {
         Map<String, Integer> gold = Map.of(
                 "\"\t\"\t$(", 1,
@@ -103,6 +162,7 @@
                 10,
                 map,
                 true,
+                false,
                 new WorkerNodePool(""),
                 new Progressbar(tempFile.length()),
                 Logger.getLogger(TotalNGrams.class.getSimpleName()));
diff --git a/src/test/resources/entities.conllu b/src/test/resources/entities.conllu
new file mode 100644
index 0000000..fe4731d
--- /dev/null
+++ b/src/test/resources/entities.conllu
@@ -0,0 +1,15 @@
+# foundry = tree_tagger
+# filename = A00/JAN/00001/tree_tagger/morpho.xml  
+# text_id = A00_JAN.00001
+# start_offsets = 0 0 3 7 14 18 28 31 38 45 51 63
+# end_offsets = 64 2 6 13 17 27 30 37 44 50 63 64
+1	In	in	APPR	APPR	_	_	_	_	1.000000
+2	den	die	ART	ART	_	_	_	_	0.999974
+3	Farben	Farbe	NN	NN	_	_	_	_	1.000000
+4	der	die	ART	ART	_	_	_	_	0.999973
+5	Dämmerung	Dämmerung	NN	NN	_	_	_	_	1.000000
+6	&apos;	--	NN	NN	_	_	_	_	1.0
+7	&lt;	--	NN	NN	_	_	_	_	1.0
+8	&gt;	--	NN	NN	_	_	_	_	1.0
+9	&amp;	--	NN	NN	_	_	_	_	1.0
+10	&quot;	--	NN	NN	_	_	_	_	1.0