Add --downcase/-d option to convert all token characters to lower case
Change-Id: I236cf61369faead4b9e4d955b7190b25f88d4a46
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index a80b6d6..80474ae 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -63,6 +63,7 @@
10,
map,
true,
+ false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()));
@@ -70,12 +71,70 @@
queue.add(0);
queue.add(-1);
worker.run();
- gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+ gold.forEach((key, value) -> {
+ AtomicInteger observed = map.get(key);
+ assertNotNull(observed);
+ if (observed != null) {
+ assertEquals(value, observed.intValue());
+ }
+ });
assertTrue(errContent.toString().contains("with 1 text"));
assertTrue(errContent.toString().contains("100%"));
}
@Test
+ void downcasedResultAndOutputAreCorrect() throws IOException {
+ Map<String, Integer> gold = Map.of(
+ "und und KON fluchen Fluchen NN", 1,
+ "bestreben Bestreben NN , , $,", 1,
+ "bürger Bürger NN sich sich PRF", 1,
+ "dieses dies PDAT würdigen würdig ADJA", 1,
+ "im in APPRART kriegshandwerk Kriegshandwerk NN", 1,
+ "man man PIS nur nur ADV", 1,
+ "von von APPR longwy -- NE", 3,
+ "nicht nicht PTKNEG ungeschickt ungeschickt ADJD", 1,
+ "republikaner Republikaner NN und und KON", 1,
+ "patriotismus Patriotismus NN derer die PDS", 1
+ );
+
+ File tempFile = File.createTempFile("goe_sample", ".conllu.gz");
+ tempFile.deleteOnExit();
+ try (FileOutputStream out = new FileOutputStream(tempFile)) {
+ IOUtils.copy(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("goe_sample.conllu.gz"), out);
+ }
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ 2,
+ 8,
+ 10,
+ map,
+ true,
+ true,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGrams.class.getSimpleName()));
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ gold.forEach((key, value) -> {
+// System.out.println("KEY: " + key);
+ AtomicInteger observed = map.get(key);
+ assertNotNull(observed);
+ if (observed != null) {
+ assertEquals(value, observed.intValue());
+ }
+ });
+ assertTrue(errContent.toString().contains("100%"));
+ }
+
+ @Test
void resultAndOutputAreCorrectForEntities() throws IOException {
Map<String, Integer> gold = Map.of(
"\"\t\"\t$(", 1,
@@ -103,6 +162,7 @@
10,
map,
true,
+ false,
new WorkerNodePool(""),
new Progressbar(tempFile.length()),
Logger.getLogger(TotalNGrams.class.getSimpleName()));
diff --git a/src/test/resources/entities.conllu b/src/test/resources/entities.conllu
new file mode 100644
index 0000000..fe4731d
--- /dev/null
+++ b/src/test/resources/entities.conllu
@@ -0,0 +1,15 @@
+# foundry = tree_tagger
+# filename = A00/JAN/00001/tree_tagger/morpho.xml
+# text_id = A00_JAN.00001
+# start_offsets = 0 0 3 7 14 18 28 31 38 45 51 63
+# end_offsets = 64 2 6 13 17 27 30 37 44 50 63 64
+1 In in APPR APPR _ _ _ _ 1.000000
+2 den die ART ART _ _ _ _ 0.999974
+3 Farben Farbe NN NN _ _ _ _ 1.000000
+4 der die ART ART _ _ _ _ 0.999973
+5 Dämmerung Dämmerung NN NN _ _ _ _ 1.000000
+6 ' -- NN NN _ _ _ _ 1.0
+7 < -- NN NN _ _ _ _ 1.0
+8 > -- NN NN _ _ _ _ 1.0
+9 & -- NN NN _ _ _ _ 1.0
+10 " -- NN NN _ _ _ _ 1.0