totalngrams: add unit test for almost the whole pipeline
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
new file mode 100644
index 0000000..8f3020f
--- /dev/null
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -0,0 +1,78 @@
+package org.ids_mannheim;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.logging.Logger;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class WorkerTest {
+ private final ByteArrayOutputStream errContent = new ByteArrayOutputStream();
+ private final PrintStream originalErr = System.err;
+ Worker worker;
+ ConcurrentHashMap<String, AtomicInteger> map;
+ Map<String, Integer> gold = Map.of(
+ "und und KON Fluchen Fluchen NN", 1,
+ "Bestreben Bestreben NN , , $,", 1,
+ "Bürger Bürger NN sich sich PRF", 1,
+ "dieses dies PDAT würdigen würdig ADJA", 1,
+ "im in APPRART Kriegshandwerk Kriegshandwerk NN", 1,
+ "man man PIS nur nur ADV", 1,
+ "von von APPR Longwy -- NE", 3,
+ "nicht nicht PTKNEG ungeschickt ungeschickt ADJD", 1,
+ "Republikaner Republikaner NN und und KON", 1,
+ "Patriotismus Patriotismus NN derer die PDS", 1
+ );
+
+ @BeforeEach
+ public void setUpStreams() {
+ System.setErr(new PrintStream(errContent));
+ }
+
+ @AfterEach
+ public void restoreStreams() {
+ System.setErr(originalErr);
+ }
+
+ @Test
+ void resultAndOutputAreCorrect() throws IOException {
+ File tempFile = File.createTempFile("goe_sample", ".conllu.gz");
+ tempFile.deleteOnExit();
+ try (FileOutputStream out = new FileOutputStream(tempFile)) {
+ IOUtils.copy(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("goe_sample.conllu.gz"), out);
+ }
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ 2,
+ 8,
+ 10,
+ map,
+ true,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGram.class.getSimpleName()));
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+ assertTrue(errContent.toString().contains("with 1 text"));
+ assertTrue(errContent.toString().contains("100%"));
+ }
+}
\ No newline at end of file
diff --git a/src/test/resources/goe_sample.conllu.gz b/src/test/resources/goe_sample.conllu.gz
new file mode 100644
index 0000000..252904a
--- /dev/null
+++ b/src/test/resources/goe_sample.conllu.gz
Binary files differ