totalNGrams: unescape all XML entities (&, <, >, ")
diff --git a/src/main/java/org/ids_mannheim/Utils.java b/src/main/java/org/ids_mannheim/Utils.java
index bde120a..e8413b5 100644
--- a/src/main/java/org/ids_mannheim/Utils.java
+++ b/src/main/java/org/ids_mannheim/Utils.java
@@ -37,5 +37,23 @@
}
return f;
}
+
+ @org.jetbrains.annotations.NotNull
+ public static String unEscapeEntities(String s) {
+ return s.replace("&", "&")
+ .replace("'", "'")
+ .replace(""", "\"")
+ .replace("<", "<")
+ .replace(">", ">");
+ }
+
+ public static String fixEscapedConlluEntities(String s) {
+ return s.replaceAll("^(\\d+)\t'\t--\t[^\t]+", "$1\t'\t\"\t\\$(")
+ .replaceAll("^(\\d+)\t"\t--\t[^\t]+", "$1\t\"\t\"\t\\$(")
+ .replaceAll("^(\\d+)\t&\t--\t[^\t]+", "$1\t&\t&\tKON")
+ .replaceAll("^(\\d+)\t<\t--\t[^\t]+", "$1\t<\t<\t\\$(")
+ .replaceAll("^(\\d+)\t>\t--\t[^\t]+", "$1\t>\t>\t\\$(");
+ }
+
}
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index 635934d..3e89faa 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -93,11 +93,32 @@
continue;
}
assert strings.length == 10 : "CoNLL-U Format must have 10 columns";
+ String token = Utils.unEscapeEntities(strings[1]);
if (with_lemma_and_pos) {
+ String lemma, pos;
+
+ if (token.equals("\"") || token.equals("'")) {
+ lemma = "\"";
+ } else if (token.equals("&")) {
+ lemma = "&";
+ } else if (token.equals("<")) {
+ lemma = "<";
+ } else if (token.equals(">")) {
+ lemma = ">";
+ } else {
+ lemma = strings[2];
+ }
+ if (token.equals("\"") || token.equals("'") || token.equals("<") || token.equals(">")) {
+ pos = "$(";
+ } else if (token.equals("&")) {
+ pos = "KON";
+ } else {
+ pos = strings[3];
+ }
//noinspection ConstantCondition
- slidingWindowQueue.add(join("\t", strings[1], strings[2], strings[3]));
+ slidingWindowQueue.add(join("\t", token, lemma, pos));
} else {
- slidingWindowQueue.add(strings[1]);
+ slidingWindowQueue.add(token);
}
}
}
diff --git a/src/test/java/org/ids_mannheim/UtilsTest.java b/src/test/java/org/ids_mannheim/UtilsTest.java
index a93db3c..65dbca7 100644
--- a/src/test/java/org/ids_mannheim/UtilsTest.java
+++ b/src/test/java/org/ids_mannheim/UtilsTest.java
@@ -20,4 +20,19 @@
assertEquals(53, drp.getFoldFromTextID("RPO05_JUN.00001"));
assertEquals(94, drp.getFoldFromTextID("RPO05_JUN.00002"));
}
+
+ @Test
+ void unEscapeEntitiesWorks() {
+ assertEquals("\'\"&<>\"\'", Utils.unEscapeEntities("'"&<>"'"));
+ }
+
+ @Test
+ void fixEscapedConlluEntitiesWorks() {
+ assertEquals("1\t\'\t\"\t$(\txxx", Utils.fixEscapedConlluEntities("1\t'\t--\tNN\txxx"));
+ assertEquals("22\t\"\t\"\t$(\txxx", Utils.fixEscapedConlluEntities("22\t"\t--\tNN\txxx"));
+ assertEquals("333\t<\t<\t$(\txxx", Utils.fixEscapedConlluEntities("333\t<\t--\tNN\txxx"));
+ assertEquals("4444\t>\t>\t$(\txxx", Utils.fixEscapedConlluEntities("4444\t>\t--\tNN\txxx"));
+ assertEquals("55555\t&\t&\tKON\txxx", Utils.fixEscapedConlluEntities("55555\t&\t--\tNN\txxx"));
+ }
+
}
\ No newline at end of file
diff --git a/src/test/java/org/ids_mannheim/WorkerTest.java b/src/test/java/org/ids_mannheim/WorkerTest.java
index 40bf344..a80b6d6 100644
--- a/src/test/java/org/ids_mannheim/WorkerTest.java
+++ b/src/test/java/org/ids_mannheim/WorkerTest.java
@@ -13,27 +13,13 @@
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.*;
class WorkerTest {
private final ByteArrayOutputStream errContent = new ByteArrayOutputStream();
private final PrintStream originalErr = System.err;
Worker worker;
ConcurrentHashMap<String, AtomicInteger> map;
- Map<String, Integer> gold = Map.of(
- "und und KON Fluchen Fluchen NN", 1,
- "Bestreben Bestreben NN , , $,", 1,
- "Bürger Bürger NN sich sich PRF", 1,
- "dieses dies PDAT würdigen würdig ADJA", 1,
- "im in APPRART Kriegshandwerk Kriegshandwerk NN", 1,
- "man man PIS nur nur ADV", 1,
- "von von APPR Longwy -- NE", 3,
- "nicht nicht PTKNEG ungeschickt ungeschickt ADJD", 1,
- "Republikaner Republikaner NN und und KON", 1,
- "Patriotismus Patriotismus NN derer die PDS", 1
- );
-
@BeforeEach
public void setUpStreams() {
System.setErr(new PrintStream(errContent));
@@ -46,6 +32,19 @@
@Test
void resultAndOutputAreCorrect() throws IOException {
+ Map<String, Integer> gold = Map.of(
+ "und und KON Fluchen Fluchen NN", 1,
+ "Bestreben Bestreben NN , , $,", 1,
+ "Bürger Bürger NN sich sich PRF", 1,
+ "dieses dies PDAT würdigen würdig ADJA", 1,
+ "im in APPRART Kriegshandwerk Kriegshandwerk NN", 1,
+ "man man PIS nur nur ADV", 1,
+ "von von APPR Longwy -- NE", 3,
+ "nicht nicht PTKNEG ungeschickt ungeschickt ADJD", 1,
+ "Republikaner Republikaner NN und und KON", 1,
+ "Patriotismus Patriotismus NN derer die PDS", 1
+ );
+
File tempFile = File.createTempFile("goe_sample", ".conllu.gz");
tempFile.deleteOnExit();
try (FileOutputStream out = new FileOutputStream(tempFile)) {
@@ -75,4 +74,43 @@
assertTrue(errContent.toString().contains("with 1 text"));
assertTrue(errContent.toString().contains("100%"));
}
+
+ @Test
+ void resultAndOutputAreCorrectForEntities() throws IOException {
+ Map<String, Integer> gold = Map.of(
+ "\"\t\"\t$(", 1,
+ "\'\t\"\t$(", 1,
+ "&\t&\tKON", 1,
+ "<\t<\t$(", 1,
+ ">\t>\t$(", 1
+ );
+
+ File tempFile = File.createTempFile("entities", ".conllu");
+ tempFile.deleteOnExit();
+ try (FileOutputStream out = new FileOutputStream(tempFile)) {
+ IOUtils.copy(Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("entities.conllu"), out);
+ }
+ ArrayList<String> fnames = new ArrayList<>();
+ fnames.add(tempFile.getAbsolutePath());
+ map = new ConcurrentHashMap<>();
+ LinkedBlockingQueue<Integer> queue = new LinkedBlockingQueue<>(2);
+ worker = new Worker(
+ queue,
+ fnames,
+ 1,
+ 5,
+ 10,
+ map,
+ true,
+ new WorkerNodePool(""),
+ new Progressbar(tempFile.length()),
+ Logger.getLogger(TotalNGrams.class.getSimpleName()));
+
+ queue.add(0);
+ queue.add(-1);
+ worker.run();
+ gold.forEach((key, value) -> assertNotNull(map.get(key)));
+ gold.forEach((key, value) -> assertEquals(value, map.get(key).intValue()));
+ }
}