Mark non-word tokens for index
Necessary for distance operator compatibility with COSMAS II.
Change-Id: Ie5b3ffdf9058011859142c917572523b266a0624
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index d858858..e131a9b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -252,6 +252,9 @@
if (textData.sentences != null) {
layerInfos.add("dereko/s=spans")
}
+ if (includeNonWordTokens) {
+ layerInfos.add("base/p=tokens")
+ }
externalSentenceFoundries.forEach { layerInfos.add("$it/s=spans") }
externalConstitFoundries.forEach { layerInfos.add("$it/c=spans") }
@@ -635,6 +638,11 @@
tokenAnnotations.add(jsonString("i:${surfaceForm.lowercase().escapeKrillValue()}"))
}
+ // Mark non-word tokens (punctuation, symbols, etc.) with a base-layer flag
+ if (!shouldKeepTokenForKrill(text, token)) {
+ tokenAnnotations.add(jsonString("base/p:_"))
+ }
+
// Add inverse dependency annotations (<:) for dependents pointing to this token as head
inverseDeps[index]?.sortedBy { "${it.foundry}/${it.deprel}" }?.forEach { inv ->
tokenAnnotations.add(jsonString("<:${inv.foundry}/d:${inv.deprel.escapeKrillValue()}\$<b>32<i>${inv.dependentIndex}"))
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index cbe9ef5..c859c64 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -313,6 +313,8 @@
assertTrue(defaultJsons.isNotEmpty())
assertTrue(defaultJsons.all { !it.contains("\"s:,\"") })
assertTrue(defaultJsons.all { !it.contains("\"s:!\"") })
+ assertTrue(defaultJsons.all { !it.contains("\"base/p:_\"") }, "Default output should not have base/p:_ marker")
+ assertTrue(defaultJsons.all { !it.contains("base/p=tokens") }, "Default output should not declare base/p=tokens layer")
val flagTar = ensureKrillTar("wud24_default_corenlp_nwt") { outputDir ->
arrayOf("-t", "krill", "-q", "--non-word-tokens", "-D", outputDir.path, baseZip, spacyZip, wud24Corenlp)
@@ -323,6 +325,8 @@
assertTrue(flagJsons.isNotEmpty())
assertTrue(flagJsons.any { it.contains("\"s:,\"") })
assertTrue(flagJsons.any { it.contains("\"s:!\"") })
+ assertTrue(flagJsons.any { it.contains("\"base/p:_\"") }, "NWT output should mark non-word tokens with base/p:_")
+ assertTrue(flagJsons.all { it.contains("base/p=tokens") }, "NWT output should declare base/p=tokens in layerInfos")
}
@Test