Add --non-word-token option and change default
Default to not indexing non-word-tokens, like korapxml2krill
Change-Id: Iac9e6a1a55bd22f03914e9a32a93417a7804b615
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index c0d0899..58306c3 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -157,6 +157,12 @@
)
var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) " " else "\n"
+ @Option(
+ names = ["--non-word-tokens", "--nwt", "-nwt"],
+ description = ["Include punctuation and other non-word tokens when generating Krill output (matches korapxml2krill --non-word-tokens flag)."]
+ )
+ var includeNonWordTokens: Boolean = false
+
@Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
var offsets: Boolean = false
@@ -3632,9 +3638,18 @@
}
private fun generateKrillStream(textData: KrillTextData): List<String> {
- val tokens = textData.tokens ?: return emptyList()
+ val rawTokens = textData.tokens ?: return emptyList()
val text = textData.textContent ?: ""
val sentences = textData.sentences ?: emptyArray()
+ val tokens: List<Span> = if (includeNonWordTokens || text.isEmpty()) {
+ rawTokens.toList()
+ } else {
+ rawTokens.filter { span -> shouldKeepTokenForKrill(text, span) }
+ }
+ if (tokens.isEmpty()) {
+ LOGGER.fine("No tokens remained for ${textData.textId} after filtering non-word tokens")
+ return emptyList()
+ }
val result = mutableListOf<String>()
// Build offset-to-index map for resolving dependency heads and structural spans
@@ -3918,6 +3933,15 @@
return result
}
+ private fun shouldKeepTokenForKrill(text: String, span: Span): Boolean {
+ if (text.isEmpty()) return true
+ val safeFrom = span.from.coerceIn(0, text.length)
+ val safeTo = span.to.coerceIn(safeFrom, text.length)
+ if (safeFrom >= safeTo) return false
+ val surface = text.substring(safeFrom, safeTo)
+ return surface.any { it.isLetterOrDigit() || it == '_' }
+ }
+
} // End of KorapXmlTool class
fun main(args: Array<String>): Unit {
@@ -4009,4 +4033,3 @@
fun String.urlEncode(): String {
return java.net.URLEncoder.encode(this, "UTF-8")
}
-
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 062ab14..c4ad2c1 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -8,6 +8,7 @@
import java.net.URL
import kotlin.test.Test
import kotlin.test.assertContains
+import kotlin.test.assertEquals
import kotlin.test.assertFalse
import kotlin.test.assertTrue
@@ -669,4 +670,184 @@
tempDir.deleteRecursively()
}
}
+
+ @Test
+ fun krillRespectsNonWordTokenOption() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+
+ val defaultDir = File.createTempFile("krill_default", "").let {
+ it.delete()
+ it.mkdirs()
+ it
+ }
+
+ try {
+ val defaultArgs = arrayOf("-f", "krill", "-D", defaultDir.path, baseZip, spacyZip)
+ val defaultExit = debug(defaultArgs)
+ assertTrue(defaultExit == 0, "Krill conversion should succeed without --non-word-tokens")
+
+ val defaultTar = File(defaultDir, "wud24_sample.krill.tar")
+ assertTrue(defaultTar.exists(), "Default krill tar should exist")
+
+ val defaultJsons = readKrillJson(defaultTar).values
+ assertTrue(defaultJsons.isNotEmpty(), "Default Krill tar should contain JSON files")
+ assertTrue(
+ defaultJsons.all { !it.contains("\"s:,\"") },
+ "Default Krill output should skip comma tokens"
+ )
+ assertTrue(
+ defaultJsons.all { !it.contains("\"s:!\"") },
+ "Default Krill output should skip exclamation mark tokens"
+ )
+ } finally {
+ defaultDir.deleteRecursively()
+ }
+
+ val flagDir = File.createTempFile("krill_nwt", "").let {
+ it.delete()
+ it.mkdirs()
+ it
+ }
+
+ try {
+ val flagArgs = arrayOf("-f", "krill", "--non-word-tokens", "-D", flagDir.path, baseZip, spacyZip)
+ val flagExit = debug(flagArgs)
+ assertTrue(flagExit == 0, "Krill conversion should succeed with --non-word-tokens")
+
+ val flagTar = File(flagDir, "wud24_sample.krill.tar")
+ assertTrue(flagTar.exists(), "Krill tar should exist when --non-word-tokens is set")
+
+ val flagJsons = readKrillJson(flagTar).values
+ assertTrue(flagJsons.isNotEmpty(), "Krill tar should contain JSON files when --non-word-tokens is set")
+ assertTrue(
+ flagJsons.any { it.contains("\"s:,\"") },
+ "Krill output should include commas when --non-word-tokens is set"
+ )
+ assertTrue(
+ flagJsons.any { it.contains("\"s:!\"") },
+ "Krill output should include exclamation marks when --non-word-tokens is set"
+ )
+ } finally {
+ flagDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun krillDefaultMatchesPerlReference() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val referenceTar = File(loadResource("wud24_sample.wonwtopt.krill.tar").toURI())
+ assertTrue(referenceTar.exists(), "Reference Krill tar is missing: ${referenceTar.path}")
+
+ val kotlinDir = File.createTempFile("krill_reference_cmp", "").let {
+ it.delete()
+ it.mkdirs()
+ it
+ }
+
+ try {
+ val args = arrayOf("-f", "krill", "-D", kotlinDir.path, baseZip, spacyZip)
+ val exitCode = debug(args)
+ assertTrue(exitCode == 0, "Krill conversion should succeed for reference comparison")
+
+ val kotlinTar = File(kotlinDir, "wud24_sample.krill.tar")
+ assertTrue(kotlinTar.exists(), "Kotlin-produced Krill tar should exist at ${kotlinTar.path}")
+
+ val kotlinJsons = readKrillJson(kotlinTar)
+ val referenceJsons = readKrillJson(referenceTar)
+
+ assertEquals(referenceJsons.keys, kotlinJsons.keys, "Kotlin and reference JSON sets differ")
+
+ val tokensToCheck = listOf("\"s:,\"", "\"s:.\"")
+ referenceJsons.forEach { (doc, referenceJson) ->
+ val kotlinJson = kotlinJsons.getValue(doc)
+ tokensToCheck.forEach { token ->
+ val refHas = referenceJson.contains(token)
+ val kotlinHas = kotlinJson.contains(token)
+ assertEquals(
+ refHas,
+ kotlinHas,
+ "Mismatch for $token in document $doc compared to reference"
+ )
+ }
+ }
+ } finally {
+ kotlinDir.deleteRecursively()
+ }
+ }
+
+ @Test
+ fun krillNonWordTokensMatchesPerlReference() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val referenceTar = File(loadResource("wud24_sample.nwt.krill.tar").toURI())
+ assertTrue(referenceTar.exists(), "Non-word-token reference tar missing: ${referenceTar.path}")
+
+ val kotlinDir = File.createTempFile("krill_reference_nwt", "").let {
+ it.delete()
+ it.mkdirs()
+ it
+ }
+
+ try {
+ val args = arrayOf("-f", "krill", "--non-word-tokens", "-D", kotlinDir.path, baseZip, spacyZip)
+ val exitCode = debug(args)
+ assertTrue(exitCode == 0, "Krill conversion with --non-word-tokens should succeed for reference comparison")
+
+ val kotlinTar = File(kotlinDir, "wud24_sample.krill.tar")
+ assertTrue(kotlinTar.exists(), "Kotlin-produced Krill tar (nwt) should exist at ${kotlinTar.path}")
+
+ val kotlinJsons = readKrillJson(kotlinTar)
+ val referenceJsons = readKrillJson(referenceTar)
+
+ assertEquals(referenceJsons.keys, kotlinJsons.keys, "Kotlin and reference JSON sets differ (nwt)")
+
+ val tokensToCheck = listOf("\"s:,\"", "\"s:.\"", "\"s:!\"")
+ referenceJsons.forEach { (doc, referenceJson) ->
+ val kotlinJson = kotlinJsons.getValue(doc)
+ tokensToCheck.forEach { token ->
+ val refHas = referenceJson.contains(token)
+ val kotlinHas = kotlinJson.contains(token)
+ assertEquals(
+ refHas,
+ kotlinHas,
+ "Mismatch for $token in document $doc compared to nwt reference"
+ )
+ }
+ }
+ } finally {
+ kotlinDir.deleteRecursively()
+ }
+ }
+
+ private fun readKrillJson(tarFile: File): Map<String, String> {
+ val extractDir = File.createTempFile("krill_extract", "").let {
+ it.delete()
+ it.mkdirs()
+ it
+ }
+
+ return try {
+ val tarProcess = ProcessBuilder("tar", "-xf", tarFile.path, "-C", extractDir.path)
+ .redirectErrorStream(true)
+ .start()
+ assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed for ${tarFile.path}")
+
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") }.orEmpty()
+ assertTrue(jsonFiles.isNotEmpty(), "No JSON files found in ${tarFile.path}")
+
+ jsonFiles.associate { jsonFile ->
+ val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+ .redirectOutput(ProcessBuilder.Redirect.PIPE)
+ .start()
+ .inputStream
+ .bufferedReader()
+ .use { it.readText() }
+ jsonFile.name to jsonContent
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
}
diff --git a/app/src/test/resources/wud24_sample.corenlp.zip b/app/src/test/resources/wud24_sample.corenlp.zip
new file mode 100644
index 0000000..e5f6df1
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.corenlp.zip
Binary files differ
diff --git a/app/src/test/resources/wud24_sample.nwt.krill.tar b/app/src/test/resources/wud24_sample.nwt.krill.tar
new file mode 100644
index 0000000..df498e0
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.nwt.krill.tar
Binary files differ
diff --git a/app/src/test/resources/wud24_sample.wonwtopt.krill.tar b/app/src/test/resources/wud24_sample.wonwtopt.krill.tar
new file mode 100644
index 0000000..b3642ae
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.wonwtopt.krill.tar
Binary files differ