Add --non-word-token option and change default

Default to not indexing non-word-tokens, like korapxml2krill

Change-Id: Iac9e6a1a55bd22f03914e9a32a93417a7804b615
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index c0d0899..58306c3 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -157,6 +157,12 @@
     )
     var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) " " else "\n"
 
+    @Option(
+        names = ["--non-word-tokens", "--nwt", "-nwt"],
+        description = ["Include punctuation and other non-word tokens when generating Krill output (matches korapxml2krill --non-word-tokens flag)."]
+    )
+    var includeNonWordTokens: Boolean = false
+
     @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
     var offsets: Boolean = false
 
@@ -3632,9 +3638,18 @@
     }
 
     private fun generateKrillStream(textData: KrillTextData): List<String> {
-        val tokens = textData.tokens ?: return emptyList()
+        val rawTokens = textData.tokens ?: return emptyList()
         val text = textData.textContent ?: ""
         val sentences = textData.sentences ?: emptyArray()
+        val tokens: List<Span> = if (includeNonWordTokens || text.isEmpty()) {
+            rawTokens.toList()
+        } else {
+            rawTokens.filter { span -> shouldKeepTokenForKrill(text, span) }
+        }
+        if (tokens.isEmpty()) {
+            LOGGER.fine("No tokens remained for ${textData.textId} after filtering non-word tokens")
+            return emptyList()
+        }
         val result = mutableListOf<String>()
 
         // Build offset-to-index map for resolving dependency heads and structural spans
@@ -3918,6 +3933,15 @@
         return result
     }
 
+    private fun shouldKeepTokenForKrill(text: String, span: Span): Boolean {
+        if (text.isEmpty()) return true
+        val safeFrom = span.from.coerceIn(0, text.length)
+        val safeTo = span.to.coerceIn(safeFrom, text.length)
+        if (safeFrom >= safeTo) return false
+        val surface = text.substring(safeFrom, safeTo)
+        return surface.any { it.isLetterOrDigit() || it == '_' }
+    }
+
 }  // End of KorapXmlTool class
 
 fun main(args: Array<String>): Unit {
@@ -4009,4 +4033,3 @@
 fun String.urlEncode(): String {
     return java.net.URLEncoder.encode(this, "UTF-8")
 }
-
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 062ab14..c4ad2c1 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -8,6 +8,7 @@
 import java.net.URL
 import kotlin.test.Test
 import kotlin.test.assertContains
+import kotlin.test.assertEquals
 import kotlin.test.assertFalse
 import kotlin.test.assertTrue
 
@@ -669,4 +670,184 @@
             tempDir.deleteRecursively()
         }
     }
+
+    @Test
+    fun krillRespectsNonWordTokenOption() {
+        val baseZip = loadResource("wud24_sample.zip").path
+        val spacyZip = loadResource("wud24_sample.spacy.zip").path
+
+        val defaultDir = File.createTempFile("krill_default", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+
+        try {
+            val defaultArgs = arrayOf("-f", "krill", "-D", defaultDir.path, baseZip, spacyZip)
+            val defaultExit = debug(defaultArgs)
+            assertTrue(defaultExit == 0, "Krill conversion should succeed without --non-word-tokens")
+
+            val defaultTar = File(defaultDir, "wud24_sample.krill.tar")
+            assertTrue(defaultTar.exists(), "Default krill tar should exist")
+
+            val defaultJsons = readKrillJson(defaultTar).values
+            assertTrue(defaultJsons.isNotEmpty(), "Default Krill tar should contain JSON files")
+            assertTrue(
+                defaultJsons.all { !it.contains("\"s:,\"") },
+                "Default Krill output should skip comma tokens"
+            )
+            assertTrue(
+                defaultJsons.all { !it.contains("\"s:!\"") },
+                "Default Krill output should skip exclamation mark tokens"
+            )
+        } finally {
+            defaultDir.deleteRecursively()
+        }
+
+        val flagDir = File.createTempFile("krill_nwt", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+
+        try {
+            val flagArgs = arrayOf("-f", "krill", "--non-word-tokens", "-D", flagDir.path, baseZip, spacyZip)
+            val flagExit = debug(flagArgs)
+            assertTrue(flagExit == 0, "Krill conversion should succeed with --non-word-tokens")
+
+            val flagTar = File(flagDir, "wud24_sample.krill.tar")
+            assertTrue(flagTar.exists(), "Krill tar should exist when --non-word-tokens is set")
+
+            val flagJsons = readKrillJson(flagTar).values
+            assertTrue(flagJsons.isNotEmpty(), "Krill tar should contain JSON files when --non-word-tokens is set")
+            assertTrue(
+                flagJsons.any { it.contains("\"s:,\"") },
+                "Krill output should include commas when --non-word-tokens is set"
+            )
+            assertTrue(
+                flagJsons.any { it.contains("\"s:!\"") },
+                "Krill output should include exclamation marks when --non-word-tokens is set"
+            )
+        } finally {
+            flagDir.deleteRecursively()
+        }
+    }
+
+    @Test
+    fun krillDefaultMatchesPerlReference() {
+        val baseZip = loadResource("wud24_sample.zip").path
+        val spacyZip = loadResource("wud24_sample.spacy.zip").path
+        val referenceTar = File(loadResource("wud24_sample.wonwtopt.krill.tar").toURI())
+        assertTrue(referenceTar.exists(), "Reference Krill tar is missing: ${referenceTar.path}")
+
+        val kotlinDir = File.createTempFile("krill_reference_cmp", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+
+        try {
+            val args = arrayOf("-f", "krill", "-D", kotlinDir.path, baseZip, spacyZip)
+            val exitCode = debug(args)
+            assertTrue(exitCode == 0, "Krill conversion should succeed for reference comparison")
+
+            val kotlinTar = File(kotlinDir, "wud24_sample.krill.tar")
+            assertTrue(kotlinTar.exists(), "Kotlin-produced Krill tar should exist at ${kotlinTar.path}")
+
+            val kotlinJsons = readKrillJson(kotlinTar)
+            val referenceJsons = readKrillJson(referenceTar)
+
+            assertEquals(referenceJsons.keys, kotlinJsons.keys, "Kotlin and reference JSON sets differ")
+
+            val tokensToCheck = listOf("\"s:,\"", "\"s:.\"")
+            referenceJsons.forEach { (doc, referenceJson) ->
+                val kotlinJson = kotlinJsons.getValue(doc)
+                tokensToCheck.forEach { token ->
+                    val refHas = referenceJson.contains(token)
+                    val kotlinHas = kotlinJson.contains(token)
+                    assertEquals(
+                        refHas,
+                        kotlinHas,
+                        "Mismatch for $token in document $doc compared to reference"
+                    )
+                }
+            }
+        } finally {
+            kotlinDir.deleteRecursively()
+        }
+    }
+
+    @Test
+    fun krillNonWordTokensMatchesPerlReference() {
+        val baseZip = loadResource("wud24_sample.zip").path
+        val spacyZip = loadResource("wud24_sample.spacy.zip").path
+        val referenceTar = File(loadResource("wud24_sample.nwt.krill.tar").toURI())
+        assertTrue(referenceTar.exists(), "Non-word-token reference tar missing: ${referenceTar.path}")
+
+        val kotlinDir = File.createTempFile("krill_reference_nwt", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+
+        try {
+            val args = arrayOf("-f", "krill", "--non-word-tokens", "-D", kotlinDir.path, baseZip, spacyZip)
+            val exitCode = debug(args)
+            assertTrue(exitCode == 0, "Krill conversion with --non-word-tokens should succeed for reference comparison")
+
+            val kotlinTar = File(kotlinDir, "wud24_sample.krill.tar")
+            assertTrue(kotlinTar.exists(), "Kotlin-produced Krill tar (nwt) should exist at ${kotlinTar.path}")
+
+            val kotlinJsons = readKrillJson(kotlinTar)
+            val referenceJsons = readKrillJson(referenceTar)
+
+            assertEquals(referenceJsons.keys, kotlinJsons.keys, "Kotlin and reference JSON sets differ (nwt)")
+
+            val tokensToCheck = listOf("\"s:,\"", "\"s:.\"", "\"s:!\"")
+            referenceJsons.forEach { (doc, referenceJson) ->
+                val kotlinJson = kotlinJsons.getValue(doc)
+                tokensToCheck.forEach { token ->
+                    val refHas = referenceJson.contains(token)
+                    val kotlinHas = kotlinJson.contains(token)
+                    assertEquals(
+                        refHas,
+                        kotlinHas,
+                        "Mismatch for $token in document $doc compared to nwt reference"
+                    )
+                }
+            }
+        } finally {
+            kotlinDir.deleteRecursively()
+        }
+    }
+
+    private fun readKrillJson(tarFile: File): Map<String, String> {
+        val extractDir = File.createTempFile("krill_extract", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+
+        return try {
+            val tarProcess = ProcessBuilder("tar", "-xf", tarFile.path, "-C", extractDir.path)
+                .redirectErrorStream(true)
+                .start()
+            assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed for ${tarFile.path}")
+
+            val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.gz") }.orEmpty()
+            assertTrue(jsonFiles.isNotEmpty(), "No JSON files found in ${tarFile.path}")
+
+            jsonFiles.associate { jsonFile ->
+                val jsonContent = ProcessBuilder("gunzip", "-c", jsonFile.path)
+                    .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                    .start()
+                    .inputStream
+                    .bufferedReader()
+                    .use { it.readText() }
+                jsonFile.name to jsonContent
+            }
+        } finally {
+            extractDir.deleteRecursively()
+        }
+    }
 }
diff --git a/app/src/test/resources/wud24_sample.corenlp.zip b/app/src/test/resources/wud24_sample.corenlp.zip
new file mode 100644
index 0000000..e5f6df1
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.corenlp.zip
Binary files differ
diff --git a/app/src/test/resources/wud24_sample.nwt.krill.tar b/app/src/test/resources/wud24_sample.nwt.krill.tar
new file mode 100644
index 0000000..df498e0
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.nwt.krill.tar
Binary files differ
diff --git a/app/src/test/resources/wud24_sample.wonwtopt.krill.tar b/app/src/test/resources/wud24_sample.wonwtopt.krill.tar
new file mode 100644
index 0000000..b3642ae
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.wonwtopt.krill.tar
Binary files differ