Support non-BMP tokens (emojis, …) in krill output

Resolves #7

Change-Id: I6045053b7e6f7497287c538df8bf1116fab1afc5
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index aff3db5..6a5ff6b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -4563,7 +4563,7 @@
             }
             val text = texts[docId]
             if (text != null) {
-                textData.textContent = text.toString()
+                textData.textContent = text
             }
             val tokenArray = tokens[docId]
             if (tokenArray != null) {
@@ -4780,7 +4780,7 @@
         // Collect text content (only from base foundry)
         if (foundry == "base" && texts[docId] != null) {
             synchronized(textData) {
-                textData.textContent = texts[docId]!!.toString()
+                textData.textContent = texts[docId]!!
                 textData.tokens = tokens[docId]
                 textData.sentences = sentences[docId]
                 LOGGER.info("  Collected base text data for $docId: ${textData.textContent?.length ?: 0} chars, ${textData.tokens?.size ?: 0} tokens")
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index 8ccc285..7871341 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -2,6 +2,7 @@
 
 import de.ids_mannheim.korapxmltools.KorapXmlTool.MorphoSpan
 import de.ids_mannheim.korapxmltools.KorapXmlTool.Span
+import de.ids_mannheim.korapxmltools.NonBmpString
 import java.util.logging.Logger
 
 /**
@@ -18,7 +19,7 @@
      */
     data class KrillTextData(
         var textId: String,
-        var textContent: String? = null,
+        var textContent: NonBmpString? = null,
         var headerMetadata: MutableMap<String, Any> = mutableMapOf(),
         var tokens: Array<Span>? = null,
         var sentences: Array<Span>? = null,
@@ -207,7 +208,7 @@
 
         // data section
         sb.append("\"data\":{")
-        sb.append("\"text\":${jsonString(textData.textContent ?: "")},")
+        sb.append("\"text\":${jsonString(textData.textContent?.toString() ?: "")},")
 
         val sentenceSpanFoundries = textData.structureSpans.foundriesWithSentenceSpans()
         val constituencySpanFoundries = textData.structureSpans.foundriesWithConstituencySpans()
@@ -354,9 +355,9 @@
 
     private fun generateStream(textData: KrillTextData, includeNonWordTokens: Boolean): List<String> {
         val rawTokens = textData.tokens ?: return emptyList()
-        val text = textData.textContent ?: ""
+        val text = textData.textContent ?: NonBmpString("")
         val sentences = textData.sentences ?: emptyArray()
-        val tokens: List<Span> = if (includeNonWordTokens || text.isEmpty()) {
+        val tokens: List<Span> = if (includeNonWordTokens || text.length == 0) {
             rawTokens.toList()
         } else {
             rawTokens.filter { span -> shouldKeepTokenForKrill(text, span) }
@@ -568,8 +569,9 @@
             tokenAnnotations.add(jsonString("_$index\$<i>${token.from}<i>${token.to}"))
 
             // Get surface form (used for both i: and s: annotations)
+            // Get surface form (used for both i: and s: annotations)
             val surfaceForm = if (token.to <= text.length) {
-                text.substring(token.from, token.to)
+                text.subSequence(token.from, token.to).toString()
             } else {
                 ""
             }
@@ -660,13 +662,19 @@
         return result
     }
 
-    private fun shouldKeepTokenForKrill(text: String, span: Span): Boolean {
-        if (text.isEmpty()) return true
+    private fun shouldKeepTokenForKrill(text: NonBmpString, span: Span): Boolean {
+        if (text.length == 0) return true
         val safeFrom = span.from.coerceIn(0, text.length)
         val safeTo = span.to.coerceIn(safeFrom, text.length)
         if (safeFrom >= safeTo) return false
-        val surface = text.substring(safeFrom, safeTo)
-        return surface.any { it.isLetterOrDigit() || it == '_' }
+        val surface = text.subSequence(safeFrom, safeTo).toString()
+        val keep = surface.any { 
+            it.isLetterOrDigit() || 
+            it == '_' || 
+            it.isSurrogate() || 
+            Character.getType(it) == Character.OTHER_SYMBOL.toInt() 
+        }
+        return keep
     }
 
     // Extension functions for StructureSpan collections
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index d7837a0..cdad437 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -593,4 +593,51 @@
             envOutputDir.deleteRecursively()
         }
     }
+    @Test
+    fun krillCanHandleNonBmpText() {
+        val wdd17 = loadResource("wdd17sample.zip").path
+        val generatedTar = ensureKrillTar("wdd17_non_bmp", "wdd17sample.krill.tar") { outputDir ->
+            arrayOf("-t", "krill", "-q", "-D", outputDir.path, wdd17)
+        }
+        assertTrue(generatedTar.exists())
+
+        val jsons = readKrillJson(generatedTar)
+        assertTrue(jsons.isNotEmpty())
+
+        val combinedJsonContent = jsons.values.joinToString("\n")
+
+        // Check for the presence of the emoji sequence
+        // 🙈 🙉 🙊
+        assertTrue(combinedJsonContent.contains("\uD83D\uDE48"), "Should contain 🙈")
+        assertTrue(combinedJsonContent.contains("\uD83D\uDE49"), "Should contain 🙉")
+        assertTrue(combinedJsonContent.contains("\uD83D\uDE4A"), "Should contain 🙊")
+
+        // Check for the text context
+        assertTrue(combinedJsonContent.contains("mach"), "Should contain 'mach'")
+        assertTrue(combinedJsonContent.contains("Bereinige wenigstens die allergröbsten Sachen"), "Should contain German text")
+
+        // Check if emojis are indexed as tokens
+        assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE48\""), "Should contain token 🙈")
+        assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE49\""), "Should contain token 🙉")
+        assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE4A\""), "Should contain token 🙊")
+    }
+
+    @Test
+    fun krillCanHandleNonBmpTextWithNonWordTokens() {
+        val wdd17 = loadResource("wdd17sample.zip").path
+        val generatedTar = ensureKrillTar("wdd17_non_bmp_nwt", "wdd17sample.krill.tar") { outputDir ->
+            arrayOf("-t", "krill", "-q", "--non-word-tokens", "-D", outputDir.path, wdd17)
+        }
+        assertTrue(generatedTar.exists())
+
+        val jsons = readKrillJson(generatedTar)
+        assertTrue(jsons.isNotEmpty())
+
+        val combinedJsonContent = jsons.values.joinToString("\n")
+
+        // Check if emojis are indexed as tokens
+        assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE48\""), "Should contain token 🙈 with --non-word-tokens")
+        assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE49\""), "Should contain token 🙉 with --non-word-tokens")
+        assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE4A\""), "Should contain token 🙊 with --non-word-tokens")
+    }
 }