Support non-BMP tokens (emojis, …) in krill output
Resolves #7
Change-Id: I6045053b7e6f7497287c538df8bf1116fab1afc5
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index aff3db5..6a5ff6b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -4563,7 +4563,7 @@
}
val text = texts[docId]
if (text != null) {
- textData.textContent = text.toString()
+ textData.textContent = text
}
val tokenArray = tokens[docId]
if (tokenArray != null) {
@@ -4780,7 +4780,7 @@
// Collect text content (only from base foundry)
if (foundry == "base" && texts[docId] != null) {
synchronized(textData) {
- textData.textContent = texts[docId]!!.toString()
+ textData.textContent = texts[docId]!!
textData.tokens = tokens[docId]
textData.sentences = sentences[docId]
LOGGER.info(" Collected base text data for $docId: ${textData.textContent?.length ?: 0} chars, ${textData.tokens?.size ?: 0} tokens")
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index 8ccc285..7871341 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
@@ -2,6 +2,7 @@
import de.ids_mannheim.korapxmltools.KorapXmlTool.MorphoSpan
import de.ids_mannheim.korapxmltools.KorapXmlTool.Span
+import de.ids_mannheim.korapxmltools.NonBmpString
import java.util.logging.Logger
/**
@@ -18,7 +19,7 @@
*/
data class KrillTextData(
var textId: String,
- var textContent: String? = null,
+ var textContent: NonBmpString? = null,
var headerMetadata: MutableMap<String, Any> = mutableMapOf(),
var tokens: Array<Span>? = null,
var sentences: Array<Span>? = null,
@@ -207,7 +208,7 @@
// data section
sb.append("\"data\":{")
- sb.append("\"text\":${jsonString(textData.textContent ?: "")},")
+ sb.append("\"text\":${jsonString(textData.textContent?.toString() ?: "")},")
val sentenceSpanFoundries = textData.structureSpans.foundriesWithSentenceSpans()
val constituencySpanFoundries = textData.structureSpans.foundriesWithConstituencySpans()
@@ -354,9 +355,9 @@
private fun generateStream(textData: KrillTextData, includeNonWordTokens: Boolean): List<String> {
val rawTokens = textData.tokens ?: return emptyList()
- val text = textData.textContent ?: ""
+ val text = textData.textContent ?: NonBmpString("")
val sentences = textData.sentences ?: emptyArray()
- val tokens: List<Span> = if (includeNonWordTokens || text.isEmpty()) {
+ val tokens: List<Span> = if (includeNonWordTokens || text.length == 0) {
rawTokens.toList()
} else {
rawTokens.filter { span -> shouldKeepTokenForKrill(text, span) }
@@ -568,8 +569,9 @@
tokenAnnotations.add(jsonString("_$index\$<i>${token.from}<i>${token.to}"))
// Get surface form (used for both i: and s: annotations)
+ // Get surface form (used for both i: and s: annotations)
val surfaceForm = if (token.to <= text.length) {
- text.substring(token.from, token.to)
+ text.subSequence(token.from, token.to).toString()
} else {
""
}
@@ -660,13 +662,19 @@
return result
}
- private fun shouldKeepTokenForKrill(text: String, span: Span): Boolean {
- if (text.isEmpty()) return true
+ private fun shouldKeepTokenForKrill(text: NonBmpString, span: Span): Boolean {
+ if (text.length == 0) return true
val safeFrom = span.from.coerceIn(0, text.length)
val safeTo = span.to.coerceIn(safeFrom, text.length)
if (safeFrom >= safeTo) return false
- val surface = text.substring(safeFrom, safeTo)
- return surface.any { it.isLetterOrDigit() || it == '_' }
+ val surface = text.subSequence(safeFrom, safeTo).toString()
+ val keep = surface.any {
+ it.isLetterOrDigit() ||
+ it == '_' ||
+ it.isSurrogate() ||
+ Character.getType(it) == Character.OTHER_SYMBOL.toInt()
+ }
+ return keep
}
// Extension functions for StructureSpan collections
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index d7837a0..cdad437 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -593,4 +593,51 @@
envOutputDir.deleteRecursively()
}
}
+ @Test
+ fun krillCanHandleNonBmpText() {
+ val wdd17 = loadResource("wdd17sample.zip").path
+ val generatedTar = ensureKrillTar("wdd17_non_bmp", "wdd17sample.krill.tar") { outputDir ->
+ arrayOf("-t", "krill", "-q", "-D", outputDir.path, wdd17)
+ }
+ assertTrue(generatedTar.exists())
+
+ val jsons = readKrillJson(generatedTar)
+ assertTrue(jsons.isNotEmpty())
+
+ val combinedJsonContent = jsons.values.joinToString("\n")
+
+ // Check for the presence of the emoji sequence
+ // 🙈 🙉 🙊
+ assertTrue(combinedJsonContent.contains("\uD83D\uDE48"), "Should contain 🙈")
+ assertTrue(combinedJsonContent.contains("\uD83D\uDE49"), "Should contain 🙉")
+ assertTrue(combinedJsonContent.contains("\uD83D\uDE4A"), "Should contain 🙊")
+
+ // Check for the text context
+ assertTrue(combinedJsonContent.contains("mach"), "Should contain 'mach'")
+ assertTrue(combinedJsonContent.contains("Bereinige wenigstens die allergröbsten Sachen"), "Should contain German text")
+
+ // Check if emojis are indexed as tokens
+ assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE48\""), "Should contain token 🙈")
+ assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE49\""), "Should contain token 🙉")
+ assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE4A\""), "Should contain token 🙊")
+ }
+
+ @Test
+ fun krillCanHandleNonBmpTextWithNonWordTokens() {
+ val wdd17 = loadResource("wdd17sample.zip").path
+ val generatedTar = ensureKrillTar("wdd17_non_bmp_nwt", "wdd17sample.krill.tar") { outputDir ->
+ arrayOf("-t", "krill", "-q", "--non-word-tokens", "-D", outputDir.path, wdd17)
+ }
+ assertTrue(generatedTar.exists())
+
+ val jsons = readKrillJson(generatedTar)
+ assertTrue(jsons.isNotEmpty())
+
+ val combinedJsonContent = jsons.values.joinToString("\n")
+
+ // Check if emojis are indexed as tokens
+ assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE48\""), "Should contain token 🙈 with --non-word-tokens")
+ assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE49\""), "Should contain token 🙉 with --non-word-tokens")
+ assertTrue(combinedJsonContent.contains("\"s:\uD83D\uDE4A\""), "Should contain token 🙊 with --non-word-tokens")
+ }
}