Fix handling of non-BMP characters (eg emojis)

Resolves #1

Should fix broken DeReKoVecs-2024-I – after training

Change-Id: I99bf065832d82ac9b62f3995402dd96e501e3037
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 104572c..93ca24b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -206,7 +206,7 @@
 
     private var annotationWorkerPool : AnnotationWorkerPool? = null
 
-    val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
+    val texts: ConcurrentHashMap<String, NonBmpString> = ConcurrentHashMap()
     val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
     val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
     val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
@@ -371,7 +371,7 @@
                     "data.xml" -> {
                         val textsList: NodeList = doc.getElementsByTagName("text")
                         if (textsList.length > 0) {
-                            texts[docId] = textsList.item(0).textContent
+                            texts[docId] = NonBmpString(textsList.item(0).textContent)
                         }
                     }
 
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
new file mode 100644
index 0000000..44c7282
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
@@ -0,0 +1,62 @@
+package de.ids_mannheim.korapxmltools
+
+class NonBmpString : CharSequence {
+
+    private val utf32Chars: IntArray
+
+    constructor(input: String) {
+        utf32Chars = input.toUtf32Array()
+    }
+
+    constructor(utf32Chars: IntArray) {
+        this.utf32Chars = utf32Chars.copyOf()
+    }
+
+    override val length: Int
+        get() = utf32Chars.size
+
+    override fun get(index: Int): Char {
+        if (index < 0 || index >= length) {
+            throw IndexOutOfBoundsException("Index $index is out of bounds for NonBmpString with length $length")
+        }
+        val codePoint = utf32Chars[index]
+        return if (Character.isBmpCodePoint(codePoint)) {
+            codePoint.toChar()
+        } else {
+            throw UnsupportedOperationException("Non-BMP characters not supported directly as Char")
+        }
+    }
+
+    override fun subSequence(startIndex: Int, endIndex: Int): CharSequence {
+        if (startIndex < 0 || endIndex > length || startIndex > endIndex) {
+            throw IndexOutOfBoundsException("Invalid substring range")
+        }
+        val subArray = utf32Chars.copyOfRange(startIndex, endIndex)
+        return NonBmpString(subArray)
+    }
+
+    override fun toString(): String {
+        val stringBuilder = StringBuilder()
+        utf32Chars.forEach { codePoint ->
+            if (Character.isBmpCodePoint(codePoint)) {
+                stringBuilder.append(codePoint.toChar())
+            } else {
+                stringBuilder.append(Character.highSurrogate(codePoint))
+                stringBuilder.append(Character.lowSurrogate(codePoint))
+            }
+        }
+        return stringBuilder.toString()
+    }
+
+    private fun String.toUtf32Array(): IntArray {
+        val codePoints = IntArray(Character.codePointCount(this, 0, length))
+        var index = 0
+        var offset = 0
+        while (offset < length) {
+            val codePoint = Character.codePointAt(this, offset)
+            codePoints[index++] = codePoint
+            offset += Character.charCount(codePoint)
+        }
+        return codePoints
+    }
+}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
index 8b1fa8f..ab79da1 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
@@ -5,7 +5,7 @@
         tokens: Array<KorapXml2Conllu.Span>,
         morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?,
         sentenceSpans: Array<KorapXml2Conllu.Span>?,
-        text: String
+        text: NonBmpString
     ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
         val sentence_tokens = mutableListOf<String>()
         val sentence_token_offsets = mutableListOf<String>()
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
index 03e04e8..95a9315 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
@@ -3,7 +3,7 @@
 abstract class TaggerToolBridge : AnnotationToolBridge {
 
     fun tagText(
-        tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String
+        tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: NonBmpString
     ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
         val sentence_tokens = mutableListOf<String>()
         val sentence_token_offsets = mutableListOf<String>()
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index 39d8974..cf4941a 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -21,6 +21,7 @@
     val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
     val zca20scrambled = loadResource("zca20-scrambled.zip").path
     val wdf19 = loadResource("wdf19.zip").path
+    val wdd17 = loadResource("wdd17sample.zip").path
 
     @Before
     fun setUpStreams() {
@@ -188,6 +189,20 @@
     }
 
     @Test
+    fun canHandleNonBmpText() {
+        val args = arrayOf("--word2vec", wdd17)
+        debug(args)
+        assertContains(
+            outContent.toString(),
+            "\n-- mach \uD83D\uDE48 \uD83D\uDE49 \uD83D\uDE4A 20 : 45 , 1. Feb .\n" // 🙈 🙉 🙊
+        )
+        assertContains(
+            outContent.toString(),
+            "\nBereinige wenigstens die allergröbsten Sachen .\n"
+        )
+    }
+
+    @Test
     fun canExtractExtraFeaturesByRegex() {
         val args = arrayOf("-e" ,"(posting/id|div/id)",loadResource("wdf19.zip").path)
         debug(args)
diff --git a/app/src/test/resources/wdd17sample.zip b/app/src/test/resources/wdd17sample.zip
new file mode 100644
index 0000000..8c92a18
--- /dev/null
+++ b/app/src/test/resources/wdd17sample.zip
Binary files differ