Fix handling of non-BMP characters (eg emojis)
Resolves #1
Should fix broken DeReKoVecs-2024-I – after training
Change-Id: I99bf065832d82ac9b62f3995402dd96e501e3037
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 104572c..93ca24b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -206,7 +206,7 @@
private var annotationWorkerPool : AnnotationWorkerPool? = null
- val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
+ val texts: ConcurrentHashMap<String, NonBmpString> = ConcurrentHashMap()
val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
@@ -371,7 +371,7 @@
"data.xml" -> {
val textsList: NodeList = doc.getElementsByTagName("text")
if (textsList.length > 0) {
- texts[docId] = textsList.item(0).textContent
+ texts[docId] = NonBmpString(textsList.item(0).textContent)
}
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
new file mode 100644
index 0000000..44c7282
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
@@ -0,0 +1,62 @@
+package de.ids_mannheim.korapxmltools
+
+class NonBmpString : CharSequence {
+
+ private val utf32Chars: IntArray
+
+ constructor(input: String) {
+ utf32Chars = input.toUtf32Array()
+ }
+
+ constructor(utf32Chars: IntArray) {
+ this.utf32Chars = utf32Chars.copyOf()
+ }
+
+ override val length: Int
+ get() = utf32Chars.size
+
+ override fun get(index: Int): Char {
+ if (index < 0 || index >= length) {
+ throw IndexOutOfBoundsException("Index $index is out of bounds for NonBmpString with length $length")
+ }
+ val codePoint = utf32Chars[index]
+ return if (Character.isBmpCodePoint(codePoint)) {
+ codePoint.toChar()
+ } else {
+ throw UnsupportedOperationException("Non-BMP characters not supported directly as Char")
+ }
+ }
+
+ override fun subSequence(startIndex: Int, endIndex: Int): CharSequence {
+ if (startIndex < 0 || endIndex > length || startIndex > endIndex) {
+ throw IndexOutOfBoundsException("Invalid substring range")
+ }
+ val subArray = utf32Chars.copyOfRange(startIndex, endIndex)
+ return NonBmpString(subArray)
+ }
+
+ override fun toString(): String {
+ val stringBuilder = StringBuilder()
+ utf32Chars.forEach { codePoint ->
+ if (Character.isBmpCodePoint(codePoint)) {
+ stringBuilder.append(codePoint.toChar())
+ } else {
+ stringBuilder.append(Character.highSurrogate(codePoint))
+ stringBuilder.append(Character.lowSurrogate(codePoint))
+ }
+ }
+ return stringBuilder.toString()
+ }
+
+ private fun String.toUtf32Array(): IntArray {
+ val codePoints = IntArray(Character.codePointCount(this, 0, length))
+ var index = 0
+ var offset = 0
+ while (offset < length) {
+ val codePoint = Character.codePointAt(this, offset)
+ codePoints[index++] = codePoint
+ offset += Character.charCount(codePoint)
+ }
+ return codePoints
+ }
+}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
index 8b1fa8f..ab79da1 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
@@ -5,7 +5,7 @@
tokens: Array<KorapXml2Conllu.Span>,
morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?,
sentenceSpans: Array<KorapXml2Conllu.Span>?,
- text: String
+ text: NonBmpString
): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
val sentence_tokens = mutableListOf<String>()
val sentence_token_offsets = mutableListOf<String>()
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
index 03e04e8..95a9315 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
@@ -3,7 +3,7 @@
abstract class TaggerToolBridge : AnnotationToolBridge {
fun tagText(
- tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String
+ tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: NonBmpString
): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
val sentence_tokens = mutableListOf<String>()
val sentence_token_offsets = mutableListOf<String>()
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index 39d8974..cf4941a 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -21,6 +21,7 @@
val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
val zca20scrambled = loadResource("zca20-scrambled.zip").path
val wdf19 = loadResource("wdf19.zip").path
+ val wdd17 = loadResource("wdd17sample.zip").path
@Before
fun setUpStreams() {
@@ -188,6 +189,20 @@
}
@Test
+ fun canHandleNonBmpText() {
+ val args = arrayOf("--word2vec", wdd17)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "\n-- mach \uD83D\uDE48 \uD83D\uDE49 \uD83D\uDE4A 20 : 45 , 1. Feb .\n" // 🙈 🙉 🙊
+ )
+ assertContains(
+ outContent.toString(),
+ "\nBereinige wenigstens die allergröbsten Sachen .\n"
+ )
+ }
+
+ @Test
fun canExtractExtraFeaturesByRegex() {
val args = arrayOf("-e" ,"(posting/id|div/id)",loadResource("wdf19.zip").path)
debug(args)
diff --git a/app/src/test/resources/wdd17sample.zip b/app/src/test/resources/wdd17sample.zip
new file mode 100644
index 0000000..8c92a18
--- /dev/null
+++ b/app/src/test/resources/wdd17sample.zip
Binary files differ