Optimize Krill stream generation Change-Id: If696d898d966e00e1dc9ba7289ecd897d2f3051b

commit: 0f18a785f7f6c9b1dd6d7ffa01372c0a10b69efa [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Mar 25 19:35:03 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Mar 26 18:39:52 2026 +0100
tree: 572e4387e21e420b4ccdd2e936b877c2f49f86b5
parent: e922b48121c5c3f3a53eea029f72f134983f3a62 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
index e131a9b..295265a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KrillJsonGenerator.kt

@@ -404,6 +404,25 @@
             return emptyList()
         }
         val result = mutableListOf<String>()
+        data class FoundryMorphoData(
+            val foundry: String,
+            val prefix: String?,
+            val morphoSpans: MutableMap<String, MorphoSpan>
+        )
+        val sortedFoundries = textData.morphoByFoundry.entries
+            .sortedBy { it.key }
+            .map { (foundry, morphoSpans) ->
+                FoundryMorphoData(
+                    foundry = foundry,
+                    prefix = when (foundry) {
+                        "tree_tagger" -> "tt"
+                        "marmot-malt" -> "marmot"
+                        "base" -> null
+                        else -> foundry
+                    },
+                    morphoSpans = morphoSpans
+                )
+            }
 
         // Build offset-to-index map for resolving dependency heads and structural spans
         val offsetToIndex = mutableMapOf<String, Int>()
@@ -413,26 +432,18 @@
 
         // Collect inverse dependency relations and ROOT dependencies
         data class InverseDep(val dependentIndex: Int, val foundry: String, val deprel: String)
-        data class RootDep(val tokenIndex: Int, val foundry: String)
         val inverseDeps = mutableMapOf<Int, MutableList<InverseDep>>()
-        val rootTokens = mutableListOf<RootDep>()
 
         tokens.forEachIndexed { index, token ->
             val spanKey = "${token.from}-${token.to}"
-            textData.morphoByFoundry.keys.forEach { foundry ->
-                val morphoSpan = textData.morphoByFoundry[foundry]?.get(spanKey)
+            sortedFoundries.forEach { foundryData ->
+                val morphoSpan = foundryData.morphoSpans[spanKey]
                 if (morphoSpan != null && morphoSpan.head != null && morphoSpan.head != "_" && morphoSpan.deprel != null && morphoSpan.deprel != "_") {
                     val headStr = morphoSpan.head!!
-                    val prefix = when(foundry) {
-                        "tree_tagger" -> "tt"
-                        "marmot-malt" -> "marmot"
-                        else -> foundry
-                    }
+                    val prefix = foundryData.prefix ?: foundryData.foundry
 
                     // Check if this is a ROOT dependency (head == 0)
-                    if (headStr == "0" || (headStr.contains("-") && headStr.startsWith("0-"))) {
-                        rootTokens.add(RootDep(index, prefix))
-                    } else {
+                    if (!(headStr == "0" || (headStr.contains("-") && headStr.startsWith("0-")))) {
                         val resolvedHeadIndex = if (headStr.contains("-")) {
                             offsetToIndex[headStr]
                         } else {
@@ -483,39 +494,37 @@
             ))
         }
 
-
-        // Build token-to-sentence map for ROOT edge generation
-        data class SentenceInfo(val from: Int, val to: Int, val tokenFrom: Int, val tokenTo: Int)
-        val tokenToSentence = mutableMapOf<Int, SentenceInfo>()
-
         // Add sentence spans (tokenTo is exclusive: first token after the span)
-        sentences.forEachIndexed { sentIdx, sentence ->
-            val sentTokens = tokens.filter { it.from >= sentence.from && it.to <= sentence.to }
-            if (sentTokens.isNotEmpty()) {
-                val firstTokenIdx = tokens.indexOf(sentTokens.first())
-                val lastTokenIdx = tokens.indexOf(sentTokens.last())
-                val sentInfo = SentenceInfo(
-                    from = sentTokens.first().from,
-                    to = sentTokens.last().to,
-                    tokenFrom = firstTokenIdx,
-                    tokenTo = lastTokenIdx + 1  // Exclusive end
-                )
+        var sentenceTokenCursor = 0
+        sentences.forEach { sentence ->
+            while (sentenceTokenCursor < tokens.size && tokens[sentenceTokenCursor].to <= sentence.from) {
+                sentenceTokenCursor++
+            }
 
-                // Map all tokens in this sentence to the sentence info
-                for (i in firstTokenIdx until sentInfo.tokenTo) {
-                    tokenToSentence[i] = sentInfo
+            var firstTokenIdx = -1
+            var lastTokenIdx = -1
+            var idx = sentenceTokenCursor
+            while (idx < tokens.size && tokens[idx].from < sentence.to) {
+                val token = tokens[idx]
+                if (token.from >= sentence.from && token.to <= sentence.to) {
+                    if (firstTokenIdx == -1) firstTokenIdx = idx
+                    lastTokenIdx = idx
                 }
+                idx++
+            }
 
+            if (firstTokenIdx != -1) {
                 baseStructureSpans.add(StructureSpan(
                     layer = "base/s:s",
-                    from = sentInfo.from,
-                    to = sentInfo.to,
-                    tokenFrom = sentInfo.tokenFrom,
-                    tokenTo = sentInfo.tokenTo,
+                    from = tokens[firstTokenIdx].from,
+                    to = tokens[lastTokenIdx].to,
+                    tokenFrom = firstTokenIdx,
+                    tokenTo = lastTokenIdx + 1,
                     depth = 2,
                     attributes = emptyMap()
                 ))
             }
+            sentenceTokenCursor = idx
         }
 
         // Combine base structure spans with dereko spans
@@ -528,9 +537,15 @@
                 // Already resolved
                 span
             } else {
-                // Find first and last token covered by this span
-                var tokenFrom = tokens.indexOfFirst { it.from >= span.from && it.from < span.to }
-                var lastTokenIndex = tokens.indexOfLast { it.to > span.from && it.to <= span.to }
+                var tokenFrom = lowerBoundTokenFrom(tokens, span.from)
+                if (tokenFrom >= tokens.size || tokens[tokenFrom].from >= span.to) {
+                    tokenFrom = -1
+                }
+
+                var lastTokenIndex = upperBoundTokenTo(tokens, span.to) - 1
+                if (lastTokenIndex < 0 || tokens[lastTokenIndex].to <= span.from) {
+                    lastTokenIndex = -1
+                }
 
                 // Handle edge cases
                 if (tokenFrom == -1) tokenFrom = 0
@@ -553,6 +568,8 @@
         resolvedStructureSpans.forEach { span ->
             spansByToken.getOrPut(span.tokenFrom) { mutableListOf() }.add(span)
         }
+        val spanComparator = compareByDescending<StructureSpan> { it.depth }.thenBy { it.layer }
+        spansByToken.values.forEach { spans -> spans.sortWith(spanComparator) }
 
         // Count paragraph spans (name="p") from original document structure only
         // Don't count the base/s:p wrapper we added programmatically
@@ -580,8 +597,8 @@
                 tokenAnnotations.add(jsonString("-:tokens\$<i>${tokens.size}"))
 
                 // Add all structural spans that start at token 0 or cover the whole document
-                val spansAtZero = spansByToken[0] ?: emptyList()
-                spansAtZero.sortedWith(compareBy({ -it.depth }, { it.layer })).forEach { span ->
+                val spansAtZero = spansByToken[0].orEmpty()
+                spansAtZero.forEach { span ->
                     val spanAnnotation = if (span.attributes.isEmpty()) {
                         "<>:${span.layer}\$<b>64<i>${span.from}<i>${span.to}<i>${span.tokenTo}<b>${span.depth}"
                     } else {
@@ -603,7 +620,7 @@
                 }
             } else {
                 // Add structural spans that start at this token
-                spansByToken[index]?.sortedWith(compareBy({ -it.depth }, { it.layer }))?.forEach { span ->
+                spansByToken[index]?.forEach { span ->
                     val spanAnnotation = if (span.attributes.isEmpty()) {
                         "<>:${span.layer}\$<b>64<i>${span.from}<i>${span.to}<i>${span.tokenTo}<b>${span.depth}"
                     } else {
@@ -649,16 +666,11 @@
             }
 
             // Collect annotations from all foundries for this token
-            val sortedFoundries = textData.morphoByFoundry.keys.sorted()
-            sortedFoundries.forEach { foundry ->
-                val morphoSpan = textData.morphoByFoundry[foundry]?.get(spanKey)
+            sortedFoundries.forEach { foundryData ->
+                val foundry = foundryData.foundry
+                val morphoSpan = foundryData.morphoSpans[spanKey]
                 if (morphoSpan != null) {
-                    val prefix = when(foundry) {
-                        "tree_tagger" -> "tt"
-                        "marmot-malt" -> "marmot"
-                        "base" -> null  // Skip base for most annotations
-                        else -> foundry
-                    }
+                    val prefix = foundryData.prefix
 
                     if (prefix != null) {
                         // Morphological features (sorted)
@@ -781,6 +793,34 @@
         return result
     }
 
+    private fun lowerBoundTokenFrom(tokens: List<Span>, target: Int): Int {
+        var low = 0
+        var high = tokens.size
+        while (low < high) {
+            val mid = (low + high) ushr 1
+            if (tokens[mid].from < target) {
+                low = mid + 1
+            } else {
+                high = mid
+            }
+        }
+        return low
+    }
+
+    private fun upperBoundTokenTo(tokens: List<Span>, target: Int): Int {
+        var low = 0
+        var high = tokens.size
+        while (low < high) {
+            val mid = (low + high) ushr 1
+            if (tokens[mid].to <= target) {
+                low = mid + 1
+            } else {
+                high = mid
+            }
+        }
+        return low
+    }
+
     private fun shouldKeepTokenForKrill(text: NonBmpString, span: Span): Boolean {
         if (text.length == 0) return true
         val safeFrom = span.from.coerceIn(0, text.length)
commit	0f18a785f7f6c9b1dd6d7ffa01372c0a10b69efa	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Mar 25 19:35:03 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Mar 26 18:39:52 2026 +0100
tree	572e4387e21e420b4ccdd2e936b877c2f49f86b5
parent	e922b48121c5c3f3a53eea029f72f134983f3a62 [diff]