Let CoreNLP use the base tokenization Change-Id: I383a8b12a96f7a33d6a76a603bd919f6f732c4ba

commit: 8a1b2a90dfb7c96a56e2c0fb01eb56a71dfe7c05 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Apr 05 19:42:54 2026 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Apr 05 19:42:54 2026 +0200
tree: 8e2fae9205b25255c3b53f045e0d4e080716aa27
parent: a2dd177c8879f51c0e7f53b3a32b778607000174 [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8d6aeb..132c65e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -8,6 +8,7 @@
 - Plain NOW export startup and progress diagnostics now log ZIP open time and first-output timing more explicitly, making it easier to distinguish ZIP indexing overhead from actual extraction work
 - Plain Word2Vec export now uses the same archive-order streaming ZIP path as plain NOW output, including the faster `java.util.zip.ZipFile` opener for large archives with many entries
 - Plain CoNLL-U export now also uses the archive-order streaming ZIP path when exactly one base ZIP is given, while multi-ZIP and foundry-paired CoNLL-U input stays on the ordered pipeline
+- CoreNLP now uses the the base tokenization and sentence boundaries instead of its own, preventing morpho/constituency drifts (#38).
 
 ## [v3.3.0] - 2026-03-26
 

diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt
index 2654ccd..3c6b9ea 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt

@@ -17,6 +17,93 @@
     override val foundry = "corenlp"
 
     private val pipeline: StanfordCoreNLP
+    internal data class PosAssignment(val from: Int, val to: Int, val pos: String)
+
+    internal companion object {
+        internal fun mergeMorphoPosAssignmentsByExactOffsets(
+            assignments: List<PosAssignment>,
+            tokens: List<KorapXmlTool.Span>,
+            morpho: MutableMap<String, KorapXmlTool.MorphoSpan>,
+            logger: Logger
+        ) {
+            val tokenSpans = tokens.associateBy { "${it.from}-${it.to}" }
+            var skippedRetokenizedLeaves = 0
+
+            assignments.forEach { assignment ->
+                val spanKey = "${assignment.from}-${assignment.to}"
+                if (!tokenSpans.containsKey(spanKey)) {
+                    skippedRetokenizedLeaves++
+                    return@forEach
+                }
+
+                val existing = morpho[spanKey]
+                morpho[spanKey] = KorapXmlTool.MorphoSpan(
+                    lemma = existing?.lemma ?: "_",
+                    upos = existing?.upos ?: "_",
+                    xpos = assignment.pos.ifBlank { existing?.xpos ?: "_" },
+                    feats = existing?.feats ?: "_",
+                    head = existing?.head ?: "_",
+                    deprel = existing?.deprel ?: "_",
+                    misc = existing?.misc
+                )
+            }
+
+            if (skippedRetokenizedLeaves > 0) {
+                logger.fine(
+                    "Skipped $skippedRetokenizedLeaves CoreNLP POS leaf updates without exact token span match"
+                )
+            }
+        }
+
+        internal fun mergeMorphoPosAssignmentsByTokenOrder(
+            posTags: List<String>,
+            tokens: List<KorapXmlTool.Span>,
+            morpho: MutableMap<String, KorapXmlTool.MorphoSpan>
+        ) {
+            posTags.forEachIndexed { index, pos ->
+                if (index >= tokens.size) return@forEachIndexed
+                val spanKey = "${tokens[index].from}-${tokens[index].to}"
+                val existing = morpho[spanKey]
+                morpho[spanKey] = KorapXmlTool.MorphoSpan(
+                    lemma = existing?.lemma ?: "_",
+                    upos = existing?.upos ?: "_",
+                    xpos = pos.ifBlank { existing?.xpos ?: "_" },
+                    feats = existing?.feats ?: "_",
+                    head = existing?.head ?: "_",
+                    deprel = existing?.deprel ?: "_",
+                    misc = existing?.misc
+                )
+            }
+        }
+
+        internal fun updateMorphoFromTreeByExactOffsets(
+            tree: Tree,
+            tokens: List<KorapXmlTool.Span>,
+            morpho: MutableMap<String, KorapXmlTool.MorphoSpan>,
+            logger: Logger
+        ) {
+            val assignments = mutableListOf<PosAssignment>()
+            val leaves = tree.getLeaves<Tree>()
+            for (leafObj in leaves) {
+                val leaf = leafObj as Tree
+                val parent = leaf.parent(tree) ?: continue
+                if (!parent.isPreTerminal) continue
+
+                val label = leaf.label()
+                if (label !is HasOffset) continue
+
+                val pos = parent.label()?.value()
+                assignments.add(
+                    PosAssignment(
+                        from = label.beginPosition(),
+                        to = label.endPosition(),
+                        pos = pos ?: "_"
+                    )
+                )
+            }
+            mergeMorphoPosAssignmentsByExactOffsets(assignments, tokens, morpho, logger)
+        }
+    }
 
     init {
         logger.info("Initializing CoreNLP parser with model $model" + (if (taggerModel != null) " and tagger model $taggerModel" else ""))
@@ -46,9 +133,10 @@
             logger.warning("No POS model specified for parser - CoreNLP may fail. Use both -t and -P with corenlp.")
         }
 
-        // Use default sentence splitting (not eolonly)
-        // tokenize.whitespace=true would prevent CoreNLP from retokenizing, but we want it to
-        // retokenize for better parse quality
+        // Use the tool's tokenization and sentence boundaries to keep constituency
+        // and morpho output aligned with the input ZIP token layer.
+        props.setProperty("tokenize.whitespace", "true")
+        props.setProperty("ssplit.eolonly", "true")
 
         pipeline = StanfordCoreNLP(props)
         logger.info("CoreNLP parser initialized successfully")
@@ -68,43 +156,43 @@
         }
 
         try {
-            // Annotate the ENTIRE document text at once, like the Java implementation does
-            // This ensures CoreNLP gives us document-level offsets directly
-            val docText = text.toString()
-            val annotation = Annotation(docText)
-            pipeline.annotate(annotation)
-
-            // Get all sentences from CoreNLP
-            val sentences = annotation.get(CoreAnnotations.SentencesAnnotation::class.java)
-
-            if (sentences.isEmpty()) {
-                logger.warning("CoreNLP produced no sentences")
-                return trees
-            }
-
-            // Process each sentence
-            sentences.forEachIndexed { sentenceIdx, sentence ->
+            sentenceSpans.forEachIndexed { sentenceIdx, sentenceSpan ->
                 val sentenceId = "s${sentenceIdx + 1}"
+                val sentenceTokens = tokens.filter { token ->
+                    token.from >= sentenceSpan.from && token.to <= sentenceSpan.to
+                }
+                if (sentenceTokens.isEmpty()) {
+                    return@forEachIndexed
+                }
+
+                val sentenceText = sentenceTokens.joinToString(" ") { token ->
+                    text.substring(token.from, token.to)
+                }
+                val annotation = Annotation(sentenceText)
+                pipeline.annotate(annotation)
+                val sentences = annotation.get(CoreAnnotations.SentencesAnnotation::class.java)
+                if (sentences.isEmpty()) {
+                    logger.warning("CoreNLP produced no sentences for sentence $sentenceId")
+                    return@forEachIndexed
+                }
+                val sentence = sentences[0]
 
                 val tree = sentence.get(TreeCoreAnnotations.TreeAnnotation::class.java)
 
                 if (tree != null) {
-                    // Get tokens for this sentence based on CoreNLP's sentence boundaries
                     val coreLabels = sentence.get(CoreAnnotations.TokensAnnotation::class.java)
+                    if (coreLabels.size != sentenceTokens.size) {
+                        logger.warning(
+                            "CoreNLP token count mismatch for $sentenceId: parser=${coreLabels.size}, input=${sentenceTokens.size}; " +
+                                "skipping constituency tree to avoid misalignment"
+                        )
+                        return@forEachIndexed
+                    }
+
                     if (coreLabels.isNotEmpty()) {
-                        val sentStart = coreLabels[0].beginPosition()
-                        val sentEnd = coreLabels[coreLabels.size - 1].endPosition()
-
-                        val sentenceTokens = tokens.filter { token ->
-                            token.from >= sentStart && token.to <= sentEnd
-                        }
-
-                        // Convert Stanford tree to our ConstituencyTree format
-                        // No offset adjustment needed since CoreNLP already has document-level offsets
-                        val constituencyTree = convertTree(tree, sentenceId, sentenceTokens, 0)
+                        val constituencyTree = convertTree(tree, sentenceId, sentenceTokens)
                         trees.add(constituencyTree)
 
-                        // Optionally update morpho with POS tags from CoreNLP
                         if (morpho != null) {
                             updatePOSFromTree(tree, sentenceTokens, morpho)
                         }
@@ -122,14 +210,15 @@
     private fun convertTree(
         tree: Tree,
         sentenceId: String,
-        tokens: List<KorapXmlTool.Span>,
-        sentenceOffsetInDoc: Int
+        tokens: List<KorapXmlTool.Span>
     ): ConstituencyTree {
         val nodes = mutableListOf<ConstituencyNode>()
+        val rootLeaves = tree.getLeaves<Tree>().map { it as Tree }
+        val leafIndices = java.util.IdentityHashMap<Tree, Int>()
+        rootLeaves.forEachIndexed { index, leaf -> leafIndices[leaf] = index }
 
         // Recursively convert tree nodes
-        // We need to pass offset adjustment since CoreNLP gives offsets relative to sentence text
-        convertNode(tree, tree, sentenceId, tokens, nodes, sentenceOffsetInDoc)
+        convertNode(tree, tree, sentenceId, tokens, nodes, leafIndices)
 
         return ConstituencyTree(sentenceId, nodes)
     }
@@ -140,33 +229,25 @@
         sentenceId: String,
         tokens: List<KorapXmlTool.Span>,
         nodes: MutableList<ConstituencyNode>,
-        sentenceOffsetInDoc: Int
+        leafIndices: java.util.IdentityHashMap<Tree, Int>
     ) {
         val nodeNumber = node.nodeNumber(root)
         val nodeId = "${sentenceId}_n${nodeNumber}"
 
-        // Get character offsets from leaves
-        val leaves: List<Tree> = node.getLeaves()
+        // Map constituent span boundaries back to the original token spans.
+        val leaves: List<Tree> = node.getLeaves<Tree>().map { it as Tree }
         if (leaves.isEmpty()) return
 
-        val firstLeafLabel: Label = leaves[0].label()
-        val lastLeafLabel: Label = leaves[leaves.size - 1].label()
-
-        // Get offsets from the leaf labels
-        // CoreNLP gives offsets relative to the sentence text we fed it,
-        // so we need to add sentenceOffsetInDoc to get document-level offsets
-        val from: Int
-        val to: Int
-
-        if (firstLeafLabel is HasOffset && lastLeafLabel is HasOffset) {
-            from = (firstLeafLabel as HasOffset).beginPosition() + sentenceOffsetInDoc
-            to = (lastLeafLabel as HasOffset).endPosition() + sentenceOffsetInDoc
-        } else {
-            // Fallback: use first and last tokens from sentence
-            from = tokens[0].from
-            to = tokens[tokens.size - 1].to
+        val firstLeafIndex = leafIndices[leaves.first()] ?: return
+        val lastLeafIndex = leafIndices[leaves.last()] ?: return
+        if (firstLeafIndex >= tokens.size || lastLeafIndex >= tokens.size) {
+            logger.warning("CoreNLP leaf/token index mismatch in $sentenceId: $firstLeafIndex..$lastLeafIndex vs ${tokens.size} tokens")
+            return
         }
 
+        val from = tokens[firstLeafIndex].from
+        val to = tokens[lastLeafIndex].to
+
         // Get children
         val children = mutableListOf<ConstituencyChild>()
         for (child in node.children()) {
@@ -197,7 +278,7 @@
         // Recursively process children
         for (child in node.children()) {
             if (!child.isLeaf()) {
-                convertNode(child, root, sentenceId, tokens, nodes, sentenceOffsetInDoc)
+                convertNode(child, root, sentenceId, tokens, nodes, leafIndices)
             }
         }
     }
@@ -207,28 +288,12 @@
         tokens: List<KorapXmlTool.Span>,
         morpho: MutableMap<String, KorapXmlTool.MorphoSpan>
     ) {
-        // Get POS tags from pre-terminal nodes
-        val leaves: List<Tree> = tree.getLeaves()
-        leaves.forEachIndexed { idx, leaf: Tree ->
-            if (idx < tokens.size) {
-                val parent: Tree? = leaf.parent(tree)
-                if (parent != null && parent.isPreTerminal()) {
-                    val pos: String? = parent.label()?.value()
-                    val spanKey = "${tokens[idx].from}-${tokens[idx].to}"
-                    val existing = morpho[spanKey]
-
-                    // Update or create morpho entry with POS tag
-                    morpho[spanKey] = KorapXmlTool.MorphoSpan(
-                        lemma = existing?.lemma ?: "_",
-                        upos = existing?.upos ?: "_",
-                        xpos = pos ?: existing?.xpos ?: "_",
-                        feats = existing?.feats ?: "_",
-                        head = existing?.head ?: "_",
-                        deprel = existing?.deprel ?: "_",
-                        misc = existing?.misc
-                    )
-                }
-            }
+        val posTags = tree.getLeaves<Tree>().map { leafObj ->
+            val leaf = leafObj as Tree
+            val parent = leaf.parent(tree)
+            if (parent != null && parent.isPreTerminal) parent.label()?.value() ?: "_"
+            else "_"
         }
+        mergeMorphoPosAssignmentsByTokenOrder(posTags, tokens, morpho)
     }
 }
commit	8a1b2a90dfb7c96a56e2c0fb01eb56a71dfe7c05	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Apr 05 19:42:54 2026 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Apr 05 19:42:54 2026 +0200
tree	8e2fae9205b25255c3b53f045e0d4e080716aa27
parent	a2dd177c8879f51c0e7f53b3a32b778607000174 [diff]