Fix constituency parsing Change-Id: Ifa0389864a40657a867e0a1b1f63273e4f279914

commit: 6cb3f27425e6347a1d06afb4d66ca07c71e8c35b [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Nov 15 23:12:18 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Nov 15 23:12:18 2025 +0100
tree: aa6549202b2433592762f7cbf2bb85242abe4926
parent: 319f3d549aec526ad7b1b177974181f81956c044 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ConstituencyParserBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ConstituencyParserBridge.kt
new file mode 100644
index 0000000..35b2490
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ConstituencyParserBridge.kt

@@ -0,0 +1,60 @@
+package de.ids_mannheim.korapxmltools
+
+abstract class ConstituencyParserBridge : AnnotationToolBridge {
+
+    data class ConstituencyTree(
+        val sentenceId: String,
+        val nodes: List<ConstituencyNode>
+    )
+
+    data class ConstituencyNode(
+        val id: String,
+        val label: String,
+        val from: Int,
+        val to: Int,
+        val children: List<ConstituencyChild>
+    )
+
+    sealed class ConstituencyChild {
+        data class NodeRef(val targetId: String) : ConstituencyChild()
+        data class MorphoRef(val morphoId: String) : ConstituencyChild()
+    }
+
+    /**
+     * Parse text and return constituency trees for each sentence.
+     *
+     * @param tokens Array of token spans
+     * @param morpho Map of morphological annotations (may be null or incomplete)
+     * @param sentenceSpans Array of sentence spans
+     * @param text The full text as NonBmpString
+     * @return List of constituency trees, one per sentence
+     */
+    abstract fun parseConstituency(
+        tokens: Array<KorapXmlTool.Span>,
+        morpho: MutableMap<String, KorapXmlTool.MorphoSpan>?,
+        sentenceSpans: Array<KorapXmlTool.Span>?,
+        text: NonBmpString
+    ): List<ConstituencyTree>
+
+    /**
+     * Optionally update morpho map with POS tags from constituency parser.
+     * Default implementation does nothing.
+     */
+    open fun updateMorphoFromConstituency(
+        tokens: Array<KorapXmlTool.Span>,
+        morpho: MutableMap<String, KorapXmlTool.MorphoSpan>?,
+        trees: List<ConstituencyTree>,
+        text: NonBmpString
+    ): MutableMap<String, KorapXmlTool.MorphoSpan>? {
+        return morpho
+    }
+
+    // Implementation required by AnnotationToolBridge but not used for constituency parsing
+    override fun tagSentence(
+        sentenceTokens: MutableList<String>,
+        sentenceTokenOffsets: MutableList<String>,
+        morphoMap: MutableMap<String, KorapXmlTool.MorphoSpan>?
+    ) {
+        throw UnsupportedOperationException("Constituency parsers use parseConstituency() instead")
+    }
+}

diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt
new file mode 100644
index 0000000..2654ccd
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPBridge.kt

@@ -0,0 +1,234 @@
+package de.ids_mannheim.korapxmltools
+
+import edu.stanford.nlp.ling.CoreAnnotations
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.ling.HasOffset
+import edu.stanford.nlp.ling.Label
+import edu.stanford.nlp.pipeline.Annotation
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.TreeCoreAnnotations
+import edu.stanford.nlp.util.CoreMap
+import java.io.File
+import java.util.*
+import java.util.logging.Logger
+
+class CoreNLPBridge(override val model: String, override val logger: Logger, val taggerModel: String? = null) : ConstituencyParserBridge() {
+    override val foundry = "corenlp"
+
+    private val pipeline: StanfordCoreNLP
+
+    init {
+        logger.info("Initializing CoreNLP parser with model $model" + (if (taggerModel != null) " and tagger model $taggerModel" else ""))
+        val props = Properties()
+
+        // Basic annotators for constituency parsing
+        // tokenize and ssplit are needed, but we'll provide our own sentence splitting
+        props.setProperty("annotators", "tokenize,ssplit,pos,parse")
+
+        // Set the parse model from the model parameter
+        if (File(model).exists()) {
+            props.setProperty("parse.model", model)
+            logger.info("Loading parse model from $model")
+        } else {
+            throw IllegalArgumentException("Parser model file not found: $model")
+        }
+
+        // Set the POS model - use taggerModel if provided, otherwise try default
+        if (taggerModel != null && File(taggerModel).exists()) {
+            props.setProperty("pos.model", taggerModel)
+            logger.info("Loading POS model from $taggerModel for parser")
+        } else if (model.contains("german") || model.contains("German") || model.contains("SR")) {
+            // German-specific settings - use built-in model
+            props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger")
+            logger.info("Using default German POS model for parser")
+        } else {
+            logger.warning("No POS model specified for parser - CoreNLP may fail. Use both -t and -P with corenlp.")
+        }
+
+        // Use default sentence splitting (not eolonly)
+        // tokenize.whitespace=true would prevent CoreNLP from retokenizing, but we want it to
+        // retokenize for better parse quality
+
+        pipeline = StanfordCoreNLP(props)
+        logger.info("CoreNLP parser initialized successfully")
+    }
+
+    override fun parseConstituency(
+        tokens: Array<KorapXmlTool.Span>,
+        morpho: MutableMap<String, KorapXmlTool.MorphoSpan>?,
+        sentenceSpans: Array<KorapXmlTool.Span>?,
+        text: NonBmpString
+    ): List<ConstituencyTree> {
+        val trees = mutableListOf<ConstituencyTree>()
+
+        if (sentenceSpans == null || sentenceSpans.isEmpty()) {
+            logger.warning("No sentence spans provided for constituency parsing")
+            return trees
+        }
+
+        try {
+            // Annotate the ENTIRE document text at once, like the Java implementation does
+            // This ensures CoreNLP gives us document-level offsets directly
+            val docText = text.toString()
+            val annotation = Annotation(docText)
+            pipeline.annotate(annotation)
+
+            // Get all sentences from CoreNLP
+            val sentences = annotation.get(CoreAnnotations.SentencesAnnotation::class.java)
+
+            if (sentences.isEmpty()) {
+                logger.warning("CoreNLP produced no sentences")
+                return trees
+            }
+
+            // Process each sentence
+            sentences.forEachIndexed { sentenceIdx, sentence ->
+                val sentenceId = "s${sentenceIdx + 1}"
+
+                val tree = sentence.get(TreeCoreAnnotations.TreeAnnotation::class.java)
+
+                if (tree != null) {
+                    // Get tokens for this sentence based on CoreNLP's sentence boundaries
+                    val coreLabels = sentence.get(CoreAnnotations.TokensAnnotation::class.java)
+                    if (coreLabels.isNotEmpty()) {
+                        val sentStart = coreLabels[0].beginPosition()
+                        val sentEnd = coreLabels[coreLabels.size - 1].endPosition()
+
+                        val sentenceTokens = tokens.filter { token ->
+                            token.from >= sentStart && token.to <= sentEnd
+                        }
+
+                        // Convert Stanford tree to our ConstituencyTree format
+                        // No offset adjustment needed since CoreNLP already has document-level offsets
+                        val constituencyTree = convertTree(tree, sentenceId, sentenceTokens, 0)
+                        trees.add(constituencyTree)
+
+                        // Optionally update morpho with POS tags from CoreNLP
+                        if (morpho != null) {
+                            updatePOSFromTree(tree, sentenceTokens, morpho)
+                        }
+                    }
+                }
+            }
+        } catch (e: Exception) {
+            logger.warning("Failed to parse document: ${e.message}")
+            e.printStackTrace()
+        }
+
+        return trees
+    }
+
+    private fun convertTree(
+        tree: Tree,
+        sentenceId: String,
+        tokens: List<KorapXmlTool.Span>,
+        sentenceOffsetInDoc: Int
+    ): ConstituencyTree {
+        val nodes = mutableListOf<ConstituencyNode>()
+
+        // Recursively convert tree nodes
+        // We need to pass offset adjustment since CoreNLP gives offsets relative to sentence text
+        convertNode(tree, tree, sentenceId, tokens, nodes, sentenceOffsetInDoc)
+
+        return ConstituencyTree(sentenceId, nodes)
+    }
+
+    private fun convertNode(
+        node: Tree,
+        root: Tree,
+        sentenceId: String,
+        tokens: List<KorapXmlTool.Span>,
+        nodes: MutableList<ConstituencyNode>,
+        sentenceOffsetInDoc: Int
+    ) {
+        val nodeNumber = node.nodeNumber(root)
+        val nodeId = "${sentenceId}_n${nodeNumber}"
+
+        // Get character offsets from leaves
+        val leaves: List<Tree> = node.getLeaves()
+        if (leaves.isEmpty()) return
+
+        val firstLeafLabel: Label = leaves[0].label()
+        val lastLeafLabel: Label = leaves[leaves.size - 1].label()
+
+        // Get offsets from the leaf labels
+        // CoreNLP gives offsets relative to the sentence text we fed it,
+        // so we need to add sentenceOffsetInDoc to get document-level offsets
+        val from: Int
+        val to: Int
+
+        if (firstLeafLabel is HasOffset && lastLeafLabel is HasOffset) {
+            from = (firstLeafLabel as HasOffset).beginPosition() + sentenceOffsetInDoc
+            to = (lastLeafLabel as HasOffset).endPosition() + sentenceOffsetInDoc
+        } else {
+            // Fallback: use first and last tokens from sentence
+            from = tokens[0].from
+            to = tokens[tokens.size - 1].to
+        }
+
+        // Get children
+        val children = mutableListOf<ConstituencyChild>()
+        for (child in node.children()) {
+            if (!child.isLeaf()) {
+                val childNumber = child.nodeNumber(root)
+                val childId = "${sentenceId}_n${childNumber}"
+
+                if (child.isPreTerminal) {
+                    // Points to morpho.xml
+                    children.add(ConstituencyChild.MorphoRef(childId))
+                } else {
+                    // Points to another constituent node
+                    children.add(ConstituencyChild.NodeRef(childId))
+                }
+            }
+        }
+
+        nodes.add(
+            ConstituencyNode(
+                id = nodeId,
+                label = node.value() ?: "UNKNOWN",
+                from = from,
+                to = to,
+                children = children
+            )
+        )
+
+        // Recursively process children
+        for (child in node.children()) {
+            if (!child.isLeaf()) {
+                convertNode(child, root, sentenceId, tokens, nodes, sentenceOffsetInDoc)
+            }
+        }
+    }
+
+    private fun updatePOSFromTree(
+        tree: Tree,
+        tokens: List<KorapXmlTool.Span>,
+        morpho: MutableMap<String, KorapXmlTool.MorphoSpan>
+    ) {
+        // Get POS tags from pre-terminal nodes
+        val leaves: List<Tree> = tree.getLeaves()
+        leaves.forEachIndexed { idx, leaf: Tree ->
+            if (idx < tokens.size) {
+                val parent: Tree? = leaf.parent(tree)
+                if (parent != null && parent.isPreTerminal()) {
+                    val pos: String? = parent.label()?.value()
+                    val spanKey = "${tokens[idx].from}-${tokens[idx].to}"
+                    val existing = morpho[spanKey]
+
+                    // Update or create morpho entry with POS tag
+                    morpho[spanKey] = KorapXmlTool.MorphoSpan(
+                        lemma = existing?.lemma ?: "_",
+                        upos = existing?.upos ?: "_",
+                        xpos = pos ?: existing?.xpos ?: "_",
+                        feats = existing?.feats ?: "_",
+                        head = existing?.head ?: "_",
+                        deprel = existing?.deprel ?: "_",
+                        misc = existing?.misc
+                    )
+                }
+            }
+        }
+    }
+}

diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPTaggerBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPTaggerBridge.kt
new file mode 100644
index 0000000..090d725
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/CoreNLPTaggerBridge.kt

@@ -0,0 +1,91 @@
+package de.ids_mannheim.korapxmltools
+
+import edu.stanford.nlp.ling.CoreAnnotations
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.pipeline.Annotation
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import edu.stanford.nlp.util.CoreMap
+import java.io.File
+import java.util.*
+import java.util.logging.Logger
+
+class CoreNLPTaggerBridge(override val model: String, override val logger: Logger) : TaggerToolBridge() {
+    override val foundry = "corenlp"
+
+    private val pipeline: StanfordCoreNLP
+
+    init {
+        logger.info("Initializing CoreNLP tagger with model $model")
+        val props = Properties()
+
+        // Basic annotators for POS tagging
+        props.setProperty("annotators", "tokenize,ssplit,pos")
+
+        // Set the POS model from the model parameter
+        if (File(model).exists()) {
+            props.setProperty("pos.model", model)
+            logger.info("Loading POS model from $model")
+        } else {
+            throw IllegalArgumentException("Model file not found: $model")
+        }
+
+        // Configure for German if model name suggests it
+        if (model.contains("german") || model.contains("German")) {
+            logger.info("Detected German model")
+        }
+
+        // Use whitespace tokenization since we have our own tokens
+        props.setProperty("tokenize.whitespace", "true")
+        props.setProperty("ssplit.eolonly", "true")
+
+        pipeline = StanfordCoreNLP(props)
+        logger.info("CoreNLP tagger initialized successfully")
+    }
+
+    @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
+    override fun tagSentence(
+        sentenceTokens: MutableList<String>,
+        sentenceTokenOffsets: MutableList<String>,
+        morphoMap: MutableMap<String, KorapXmlTool.MorphoSpan>?
+    ) {
+        if (sentenceTokens.isEmpty()) return
+
+        // Build sentence text from tokens
+        val sentenceText = sentenceTokens.joinToString(" ")
+
+        try {
+            // Annotate with CoreNLP
+            val annotation = Annotation(sentenceText)
+            pipeline.annotate(annotation)
+
+            // Get the annotated sentence
+            val sentences: List<CoreMap> = annotation.get(CoreAnnotations.SentencesAnnotation::class.java)
+            if (sentences.isEmpty()) {
+                logger.warning("CoreNLP produced no sentences for: $sentenceText")
+                return
+            }
+
+            val sentence = sentences[0]
+            val tokens: List<CoreLabel> = sentence.get(CoreAnnotations.TokensAnnotation::class.java)
+
+            // Map POS tags back to our tokens
+            tokens.forEachIndexed { idx, token ->
+                if (idx < sentenceTokenOffsets.size) {
+                    val pos = token.get(CoreAnnotations.PartOfSpeechAnnotation::class.java)
+                    val lemma = token.get(CoreAnnotations.LemmaAnnotation::class.java)
+
+                    val taggedWord = KorapXmlTool.MorphoSpan(
+                        lemma = lemma ?: "_",
+                        xpos = pos ?: "_",
+                        upos = "_",  // CoreNLP doesn't provide universal POS by default
+                        feats = "_"
+                    )
+                    morphoMap?.set(sentenceTokenOffsets[idx], taggedWord)
+                }
+            }
+        } catch (e: Exception) {
+            logger.warning("Failed to tag sentence: ${e.message}")
+            throw e
+        }
+    }
+}
commit	6cb3f27425e6347a1d06afb4d66ca07c71e8c35b	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Nov 15 23:12:18 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Nov 15 23:12:18 2025 +0100
tree	aa6549202b2433592762f7cbf2bb85242abe4926
parent	319f3d549aec526ad7b1b177974181f81956c044 [diff]