Add constituencies and non base sentences to krill output Change-Id: Ib56c852efca64dd28f7f487a87384df21b248e7c

commit: bf622e959b104bb33cd5e589b39869ddb40ee42a [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Nov 13 19:23:03 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Nov 13 19:23:03 2025 +0100
tree: f9cc246066e6f841b9e45b4a1e48f2d14a779038
parent: f1d1e7f4a496528471217a72c3ca5c5b3ead4ba1 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 966736b..c5701e9 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -466,7 +466,9 @@
         var sentences: Array<Span>? = null,
         var morphoByFoundry: MutableMap<String, MutableMap<String, MorphoSpan>> = mutableMapOf(),
         var structureSpans: MutableList<StructureSpan> = mutableListOf(),
-        var extractedAttributes: MutableMap<String, String> = mutableMapOf()
+        var extractedAttributes: MutableMap<String, String> = mutableMapOf(),
+        var corenlpSentencesCollected: Boolean = false,
+        var corenlpConstituencyCollected: Boolean = false
     )
 
     data class StructureSpan(
@@ -1177,8 +1179,6 @@
         val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
         val textIds = entriesByTextId.keys.sorted()  // Process text IDs in lexicographic order
 
-        LOGGER.info("processZipEntriesWithPool: processing ${entries.size} entries (${textIds.size} texts) with foundry=$foundry")
-
         // Initialize watermark for this foundry if not exists (set to first text ID)
         if (!foundryWatermarks.containsKey(foundry) && textIds.isNotEmpty()) {
             foundryWatermarks.putIfAbsent(foundry, textIds.first())
@@ -1252,7 +1252,7 @@
         }
 
         try {
-            if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency)\\.xml$"))) {
+            if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$"))) {
                 LOGGER.finer("Processing entry: ${zipEntry.name}, foundry=$foundry")
                 // Ensure the entry stream and reader are closed to avoid native memory buildup
                 val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
@@ -1399,6 +1399,21 @@
                             LOGGER.info("Dependency merge complete: $mergedCount merged, $newCount new entries (heads will be resolved during output)")
                         }
                     }
+
+                    "sentences.xml" -> {
+                        println("sentences entry foundry=$foundry for $docId from ${zipEntry.name}")
+                        if (outputFormat == OutputFormat.KRILL && foundry.startsWith("corenlp")) {
+                            val sentenceSpans: NodeList = doc.getElementsByTagName("span")
+                            collectCorenlpSentences(docId, sentenceSpans)
+                        }
+                    }
+
+                    "constituency.xml" -> {
+                        if (outputFormat == OutputFormat.KRILL && foundry.startsWith("corenlp")) {
+                            val constituencySpans: NodeList = doc.getElementsByTagName("span")
+                            collectCorenlpConstituency(docId, constituencySpans)
+                        }
+                    }
                 }
 
                 // Mark text as processed from this ZIP for incremental output
@@ -2508,6 +2523,127 @@
         }
     }
 
+    private data class CorenlpConstituencyNode(
+        val id: String,
+        val from: Int,
+        val to: Int,
+        val label: String,
+        val children: MutableList<String> = mutableListOf()
+    )
+
+    private fun collectCorenlpSentences(docId: String, spans: NodeList) {
+        if (outputTexts.contains(docId)) return
+
+        val textData = krillData.getOrPut(docId) {
+            KrillTextData(textId = docId)
+        }
+
+        synchronized(textData) {
+            if (textData.corenlpSentencesCollected) return
+            for (i in 0 until spans.length) {
+                val span = spans.item(i) as? Element ?: continue
+                val from = span.getAttribute("from").toIntOrNull() ?: continue
+                val to = span.getAttribute("to").toIntOrNull() ?: continue
+                textData.structureSpans.add(
+                    StructureSpan(
+                        layer = "corenlp/s:s",
+                        from = from,
+                        to = to,
+                        tokenFrom = -1,
+                        tokenTo = -1,
+                        depth = 0,
+                        attributes = emptyMap()
+                    )
+                )
+            }
+            textData.corenlpSentencesCollected = true
+        }
+    }
+
+    private fun collectCorenlpConstituency(docId: String, spans: NodeList) {
+        if (outputTexts.contains(docId)) return
+
+        val nodesById = mutableMapOf<String, CorenlpConstituencyNode>()
+        val nonRootIds = mutableSetOf<String>()
+
+        for (i in 0 until spans.length) {
+            val span = spans.item(i) as? Element ?: continue
+            val id = span.getAttribute("id")
+            if (id.isNullOrBlank()) continue
+            val from = span.getAttribute("from").toIntOrNull() ?: continue
+            val to = span.getAttribute("to").toIntOrNull() ?: continue
+
+            val fsList = span.getElementsByTagName("fs")
+            if (fsList.length == 0) continue
+            val fsElement = fsList.item(0) as? Element ?: continue
+            val fElements = fsElement.getElementsByTagName("f")
+            var label: String? = null
+            for (j in 0 until fElements.length) {
+                val f = fElements.item(j) as? Element ?: continue
+                if (f.getAttribute("name") == "const") {
+                    label = f.textContent?.trim()
+                    break
+                }
+            }
+            if (label.isNullOrBlank()) continue
+
+            val node = CorenlpConstituencyNode(id, from, to, label)
+
+            val relElements = span.getElementsByTagName("rel")
+            for (j in 0 until relElements.length) {
+                val rel = relElements.item(j) as? Element ?: continue
+                if (rel.getAttribute("label") != "dominates") continue
+                val target = rel.getAttribute("target")
+                if (!target.isNullOrBlank()) {
+                    node.children.add(target)
+                    nonRootIds.add(target)
+                } else {
+                    val uri = rel.getAttribute("uri")
+                    if (!uri.isNullOrBlank()) {
+                        val normalized = uri.removePrefix("morpho.xml#")
+                        if (normalized.isNotBlank()) {
+                            nonRootIds.add(normalized)
+                        }
+                    }
+                }
+            }
+            nodesById[id] = node
+        }
+
+        if (nodesById.isEmpty()) return
+
+        val textData = krillData.getOrPut(docId) {
+            KrillTextData(textId = docId)
+        }
+
+        synchronized(textData) {
+            if (textData.corenlpConstituencyCollected) return
+            LOGGER.fine("Collecting corenlp constituency for $docId: ${nodesById.size} nodes, roots=${nodesById.keys.count { it !in nonRootIds }}")
+
+            fun traverse(nodeId: String, depth: Int) {
+                val node = nodesById[nodeId] ?: return
+                textData.structureSpans.add(
+                    StructureSpan(
+                        layer = "corenlp/c:${node.label}",
+                        from = node.from,
+                        to = node.to,
+                        tokenFrom = -1,
+                        tokenTo = -1,
+                        depth = depth,
+                        attributes = emptyMap()
+                    )
+                )
+                node.children.forEach { childId ->
+                    traverse(childId, depth + 1)
+                }
+            }
+
+            val rootIds = nodesById.keys.filter { it !in nonRootIds }
+            rootIds.forEach { traverse(it, 0) }
+            textData.corenlpConstituencyCollected = true
+        }
+    }
+
     // Collect rich metadata from header.xml for krill format
     private fun collectKrillMetadata(docId: String, headerXml: String) {
         // Skip if already output (thread-safe check with ConcurrentHashMap.KeySet)
@@ -3518,11 +3654,18 @@
         sb.append("\"data\":{")
         sb.append("\"text\":${jsonString(textData.textContent ?: "")},")
 
-        // layerInfos - list all foundries
+        val hasCorenlpSentences = textData.structureSpans.any { it.layer == "corenlp/s:s" }
+        val hasCorenlpConstituency = textData.structureSpans.any { it.layer.startsWith("corenlp/c:") }
         val layerInfos = mutableListOf<String>()
         if (textData.sentences != null) {
             layerInfos.add("dereko/s=spans")
         }
+        if (hasCorenlpSentences) {
+            layerInfos.add("corenlp/s=spans")
+        }
+        if (hasCorenlpConstituency) {
+            layerInfos.add("corenlp/c=spans")
+        }
 
         // Collect layers by foundry type (checking what data actually exists)
         val foundryLayers = mutableMapOf<String, MutableSet<String>>()
@@ -3600,6 +3743,18 @@
             foundries.add("dereko/structure/base-sentences-paragraphs-pagebreaks")
         }
 
+        if (hasCorenlpSentences || hasCorenlpConstituency) {
+            if (!foundries.contains("corenlp")) {
+                foundries.add("corenlp")
+            }
+            if (hasCorenlpSentences && !foundries.contains("corenlp/sentences")) {
+                foundries.add("corenlp/sentences")
+            }
+            if (hasCorenlpConstituency && !foundries.contains("corenlp/structure")) {
+                foundries.add("corenlp/structure")
+            }
+        }
+
         // Add annotation foundries with their layers
         foundryLayers.keys.sorted().forEach { foundry ->
             // Use full name "treetagger" instead of "tt" in foundries list
@@ -3771,6 +3926,19 @@
             }
         }
 
+        val hasCorenlpSentences = resolvedStructureSpans.any { it.layer == "corenlp/s:s" }
+        val hasCorenlpConstituency = resolvedStructureSpans.any { it.layer.startsWith("corenlp/c:") }
+        val layerInfos = mutableListOf<String>()
+        if (textData.sentences != null) {
+            layerInfos.add("dereko/s=spans")
+        }
+        if (hasCorenlpSentences) {
+            layerInfos.add("corenlp/s=spans")
+        }
+        if (hasCorenlpConstituency) {
+            layerInfos.add("corenlp/c=spans")
+        }
+
         // Group structural spans by their starting token
         val spansByToken = mutableMapOf<Int, MutableList<StructureSpan>>()
         resolvedStructureSpans.forEach { span ->
@@ -3779,6 +3947,7 @@
 
         // Count paragraph spans (name="p")
         val paragraphCount = allStructureSpans.count { it.layer.endsWith(":p") }
+        val corenlpSentenceCount = resolvedStructureSpans.count { it.layer == "corenlp/s:s" }
 
         tokens.forEachIndexed { index, token ->
             val tokenAnnotations = mutableListOf<String>()
@@ -3792,6 +3961,9 @@
                 if (sentences.isNotEmpty()) {
                     tokenAnnotations.add(jsonString("-:base/sentences\$<i>${sentences.size}"))
                 }
+                if (corenlpSentenceCount > 0) {
+                    tokenAnnotations.add(jsonString("-:corenlp/sentences\$<i>$corenlpSentenceCount"))
+                }
                 tokenAnnotations.add(jsonString("-:tokens\$<i>${tokens.size}"))
 
                 // Add all structural spans that start at token 0 or cover the whole document

diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 6eddd09..26572c2 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt

@@ -25,6 +25,7 @@
     val zca20scrambled = loadResource("zca20-scrambled.zip").path
     val wdf19 = loadResource("wdf19.zip").path
     val wdd17 = loadResource("wdd17sample.zip").path
+    val wud24Corenlp = loadResource("wud24_sample.corenlp.zip").path
 
     @Before
     fun setUpStreams() {
@@ -683,7 +684,7 @@
         }
 
         try {
-            val defaultArgs = arrayOf("-f", "krill", "-D", defaultDir.path, baseZip, spacyZip)
+            val defaultArgs = arrayOf("-f", "krill", "-D", defaultDir.path, baseZip, spacyZip, wud24Corenlp)
             val defaultExit = debug(defaultArgs)
             assertTrue(defaultExit == 0, "Krill conversion should succeed without --non-word-tokens")
 
@@ -711,7 +712,7 @@
         }
 
         try {
-            val flagArgs = arrayOf("-f", "krill", "--non-word-tokens", "-D", flagDir.path, baseZip, spacyZip)
+            val flagArgs = arrayOf("-f", "krill", "--non-word-tokens", "-D", flagDir.path, baseZip, spacyZip, wud24Corenlp)
             val flagExit = debug(flagArgs)
             assertTrue(flagExit == 0, "Krill conversion should succeed with --non-word-tokens")
 
@@ -737,6 +738,10 @@
     fun krillDefaultMatchesPerlReference() {
         val baseZip = loadResource("wud24_sample.zip").path
         val spacyZip = loadResource("wud24_sample.spacy.zip").path
+        val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+        val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
+        val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+        val corenlpZip = wud24Corenlp
         val referenceTar = File(loadResource("wud24_sample.wonwtopt.krill.tar").toURI())
         assertTrue(referenceTar.exists(), "Reference Krill tar is missing: ${referenceTar.path}")
 
@@ -747,7 +752,15 @@
         }
 
         try {
-            val args = arrayOf("-f", "krill", "-D", kotlinDir.path, baseZip, spacyZip)
+            val args = arrayOf(
+                "-f", "krill",
+                "-D", kotlinDir.path,
+                baseZip,
+                spacyZip,
+                marmotMaltZip,
+                treeTaggerZip,
+                corenlpZip
+            )
             val exitCode = debug(args)
             assertTrue(exitCode == 0, "Krill conversion should succeed for reference comparison")
 
@@ -781,6 +794,9 @@
     fun krillNonWordTokensMatchesPerlReference() {
         val baseZip = loadResource("wud24_sample.zip").path
         val spacyZip = loadResource("wud24_sample.spacy.zip").path
+        val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+        val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+        val corenlpZipNwt = wud24Corenlp
         val referenceTar = File(loadResource("wud24_sample.nwt.krill.tar").toURI())
         assertTrue(referenceTar.exists(), "Non-word-token reference tar missing: ${referenceTar.path}")
 
@@ -791,7 +807,16 @@
         }
 
         try {
-            val args = arrayOf("-f", "krill", "--non-word-tokens", "-D", kotlinDir.path, baseZip, spacyZip)
+            val args = arrayOf(
+                "-f", "krill",
+                "--non-word-tokens",
+                "-D", kotlinDir.path,
+                baseZip,
+                spacyZip,
+                marmotMaltZip,
+                treeTaggerZip,
+                corenlpZipNwt
+            )
             val exitCode = debug(args)
             assertTrue(exitCode == 0, "Krill conversion with --non-word-tokens should succeed for reference comparison")
 
@@ -809,8 +834,10 @@
                 "\"s:!\"",
                 "\"marmot/p:\\$,\"",
                 "\"spacy/p:\\$,\"",
-                "\"opennlp/p:\\$,\"",
-                "\"tt/p:\\$,\""
+                "\"tt/p:\\$,\"",
+                "\"-:corenlp/sentences\$<i>11\"",
+                "corenlp/s=spans",
+                "corenlp/c=spans"
             )
             referenceJsons.forEach { (doc, referenceJson) ->
                 val kotlinJson = kotlinJsons.getValue(doc)

diff --git a/app/src/test/resources/wud24_sample.krill.tar b/app/src/test/resources/wud24_sample.krill.tar
index 0a1745e..aa153aa 100644
--- a/app/src/test/resources/wud24_sample.krill.tar
+++ b/app/src/test/resources/wud24_sample.krill.tar
Binary files differ

diff --git a/app/src/test/resources/wud24_sample.nwt.krill.tar b/app/src/test/resources/wud24_sample.nwt.krill.tar
index df498e0..cd51749 100644
--- a/app/src/test/resources/wud24_sample.nwt.krill.tar
+++ b/app/src/test/resources/wud24_sample.nwt.krill.tar
Binary files differ
commit	bf622e959b104bb33cd5e589b39869ddb40ee42a	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Nov 13 19:23:03 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Nov 13 19:23:03 2025 +0100
tree	f9cc246066e6f841b9e45b4a1e48f2d14a779038
parent	f1d1e7f4a496528471217a72c3ca5c5b3ead4ba1 [diff]