Neutralize identifiers for constituency and sentence annotations Change-Id: Ic52f032ba76e75ebe724610cb223802047b17985

commit: 9d2c64e3fd062c1bf7d5150dc53ed7ace47add36 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Nov 13 19:44:40 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Nov 13 19:44:40 2025 +0100
tree: ac1f02e56deb634b6891f326a32dc122aa41a380
parent: bf622e959b104bb33cd5e589b39869ddb40ee42a [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index c5701e9..3a936ec 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -471,6 +471,44 @@
         var corenlpConstituencyCollected: Boolean = false
     )
 
+    private val BASE_STRUCTURE_FOUNDRIES = setOf("base", "dereko")
+
+    private fun StructureSpan.splitFoundryAndLayer(): Pair<String, String>? {
+        val separatorIdx = layer.indexOf('/')
+        if (separatorIdx <= 0 || separatorIdx == layer.length - 1) {
+            return null
+        }
+        val foundry = layer.substring(0, separatorIdx)
+        val descriptor = layer.substring(separatorIdx + 1)
+        if (foundry.isBlank() || descriptor.isBlank()) {
+            return null
+        }
+        return foundry to descriptor
+    }
+
+    private fun Collection<StructureSpan>.foundriesWithSentenceSpans(): Set<String> =
+        this.mapNotNull { span ->
+            val (foundry, descriptor) = span.splitFoundryAndLayer() ?: return@mapNotNull null
+            if (descriptor == "s:s") foundry else null
+        }.toSet()
+
+    private fun Collection<StructureSpan>.foundriesWithConstituencySpans(): Set<String> =
+        this.mapNotNull { span ->
+            val (foundry, descriptor) = span.splitFoundryAndLayer() ?: return@mapNotNull null
+            if (descriptor.startsWith("c:")) foundry else null
+        }.toSet()
+
+    private fun Collection<StructureSpan>.sentenceCountsByFoundry(): Map<String, Int> {
+        val counts = mutableMapOf<String, Int>()
+        this.forEach { span ->
+            val (foundry, descriptor) = span.splitFoundryAndLayer() ?: return@forEach
+            if (descriptor == "s:s") {
+                counts[foundry] = counts.getOrDefault(foundry, 0) + 1
+            }
+        }
+        return counts
+    }
+
     data class StructureSpan(
         val layer: String,  // e.g., "base/s:s", "dereko/s:p"
         val from: Int,
@@ -1401,7 +1439,7 @@
                     }
 
                     "sentences.xml" -> {
-                        println("sentences entry foundry=$foundry for $docId from ${zipEntry.name}")
+                        LOGGER.fine("Sentences entry foundry=$foundry for $docId from ${zipEntry.name}")
                         if (outputFormat == OutputFormat.KRILL && foundry.startsWith("corenlp")) {
                             val sentenceSpans: NodeList = doc.getElementsByTagName("span")
                             collectCorenlpSentences(docId, sentenceSpans)
@@ -3654,18 +3692,16 @@
         sb.append("\"data\":{")
         sb.append("\"text\":${jsonString(textData.textContent ?: "")},")
 
-        val hasCorenlpSentences = textData.structureSpans.any { it.layer == "corenlp/s:s" }
-        val hasCorenlpConstituency = textData.structureSpans.any { it.layer.startsWith("corenlp/c:") }
+        val sentenceSpanFoundries = textData.structureSpans.foundriesWithSentenceSpans()
+        val constituencySpanFoundries = textData.structureSpans.foundriesWithConstituencySpans()
+        val externalSentenceFoundries = sentenceSpanFoundries.filterNot { it in BASE_STRUCTURE_FOUNDRIES }.toSortedSet()
+        val externalConstitFoundries = constituencySpanFoundries.filterNot { it in BASE_STRUCTURE_FOUNDRIES }.toSortedSet()
         val layerInfos = mutableListOf<String>()
         if (textData.sentences != null) {
             layerInfos.add("dereko/s=spans")
         }
-        if (hasCorenlpSentences) {
-            layerInfos.add("corenlp/s=spans")
-        }
-        if (hasCorenlpConstituency) {
-            layerInfos.add("corenlp/c=spans")
-        }
+        externalSentenceFoundries.forEach { layerInfos.add("$it/s=spans") }
+        externalConstitFoundries.forEach { layerInfos.add("$it/c=spans") }
 
         // Collect layers by foundry type (checking what data actually exists)
         val foundryLayers = mutableMapOf<String, MutableSet<String>>()
@@ -3743,15 +3779,22 @@
             foundries.add("dereko/structure/base-sentences-paragraphs-pagebreaks")
         }
 
-        if (hasCorenlpSentences || hasCorenlpConstituency) {
-            if (!foundries.contains("corenlp")) {
-                foundries.add("corenlp")
+        val advertisedStructureFoundries = (externalSentenceFoundries + externalConstitFoundries).toSortedSet()
+        advertisedStructureFoundries.forEach { foundry ->
+            if (!foundries.contains(foundry)) {
+                foundries.add(foundry)
             }
-            if (hasCorenlpSentences && !foundries.contains("corenlp/sentences")) {
-                foundries.add("corenlp/sentences")
+            if (externalSentenceFoundries.contains(foundry)) {
+                val sentencesEntry = "$foundry/sentences"
+                if (!foundries.contains(sentencesEntry)) {
+                    foundries.add(sentencesEntry)
+                }
             }
-            if (hasCorenlpConstituency && !foundries.contains("corenlp/structure")) {
-                foundries.add("corenlp/structure")
+            if (externalConstitFoundries.contains(foundry)) {
+                val structureEntry = "$foundry/structure"
+                if (!foundries.contains(structureEntry)) {
+                    foundries.add(structureEntry)
+                }
             }
         }
 
@@ -3926,18 +3969,16 @@
             }
         }
 
-        val hasCorenlpSentences = resolvedStructureSpans.any { it.layer == "corenlp/s:s" }
-        val hasCorenlpConstituency = resolvedStructureSpans.any { it.layer.startsWith("corenlp/c:") }
+        val resolvedSentenceFoundries = resolvedStructureSpans.foundriesWithSentenceSpans()
+        val resolvedConstitFoundries = resolvedStructureSpans.foundriesWithConstituencySpans()
+        val externalSentenceFoundries = resolvedSentenceFoundries.filterNot { it in BASE_STRUCTURE_FOUNDRIES }.toSortedSet()
+        val externalConstitFoundries = resolvedConstitFoundries.filterNot { it in BASE_STRUCTURE_FOUNDRIES }.toSortedSet()
         val layerInfos = mutableListOf<String>()
         if (textData.sentences != null) {
             layerInfos.add("dereko/s=spans")
         }
-        if (hasCorenlpSentences) {
-            layerInfos.add("corenlp/s=spans")
-        }
-        if (hasCorenlpConstituency) {
-            layerInfos.add("corenlp/c=spans")
-        }
+        externalSentenceFoundries.forEach { layerInfos.add("$it/s=spans") }
+        externalConstitFoundries.forEach { layerInfos.add("$it/c=spans") }
 
         // Group structural spans by their starting token
         val spansByToken = mutableMapOf<Int, MutableList<StructureSpan>>()
@@ -3947,7 +3988,10 @@
 
         // Count paragraph spans (name="p")
         val paragraphCount = allStructureSpans.count { it.layer.endsWith(":p") }
-        val corenlpSentenceCount = resolvedStructureSpans.count { it.layer == "corenlp/s:s" }
+        val sentenceCountsByFoundry = resolvedStructureSpans.sentenceCountsByFoundry()
+        val externalSentenceCounts = sentenceCountsByFoundry.entries
+            .filter { (foundry, _) -> foundry !in BASE_STRUCTURE_FOUNDRIES }
+            .sortedBy { it.key }
 
         tokens.forEachIndexed { index, token ->
             val tokenAnnotations = mutableListOf<String>()
@@ -3961,8 +4005,8 @@
                 if (sentences.isNotEmpty()) {
                     tokenAnnotations.add(jsonString("-:base/sentences\$<i>${sentences.size}"))
                 }
-                if (corenlpSentenceCount > 0) {
-                    tokenAnnotations.add(jsonString("-:corenlp/sentences\$<i>$corenlpSentenceCount"))
+                externalSentenceCounts.forEach { (foundry, count) ->
+                    tokenAnnotations.add(jsonString("-:$foundry/sentences\$<i>$count"))
                 }
                 tokenAnnotations.add(jsonString("-:tokens\$<i>${tokens.size}"))
commit	9d2c64e3fd062c1bf7d5150dc53ed7ace47add36	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Nov 13 19:44:40 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Nov 13 19:44:40 2025 +0100
tree	ac1f02e56deb634b6891f326a32dc122aa41a380
parent	bf622e959b104bb33cd5e589b39869ddb40ee42a [diff]