Add constituencies and non base sentences to krill output
Change-Id: Ib56c852efca64dd28f7f487a87384df21b248e7c
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 966736b..c5701e9 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -466,7 +466,9 @@
var sentences: Array<Span>? = null,
var morphoByFoundry: MutableMap<String, MutableMap<String, MorphoSpan>> = mutableMapOf(),
var structureSpans: MutableList<StructureSpan> = mutableListOf(),
- var extractedAttributes: MutableMap<String, String> = mutableMapOf()
+ var extractedAttributes: MutableMap<String, String> = mutableMapOf(),
+ var corenlpSentencesCollected: Boolean = false,
+ var corenlpConstituencyCollected: Boolean = false
)
data class StructureSpan(
@@ -1177,8 +1179,6 @@
val entriesByTextId = entries.groupBy { getTextIdFromPath(it.name) }
val textIds = entriesByTextId.keys.sorted() // Process text IDs in lexicographic order
- LOGGER.info("processZipEntriesWithPool: processing ${entries.size} entries (${textIds.size} texts) with foundry=$foundry")
-
// Initialize watermark for this foundry if not exists (set to first text ID)
if (!foundryWatermarks.containsKey(foundry) && textIds.isNotEmpty()) {
foundryWatermarks.putIfAbsent(foundry, textIds.first())
@@ -1252,7 +1252,7 @@
}
try {
- if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency)\\.xml$"))) {
+ if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$"))) {
LOGGER.finer("Processing entry: ${zipEntry.name}, foundry=$foundry")
// Ensure the entry stream and reader are closed to avoid native memory buildup
val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
@@ -1399,6 +1399,21 @@
LOGGER.info("Dependency merge complete: $mergedCount merged, $newCount new entries (heads will be resolved during output)")
}
}
+
+ "sentences.xml" -> {
+ println("sentences entry foundry=$foundry for $docId from ${zipEntry.name}")
+ if (outputFormat == OutputFormat.KRILL && foundry.startsWith("corenlp")) {
+ val sentenceSpans: NodeList = doc.getElementsByTagName("span")
+ collectCorenlpSentences(docId, sentenceSpans)
+ }
+ }
+
+ "constituency.xml" -> {
+ if (outputFormat == OutputFormat.KRILL && foundry.startsWith("corenlp")) {
+ val constituencySpans: NodeList = doc.getElementsByTagName("span")
+ collectCorenlpConstituency(docId, constituencySpans)
+ }
+ }
}
// Mark text as processed from this ZIP for incremental output
@@ -2508,6 +2523,127 @@
}
}
+ private data class CorenlpConstituencyNode(
+ val id: String,
+ val from: Int,
+ val to: Int,
+ val label: String,
+ val children: MutableList<String> = mutableListOf()
+ )
+
+ private fun collectCorenlpSentences(docId: String, spans: NodeList) {
+ if (outputTexts.contains(docId)) return
+
+ val textData = krillData.getOrPut(docId) {
+ KrillTextData(textId = docId)
+ }
+
+ synchronized(textData) {
+ if (textData.corenlpSentencesCollected) return
+ for (i in 0 until spans.length) {
+ val span = spans.item(i) as? Element ?: continue
+ val from = span.getAttribute("from").toIntOrNull() ?: continue
+ val to = span.getAttribute("to").toIntOrNull() ?: continue
+ textData.structureSpans.add(
+ StructureSpan(
+ layer = "corenlp/s:s",
+ from = from,
+ to = to,
+ tokenFrom = -1,
+ tokenTo = -1,
+ depth = 0,
+ attributes = emptyMap()
+ )
+ )
+ }
+ textData.corenlpSentencesCollected = true
+ }
+ }
+
+ private fun collectCorenlpConstituency(docId: String, spans: NodeList) {
+ if (outputTexts.contains(docId)) return
+
+ val nodesById = mutableMapOf<String, CorenlpConstituencyNode>()
+ val nonRootIds = mutableSetOf<String>()
+
+ for (i in 0 until spans.length) {
+ val span = spans.item(i) as? Element ?: continue
+ val id = span.getAttribute("id")
+ if (id.isNullOrBlank()) continue
+ val from = span.getAttribute("from").toIntOrNull() ?: continue
+ val to = span.getAttribute("to").toIntOrNull() ?: continue
+
+ val fsList = span.getElementsByTagName("fs")
+ if (fsList.length == 0) continue
+ val fsElement = fsList.item(0) as? Element ?: continue
+ val fElements = fsElement.getElementsByTagName("f")
+ var label: String? = null
+ for (j in 0 until fElements.length) {
+ val f = fElements.item(j) as? Element ?: continue
+ if (f.getAttribute("name") == "const") {
+ label = f.textContent?.trim()
+ break
+ }
+ }
+ if (label.isNullOrBlank()) continue
+
+ val node = CorenlpConstituencyNode(id, from, to, label)
+
+ val relElements = span.getElementsByTagName("rel")
+ for (j in 0 until relElements.length) {
+ val rel = relElements.item(j) as? Element ?: continue
+ if (rel.getAttribute("label") != "dominates") continue
+ val target = rel.getAttribute("target")
+ if (!target.isNullOrBlank()) {
+ node.children.add(target)
+ nonRootIds.add(target)
+ } else {
+ val uri = rel.getAttribute("uri")
+ if (!uri.isNullOrBlank()) {
+ val normalized = uri.removePrefix("morpho.xml#")
+ if (normalized.isNotBlank()) {
+ nonRootIds.add(normalized)
+ }
+ }
+ }
+ }
+ nodesById[id] = node
+ }
+
+ if (nodesById.isEmpty()) return
+
+ val textData = krillData.getOrPut(docId) {
+ KrillTextData(textId = docId)
+ }
+
+ synchronized(textData) {
+ if (textData.corenlpConstituencyCollected) return
+ LOGGER.fine("Collecting corenlp constituency for $docId: ${nodesById.size} nodes, roots=${nodesById.keys.count { it !in nonRootIds }}")
+
+ fun traverse(nodeId: String, depth: Int) {
+ val node = nodesById[nodeId] ?: return
+ textData.structureSpans.add(
+ StructureSpan(
+ layer = "corenlp/c:${node.label}",
+ from = node.from,
+ to = node.to,
+ tokenFrom = -1,
+ tokenTo = -1,
+ depth = depth,
+ attributes = emptyMap()
+ )
+ )
+ node.children.forEach { childId ->
+ traverse(childId, depth + 1)
+ }
+ }
+
+ val rootIds = nodesById.keys.filter { it !in nonRootIds }
+ rootIds.forEach { traverse(it, 0) }
+ textData.corenlpConstituencyCollected = true
+ }
+ }
+
// Collect rich metadata from header.xml for krill format
private fun collectKrillMetadata(docId: String, headerXml: String) {
// Skip if already output (thread-safe check with ConcurrentHashMap.KeySet)
@@ -3518,11 +3654,18 @@
sb.append("\"data\":{")
sb.append("\"text\":${jsonString(textData.textContent ?: "")},")
- // layerInfos - list all foundries
+ val hasCorenlpSentences = textData.structureSpans.any { it.layer == "corenlp/s:s" }
+ val hasCorenlpConstituency = textData.structureSpans.any { it.layer.startsWith("corenlp/c:") }
val layerInfos = mutableListOf<String>()
if (textData.sentences != null) {
layerInfos.add("dereko/s=spans")
}
+ if (hasCorenlpSentences) {
+ layerInfos.add("corenlp/s=spans")
+ }
+ if (hasCorenlpConstituency) {
+ layerInfos.add("corenlp/c=spans")
+ }
// Collect layers by foundry type (checking what data actually exists)
val foundryLayers = mutableMapOf<String, MutableSet<String>>()
@@ -3600,6 +3743,18 @@
foundries.add("dereko/structure/base-sentences-paragraphs-pagebreaks")
}
+ if (hasCorenlpSentences || hasCorenlpConstituency) {
+ if (!foundries.contains("corenlp")) {
+ foundries.add("corenlp")
+ }
+ if (hasCorenlpSentences && !foundries.contains("corenlp/sentences")) {
+ foundries.add("corenlp/sentences")
+ }
+ if (hasCorenlpConstituency && !foundries.contains("corenlp/structure")) {
+ foundries.add("corenlp/structure")
+ }
+ }
+
// Add annotation foundries with their layers
foundryLayers.keys.sorted().forEach { foundry ->
// Use full name "treetagger" instead of "tt" in foundries list
@@ -3771,6 +3926,19 @@
}
}
+ val hasCorenlpSentences = resolvedStructureSpans.any { it.layer == "corenlp/s:s" }
+ val hasCorenlpConstituency = resolvedStructureSpans.any { it.layer.startsWith("corenlp/c:") }
+ val layerInfos = mutableListOf<String>()
+ if (textData.sentences != null) {
+ layerInfos.add("dereko/s=spans")
+ }
+ if (hasCorenlpSentences) {
+ layerInfos.add("corenlp/s=spans")
+ }
+ if (hasCorenlpConstituency) {
+ layerInfos.add("corenlp/c=spans")
+ }
+
// Group structural spans by their starting token
val spansByToken = mutableMapOf<Int, MutableList<StructureSpan>>()
resolvedStructureSpans.forEach { span ->
@@ -3779,6 +3947,7 @@
// Count paragraph spans (name="p")
val paragraphCount = allStructureSpans.count { it.layer.endsWith(":p") }
+ val corenlpSentenceCount = resolvedStructureSpans.count { it.layer == "corenlp/s:s" }
tokens.forEachIndexed { index, token ->
val tokenAnnotations = mutableListOf<String>()
@@ -3792,6 +3961,9 @@
if (sentences.isNotEmpty()) {
tokenAnnotations.add(jsonString("-:base/sentences\$<i>${sentences.size}"))
}
+ if (corenlpSentenceCount > 0) {
+ tokenAnnotations.add(jsonString("-:corenlp/sentences\$<i>$corenlpSentenceCount"))
+ }
tokenAnnotations.add(jsonString("-:tokens\$<i>${tokens.size}"))
// Add all structural spans that start at token 0 or cover the whole document
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 6eddd09..26572c2 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -25,6 +25,7 @@
val zca20scrambled = loadResource("zca20-scrambled.zip").path
val wdf19 = loadResource("wdf19.zip").path
val wdd17 = loadResource("wdd17sample.zip").path
+ val wud24Corenlp = loadResource("wud24_sample.corenlp.zip").path
@Before
fun setUpStreams() {
@@ -683,7 +684,7 @@
}
try {
- val defaultArgs = arrayOf("-f", "krill", "-D", defaultDir.path, baseZip, spacyZip)
+ val defaultArgs = arrayOf("-f", "krill", "-D", defaultDir.path, baseZip, spacyZip, wud24Corenlp)
val defaultExit = debug(defaultArgs)
assertTrue(defaultExit == 0, "Krill conversion should succeed without --non-word-tokens")
@@ -711,7 +712,7 @@
}
try {
- val flagArgs = arrayOf("-f", "krill", "--non-word-tokens", "-D", flagDir.path, baseZip, spacyZip)
+ val flagArgs = arrayOf("-f", "krill", "--non-word-tokens", "-D", flagDir.path, baseZip, spacyZip, wud24Corenlp)
val flagExit = debug(flagArgs)
assertTrue(flagExit == 0, "Krill conversion should succeed with --non-word-tokens")
@@ -737,6 +738,10 @@
fun krillDefaultMatchesPerlReference() {
val baseZip = loadResource("wud24_sample.zip").path
val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+ val opennlpZip = loadResource("wud24_sample.opennlp.zip").path
+ val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+ val corenlpZip = wud24Corenlp
val referenceTar = File(loadResource("wud24_sample.wonwtopt.krill.tar").toURI())
assertTrue(referenceTar.exists(), "Reference Krill tar is missing: ${referenceTar.path}")
@@ -747,7 +752,15 @@
}
try {
- val args = arrayOf("-f", "krill", "-D", kotlinDir.path, baseZip, spacyZip)
+ val args = arrayOf(
+ "-f", "krill",
+ "-D", kotlinDir.path,
+ baseZip,
+ spacyZip,
+ marmotMaltZip,
+ treeTaggerZip,
+ corenlpZip
+ )
val exitCode = debug(args)
assertTrue(exitCode == 0, "Krill conversion should succeed for reference comparison")
@@ -781,6 +794,9 @@
fun krillNonWordTokensMatchesPerlReference() {
val baseZip = loadResource("wud24_sample.zip").path
val spacyZip = loadResource("wud24_sample.spacy.zip").path
+ val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+ val treeTaggerZip = loadResource("wud24_sample.tree_tagger.zip").path
+ val corenlpZipNwt = wud24Corenlp
val referenceTar = File(loadResource("wud24_sample.nwt.krill.tar").toURI())
assertTrue(referenceTar.exists(), "Non-word-token reference tar missing: ${referenceTar.path}")
@@ -791,7 +807,16 @@
}
try {
- val args = arrayOf("-f", "krill", "--non-word-tokens", "-D", kotlinDir.path, baseZip, spacyZip)
+ val args = arrayOf(
+ "-f", "krill",
+ "--non-word-tokens",
+ "-D", kotlinDir.path,
+ baseZip,
+ spacyZip,
+ marmotMaltZip,
+ treeTaggerZip,
+ corenlpZipNwt
+ )
val exitCode = debug(args)
assertTrue(exitCode == 0, "Krill conversion with --non-word-tokens should succeed for reference comparison")
@@ -809,8 +834,10 @@
"\"s:!\"",
"\"marmot/p:\\$,\"",
"\"spacy/p:\\$,\"",
- "\"opennlp/p:\\$,\"",
- "\"tt/p:\\$,\""
+ "\"tt/p:\\$,\"",
+ "\"-:corenlp/sentences\$<i>11\"",
+ "corenlp/s=spans",
+ "corenlp/c=spans"
)
referenceJsons.forEach { (doc, referenceJson) ->
val kotlinJson = kotlinJsons.getValue(doc)
diff --git a/app/src/test/resources/wud24_sample.krill.tar b/app/src/test/resources/wud24_sample.krill.tar
index 0a1745e..aa153aa 100644
--- a/app/src/test/resources/wud24_sample.krill.tar
+++ b/app/src/test/resources/wud24_sample.krill.tar
Binary files differ
diff --git a/app/src/test/resources/wud24_sample.nwt.krill.tar b/app/src/test/resources/wud24_sample.nwt.krill.tar
index df498e0..cd51749 100644
--- a/app/src/test/resources/wud24_sample.nwt.krill.tar
+++ b/app/src/test/resources/wud24_sample.nwt.krill.tar
Binary files differ