Add rough constituency parse also as conllu output comment
Change-Id: If6a2049a6800828abeebee98d860267a13b51b68
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index e5af333..759c94b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1592,7 +1592,7 @@
}
"constituency.xml" -> {
- if (outputFormat == OutputFormat.KRILL) {
+ if (outputFormat == OutputFormat.KRILL || outputFormat == OutputFormat.CONLLU) {
val constituencySpans: NodeList = doc.getElementsByTagName("span")
collectConstituency(docId, foundry, constituencySpans)
}
@@ -2143,15 +2143,23 @@
var token_index = 0
var real_token_index = 0
var sentence_index = 0
- val output: StringBuilder
val sentencesArr = sentences[docId]
val tokensArr = tokens[docId]
- output =
+ val textVal = texts[docId]
+ val constituencyComments = buildConstituencyComments(docId, tokensArr, sentencesArr, textVal)
+ val output =
StringBuilder("# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n").append(
tokenOffsetsInSentence(
sentences, docId, sentence_index, real_token_index, tokens
)
)
+ fun appendConstituencyComment(sentenceIdx: Int) {
+ val comment = constituencyComments[sentenceIdx]
+ if (!comment.isNullOrBlank()) {
+ output.append("# constituency = ").append(comment).append("\n")
+ }
+ }
+ appendConstituencyComment(sentence_index)
if (extractMetadataRegex.isNotEmpty()) {
output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
}
@@ -2175,19 +2183,19 @@
} else raw
}
- val textVal = texts[docId]
tokensArr.forEach { span ->
token_index++
if (sentencesArr != null && (sentence_index >= sentencesArr.size || span.from >= sentencesArr[sentence_index].to)) {
output.append("\n")
sentence_index++
- token_index = 1
- output.append(
- tokenOffsetsInSentence(
- sentences, docId, sentence_index, real_token_index, tokens
- )
- )
- }
+ token_index = 1
+ output.append(
+ tokenOffsetsInSentence(
+ sentences, docId, sentence_index, real_token_index, tokens
+ )
+ )
+ appendConstituencyComment(sentence_index)
+ }
if (extractAttributesRegex.isNotEmpty() && extraFeatures[docId] != null) {
for (i in previousSpanStart until span.from + 1) {
if (extraFeatures[docId]?.containsKey("$i") == true) {
@@ -2271,6 +2279,101 @@
return output
}
+ private fun buildConstituencyComments(
+ docId: String,
+ tokensArr: Array<Span>?,
+ sentencesArr: Array<Span>?,
+ textVal: NonBmpString?
+ ): Map<Int, String> {
+ if (tokensArr.isNullOrEmpty() || textVal == null) return emptyMap()
+ val trees = constituencyTrees[docId] ?: return emptyMap()
+ if (trees.isEmpty()) return emptyMap()
+
+ data class TokenInfo(val from: Int, val to: Int, val surface: String)
+
+ val tokenInfos = tokensArr.map { span ->
+ val safeFrom = span.from.coerceIn(0, textVal.length)
+ val safeTo = span.to.coerceIn(safeFrom, textVal.length)
+ val surface = if (safeFrom < safeTo) {
+ textVal.substring(safeFrom, safeTo)
+ } else {
+ "_"
+ }
+ TokenInfo(safeFrom, safeTo, surface.ifBlank { "_" })
+ }
+
+ fun tokensInRange(from: Int, to: Int): List<TokenInfo> =
+ tokenInfos.filter { it.from >= from && it.to <= to }
+
+ fun escapeParens(value: String): String =
+ value.replace("(", "-LRB-").replace(")", "-RRB-")
+
+ val comments = mutableMapOf<Int, String>()
+
+ trees.forEach { tree ->
+ if (tree.nodes.isEmpty()) return@forEach
+
+ val nodeById = tree.nodes.associateBy { it.id }
+ val referencedNodeIds = tree.nodes.flatMap { node ->
+ node.children.mapNotNull { child ->
+ when (child) {
+ is ConstituencyParserBridge.ConstituencyChild.NodeRef -> child.targetId
+ is ConstituencyParserBridge.ConstituencyChild.MorphoRef -> child.morphoId.takeIf { nodeById.containsKey(it) }
+ }
+ }
+ }.toSet()
+ val rootNode = tree.nodes.firstOrNull { it.id !in referencedNodeIds } ?: tree.nodes.first()
+ val visited = mutableSetOf<String>()
+ val sentenceIdx = sentencesArr
+ ?.indexOfFirst { rootNode.from >= it.from && rootNode.to <= it.to }
+ ?.takeIf { it >= 0 }
+ ?: 0
+ val sentenceTokens = sentencesArr
+ ?.getOrNull(sentenceIdx)
+ ?.let { sentSpan -> tokensInRange(sentSpan.from, sentSpan.to) }
+ ?: tokenInfos
+ var sentenceTokenCursor = 0
+
+ fun render(node: ConstituencyParserBridge.ConstituencyNode): String? {
+ if (!visited.add(node.id)) return null
+
+ val childStrings = node.children.mapNotNull { child ->
+ when (child) {
+ is ConstituencyParserBridge.ConstituencyChild.NodeRef -> {
+ val childNode = nodeById[child.targetId] ?: return@mapNotNull null
+ render(childNode)
+ }
+ is ConstituencyParserBridge.ConstituencyChild.MorphoRef -> {
+ val nextToken = sentenceTokens.getOrNull(sentenceTokenCursor++)
+ ?: return@mapNotNull null
+ val tokenText = escapeParens(nextToken.surface)
+ val label = escapeParens(nodeById[child.morphoId]?.label ?: "TOK")
+ if (label == "TOK") tokenText else "($label $tokenText)"
+ }
+ }
+ }.filter { it.isNotBlank() }
+
+ val label = escapeParens(node.label.ifBlank { "ROOT" })
+ if (childStrings.isEmpty()) {
+ val fallbackTokens = tokensInRange(node.from, node.to)
+ return if (fallbackTokens.isNotEmpty()) {
+ "($label ${fallbackTokens.joinToString(" ") { escapeParens(it.surface) }})"
+ } else {
+ "($label)"
+ }
+ }
+
+ return "($label ${childStrings.joinToString(" ")})"
+ }
+
+ val rendered = render(rootNode) ?: return@forEach
+
+ comments.merge(sentenceIdx, rendered) { old, new -> "$old | $new" }
+ }
+
+ return comments
+ }
+
private fun lmTrainingOutput(docId: String): StringBuilder {
var token_index = 0
var real_token_index = 0
@@ -2807,7 +2910,7 @@
val from: Int,
val to: Int,
val label: String,
- val children: MutableList<String> = mutableListOf()
+ val children: MutableList<ConstituencyParserBridge.ConstituencyChild> = mutableListOf()
)
private fun collectSentences(docId: String, foundry: String, spans: NodeList) {
@@ -2874,13 +2977,14 @@
if (rel.getAttribute("label") != "dominates") continue
val target = rel.getAttribute("target")
if (!target.isNullOrBlank()) {
- node.children.add(target)
+ node.children.add(ConstituencyParserBridge.ConstituencyChild.NodeRef(target))
nonRootIds.add(target)
} else {
val uri = rel.getAttribute("uri")
if (!uri.isNullOrBlank()) {
val normalized = uri.removePrefix("morpho.xml#")
if (normalized.isNotBlank()) {
+ node.children.add(ConstituencyParserBridge.ConstituencyChild.MorphoRef(normalized))
nonRootIds.add(normalized)
}
}
@@ -2912,8 +3016,10 @@
attributes = emptyMap()
)
)
- node.children.forEach { childId ->
- traverse(childId, depth + 1)
+ node.children.forEach { child ->
+ if (child is ConstituencyParserBridge.ConstituencyChild.NodeRef) {
+ traverse(child.targetId, depth + 1)
+ }
}
}
@@ -2921,6 +3027,70 @@
rootIds.forEach { traverse(it, 0) }
textData.constituencyCollectedByFoundry.add(foundry)
}
+
+ // Also cache constituency trees for downstream outputs (e.g., CoNLL-U comments)
+ if (nodesById.isNotEmpty()) {
+ val nodeRefTargets = nodesById.values
+ .flatMap { node -> node.children.mapNotNull { child -> (child as? ConstituencyParserBridge.ConstituencyChild.NodeRef)?.targetId } }
+ .toMutableSet()
+ nodesById.values.forEach { node ->
+ node.children.forEach { child ->
+ if (child is ConstituencyParserBridge.ConstituencyChild.MorphoRef && nodesById.containsKey(child.morphoId)) {
+ nodeRefTargets.add(child.morphoId)
+ }
+ }
+ }
+ val rootIds = nodesById.keys.filter { it !in nodeRefTargets }
+
+ val trees = mutableListOf<ConstituencyParserBridge.ConstituencyTree>()
+
+ fun copySubtree(nodeId: String, visited: MutableSet<String>): ConstituencyParserBridge.ConstituencyNode? {
+ if (!visited.add(nodeId)) return null
+ val source = nodesById[nodeId] ?: return null
+ val copiedChildren = mutableListOf<ConstituencyParserBridge.ConstituencyChild>()
+ source.children.forEach { child ->
+ when (child) {
+ is ConstituencyParserBridge.ConstituencyChild.NodeRef -> copiedChildren.add(child)
+ is ConstituencyParserBridge.ConstituencyChild.MorphoRef -> copiedChildren.add(child)
+ }
+ }
+ return ConstituencyParserBridge.ConstituencyNode(
+ id = source.id,
+ label = source.label,
+ from = source.from,
+ to = source.to,
+ children = copiedChildren
+ )
+ }
+
+ rootIds.forEachIndexed { idx, rootId ->
+ val visited = mutableSetOf<String>()
+ val collectedNodes = mutableListOf<ConstituencyParserBridge.ConstituencyNode>()
+
+ fun collect(nodeId: String) {
+ val node = copySubtree(nodeId, visited) ?: return
+ collectedNodes.add(node)
+ node.children.forEach { child ->
+ if (child is ConstituencyParserBridge.ConstituencyChild.NodeRef) {
+ collect(child.targetId)
+ }
+ }
+ }
+
+ collect(rootId)
+
+ if (collectedNodes.isNotEmpty()) {
+ val sentenceId = rootId.substringBefore("_n").takeIf { it.isNotBlank() } ?: "s${idx + 1}"
+ trees.add(ConstituencyParserBridge.ConstituencyTree(sentenceId = sentenceId, nodes = collectedNodes))
+ }
+ }
+
+ if (trees.isNotEmpty()) {
+ constituencyTrees.compute(docId) { _, existing ->
+ existing?.takeIf { it.isNotEmpty() } ?: trees
+ }
+ }
+ }
}
// Collect rich metadata from header.xml for krill format
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index ba9d375..0a322aa 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -913,6 +913,25 @@
}
}
+ @Test
+ fun conlluIncludesConstituencyCommentsWhenAvailable() {
+ outContent.reset()
+ errContent.reset()
+
+ val args = arrayOf(wud24Corenlp)
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "CoNLL-U conversion should succeed when constituency annotations are present")
+
+ val output = outContent.toString("UTF-8")
+ val constituencyLines = output.lineSequence().filter { it.startsWith("# constituency =") }.toList()
+
+ assertTrue(constituencyLines.isNotEmpty(), "CoNLL-U output should include constituency comment lines")
+ assertTrue(
+ constituencyLines.first().contains("("),
+ "Constituency comment should contain bracketed structure"
+ )
+ }
+
private fun readKrillJson(tarFile: File): Map<String, String> {
val extractDir = File.createTempFile("krill_extract", "").let {
it.delete()