Add dependency output to CoNLL-U
Change-Id: If4e1fda5c0466495b327247f6fd21b94ff33ff7d
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index f7b2c0f..f335ae7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -806,7 +806,7 @@
}
try {
- if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
+ if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency)\\.xml$"))) {
// Ensure the entry stream and reader are closed to avoid native memory buildup
val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
@@ -862,9 +862,59 @@
waitForMorpho = true
fnames[docId] = zipEntry.name
val fsSpans: NodeList = doc.getElementsByTagName("span")
- morpho[docId] = extractMorphoSpans(fsSpans)
+ val morphoSpans = extractMorphoSpans(fsSpans)
+
+ // Merge with existing morpho data (e.g., from dependency.xml)
+ // instead of replacing it
+ if (morpho[docId] == null) {
+ morpho[docId] = morphoSpans
+ } else {
+ // Merge: add morpho data while preserving existing dependency data
+ morphoSpans.forEach { (key, mfs) ->
+ val existing = morpho[docId]?.get(key)
+ if (existing != null) {
+ // Preserve head and deprel from existing (dependency.xml)
+ mfs.head = existing.head
+ mfs.deprel = existing.deprel
+ }
+ morpho[docId]!![key] = mfs
+ }
+ LOGGER.fine("Merged morpho.xml with existing data for $docId (preserved ${morpho[docId]!!.count { it.value.head != "_" }} dependency relations)")
+ }
tokens[docId] = extractSpans(fsSpans)
}
+
+ "dependency.xml" -> {
+ LOGGER.info("Processing dependency.xml for $docId from ${zipEntry.name}")
+ val depSpans: NodeList = doc.getElementsByTagName("span")
+ LOGGER.info("Found ${depSpans.length} spans in dependency.xml")
+ val depMap = extractDependencySpans(depSpans)
+ LOGGER.info("Extracted ${depMap.size} dependency relations")
+
+ // Merge dependency info into existing morpho data
+ // Note: heads are stored as offsets (e.g., "100-110") and will be resolved
+ // to token indices later during CoNLL-U output
+ if (morpho[docId] == null) {
+ morpho[docId] = mutableMapOf()
+ LOGGER.info("Created new morpho map for $docId")
+ }
+ var mergedCount = 0
+ var newCount = 0
+ depMap.forEach { (key, depSpan) ->
+ val existing = morpho[docId]?.get(key)
+ if (existing != null) {
+ // Update existing morpho with dependency info (head is still offset-based)
+ existing.head = depSpan.head
+ existing.deprel = depSpan.deprel
+ mergedCount++
+ } else {
+ // Create new entry with just dependency info
+ morpho[docId]!![key] = depSpan
+ newCount++
+ }
+ }
+ LOGGER.info("Dependency merge complete: $mergedCount merged, $newCount new entries (heads will be resolved during output)")
+ }
}
val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null)
@@ -1224,6 +1274,35 @@
if (tokensArr == null || tokensArr.isEmpty()) {
return output
}
+
+ // Build offset-to-index mapping for resolving dependency heads
+ val offsetToIndex = mutableMapOf<String, Int>()
+ tokensArr.forEachIndexed { index, span ->
+ offsetToIndex["${span.from}-${span.to}"] = index + 1 // CoNLL-U is 1-indexed
+ }
+
+ // Resolve offset-based heads to token indices
+ if (morpho[docId] != null) {
+ var resolvedCount = 0
+ morpho[docId]!!.forEach { (key, mfs) ->
+ if (mfs.head != null && mfs.head != "_" && mfs.head!!.contains("-")) {
+ // This is an offset-based head, resolve it
+ val resolvedIndex = offsetToIndex[mfs.head]
+ if (resolvedIndex != null) {
+ mfs.head = resolvedIndex.toString()
+ resolvedCount++
+ } else {
+ // Could not resolve, set to root
+ LOGGER.fine("Could not resolve head offset ${mfs.head} for token $key in $docId, setting to 0 (root)")
+ mfs.head = "0"
+ }
+ }
+ }
+ if (resolvedCount > 0) {
+ LOGGER.fine("Resolved $resolvedCount offset-based heads to token indices for $docId")
+ }
+ }
+
val textVal = texts[docId]
tokensArr.forEach { span ->
token_index++
@@ -1552,6 +1631,44 @@
return res
}
+ private fun extractDependencySpans(
+ depSpans: NodeList
+ ): MutableMap<String, MorphoSpan> {
+ val res: MutableMap<String, MorphoSpan> = HashMap()
+ IntStream.range(0, depSpans.length).mapToObj(depSpans::item)
+ .filter { node -> node is Element }
+ .forEach { node ->
+ node as Element
+ val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}"
+
+ // Look for <rel> element which contains the dependency relation
+ val relElements = node.getElementsByTagName("rel")
+ if (relElements.length > 0) {
+ val rel = relElements.item(0) as Element
+ val deprel = rel.getAttribute("label")
+
+ // The head is encoded as an inner <span> element with from/to attributes
+ val innerSpans = rel.getElementsByTagName("span")
+ var head: String? = null
+ if (innerSpans.length > 0) {
+ val innerSpan = innerSpans.item(0) as Element
+ val headFrom = innerSpan.getAttribute("from")
+ val headTo = innerSpan.getAttribute("to")
+ // Store as offset key for now, will need to resolve to token index later
+ head = "$headFrom-$headTo"
+ }
+
+ if (head != null || deprel != null) {
+ res[fromTo] = MorphoSpan(
+ head = head ?: "_",
+ deprel = deprel ?: "_"
+ )
+ }
+ }
+ }
+ return res
+ }
+
private fun extractSentenceSpans(spans: NodeList): Array<Span> {
return IntStream.range(0, spans.length).mapToObj(spans::item)
.filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") }
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 2e49fc5..ec8f5b1 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -18,6 +18,7 @@
private val originalErr: PrintStream = System.err
val goe = loadResource("goe.zip").path
+ val goeSpacy = loadResource("goe.spacy.zip").path
val goeMarmot = loadResource("goe.marmot.zip").path
val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
val zca20scrambled = loadResource("zca20-scrambled.zip").path
@@ -352,4 +353,72 @@
assertTrue(rc != 0)
assertContains(errContent.toString(), "--sequential is supported only with -f word2vec or -f now")
}
+
+ @Test
+ fun dependencyColumnsArePopulatedFromSpacyZip() {
+ val args = arrayOf(goeSpacy)
+ debug(args)
+ val out = outContent.toString()
+
+ // Check that output is CoNLL-U format
+ assertContains(out, "# foundry = spacy")
+ assertContains(out, "# text_id = GOE_AGA.00000")
+
+ // Get data lines (non-comment, non-empty)
+ val dataLines = out.lines()
+ .filter { !it.startsWith("#") && it.isNotBlank() }
+
+ assertTrue(dataLines.isNotEmpty(), "Should have data lines in output")
+
+ // Parse tokens and check dependency columns (column 7 = HEAD, column 8 = DEPREL)
+ var tokensWithHead = 0
+ var tokensWithDeprel = 0
+ var totalTokens = 0
+
+ for (line in dataLines) {
+ val columns = line.split(Regex("\\s+"))
+ if (columns.size >= 8) {
+ totalTokens++
+ // Column 7 (index 6) is HEAD, column 8 (index 7) is DEPREL
+ val head = columns[6]
+ val deprel = columns[7]
+
+ if (head != "_") tokensWithHead++
+ if (deprel != "_") tokensWithDeprel++
+ }
+ }
+
+ // Assert that we have tokens
+ assertTrue(totalTokens > 0, "Should have parsed at least some tokens")
+
+ // Print diagnostic information
+ System.err.println("=== Dependency Test Diagnostics ===")
+ System.err.println("Total tokens: $totalTokens")
+ System.err.println("Tokens with HEAD (!= '_'): $tokensWithHead")
+ System.err.println("Tokens with DEPREL (!= '_'): $tokensWithDeprel")
+ System.err.println("First 5 data lines:")
+ dataLines.take(5).forEach { System.err.println(" $it") }
+
+ // Assert that HEAD column (col 7) is populated for most tokens
+ // We expect at least 90% of tokens to have dependency information
+ val headCoverage = (tokensWithHead.toDouble() / totalTokens) * 100
+ assertTrue(
+ headCoverage > 80.0,
+ "HEAD column should be populated for most tokens. Found: $tokensWithHead/$totalTokens (${headCoverage}%)"
+ )
+
+ // Assert that DEPREL column (col 8) is populated for most tokens
+ val deprelCoverage = (tokensWithDeprel.toDouble() / totalTokens) * 100
+ assertTrue(
+ deprelCoverage > 85.0,
+ "DEPREL column should be populated for most tokens. Found: $tokensWithDeprel/$totalTokens (${deprelCoverage}%)"
+ )
+
+ // Check for specific dependency relations and head indices in output
+ // Look for numeric head indices (not "_")
+ assertTrue(
+ out.contains(Regex("\\n\\d+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\d+\\t\\S+\\t")),
+ "Should find tokens with numeric HEAD values in column 7"
+ )
+ }
}
diff --git a/app/src/test/resources/goe.spacy.zip b/app/src/test/resources/goe.spacy.zip
new file mode 100644
index 0000000..b934e51
--- /dev/null
+++ b/app/src/test/resources/goe.spacy.zip
Binary files differ