Add dependency output to CoNLL-U

Change-Id: If4e1fda5c0466495b327247f6fd21b94ff33ff7d
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index f7b2c0f..f335ae7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -806,7 +806,7 @@
         }
 
         try {
-            if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
+            if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency)\\.xml$"))) {
                 // Ensure the entry stream and reader are closed to avoid native memory buildup
                 val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
                 val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
@@ -862,9 +862,59 @@
                         waitForMorpho = true
                         fnames[docId] = zipEntry.name
                         val fsSpans: NodeList = doc.getElementsByTagName("span")
-                        morpho[docId] = extractMorphoSpans(fsSpans)
+                        val morphoSpans = extractMorphoSpans(fsSpans)
+
+                        // Merge with existing morpho data (e.g., from dependency.xml)
+                        // instead of replacing it
+                        if (morpho[docId] == null) {
+                            morpho[docId] = morphoSpans
+                        } else {
+                            // Merge: add morpho data while preserving existing dependency data
+                            morphoSpans.forEach { (key, mfs) ->
+                                val existing = morpho[docId]?.get(key)
+                                if (existing != null) {
+                                    // Preserve head and deprel from existing (dependency.xml)
+                                    mfs.head = existing.head
+                                    mfs.deprel = existing.deprel
+                                }
+                                morpho[docId]!![key] = mfs
+                            }
+                            LOGGER.fine("Merged morpho.xml with existing data for $docId (preserved ${morpho[docId]!!.count { it.value.head != "_" }} dependency relations)")
+                        }
                         tokens[docId] = extractSpans(fsSpans)
                     }
+
+                    "dependency.xml" -> {
+                        LOGGER.info("Processing dependency.xml for $docId from ${zipEntry.name}")
+                        val depSpans: NodeList = doc.getElementsByTagName("span")
+                        LOGGER.info("Found ${depSpans.length} spans in dependency.xml")
+                        val depMap = extractDependencySpans(depSpans)
+                        LOGGER.info("Extracted ${depMap.size} dependency relations")
+
+                        // Merge dependency info into existing morpho data
+                        // Note: heads are stored as offsets (e.g., "100-110") and will be resolved
+                        // to token indices later during CoNLL-U output
+                        if (morpho[docId] == null) {
+                            morpho[docId] = mutableMapOf()
+                            LOGGER.info("Created new morpho map for $docId")
+                        }
+                        var mergedCount = 0
+                        var newCount = 0
+                        depMap.forEach { (key, depSpan) ->
+                            val existing = morpho[docId]?.get(key)
+                            if (existing != null) {
+                                // Update existing morpho with dependency info (head is still offset-based)
+                                existing.head = depSpan.head
+                                existing.deprel = depSpan.deprel
+                                mergedCount++
+                            } else {
+                                // Create new entry with just dependency info
+                                morpho[docId]!![key] = depSpan
+                                newCount++
+                            }
+                        }
+                        LOGGER.info("Dependency merge complete: $mergedCount merged, $newCount new entries (heads will be resolved during output)")
+                    }
                 }
 
                 val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null)
@@ -1224,6 +1274,35 @@
         if (tokensArr == null || tokensArr.isEmpty()) {
             return output
         }
+        
+        // Build offset-to-index mapping for resolving dependency heads
+        val offsetToIndex = mutableMapOf<String, Int>()
+        tokensArr.forEachIndexed { index, span ->
+            offsetToIndex["${span.from}-${span.to}"] = index + 1 // CoNLL-U is 1-indexed
+        }
+
+        // Resolve offset-based heads to token indices
+        if (morpho[docId] != null) {
+            var resolvedCount = 0
+            morpho[docId]!!.forEach { (key, mfs) ->
+                if (mfs.head != null && mfs.head != "_" && mfs.head!!.contains("-")) {
+                    // This is an offset-based head, resolve it
+                    val resolvedIndex = offsetToIndex[mfs.head]
+                    if (resolvedIndex != null) {
+                        mfs.head = resolvedIndex.toString()
+                        resolvedCount++
+                    } else {
+                        // Could not resolve, set to root
+                        LOGGER.fine("Could not resolve head offset ${mfs.head} for token $key in $docId, setting to 0 (root)")
+                        mfs.head = "0"
+                    }
+                }
+            }
+            if (resolvedCount > 0) {
+                LOGGER.fine("Resolved $resolvedCount offset-based heads to token indices for $docId")
+            }
+        }
+
         val textVal = texts[docId]
         tokensArr.forEach { span ->
             token_index++
@@ -1552,6 +1631,44 @@
         return res
     }
 
+    private fun extractDependencySpans(
+        depSpans: NodeList
+    ): MutableMap<String, MorphoSpan> {
+        val res: MutableMap<String, MorphoSpan> = HashMap()
+        IntStream.range(0, depSpans.length).mapToObj(depSpans::item)
+            .filter { node -> node is Element }
+            .forEach { node ->
+                node as Element
+                val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}"
+                
+                // Look for <rel> element which contains the dependency relation
+                val relElements = node.getElementsByTagName("rel")
+                if (relElements.length > 0) {
+                    val rel = relElements.item(0) as Element
+                    val deprel = rel.getAttribute("label")
+                    
+                    // The head is encoded as an inner <span> element with from/to attributes
+                    val innerSpans = rel.getElementsByTagName("span")
+                    var head: String? = null
+                    if (innerSpans.length > 0) {
+                        val innerSpan = innerSpans.item(0) as Element
+                        val headFrom = innerSpan.getAttribute("from")
+                        val headTo = innerSpan.getAttribute("to")
+                        // Store as offset key for now, will need to resolve to token index later
+                        head = "$headFrom-$headTo"
+                    }
+                    
+                    if (head != null || deprel != null) {
+                        res[fromTo] = MorphoSpan(
+                            head = head ?: "_",
+                            deprel = deprel ?: "_"
+                        )
+                    }
+                }
+            }
+        return res
+    }
+
     private fun extractSentenceSpans(spans: NodeList): Array<Span> {
         return IntStream.range(0, spans.length).mapToObj(spans::item)
             .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") }
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 2e49fc5..ec8f5b1 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -18,6 +18,7 @@
     private val originalErr: PrintStream = System.err
 
     val goe = loadResource("goe.zip").path
+    val goeSpacy = loadResource("goe.spacy.zip").path
     val goeMarmot = loadResource("goe.marmot.zip").path
     val goeTreeTagger = loadResource("goe.tree_tagger.zip").path
     val zca20scrambled = loadResource("zca20-scrambled.zip").path
@@ -352,4 +353,72 @@
         assertTrue(rc != 0)
         assertContains(errContent.toString(), "--sequential is supported only with -f word2vec or -f now")
     }
+
+    @Test
+    fun dependencyColumnsArePopulatedFromSpacyZip() {
+        val args = arrayOf(goeSpacy)
+        debug(args)
+        val out = outContent.toString()
+
+        // Check that output is CoNLL-U format
+        assertContains(out, "# foundry = spacy")
+        assertContains(out, "# text_id = GOE_AGA.00000")
+
+        // Get data lines (non-comment, non-empty)
+        val dataLines = out.lines()
+            .filter { !it.startsWith("#") && it.isNotBlank() }
+
+        assertTrue(dataLines.isNotEmpty(), "Should have data lines in output")
+
+        // Parse tokens and check dependency columns (column 7 = HEAD, column 8 = DEPREL)
+        var tokensWithHead = 0
+        var tokensWithDeprel = 0
+        var totalTokens = 0
+
+        for (line in dataLines) {
+            val columns = line.split(Regex("\\s+"))
+            if (columns.size >= 8) {
+                totalTokens++
+                // Column 7 (index 6) is HEAD, column 8 (index 7) is DEPREL
+                val head = columns[6]
+                val deprel = columns[7]
+
+                if (head != "_") tokensWithHead++
+                if (deprel != "_") tokensWithDeprel++
+            }
+        }
+
+        // Assert that we have tokens
+        assertTrue(totalTokens > 0, "Should have parsed at least some tokens")
+
+        // Print diagnostic information
+        System.err.println("=== Dependency Test Diagnostics ===")
+        System.err.println("Total tokens: $totalTokens")
+        System.err.println("Tokens with HEAD (!= '_'): $tokensWithHead")
+        System.err.println("Tokens with DEPREL (!= '_'): $tokensWithDeprel")
+        System.err.println("First 5 data lines:")
+        dataLines.take(5).forEach { System.err.println("  $it") }
+
+        // Assert that HEAD column (col 7) is populated for most tokens
+        // We expect at least 90% of tokens to have dependency information
+        val headCoverage = (tokensWithHead.toDouble() / totalTokens) * 100
+        assertTrue(
+            headCoverage > 80.0,
+            "HEAD column should be populated for most tokens. Found: $tokensWithHead/$totalTokens (${headCoverage}%)"
+        )
+
+        // Assert that DEPREL column (col 8) is populated for most tokens
+        val deprelCoverage = (tokensWithDeprel.toDouble() / totalTokens) * 100
+        assertTrue(
+            deprelCoverage > 85.0,
+            "DEPREL column should be populated for most tokens. Found: $tokensWithDeprel/$totalTokens (${deprelCoverage}%)"
+        )
+
+        // Check for specific dependency relations and head indices in output
+        // Look for numeric head indices (not "_")
+        assertTrue(
+            out.contains(Regex("\\n\\d+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\S+\\t\\d+\\t\\S+\\t")),
+            "Should find tokens with numeric HEAD values in column 7"
+        )
+    }
 }
diff --git a/app/src/test/resources/goe.spacy.zip b/app/src/test/resources/goe.spacy.zip
new file mode 100644
index 0000000..b934e51
--- /dev/null
+++ b/app/src/test/resources/goe.spacy.zip
Binary files differ