Fixed sparse annotations being assigned to wrong tokens Resolves #23 Change-Id: I1019a22ce2b09a74d3a9ac809866dd616015f5fc

commit: a22faa68aed357c36ce235c41aadfec47a5199b8 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Dec 18 10:56:05 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Dec 18 16:07:46 2025 +0100
tree: 44a50b18d3f35f99eab49888c49617ca6a574eb4
parent: 0eefedb3251bee8af9a957716a3c8a3502dc6592 [diff]
diff --git a/.gitignore b/.gitignore
index 8517aff..1618857 100644
--- a/.gitignore
+++ b/.gitignore

@@ -14,3 +14,4 @@
 *.md
 !Readme.md
 !ChangeLog.md
+!CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a510e0d..71e23de 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,5 +1,10 @@
 # Changelog
 
+## [Unreleased]
+
+### Fixed
+- `conllu2korapxml`: Fixed sparse annotations being assigned to wrong tokens by correctly handling the CoNLL-U ID column ([#23](https://github.com/KorAP/korapxmltool/issues/23))
+
 ## [v3.1.1] - 2025-12-17
 
 ### Fixed

diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 6d01807..a7b9dec 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -1001,7 +1001,8 @@
 
     var dbFactory: DocumentBuilderFactory? = null
     var dBuilder: DocumentBuilder? = null
-    var morphoZipOutputStream: ZipArchiveOutputStream? = null
+    // Output stream for the "morpho" layer ZIP entries
+    internal var morphoZipOutputStream: ZipArchiveOutputStream? = null
     var krillTarOutputStream: TarArchiveOutputStream? = null
     var krillOutputFileName: String? = null
     private var krillOutputPath: String? = null
@@ -3965,7 +3966,7 @@
         var misc: String? = "_"
     )
 
-    private fun parseAndWriteAnnotatedConllu(annotatedConllu: String, task: AnnotationWorkerPool.AnnotationTask?) {
+    internal fun parseAndWriteAnnotatedConllu(annotatedConllu: String, task: AnnotationWorkerPool.AnnotationTask?) {
         LOGGER.fine("parseAndWriteAnnotatedConllu called with ${annotatedConllu.length} chars, task=$task")
 
         val docId = task?.docId
@@ -4025,6 +4026,9 @@
                     val fields = line.split("\t")
                     if (fields.size < 10) continue
 
+                    val idStr = fields[0]
+                    val id = idStr.toIntOrNull()
+
                     val lemma = if (fields.size > 2) fields[2] else "_"
                     val upos = if (fields.size > 3) fields[3] else "_"
                     val xpos = if (fields.size > 4) fields[4] else "_"
@@ -4034,17 +4038,21 @@
                     val deps = if (fields.size > 8) fields[8] else "_"
                     val misc = if (fields.size > 9) fields[9] else "_"
 
-                if (currentStartOffsets != null && currentEndOffsets != null &&
-                    tokenIndexInSentence < currentStartOffsets.size &&
-                    tokenIndexInSentence < currentEndOffsets.size) {
-
-                    val spanFrom = currentStartOffsets[tokenIndexInSentence]
-                    val spanTo = currentEndOffsets[tokenIndexInSentence]
-                    val spanKey = "$spanFrom-$spanTo"
-
-                    morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
-                    tokenIndexInSentence++
-                }
+                    if (id != null) {
+                        val tokenIndex = id - 1
+                        
+                        if (currentStartOffsets != null && currentEndOffsets != null &&
+                            tokenIndex >= 0 &&
+                            tokenIndex < currentStartOffsets.size &&
+                            tokenIndex < currentEndOffsets.size) {
+        
+                            val spanFrom = currentStartOffsets[tokenIndex]
+                            val spanTo = currentEndOffsets[tokenIndex]
+                            val spanKey = "$spanFrom-$spanTo"
+        
+                            morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
+                        }
+                    }
                 }
             }
         }
@@ -4325,29 +4333,45 @@
                             val fields = line.split("\t")
                             if (fields.size < 10) continue
 
-                            val lemma = if (fields.size > 2) fields[2] else "_"
-                            val upos = if (fields.size > 3) fields[3] else "_"
-                            val xpos = if (fields.size > 4) fields[4] else "_"
-                            val feats = if (fields.size > 5) fields[5] else "_"
-                            val head = if (fields.size > 6) fields[6] else "_"
-                            val deprel = if (fields.size > 7) fields[7] else "_"
-                            val deps = if (fields.size > 8) fields[8] else "_"
-                            val misc = if (fields.size > 9) fields[9] else "_"
+                            val idStr = fields[0]
+                            val rangeMatch = Regex("^([0-9]+)-([0-9]+)$").find(idStr)
+                            
+                            // If it's a range (e.g. 1-2), we might want to skip it if it's just surface form 
+                            // coverage for following tokens, OR process it if we want to annotate the MWT.
+                            // However, the internal logic expects 1-to-1 mapping with offsets list for tokens.
+                            // For now, only process single integer IDs.
+                            // (If we support MWT annotations, we'd need to map range to start of first and end of last coverage?)
+                            
+                            val id = idStr.toIntOrNull()
+                            
+                            if (id != null) {
+                                // CoNLL-U IDs are 1-based, offsets lists are 0-based relative to tokens
+                                val tokenIndex = id - 1
+                                
+                                val lemma = if (fields.size > 2) fields[2] else "_"
+                                val upos = if (fields.size > 3) fields[3] else "_"
+                                val xpos = if (fields.size > 4) fields[4] else "_"
+                                val feats = if (fields.size > 5) fields[5] else "_"
+                                val head = if (fields.size > 6) fields[6] else "_"
+                                val deprel = if (fields.size > 7) fields[7] else "_"
+                                val deps = if (fields.size > 8) fields[8] else "_"
+                                val misc = if (fields.size > 9) fields[9] else "_"
 
-                            if (currentStartOffsets == null || currentEndOffsets == null) {
-                                LOGGER.severe("Token found before offset comments in text ${doc.textId}")
-                                throw IllegalArgumentException("CoNLL-U format error: tokens found before offset comments in text ${doc.textId}")
-                            }
+                                if (currentStartOffsets == null || currentEndOffsets == null) {
+                                    LOGGER.severe("Token found before offset comments in text ${doc.textId}")
+                                    throw IllegalArgumentException("CoNLL-U format error: tokens found before offset comments in text ${doc.textId}")
+                                }
 
-                            if (tokenIndexInSentence < currentStartOffsets.size &&
-                                tokenIndexInSentence < currentEndOffsets.size) {
+                                if (tokenIndex >= 0 && 
+                                    tokenIndex < currentStartOffsets.size &&
+                                    tokenIndex < currentEndOffsets.size) {
 
-                                val spanFrom = currentStartOffsets[tokenIndexInSentence]
-                                val spanTo = currentEndOffsets[tokenIndexInSentence]
-                                val spanKey = "$spanFrom-$spanTo"
+                                    val spanFrom = currentStartOffsets[tokenIndex]
+                                    val spanTo = currentEndOffsets[tokenIndex]
+                                    val spanKey = "$spanFrom-$spanTo"
 
-                                morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
-                                tokenIndexInSentence++
+                                    morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
+                                }
                             }
                         }
                     }

diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
index c623d6b..fc6f806 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt

@@ -416,4 +416,46 @@
             "Filename should match pattern '*/spacy/morpho.xml', but was: $filenameLine"
         )
     }
+    @Test
+    fun sparseAnnotationRespectsTokenIds() {
+        val outputDir = createTempDir("conllu_sparse")
+        try {
+            // Create CoNLL-U with sparse annotation (only for token 7)
+            val sparseConllu = File(outputDir, "sparse.conllu")
+            sparseConllu.writeText("""
+                # foundry = cmc
+                # filename = NDY/115/005255/base/tokens.xml
+                # text_id = NDY_115.005255
+                # start_offsets = 0 0 4 11 18 22 27 32 35 41 46 50 56 64
+                # end_offsets = 65 3 10 17 21 26 31 34 40 45 49 55 64 65
+                7	:)	_	_	EMOASC	_	_	_	_	_
+                
+            """.trimIndent())
+            
+            val outputZip = File(outputDir, "output.zip")
+            val args = arrayOf(
+                "-t", "zip",
+                "-o", outputZip.path,
+                sparseConllu.path
+            )
+            val exitCode = debug(args)
+            assertEquals(0, exitCode, "Sparse conversion should succeed")
+            
+            // Extract morpho.xml
+            val morphoXml = extractFileFromZip(outputZip, "NDY/115/005255/cmc/morpho.xml")
+            
+            // Verify that the annotation is on the correct span (32-34)
+            // Offset for ID 7 is start=32 (index 7), end=34 (index 7)
+            // Note: Attribute order is not guaranteed, so check for attributes individually
+            assertTrue(
+                morphoXml.contains("""from="32"""") && morphoXml.contains("""to="34""""),
+                "Annotation should be on span 32-34 (ID 7), but morpho.xml content was:\n$morphoXml"
+            )
+            
+            // Verify the content of the annotation
+            assertTrue(morphoXml.contains(">EMOASC<"), "Should contain the annotation EMOASC")
+        } finally {
+            outputDir.deleteRecursively()
+        }
+    }
 }
\ No newline at end of file

diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/SparseAnnotationExternalTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/SparseAnnotationExternalTest.kt
new file mode 100644
index 0000000..f6fd655
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/SparseAnnotationExternalTest.kt

@@ -0,0 +1,86 @@
+package de.ids_mannheim.korapxmltools
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream
+import org.junit.Test
+import java.io.File
+import java.io.FileOutputStream
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+class SparseAnnotationExternalTest {
+
+    private fun extractFileFromZip(zipFile: File, regex: Regex): String? {
+        val zip = org.apache.commons.compress.archivers.zip.ZipFile(zipFile)
+        val entry = zip.entries.asSequence().firstOrNull { regex.matches(it.name) }
+        return if (entry != null) {
+            zip.getInputStream(entry).bufferedReader().use { it.readText() }
+        } else {
+            null
+        }
+    }
+
+    @Test
+    fun sparseAnnotationRespectsTokenIdsWithExternalTool() {
+        val outputDir = createTempDir("conllu_sparse_external")
+        try {
+            val outputZip = File(outputDir, "output.zip")
+            val tool = KorapXmlTool()
+            
+            // Setup internal state
+            tool.morphoZipOutputStream = ZipArchiveOutputStream(FileOutputStream(outputZip))
+            tool.tokenSeparator = "\n"
+            
+            // Create a fake task
+            val task = AnnotationWorkerPool.AnnotationTask(
+                text = "", // Not used in parseAndWriteAnnotatedConllu logic concerning parsing content
+                docId = "NDY_115.005255", 
+                entryPath = "NDY/115/005255|cmc" // path|foundry
+            )
+            
+            // Valid sparse CONLL-U content (same as in ConlluConversionTest)
+            // Token 7 (index 6 0-based, or if using offsets directly index 7 in offset list list)
+            // Offsets list has 14 items.
+            // ID 7: 32-34
+            val annotatedConllu = """
+                # foundry = cmc
+                # filename = NDY/115/005255/base/tokens.xml
+                # text_id = NDY_115.005255
+                # start_offsets = 0 0 4 11 18 22 27 32 35 41 46 50 56 64
+                # end_offsets = 65 3 10 17 21 26 31 34 40 45 49 55 64 65
+                7	:)	_	_	EMOASC	_	_	_	_	_
+                
+            """.trimIndent()
+
+            // Invoke the internal method
+            tool.parseAndWriteAnnotatedConllu(annotatedConllu, task)
+            
+            // Close stream to flush to disk
+            tool.morphoZipOutputStream?.close()
+            
+            // Extract morpho.xml
+            // Path structure: NDY/115/005255/cmc/morpho.xml
+            val morphoXml = extractFileFromZip(outputZip, Regex(".*cmc/morpho.xml"))
+            
+            assertTrue(morphoXml != null, "morpho.xml should exist locally")
+            
+            // Verify that the annotation is on the correct span (32-34)
+            // Note: Attribute order is not guaranteed, so check for attributes individually
+            assertTrue(
+                morphoXml!!.contains("""from="32"""") && morphoXml.contains("""to="34""""),
+                "Annotation should be on span 32-34 (ID 7), but morpho.xml content was:\n$morphoXml"
+            )
+            
+            // Verify the content of the annotation
+            assertTrue(morphoXml.contains(">EMOASC<"), "Should contain the annotation EMOASC")
+        } finally {
+            outputDir.deleteRecursively()
+        }
+    }
+    
+    // Helper since kotlin-test doesn't strictly have createTempDir anymore in some versions or usually io.tmp
+    private fun createTempDir(prefix: String): File {
+        val f = java.nio.file.Files.createTempDirectory(prefix).toFile()
+        f.deleteOnExit()
+        return f
+    }
+}
commit	a22faa68aed357c36ce235c41aadfec47a5199b8	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Dec 18 10:56:05 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Dec 18 16:07:46 2025 +0100
tree	44a50b18d3f35f99eab49888c49617ca6a574eb4
parent	0eefedb3251bee8af9a957716a3c8a3502dc6592 [diff]