Fixed sparse annotations being assigned to wrong tokens
Resolves #23
Change-Id: I1019a22ce2b09a74d3a9ac809866dd616015f5fc
diff --git a/.gitignore b/.gitignore
index 8517aff..1618857 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@
*.md
!Readme.md
!ChangeLog.md
+!CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a510e0d..71e23de 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
# Changelog
+## [Unreleased]
+
+### Fixed
+- `conllu2korapxml`: Fixed sparse annotations being assigned to wrong tokens by correctly handling the CoNLL-U ID column ([#23](https://github.com/KorAP/korapxmltool/issues/23))
+
## [v3.1.1] - 2025-12-17
### Fixed
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 6d01807..a7b9dec 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1001,7 +1001,8 @@
var dbFactory: DocumentBuilderFactory? = null
var dBuilder: DocumentBuilder? = null
- var morphoZipOutputStream: ZipArchiveOutputStream? = null
+ // Output stream for the "morpho" layer ZIP entries
+ internal var morphoZipOutputStream: ZipArchiveOutputStream? = null
var krillTarOutputStream: TarArchiveOutputStream? = null
var krillOutputFileName: String? = null
private var krillOutputPath: String? = null
@@ -3965,7 +3966,7 @@
var misc: String? = "_"
)
- private fun parseAndWriteAnnotatedConllu(annotatedConllu: String, task: AnnotationWorkerPool.AnnotationTask?) {
+ internal fun parseAndWriteAnnotatedConllu(annotatedConllu: String, task: AnnotationWorkerPool.AnnotationTask?) {
LOGGER.fine("parseAndWriteAnnotatedConllu called with ${annotatedConllu.length} chars, task=$task")
val docId = task?.docId
@@ -4025,6 +4026,9 @@
val fields = line.split("\t")
if (fields.size < 10) continue
+ val idStr = fields[0]
+ val id = idStr.toIntOrNull()
+
val lemma = if (fields.size > 2) fields[2] else "_"
val upos = if (fields.size > 3) fields[3] else "_"
val xpos = if (fields.size > 4) fields[4] else "_"
@@ -4034,17 +4038,21 @@
val deps = if (fields.size > 8) fields[8] else "_"
val misc = if (fields.size > 9) fields[9] else "_"
- if (currentStartOffsets != null && currentEndOffsets != null &&
- tokenIndexInSentence < currentStartOffsets.size &&
- tokenIndexInSentence < currentEndOffsets.size) {
-
- val spanFrom = currentStartOffsets[tokenIndexInSentence]
- val spanTo = currentEndOffsets[tokenIndexInSentence]
- val spanKey = "$spanFrom-$spanTo"
-
- morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
- tokenIndexInSentence++
- }
+ if (id != null) {
+ val tokenIndex = id - 1
+
+ if (currentStartOffsets != null && currentEndOffsets != null &&
+ tokenIndex >= 0 &&
+ tokenIndex < currentStartOffsets.size &&
+ tokenIndex < currentEndOffsets.size) {
+
+ val spanFrom = currentStartOffsets[tokenIndex]
+ val spanTo = currentEndOffsets[tokenIndex]
+ val spanKey = "$spanFrom-$spanTo"
+
+ morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
+ }
+ }
}
}
}
@@ -4325,29 +4333,45 @@
val fields = line.split("\t")
if (fields.size < 10) continue
- val lemma = if (fields.size > 2) fields[2] else "_"
- val upos = if (fields.size > 3) fields[3] else "_"
- val xpos = if (fields.size > 4) fields[4] else "_"
- val feats = if (fields.size > 5) fields[5] else "_"
- val head = if (fields.size > 6) fields[6] else "_"
- val deprel = if (fields.size > 7) fields[7] else "_"
- val deps = if (fields.size > 8) fields[8] else "_"
- val misc = if (fields.size > 9) fields[9] else "_"
+ val idStr = fields[0]
+ val rangeMatch = Regex("^([0-9]+)-([0-9]+)$").find(idStr)
+
+ // If it's a range (e.g. 1-2), we might want to skip it if it's just surface form
+ // coverage for following tokens, OR process it if we want to annotate the MWT.
+ // However, the internal logic expects 1-to-1 mapping with offsets list for tokens.
+ // For now, only process single integer IDs.
+ // (If we support MWT annotations, we'd need to map range to start of first and end of last coverage?)
+
+ val id = idStr.toIntOrNull()
+
+ if (id != null) {
+ // CoNLL-U IDs are 1-based, offsets lists are 0-based relative to tokens
+ val tokenIndex = id - 1
+
+ val lemma = if (fields.size > 2) fields[2] else "_"
+ val upos = if (fields.size > 3) fields[3] else "_"
+ val xpos = if (fields.size > 4) fields[4] else "_"
+ val feats = if (fields.size > 5) fields[5] else "_"
+ val head = if (fields.size > 6) fields[6] else "_"
+ val deprel = if (fields.size > 7) fields[7] else "_"
+ val deps = if (fields.size > 8) fields[8] else "_"
+ val misc = if (fields.size > 9) fields[9] else "_"
- if (currentStartOffsets == null || currentEndOffsets == null) {
- LOGGER.severe("Token found before offset comments in text ${doc.textId}")
- throw IllegalArgumentException("CoNLL-U format error: tokens found before offset comments in text ${doc.textId}")
- }
+ if (currentStartOffsets == null || currentEndOffsets == null) {
+ LOGGER.severe("Token found before offset comments in text ${doc.textId}")
+ throw IllegalArgumentException("CoNLL-U format error: tokens found before offset comments in text ${doc.textId}")
+ }
- if (tokenIndexInSentence < currentStartOffsets.size &&
- tokenIndexInSentence < currentEndOffsets.size) {
+ if (tokenIndex >= 0 &&
+ tokenIndex < currentStartOffsets.size &&
+ tokenIndex < currentEndOffsets.size) {
- val spanFrom = currentStartOffsets[tokenIndexInSentence]
- val spanTo = currentEndOffsets[tokenIndexInSentence]
- val spanKey = "$spanFrom-$spanTo"
+ val spanFrom = currentStartOffsets[tokenIndex]
+ val spanTo = currentEndOffsets[tokenIndex]
+ val spanKey = "$spanFrom-$spanTo"
- morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
- tokenIndexInSentence++
+ morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
+ }
}
}
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
index c623d6b..fc6f806 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/ConlluConversionTest.kt
@@ -416,4 +416,46 @@
"Filename should match pattern '*/spacy/morpho.xml', but was: $filenameLine"
)
}
+ @Test
+ fun sparseAnnotationRespectsTokenIds() {
+ val outputDir = createTempDir("conllu_sparse")
+ try {
+ // Create CoNLL-U with sparse annotation (only for token 7)
+ val sparseConllu = File(outputDir, "sparse.conllu")
+ sparseConllu.writeText("""
+ # foundry = cmc
+ # filename = NDY/115/005255/base/tokens.xml
+ # text_id = NDY_115.005255
+ # start_offsets = 0 0 4 11 18 22 27 32 35 41 46 50 56 64
+ # end_offsets = 65 3 10 17 21 26 31 34 40 45 49 55 64 65
+ 7 :) _ _ EMOASC _ _ _ _ _
+
+ """.trimIndent())
+
+ val outputZip = File(outputDir, "output.zip")
+ val args = arrayOf(
+ "-t", "zip",
+ "-o", outputZip.path,
+ sparseConllu.path
+ )
+ val exitCode = debug(args)
+ assertEquals(0, exitCode, "Sparse conversion should succeed")
+
+ // Extract morpho.xml
+ val morphoXml = extractFileFromZip(outputZip, "NDY/115/005255/cmc/morpho.xml")
+
+ // Verify that the annotation is on the correct span (32-34)
+ // Offset for ID 7 is start=32 (index 7), end=34 (index 7)
+ // Note: Attribute order is not guaranteed, so check for attributes individually
+ assertTrue(
+ morphoXml.contains("""from="32"""") && morphoXml.contains("""to="34""""),
+ "Annotation should be on span 32-34 (ID 7), but morpho.xml content was:\n$morphoXml"
+ )
+
+ // Verify the content of the annotation
+ assertTrue(morphoXml.contains(">EMOASC<"), "Should contain the annotation EMOASC")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
}
\ No newline at end of file
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/SparseAnnotationExternalTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/SparseAnnotationExternalTest.kt
new file mode 100644
index 0000000..f6fd655
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/SparseAnnotationExternalTest.kt
@@ -0,0 +1,86 @@
+package de.ids_mannheim.korapxmltools
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream
+import org.junit.Test
+import java.io.File
+import java.io.FileOutputStream
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+
+class SparseAnnotationExternalTest {
+
+ private fun extractFileFromZip(zipFile: File, regex: Regex): String? {
+ val zip = org.apache.commons.compress.archivers.zip.ZipFile(zipFile)
+ val entry = zip.entries.asSequence().firstOrNull { regex.matches(it.name) }
+ return if (entry != null) {
+ zip.getInputStream(entry).bufferedReader().use { it.readText() }
+ } else {
+ null
+ }
+ }
+
+ @Test
+ fun sparseAnnotationRespectsTokenIdsWithExternalTool() {
+ val outputDir = createTempDir("conllu_sparse_external")
+ try {
+ val outputZip = File(outputDir, "output.zip")
+ val tool = KorapXmlTool()
+
+ // Setup internal state
+ tool.morphoZipOutputStream = ZipArchiveOutputStream(FileOutputStream(outputZip))
+ tool.tokenSeparator = "\n"
+
+ // Create a fake task
+ val task = AnnotationWorkerPool.AnnotationTask(
+ text = "", // Not used in parseAndWriteAnnotatedConllu logic concerning parsing content
+ docId = "NDY_115.005255",
+ entryPath = "NDY/115/005255|cmc" // path|foundry
+ )
+
+ // Valid sparse CONLL-U content (same as in ConlluConversionTest)
+ // Token 7 (index 6 0-based, or if using offsets directly index 7 in offset list list)
+ // Offsets list has 14 items.
+ // ID 7: 32-34
+ val annotatedConllu = """
+ # foundry = cmc
+ # filename = NDY/115/005255/base/tokens.xml
+ # text_id = NDY_115.005255
+ # start_offsets = 0 0 4 11 18 22 27 32 35 41 46 50 56 64
+ # end_offsets = 65 3 10 17 21 26 31 34 40 45 49 55 64 65
+ 7 :) _ _ EMOASC _ _ _ _ _
+
+ """.trimIndent()
+
+ // Invoke the internal method
+ tool.parseAndWriteAnnotatedConllu(annotatedConllu, task)
+
+ // Close stream to flush to disk
+ tool.morphoZipOutputStream?.close()
+
+ // Extract morpho.xml
+ // Path structure: NDY/115/005255/cmc/morpho.xml
+ val morphoXml = extractFileFromZip(outputZip, Regex(".*cmc/morpho.xml"))
+
+ assertTrue(morphoXml != null, "morpho.xml should exist locally")
+
+ // Verify that the annotation is on the correct span (32-34)
+ // Note: Attribute order is not guaranteed, so check for attributes individually
+ assertTrue(
+ morphoXml!!.contains("""from="32"""") && morphoXml.contains("""to="34""""),
+ "Annotation should be on span 32-34 (ID 7), but morpho.xml content was:\n$morphoXml"
+ )
+
+ // Verify the content of the annotation
+ assertTrue(morphoXml.contains(">EMOASC<"), "Should contain the annotation EMOASC")
+ } finally {
+ outputDir.deleteRecursively()
+ }
+ }
+
+ // Helper since kotlin-test doesn't strictly have createTempDir anymore in some versions or usually io.tmp
+ private fun createTempDir(prefix: String): File {
+ val f = java.nio.file.Files.createTempDirectory(prefix).toFile()
+ f.deleteOnExit()
+ return f
+ }
+}