Fix sentence spans in dependency zip output

Change-Id: I8605d6c0143040d797ba13b8bcaba60d935b1913
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 146ba48..984e0d2 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1807,21 +1807,32 @@
         var currentStartOffsets: List<Int>? = null
         var currentEndOffsets: List<Int>? = null
         var tokenIndexInSentence = 0
+        val sentenceSpans = mutableListOf<Span>()
+        var sentenceStartOffset: Int? = null
+        var sentenceEndOffset: Int? = null
 
         for (line in lines) {
             when {
                 line.startsWith("# start_offsets =") -> {
                     val offsetsStr = line.substring("# start_offsets =".length).trim()
                     val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
+                    sentenceStartOffset = allOffsets.firstOrNull()
                     currentStartOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else allOffsets
                     tokenIndexInSentence = 0
                 }
                 line.startsWith("# end_offsets =") -> {
                     val offsetsStr = line.substring("# end_offsets =".length).trim()
                     val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
+                    sentenceEndOffset = allOffsets.firstOrNull()
                     currentEndOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else emptyList()
                 }
                 line.isEmpty() -> {
+                    // Sentence boundary: record the sentence span if available
+                    if (sentenceStartOffset != null && sentenceEndOffset != null) {
+                        sentenceSpans.add(Span(sentenceStartOffset!!, sentenceEndOffset!!))
+                    }
+                    sentenceStartOffset = null
+                    sentenceEndOffset = null
                     currentStartOffsets = null
                     currentEndOffsets = null
                     tokenIndexInSentence = 0
@@ -1854,6 +1865,11 @@
             }
         }
 
+        // If last sentence did not end with an empty line, capture it now
+        if (sentenceStartOffset != null && sentenceEndOffset != null) {
+            sentenceSpans.add(Span(sentenceStartOffset!!, sentenceEndOffset!!))
+        }
+
         if (morphoSpans.isEmpty()) {
             LOGGER.warning("No morpho spans found in annotated output for $docId, skipping")
             return
@@ -1876,14 +1892,19 @@
             span.head != null && span.head != "_" && span.deprel != null && span.deprel != "_"
         }
 
+        // Prefer sentence spans from the comments; fallback to whole-document span if none detected
         if (hasDependencies && morphoSpans.isNotEmpty()) {
-            val allOffsets = morphoSpans.keys.map { key ->
-                val parts = key.split("-")
-                Pair(parts[0].toInt(), parts[1].toInt())
+            if (sentenceSpans.isNotEmpty()) {
+                sentences[tempDocId] = sentenceSpans.toTypedArray()
+            } else {
+                val allOffsets = morphoSpans.keys.map { key ->
+                    val parts = key.split("-")
+                    Pair(parts[0].toInt(), parts[1].toInt())
+                }
+                val minOffset = allOffsets.minOfOrNull { it.first } ?: 0
+                val maxOffset = allOffsets.maxOfOrNull { it.second } ?: 0
+                sentences[tempDocId] = arrayOf(Span(minOffset, maxOffset))
             }
-            val minOffset = allOffsets.minOfOrNull { it.first } ?: 0
-            val maxOffset = allOffsets.maxOfOrNull { it.second } ?: 0
-            sentences[tempDocId] = arrayOf(Span(minOffset, maxOffset))
         }
 
         try {