Allow zip output for combined pos tagging + dependency parsing Change-Id: Ibd2969c5dc764a5f35e44b6652bf869e9ef82c86

commit: 1b796a6c466623a3e2c5ed6374bd4913b7e40697 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Nov 02 16:29:01 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sun Nov 02 16:29:12 2025 +0100
tree: 5429371d845e60ed5bdc3e545df390e3eb6ca172
parent: 4815065ee893aaa859163e4029a0e448a19c07b3 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 05df874..fb17a38 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -597,14 +597,20 @@
         LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}")
         LOGGER.info("Foundry: $foundry $dbFactory")
         if (outputFormat == OutputFormat.KORAPXML && dbFactory == null) {
+            // Determine output zip label. Prefer combined label if both tagger and parser are active
             var targetFoundry = "base"
+            val labelParts = mutableListOf<String>()
             if (taggerName != null) {
                 val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
                 if (tagger != null) {
-                    targetFoundry = tagger.foundry
+                    labelParts.add(tagger.foundry)
                 }
-            } else if (parserName != null) {
-                targetFoundry = parserName!!
+            }
+            if (parserName != null) {
+                labelParts.add(parserName!!)
+            }
+            if (labelParts.isNotEmpty()) {
+                targetFoundry = labelParts.joinToString("-")
             } else if (annotateWith.isNotEmpty()) {
                 // Try to detect foundry from external annotation command
                 when {
@@ -621,11 +627,7 @@
             }
             dbFactory = DocumentBuilderFactory.newInstance()
             dBuilder = dbFactory!!.newDocumentBuilder()
-            val outputMorphoZipFileName =
-                if (parserName != null)
-                    zipFilePath.replace(Regex("(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$"), ".".plus(parserName).plus(".zip"))
-                else
-                    zipFilePath.replace(Regex("\\.zip$"), ".".plus(targetFoundry).plus(".zip"))
+            val outputMorphoZipFileName = zipFilePath.replace(Regex("\\.zip$"), "." + targetFoundry + ".zip")
             LOGGER.info("Output ZIP file: $outputMorphoZipFileName")
             if (File(outputMorphoZipFileName).exists() && !overwrite) {
                 LOGGER.severe("Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.")
@@ -821,6 +823,14 @@
             }
         }
 
+        // Ensure foundry reflects active tagger/parser even if already initialized earlier on this thread
+        taggerToolBridges[Thread.currentThread().threadId()]?.let { activeTagger ->
+            foundry = activeTagger.foundry
+        }
+        parserToolBridges[Thread.currentThread().threadId()]?.let { activeParser ->
+            foundry = "$foundry dependency:${activeParser.foundry}"
+        }
+
         try {
             if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency)\\.xml$"))) {
                 // Ensure the entry stream and reader are closed to avoid native memory buildup
@@ -1067,40 +1077,58 @@
             // Release internal char[] early
             output.setLength(0)
         } else {
-            // Direct ZIP output without external annotation
-            val entryPath = if (parserName != null)  docId.replace(Regex("[_.]"), "/").plus("/$parserName/").plus("dependency.xml")
-            else
-                docId.replace(Regex("[_.]"), "/").plus("/$morphoFoundry/").plus("morpho.xml")
-            val zipEntry = ZipArchiveEntry(entryPath)
-            zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
-            synchronized(morphoZipOutputStream!!) {
-                morphoZipOutputStream!!.putArchiveEntry(zipEntry)
-                morphoZipOutputStream!!.write(output.toString().toByteArray())
-                morphoZipOutputStream!!.closeArchiveEntry()
-            }
-            output.clear()
-            // Track written docs and update progress like with --annotate-with
-            val written = docsWrittenToZip.incrementAndGet()
-            if (!quiet) progressBar?.step()
-            if (totalDocsInInput.get() > 0) {
-                val total = totalDocsInInput.get()
-                val percent = (written * 100.0) / total
-                val elapsed = (System.currentTimeMillis() - annotationStartTime.get()) / 1000.0
-                val docsPerSec = if (elapsed > 0) written / elapsed else 0.0
-                val remaining = total - written
-                val etaSec = if (docsPerSec > 0) remaining / docsPerSec else 0.0
-                if (written % 10 == 0 || written == total) {
-                    LOGGER.info(String.format(Locale.ROOT,
-                        "Progress: %d/%d (%.1f%%), %.1f docs/s, ETA %02d:%02d",
-                        written, total, percent, docsPerSec, (etaSec / 60).toInt(), (etaSec % 60).toInt()))
-                }
-            }
-        }
+            // Direct ZIP output without external annotation: write morpho.xml and, if parser is active, dependency.xml
+            val morphoDir = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: morphoFoundry
+            val depDir = parserName ?: morphoDir
+             var wroteOne = false
+             // Always write morpho.xml if we have morpho annotations (tagger or from input)
+             if (morpho[docId] != null && morpho[docId]!!.isNotEmpty()) {
+                val morphoXml = korapXmlMorphoOutput(morphoDir, docId).toString()
+                val morphoPath = docId.replace(Regex("[_.]"), "/") + "/$morphoDir/morpho.xml"
+                 val morphoEntry = ZipArchiveEntry(morphoPath)
+                 morphoEntry.unixMode = ZIP_ENTRY_UNIX_MODE
+                 synchronized(morphoZipOutputStream!!) {
+                     morphoZipOutputStream!!.putArchiveEntry(morphoEntry)
+                     morphoZipOutputStream!!.write(morphoXml.toByteArray())
+                     morphoZipOutputStream!!.closeArchiveEntry()
+                 }
+                 wroteOne = true
+             }
+             // Write dependency.xml if a parser is active and dependency info present
+             if (parserToolBridges[Thread.currentThread().threadId()] != null) {
+                val depXml = korapXmlDependencyOutput(depDir, docId).toString()
+                val depPath = docId.replace(Regex("[_.]"), "/") + "/$depDir/dependency.xml"
+                 val depEntry = ZipArchiveEntry(depPath)
+                 depEntry.unixMode = ZIP_ENTRY_UNIX_MODE
+                 synchronized(morphoZipOutputStream!!) {
+                     morphoZipOutputStream!!.putArchiveEntry(depEntry)
+                     morphoZipOutputStream!!.write(depXml.toByteArray())
+                     morphoZipOutputStream!!.closeArchiveEntry()
+                 }
+                 wroteOne = true
+             }
+             output.clear()
+             // Track written docs once per document and update progress like with --annotate-with
+             val written = if (wroteOne) docsWrittenToZip.incrementAndGet() else docsWrittenToZip.get()
+             if (!quiet) progressBar?.step()
+             if (totalDocsInInput.get() > 0) {
+                 val total = totalDocsInInput.get()
+                 val percent = (written * 100.0) / total
+                 val elapsed = (System.currentTimeMillis() - annotationStartTime.get()) / 1000.0
+                 val docsPerSec = if (elapsed > 0) written / elapsed else 0.0
+                 val remaining = total - written
+                 val etaSec = if (docsPerSec > 0) remaining / docsPerSec else 0.0
+                 if (written % 10 == 0 || written == total) {
+                     LOGGER.info(String.format(Locale.ROOT,
+                         "Progress: %d/%d (%.1f%%), %.1f docs/s, ETA %02d:%02d",
+                         written, total, percent, docsPerSec, (etaSec / 60).toInt(), (etaSec % 60).toInt()))
+                 }
+             }
+         }
 
-
+        // Release per-document data to free memory early
         arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
             if (map === morpho) {
-                // Clear inner map to release references early
                 morpho[docId]?.clear()
             }
             map.remove(docId)
@@ -1111,11 +1139,11 @@
             if (!quiet) progressBar?.step()
         }
 
-         // Periodic GC hint after processing many docs (lightweight safeguard)
-         if ((processedDocs.incrementAndGet() % 2000) == 0) {
-             LOGGER.fine("Processed ${processedDocs.get()} docs – requesting GC hint")
-             System.gc()
-         }
+        // Periodic GC hint after processing many docs (lightweight safeguard)
+        if ((processedDocs.incrementAndGet() % 2000) == 0) {
+            LOGGER.fine("Processed ${processedDocs.get()} docs – requesting GC hint")
+            System.gc()
+        }
         // Memory / cache statistics logging
         if (memStatsInterval > 0) {
             val count = processedDocs.get()
@@ -1227,7 +1255,7 @@
     }
 
     private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {
-            val doc: Document = dBuilder!!.newDocument()
+        val doc: Document = dBuilder!!.newDocument()
 
         // Root element
         val layer = doc.createElement("layer")
@@ -1327,7 +1355,7 @@
         if (tokensArr == null || tokensArr.isEmpty()) {
             return output
         }
-        
+
         // Build offset-to-index mapping for resolving dependency heads
         val offsetToIndex = mutableMapOf<String, Int>()
         tokensArr.forEachIndexed { index, span ->
@@ -1378,27 +1406,24 @@
                 }
                 previousSpanStart = span.from + 1
             }
-            // Bestimme den Token-Text sicher
+            // Token text safely
             var tokenText: String = if (textVal != null) {
                 val safeFrom = span.from.coerceIn(0, textVal.length)
                 val safeTo = span.to.coerceIn(safeFrom, textVal.length)
                 textVal.substring(safeFrom, safeTo)
             } else "_"
 
-            // Validate and fix empty/whitespace-only tokens that cause SpaCy to crash
             if (tokenText.isBlank()) {
                 LOGGER.fine("Replacing empty/blank token at offset ${span.from}-${span.to} in document $docId with underscore")
-                tokenText = "_"  // Replace with underscore instead of skipping
+                tokenText = "_"
             }
 
             if (morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
                 val mfs = morpho[docId]?.get("${span.from}-${span.to}")
                 if (mfs != null) {
-                    // Add offset info to MISC field for external annotation with ZIP output
                     val miscWithOffset = if (annotationWorkerPool != null && outputFormat == OutputFormat.KORAPXML) {
                         val existing = mfs.misc ?: "_"
-                        if (existing == "_") "Offset=${span.from}-${span.to}"
-                        else "${existing}|Offset=${span.from}-${span.to}"
+                        if (existing == "_") "Offset=${span.from}-${span.to}" else "${existing}|Offset=${span.from}-${span.to}"
                     } else mfs.misc ?: "_"
 
                     try {
@@ -1419,7 +1444,6 @@
                         )
                     } catch (e: NullPointerException) {
                         LOGGER.warning("NPE processing morpho for $docId at ${span.from}-${span.to}: ${e.message}")
-                        // Fallback to token without morpho
                         val miscWithOffset = if (annotationWorkerPool != null && outputFormat == OutputFormat.KORAPXML) {
                             "Offset=${span.from}-${span.to}"
                         } else "_"
@@ -1430,7 +1454,6 @@
                         )
                     }
                 } else {
-                    // Fallback if mfs is null
                     val miscWithOffset = if (annotationWorkerPool != null && outputFormat == OutputFormat.KORAPXML) {
                         "Offset=${span.from}-${span.to}"
                     } else "_"
@@ -1442,7 +1465,6 @@
                     )
                 }
             } else {
-                // Add offset info for tokens without morpho data when needed
                 val miscWithOffset = if (annotationWorkerPool != null && outputFormat == OutputFormat.KORAPXML) {
                     "Offset=${span.from}-${span.to}"
                 } else "_"
@@ -1462,12 +1484,10 @@
         var token_index = 0
         var real_token_index = 0
         var sentence_index = 0
-        val output: StringBuilder
-        output = StringBuilder()
+        val output = StringBuilder()
         if (extractMetadataRegex.isNotEmpty()) {
             output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
         }
-        // If no text is available (e.g., lemma-only mode), emit lemmas
         if (texts[docId] == null) {
             tokens[docId]?.forEach { span ->
                 val key = "${span.from}-${span.to}"
@@ -1480,25 +1500,19 @@
         tokens[docId]?.forEach { span ->
             token_index++
             if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
-                if (output.isNotEmpty()) {
-                    output.setCharAt(output.length - 1, '\n')
-                } else {
-                    output.append("\n")
-                }
+                if (output.isNotEmpty()) output.setCharAt(output.length - 1, '\n') else output.append("\n")
                 if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) {
                     output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
                 }
                 sentence_index++
             }
-            // Bounds safety
             val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)
             val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)
             if (useLemma && morpho[docId] != null) {
                 val key = "${span.from}-${span.to}"
                 val lemmaVal = morpho[docId]!![key]?.lemma
                 if (lemmaVal != null && lemmaVal != "_") {
-                    output.append(lemmaVal)
-                    output.append(' ')
+                    output.append(lemmaVal).append(' ')
                 } else {
                     texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
                     output.append(' ')
@@ -1509,9 +1523,7 @@
             }
             real_token_index++
         }
-        if (output.isNotEmpty()) {
-            output.deleteCharAt(output.length - 1)
-        }
+        if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)
         return output
     }
 
@@ -1519,48 +1531,37 @@
         var token_index = 0
         var real_token_index = 0
         var sentence_index = 0
-        val output: StringBuilder = StringBuilder()
+        val output = StringBuilder()
 
-        // Add the text sigle prefix
         output.append("@@$docId ")
 
         if (texts[docId] == null) {
-            // Lemma-only fallback when original text is not loaded
             tokens[docId]?.forEach { span ->
                 if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
-                    if (output.isNotEmpty() && !output.endsWith("@@$docId ")) {
-                        output.append(" <p> ")
-                    }
+                    if (output.isNotEmpty() && !output.endsWith("@@$docId ")) output.append(" <p> ")
                     sentence_index++
                 }
                 val key = "${span.from}-${span.to}"
                 val lemmaVal = morpho[docId]?.get(key)?.lemma
                 output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
             }
-            if (output.isNotEmpty() && output.endsWith(" ")) {
-                output.deleteCharAt(output.length - 1)
-            }
+            if (output.isNotEmpty() && output.endsWith(" ")) output.deleteCharAt(output.length - 1)
             return output
         }
-        
+
         tokens[docId]?.forEach { span ->
             token_index++
             if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
-                // Replace sentence end with <p> tag instead of newline
-                if (output.isNotEmpty() && !output.endsWith("@@$docId ")) {
-                    output.append(" <p> ")
-                }
+                if (output.isNotEmpty() && !output.endsWith("@@$docId ")) output.append(" <p> ")
                 sentence_index++
             }
-            // Bounds safety
             val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)
             val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)
             if (useLemma && morpho[docId] != null) {
                 val key = "${span.from}-${span.to}"
                 val lemmaVal = morpho[docId]!![key]?.lemma
                 if (lemmaVal != null && lemmaVal != "_") {
-                    output.append(lemmaVal)
-                    output.append(' ')
+                    output.append(lemmaVal).append(' ')
                 } else {
                     texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
                     output.append(' ')
@@ -1571,16 +1572,10 @@
             }
             real_token_index++
         }
-        
-        // Remove trailing space and add final newline
-        if (output.isNotEmpty() && output.endsWith(" ")) {
-            output.deleteCharAt(output.length - 1)
-        }
-        
+        if (output.isNotEmpty() && output.endsWith(" ")) output.deleteCharAt(output.length - 1)
         return output
     }
 
-
     private fun printConlluToken(
         token_index: Int,
         token: String,
@@ -1675,7 +1670,6 @@
                             "xpos", "ctag", "pos" -> if(fs.xpos == "_") fs.xpos = value.replace(UNKNOWN, "--")
                             "feats", "msd" -> if(fs.feats == "_" ) fs.feats = value
                             "type" -> if(fs.feats == "_") fs.feats = feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent.trim()
-                            // "subtype" -> if(fs.feats == "_") fs.feats += ":" + feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent
                             "certainty" -> if(fs.misc == "_") fs.misc = value
                         }
                     }
@@ -1693,24 +1687,21 @@
             .forEach { node ->
                 node as Element
                 val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}"
-                
-                // Look for <rel> element which contains the dependency relation
+
                 val relElements = node.getElementsByTagName("rel")
                 if (relElements.length > 0) {
                     val rel = relElements.item(0) as Element
                     val deprel = rel.getAttribute("label")
-                    
-                    // The head is encoded as an inner <span> element with from/to attributes
+
                     val innerSpans = rel.getElementsByTagName("span")
                     var head: String? = null
                     if (innerSpans.length > 0) {
                         val innerSpan = innerSpans.item(0) as Element
                         val headFrom = innerSpan.getAttribute("from")
                         val headTo = innerSpan.getAttribute("to")
-                        // Store as offset key for now, will need to resolve to token index later
                         head = "$headFrom-$headTo"
                     }
-                    
+
                     if (head != null || deprel != null) {
                         res[fromTo] = MorphoSpan(
                             head = head ?: "_",
@@ -1732,21 +1723,6 @@
             }.toArray { size -> arrayOfNulls(size) }
     }
 
-    /*
-     <span id="s15" from="370" to="394" l="5">
-      <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
-        <f name="name">posting</f>
-        <f name="attr">
-          <fs type="attr">
-            <f name="id">i.10894_1_3</f>
-            <f name="indentLevel">0</f>
-            <f name="who">WU00000000</f>
-          </fs>
-        </f>
-      </fs>
-    </span>
-
-     */
     private fun extractMiscSpans(spans: NodeList): MutableMap<String, String> {
         val miscLocal: MutableMap<String, String> = HashMap()
 
@@ -1767,13 +1743,11 @@
                     val attrName = "$elementName/${(attr as Element).getAttribute("name")}"
                     if (attrName.matches(Regex(extractAttributesRegex))) {
                          res.append("# $attrName = ${attr.textContent}\n")
-                        //LOGGER.info("" + from + ": $attrName = " + attr.textContent)
                     }
 
                 }
                 if (res.isNotEmpty()) {
                     if (miscLocal.containsKey(from)) {
-                        // LOGGER.info("ADDING TO $from: ${miscLocal[from]}")
                         miscLocal[from] += res.toString()
                     } else {
                         miscLocal[from] = res.toString()
@@ -1800,7 +1774,6 @@
     private fun parseAndWriteAnnotatedConllu(annotatedConllu: String, task: AnnotationWorkerPool.AnnotationTask?) {
         LOGGER.fine("parseAndWriteAnnotatedConllu called with ${annotatedConllu.length} chars, task=$task")
 
-        // Extract metadata from task
         val docId = task?.docId
         val entryPathAndFoundry = task?.entryPath?.split("|") ?: listOf(null, null)
         val entryPath = entryPathAndFoundry.getOrNull(0)
@@ -1811,11 +1784,7 @@
             return
         }
 
-        LOGGER.fine("Processing annotated document: docId=$docId, entryPath=$entryPath, foundry=$foundry")
-
         val morphoSpans = mutableMapOf<String, MorphoSpan>()
-
-        // Parse the annotated CoNLL-U to extract morpho data
         val lines = annotatedConllu.lines()
         var currentStartOffsets: List<Int>? = null
         var currentEndOffsets: List<Int>? = null
@@ -1824,32 +1793,22 @@
         for (line in lines) {
             when {
                 line.startsWith("# start_offsets =") -> {
-                    // Parse start offsets from comment
-                    // Format: # start_offsets = <first_token_from> <token1_from> <token2_from> ...
-                    // The first value is duplicated, so skip it and use values from index 1 onwards
                     val offsetsStr = line.substring("# start_offsets =".length).trim()
                     val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
                     currentStartOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else allOffsets
                     tokenIndexInSentence = 0
-                    LOGGER.fine("Found start offsets: $currentStartOffsets")
                 }
                 line.startsWith("# end_offsets =") -> {
-                    // Parse end offsets from comment
-                    // Format: # end_offsets = <sentence_end> <token1_to> <token2_to> ...
-                    // First value is sentence end, actual token ends start from index 1
                     val offsetsStr = line.substring("# end_offsets =".length).trim()
                     val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
                     currentEndOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else emptyList()
-                    LOGGER.fine("Found end offsets: $currentEndOffsets")
                 }
                 line.isEmpty() -> {
-                    // Reset for next sentence
                     currentStartOffsets = null
                     currentEndOffsets = null
                     tokenIndexInSentence = 0
                 }
                 !line.startsWith("#") -> {
-                    // This is a token line
                     val fields = line.split("\t")
                     if (fields.size < 10) continue
 
@@ -1862,7 +1821,6 @@
                     val deps = if (fields.size > 8) fields[8] else "_"
                     val misc = if (fields.size > 9) fields[9] else "_"
 
-                    // Get offset from the comment-based offset arrays
                     if (currentStartOffsets != null && currentEndOffsets != null &&
                         tokenIndexInSentence < currentStartOffsets.size &&
                         tokenIndexInSentence < currentEndOffsets.size) {
@@ -1872,68 +1830,46 @@
                         val spanKey = "$spanFrom-$spanTo"
 
                         morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
-                        LOGGER.fine("Added morpho span: $spanKey -> $lemma/$upos")
                         tokenIndexInSentence++
-                    } else {
-                        LOGGER.fine("No offset information for token at index $tokenIndexInSentence in sentence (starts=${currentStartOffsets?.size}, ends=${currentEndOffsets?.size})")
                     }
                 }
             }
         }
 
-        LOGGER.fine("Extracted ${morphoSpans.size} morpho spans for $docId")
-
         if (morphoSpans.isEmpty()) {
             LOGGER.warning("No morpho spans found in annotated output for $docId, skipping")
-            LOGGER.warning("Sample lines: ${lines.take(10).joinToString("\\n")}")
             return
         }
 
-        // Check if the data contains dependency information (non-empty head/deprel fields)
-        val hasDependencies = morphoSpans.values.any { span ->
-            span.head != null && span.head != "_" && span.deprel != null && span.deprel != "_"
-        }
-        LOGGER.fine("Document has dependencies: $hasDependencies")
-
         if (morphoZipOutputStream == null) {
             LOGGER.severe("morphoZipOutputStream is null! Cannot write to ZIP. This should have been initialized in processZipFile.")
             return
         }
 
         if (dBuilder == null) {
-            LOGGER.warning("dBuilder is null, initializing now...")
             dbFactory = DocumentBuilderFactory.newInstance()
             dBuilder = dbFactory!!.newDocumentBuilder()
         }
 
-        // Create a temporary document context for korapXmlOutput
-        // Store the morpho data temporarily and copy necessary supporting data
         val tempDocId = "_temp_annotated_$docId"
         morpho[tempDocId] = morphoSpans
 
-        // For dependency output, we need sentences data
-        // Create dummy sentence spans covering the entire document based on tokens
+        val hasDependencies = morphoSpans.values.any { span ->
+            span.head != null && span.head != "_" && span.deprel != null && span.deprel != "_"
+        }
+
         if (hasDependencies && morphoSpans.isNotEmpty()) {
-            // Get min and max offsets from all tokens
             val allOffsets = morphoSpans.keys.map { key ->
                 val parts = key.split("-")
                 Pair(parts[0].toInt(), parts[1].toInt())
             }
             val minOffset = allOffsets.minOfOrNull { it.first } ?: 0
             val maxOffset = allOffsets.maxOfOrNull { it.second } ?: 0
-
-            // Create a single sentence span covering all tokens
-            // This is a simplification - ideally we'd track sentence boundaries from CoNLL-U
             sentences[tempDocId] = arrayOf(Span(minOffset, maxOffset))
         }
 
-        LOGGER.fine("Generating KorapXML for $docId with ${morphoSpans.size} spans")
-
-        // Generate morpho.xml (always)
         try {
             val morphoXmlOutput = korapXmlMorphoOutput(foundry, tempDocId)
-
-            // Fix the docid attribute - replace temp prefix with actual docId
             val fixedMorphoXml = morphoXmlOutput.toString().replace(
                 "docid=\"$tempDocId\"",
                 "docid=\"$docId\""
@@ -1943,50 +1879,21 @@
 
             val morphoZipEntry = ZipArchiveEntry(morphoEntryPath)
             morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
-            LOGGER.fine("Writing ${fixedMorphoXml.length} bytes to ZIP entry: $morphoEntryPath")
             synchronized(morphoZipOutputStream!!) {
                 morphoZipOutputStream!!.putArchiveEntry(morphoZipEntry)
                 morphoZipOutputStream!!.write(fixedMorphoXml.toByteArray())
                 morphoZipOutputStream!!.closeArchiveEntry()
             }
-            LOGGER.fine("Successfully wrote morpho.xml for $docId")
             val written = docsWrittenToZip.incrementAndGet()
-
-            // Update progress bar
             if (!quiet) progressBar?.step()
-
-            // Show progress with ETA at INFO level
-            if (annotationWorkerPool != null && totalDocsInInput.get() > 0) {
-                val total = totalDocsInInput.get()
-                val percent = (written * 100.0) / total
-                val elapsed = (System.currentTimeMillis() - annotationStartTime.get()) / 1000.0
-                val docsPerSec = if (elapsed > 0) written / elapsed else 0.0
-                val remaining = total - written
-                val etaSec = if (docsPerSec > 0) remaining / docsPerSec else 0.0
-
-                // Calculate estimated finish time
-                val finishTime = LocalDateTime.now().plusSeconds(etaSec.toLong())
-                val timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss")
-
-                if (written % 10 == 0 || written == total) {
-                    val etaMin = (etaSec / 60).toInt()
-                    val etaSec2 = (etaSec % 60).toInt()
-                    LOGGER.info(String.format(Locale.ROOT,
-                        "Progress: %d/%d (%.1f%%), %.1f docs/s, ETA %02d:%02d, finish ~%s",
-                        written, total, percent, docsPerSec, etaMin, etaSec2, finishTime.format(timeFormatter)))
-                }
-            }
         } catch (e: Exception) {
             LOGGER.severe("ERROR generating/writing morpho.xml: ${e.message}")
             e.printStackTrace()
         }
 
-        // Generate dependency.xml if dependencies are present
-        if (hasDependencies) {
+        if (morpho[tempDocId]?.values?.any { it.head != null && it.head != "_" && it.deprel != null && it.deprel != "_" } == true) {
             try {
                 val dependencyXmlOutput = korapXmlDependencyOutput(foundry, tempDocId)
-
-                // Fix the docid attribute - replace temp prefix with actual docId
                 val fixedDependencyXml = dependencyXmlOutput.toString().replace(
                     "docid=\"$tempDocId\"",
                     "docid=\"$docId\""
@@ -1996,20 +1903,17 @@
 
                 val dependencyZipEntry = ZipArchiveEntry(dependencyEntryPath)
                 dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
-                LOGGER.fine("Writing ${fixedDependencyXml.length} bytes to ZIP entry: $dependencyEntryPath")
                 synchronized(morphoZipOutputStream!!) {
                     morphoZipOutputStream!!.putArchiveEntry(dependencyZipEntry)
                     morphoZipOutputStream!!.write(fixedDependencyXml.toByteArray())
                     morphoZipOutputStream!!.closeArchiveEntry()
                 }
-                LOGGER.fine("Successfully wrote dependency.xml for $docId")
             } catch (e: Exception) {
                 LOGGER.severe("ERROR generating/writing dependency.xml: ${e.message}")
                 e.printStackTrace()
             }
         }
 
-        // Clean up temporary data
         morpho.remove(tempDocId)
         sentences.remove(tempDocId)
     }
commit	1b796a6c466623a3e2c5ed6374bd4913b7e40697	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Nov 02 16:29:01 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Nov 02 16:29:12 2025 +0100
tree	5429371d845e60ed5bdc3e545df390e3eb6ca172
parent	4815065ee893aaa859163e4029a0e448a19c07b3 [diff]