Krill output: Improvements for many texts

Change-Id: I364db313839c6a61185f62007145381382100919
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 1feebcd..2f02e59 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -388,6 +388,9 @@
     val krillData: ConcurrentHashMap<String, KrillTextData> = ConcurrentHashMap()
     val corpusMetadata: ConcurrentHashMap<String, MutableMap<String, Any>> = ConcurrentHashMap()
     val docMetadata: ConcurrentHashMap<String, MutableMap<String, Any>> = ConcurrentHashMap()
+    val expectedFoundries: MutableSet<String> = mutableSetOf("base")
+    val processedFoundries: MutableSet<String> = mutableSetOf()
+    var krillOutputCount = java.util.concurrent.atomic.AtomicInteger(0)
 
     fun String.hasCorrespondingBaseZip(): Boolean {
         if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false
@@ -402,6 +405,17 @@
     }
 
     fun korapxml2conllu(args: Array<String>) {
+        // Reset Krill state for fresh run (important for tests)
+        if (outputFormat == OutputFormat.KRILL) {
+            expectedFoundries.clear()
+            expectedFoundries.add("base")
+            processedFoundries.clear()
+            krillOutputCount.set(0)
+            krillData.clear()
+            corpusMetadata.clear()
+            docMetadata.clear()
+        }
+
         // Initialize shared entry executor (used inside each zip)
         entryExecutor = Executors.newFixedThreadPool(maxThreads)
 
@@ -430,6 +444,30 @@
             val fileOutputStream = FileOutputStream(krillOutputFileName!!)
             krillTarOutputStream = TarArchiveOutputStream(fileOutputStream)
             LOGGER.info("Initialized krill TAR output stream")
+
+            // Extract expected foundries from input ZIP filenames for incremental output
+            args.forEach { zipPath ->
+                val zipName = File(zipPath).name
+                // Match pattern: name.foundry.zip (e.g., corpus.spacy.zip)
+                val foundryMatch = Regex("^(.+?)\\.([^.]+)\\.zip$").find(zipName)
+                if (foundryMatch != null) {
+                    val foundry = foundryMatch.groupValues[2]
+                    // Handle compound foundries like "marmot-malt" which contains both "marmot" and "malt"
+                    if (foundry.contains("-")) {
+                        foundry.split("-").forEach { subFoundry ->
+                            expectedFoundries.add(subFoundry)
+                            LOGGER.info("Expecting foundry: $subFoundry from $zipName")
+                        }
+                    } else {
+                        expectedFoundries.add(foundry)
+                        LOGGER.info("Expecting foundry: $foundry from $zipName")
+                    }
+                } else if (zipName.endsWith(".zip")) {
+                    // Base ZIP without foundry suffix
+                    LOGGER.info("Base ZIP detected: $zipName")
+                }
+            }
+            LOGGER.info("Expected foundries for Krill output: ${expectedFoundries.sorted()}")
         }
 
         if (annotateWith.isNotEmpty()) {
@@ -599,46 +637,35 @@
         // Shutdown entry executor
         entryExecutor?.shutdown()
 
-        // Finalize krill output: generate JSON files and close TAR
+        // Finalize krill output: output any remaining incomplete texts and close TAR
         if (outputFormat == OutputFormat.KRILL && krillTarOutputStream != null) {
             try {
-                LOGGER.info("Generating krill JSON files for ${krillData.size} texts")
+                val remainingCount = krillData.size
+                if (remainingCount > 0) {
+                    LOGGER.info("Outputting $remainingCount remaining incomplete texts (already output: $krillOutputCount)")
+                } else {
+                    LOGGER.info("All texts already output incrementally ($krillOutputCount total)")
+                }
 
-                // Initialize progress bar for krill output
-                val krillProgressBar = if (!quiet) {
+                // Initialize progress bar for remaining texts
+                val krillProgressBar = if (!quiet && remainingCount > 0) {
                     ProgressBarBuilder()
-                        .setTaskName(krillOutputFileName?.let { File(it).name } ?: "Krill export")
-                        .setInitialMax(krillData.size.toLong())
+                        .setTaskName("Remaining texts")
+                        .setInitialMax(remainingCount.toLong())
                         .setStyle(ProgressBarStyle.COLORFUL_UNICODE_BAR)
                         .setUpdateIntervalMillis(500)
                         .build()
                 } else null
 
                 try {
+                    // Output remaining texts (may be incomplete if not all ZIPs covered all texts)
                     krillData.keys.sorted().forEach { textId ->
                         val textData = krillData[textId]!!
-                        LOGGER.info("Generating JSON for $textId, foundries=${textData.morphoByFoundry.keys}")
-                        val json = generateKrillJson(textData)
-                        // Convert textId to proper filename format with dashes
-                        val jsonFileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
-
-                        // Compress JSON with GZIP
-                        val byteOut = ByteArrayOutputStream()
-                        val gzipOut = GZIPOutputStream(byteOut)
-                        gzipOut.write(json.toByteArray(Charsets.UTF_8))
-                        gzipOut.close()
-                        val compressedData = byteOut.toByteArray()
-
-                        // Write to TAR
-                        val tarEntry = TarArchiveEntry(jsonFileName)
-                        tarEntry.size = compressedData.size.toLong()
-                        krillTarOutputStream!!.putArchiveEntry(tarEntry)
-                        krillTarOutputStream!!.write(compressedData)
-                        krillTarOutputStream!!.closeArchiveEntry()
-
-                        LOGGER.fine("Wrote krill JSON for $textId (${compressedData.size} bytes compressed)")
-
-                        // Update progress bar
+                        val textFoundries = textData.morphoByFoundry.keys.toSet() + setOf("base")
+                        if (!textFoundries.containsAll(expectedFoundries)) {
+                            LOGGER.warning("Outputting incomplete text $textId with foundries ${textFoundries} (expected: $expectedFoundries)")
+                        }
+                        outputKrillText(textId, textData)
                         krillProgressBar?.step()
                     }
                 } finally {
@@ -647,7 +674,7 @@
 
                 krillTarOutputStream!!.finish()
                 krillTarOutputStream!!.close()
-                LOGGER.info("Closed krill TAR file: $krillOutputFileName")
+                LOGGER.info("Closed krill TAR file: $krillOutputFileName (total texts output: $krillOutputCount)")
             } catch (e: Exception) {
                 LOGGER.severe("ERROR generating krill output: ${e.message}")
                 e.printStackTrace()
@@ -814,6 +841,13 @@
             LOGGER.fine("NOT closing ZIP in processZipFile - will close after worker pool finishes")
         }
         logZipProgress(zipFilePath)
+
+        // For Krill format, check if any texts are now complete and output them
+        // TODO: Re-enable incremental output after fixing edge cases with incomplete texts
+        // if (outputFormat == OutputFormat.KRILL) {
+        //     processedFoundries.add(foundry)
+        //     checkAndOutputCompleteTexts()
+        // }
     }
 
     private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
@@ -851,6 +885,13 @@
             }
         }
         logZipProgress(zipFilePath)
+
+        // For Krill format, check if any texts are now complete and output them
+        // TODO: Re-enable incremental output after fixing edge cases with incomplete texts
+        // if (outputFormat == OutputFormat.KRILL) {
+        //     processedFoundries.add(foundry)
+        //     checkAndOutputCompleteTexts()
+        // }
     }
 
     private fun logZipProgress(zipFilePath: String) {
@@ -2786,6 +2827,82 @@
         }
     }
 
+    // Check if a text has all expected foundries and output it if complete
+    private fun checkAndOutputCompleteTexts() {
+        if (outputFormat != OutputFormat.KRILL || krillTarOutputStream == null) return
+
+        val textsToOutput = mutableListOf<String>()
+
+        // Find texts that have all expected foundries
+        krillData.forEach { (textId, textData) ->
+            val textFoundries = textData.morphoByFoundry.keys.toSet() + setOf("base")
+            if (textFoundries.containsAll(expectedFoundries)) {
+                textsToOutput.add(textId)
+            }
+        }
+
+        // Output and remove complete texts
+        textsToOutput.sorted().forEach { textId ->
+            val textData = krillData.remove(textId)
+            if (textData != null) {
+                outputKrillText(textId, textData)
+            }
+        }
+
+        if (textsToOutput.isNotEmpty()) {
+            LOGGER.info("Output ${textsToOutput.size} complete texts (${krillData.size} still pending)")
+        }
+    }
+
+    // Output a single text to Krill TAR (thread-safe)
+    private fun outputKrillText(textId: String, textData: KrillTextData) {
+        try {
+            // Merge corpus and doc metadata
+            val textIdWithSlashes = textData.textId.replace("_", "/").replace(".", "/")
+            val corpusSigle = textIdWithSlashes.split("/")[0]
+            val docSigle = textIdWithSlashes.split("/").take(2).joinToString("/")
+
+            // Apply corpus-level metadata
+            corpusMetadata[corpusSigle]?.forEach { (key, value) ->
+                if (!textData.headerMetadata.containsKey(key)) {
+                    textData.headerMetadata[key] = value
+                }
+            }
+
+            // Apply doc-level metadata
+            docMetadata[docSigle]?.forEach { (key, value) ->
+                if (!textData.headerMetadata.containsKey(key)) {
+                    textData.headerMetadata[key] = value
+                }
+            }
+
+            val json = generateKrillJson(textData)
+            val jsonFileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
+
+            // Compress JSON with GZIP
+            val byteOut = ByteArrayOutputStream()
+            val gzipOut = GZIPOutputStream(byteOut)
+            gzipOut.write(json.toByteArray(Charsets.UTF_8))
+            gzipOut.close()
+            val compressedData = byteOut.toByteArray()
+
+            // Write to TAR (synchronized for thread safety)
+            synchronized(krillTarOutputStream!!) {
+                val tarEntry = TarArchiveEntry(jsonFileName)
+                tarEntry.size = compressedData.size.toLong()
+                krillTarOutputStream!!.putArchiveEntry(tarEntry)
+                krillTarOutputStream!!.write(compressedData)
+                krillTarOutputStream!!.closeArchiveEntry()
+            }
+
+            val count = krillOutputCount.incrementAndGet()
+            LOGGER.fine("Output Krill JSON for $textId ($count total)")
+        } catch (e: Exception) {
+            LOGGER.severe("ERROR outputting Krill JSON for $textId: ${e.message}")
+            e.printStackTrace()
+        }
+    }
+
     private fun generateKrillJson(textData: KrillTextData): String {
         val sb = StringBuilder()
         sb.append("{")