Add krill output format Change-Id: I779ebe74eb677fe6630caf37a33bdfc833fc5b66

commit: 86b055aaae908968e7395942b2883d2795c1fba0 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Nov 08 17:42:09 2025 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Nov 08 18:23:08 2025 +0100
tree: 83dd41695795d5227bbb9550514c27cc03df1046
parent: c6b51e75a98927a248c985f9c0044c1e88f1be66 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 9ba9e73..e374c6a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -2,8 +2,12 @@
 
 import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries
 import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream
 import org.apache.commons.compress.archivers.zip.Zip64Mode
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream
+import org.apache.commons.compress.archivers.zip.ZipFile as ApacheZipFile
 import org.w3c.dom.Document
 import org.w3c.dom.Element
 import org.w3c.dom.NodeList
@@ -11,10 +15,7 @@
 import org.xml.sax.SAXParseException
 import picocli.CommandLine
 import picocli.CommandLine.*
-import java.io.File
-import java.io.FileOutputStream
-import java.io.InputStream
-import java.io.StringWriter
+import java.io.*
 import java.lang.Integer.parseInt
 import java.util.*
 import java.util.concurrent.Callable
@@ -28,15 +29,12 @@
 import java.util.regex.Matcher
 import java.util.regex.Pattern
 import java.util.stream.IntStream
-import java.util.zip.ZipEntry
-
-import java.util.zip.ZipFile
+import java.util.zip.GZIPOutputStream
 import me.tongfei.progressbar.ProgressBar
 import me.tongfei.progressbar.ProgressBarBuilder
 import me.tongfei.progressbar.ProgressBarStyle
 import java.time.LocalDateTime
 import java.time.format.DateTimeFormatter
-import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream
 import javax.xml.parsers.DocumentBuilder
 import javax.xml.parsers.DocumentBuilderFactory
 import javax.xml.transform.OutputKeys
@@ -73,11 +71,12 @@
 
     @Option(
         names = ["-f", "--output-format"],
-        description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}",
+        description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}, ${KrillOutputFormat.NAME}",
             "conllu: CoNLL-U format",
             "korapxml, xml, zip: KorAP-XML format zip",
             "word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines",
             "now, NOW: NOW corpus export format: w2v-like format with <p> tags for sentence ends and @@<text-sigle> prefix",
+            "krill: Krill JSON format (tar file with gzipped JSON files, one per text)",
         ],
         converter = [OutputFormatConverter::class]
     )
@@ -89,6 +88,7 @@
                 "word2vec", "w2v" -> OutputFormat.WORD2VEC
                 "korapxml", "korap", "xml", "zip" -> OutputFormat.KORAPXML
                 "now", "NOW" -> OutputFormat.NOW
+                "krill" -> OutputFormat.KRILL
                 else -> throw IllegalArgumentException("Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(", ") { it.name }}")
             }
         }
@@ -360,6 +360,32 @@
     var dbFactory: DocumentBuilderFactory? = null
     var dBuilder: DocumentBuilder? = null
     var morphoZipOutputStream: ZipArchiveOutputStream? = null
+    var krillTarOutputStream: TarArchiveOutputStream? = null
+    var krillOutputFileName: String? = null
+
+    // Krill format data structures - collect all data from all ZIPs before output
+    data class KrillTextData(
+        var textId: String,
+        var textContent: String? = null,
+        var headerMetadata: MutableMap<String, Any> = mutableMapOf(),
+        var tokens: Array<Span>? = null,
+        var sentences: Array<Span>? = null,
+        var morphoByFoundry: MutableMap<String, MutableMap<String, MorphoSpan>> = mutableMapOf(),
+        var structureSpans: MutableList<StructureSpan> = mutableListOf(),
+        var extractedAttributes: MutableMap<String, String> = mutableMapOf()
+    )
+
+    data class StructureSpan(
+        val layer: String,  // e.g., "base/s:s", "dereko/s:p"
+        val from: Int,
+        val to: Int,
+        val tokenFrom: Int,
+        val tokenTo: Int,
+        val depth: Int,
+        val attributes: Map<String, String> = emptyMap()
+    )
+
+    val krillData: ConcurrentHashMap<String, KrillTextData> = ConcurrentHashMap()
 
     fun String.hasCorrespondingBaseZip(): Boolean {
         if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false
@@ -377,6 +403,32 @@
         // Initialize shared entry executor (used inside each zip)
         entryExecutor = Executors.newFixedThreadPool(maxThreads)
 
+        // Initialize TAR output for krill format
+        if (outputFormat == OutputFormat.KRILL) {
+            // Find the base ZIP (one without a foundry suffix)
+            val baseZip = args.firstOrNull { zip ->
+                val name = File(zip).name
+                name.matches(Regex(".*\\.zip$")) && !name.matches(Regex(".*\\.[^/.]+\\.zip$"))
+            } ?: args[0]
+            val baseZipName = File(baseZip).name.replace(Regex("\\.zip$"), "")
+            krillOutputFileName = File(outputDir, "$baseZipName.krill.tar").absolutePath
+            LOGGER.info("Initializing krill TAR output: $krillOutputFileName")
+
+            if (File(krillOutputFileName!!).exists() && !overwrite) {
+                LOGGER.severe("Output file $krillOutputFileName already exists. Use --overwrite to overwrite.")
+                exitProcess(1)
+            }
+
+            if (File(krillOutputFileName!!).exists()) {
+                LOGGER.info("Deleting existing file: $krillOutputFileName")
+                File(krillOutputFileName!!).delete()
+            }
+
+            val fileOutputStream = FileOutputStream(krillOutputFileName!!)
+            krillTarOutputStream = TarArchiveOutputStream(fileOutputStream)
+            LOGGER.info("Initialized krill TAR output stream")
+        }
+
         if (annotateWith.isNotEmpty()) {
             // Detect external foundry label once from annotateWith command
             externalFoundry = detectFoundryFromAnnotateCmd(annotateWith)
@@ -543,6 +595,43 @@
         }
         // Shutdown entry executor
         entryExecutor?.shutdown()
+
+        // Finalize krill output: generate JSON files and close TAR
+        if (outputFormat == OutputFormat.KRILL && krillTarOutputStream != null) {
+            try {
+                LOGGER.info("Generating krill JSON files for ${krillData.size} texts")
+                krillData.keys.sorted().forEach { textId ->
+                    val textData = krillData[textId]!!
+                    LOGGER.info("Generating JSON for $textId, foundries=${textData.morphoByFoundry.keys}")
+                    val json = generateKrillJson(textData)
+                    // Convert textId to proper filename format with dashes
+                    val jsonFileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
+
+                    // Compress JSON with GZIP
+                    val byteOut = ByteArrayOutputStream()
+                    val gzipOut = GZIPOutputStream(byteOut)
+                    gzipOut.write(json.toByteArray(Charsets.UTF_8))
+                    gzipOut.close()
+                    val compressedData = byteOut.toByteArray()
+
+                    // Write to TAR
+                    val tarEntry = TarArchiveEntry(jsonFileName)
+                    tarEntry.size = compressedData.size.toLong()
+                    krillTarOutputStream!!.putArchiveEntry(tarEntry)
+                    krillTarOutputStream!!.write(compressedData)
+                    krillTarOutputStream!!.closeArchiveEntry()
+
+                    LOGGER.fine("Wrote krill JSON for $textId (${compressedData.size} bytes compressed)")
+                }
+
+                krillTarOutputStream!!.finish()
+                krillTarOutputStream!!.close()
+                LOGGER.info("Closed krill TAR file: $krillOutputFileName")
+            } catch (e: Exception) {
+                LOGGER.severe("ERROR generating krill output: ${e.message}")
+                e.printStackTrace()
+            }
+        }
     }
 
     private fun processZipsWithQueue(zips: Array<String>, foundry: String, parallelism: Int) {
@@ -559,10 +648,17 @@
                         if (zipPath == null) {
                             if (queue.isEmpty()) break else continue
                         }
-                        if (sequentialInZip) {
-                            processZipFileSequentially(zipPath, foundry)
+                        // For krill format, use per-ZIP foundry; otherwise use shared foundry
+                        val zipFoundry = if (outputFormat == OutputFormat.KRILL) {
+                            getFoundryFromZipFileName(zipPath)
                         } else {
-                            processZipFile(zipPath, foundry)
+                            foundry
+                        }
+                        LOGGER.info("Processing ZIP: $zipPath with foundry=$zipFoundry")
+                        if (sequentialInZip) {
+                            processZipFileSequentially(zipPath, zipFoundry)
+                        } else {
+                            processZipFile(zipPath, zipFoundry)
                         }
                     }
                 } finally {
@@ -661,17 +757,32 @@
         } else {
             LOGGER.info("Skipping ZIP initialization: dbFactory=${dbFactory != null}, outputFormat=$outputFormat")
         }
+        LOGGER.fine("About to process ZIP entries: hasCorrespondingBaseZip=${zipFilePath.hasCorrespondingBaseZip()}")
         if (zipFilePath.hasCorrespondingBaseZip()) {
             val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
             // Process related zips one after another to keep the ZipFile lifetime strictly bounded
             relatedZips.forEach { zip ->
-                ZipFile(zip).use { zipFile ->
-                    processZipEntriesWithPool(zipFile, foundry, true)
+                // For krill format, use per-ZIP foundry; for other formats, use the original foundry
+                val zipFoundry = if (outputFormat == OutputFormat.KRILL) {
+                    if (zip == zipFilePath.correspondingBaseZip()) "base" else foundry
+                } else {
+                    foundry  // Keep original foundry for non-krill formats
+                }
+                ApacheZipFile(File(zip)).use { zipFile ->
+                    processZipEntriesWithPool(zipFile, zipFoundry, true)
                 }
             }
         } else {
-            ZipFile(zipFilePath).use { zipFile ->
-                processZipEntriesWithPool(zipFile, foundry, false)
+            LOGGER.fine("Opening ZipFile for processing: $zipFilePath")
+            try {
+                ApacheZipFile(File(zipFilePath)).use { zipFile ->
+                    LOGGER.fine("Calling processZipEntriesWithPool, foundry=$foundry")
+                    processZipEntriesWithPool(zipFile, foundry, false)
+                    LOGGER.fine("Returned from processZipEntriesWithPool")
+                }
+            } catch (e: Exception) {
+                LOGGER.severe("Error processing ZIP: ${e.message}")
+                e.printStackTrace()
             }
         }
         // Don't close the ZIP here if using external annotation - it will be closed after worker pool finishes
@@ -692,22 +803,28 @@
             // Process the two related zips strictly sequentially to limit memory growth
             val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
             zips.forEach { zip ->
-                ZipFile(zip).use { zipFile ->
+                // For krill format, use per-ZIP foundry; for other formats, use the original foundry
+                val zipFoundry = if (outputFormat == OutputFormat.KRILL) {
+                    if (zip == zipFilePath.correspondingBaseZip()) "base" else foundry
+                } else {
+                    foundry  // Keep original foundry for non-krill formats
+                }
+                ApacheZipFile(File(zip)).use { zipFile ->
                     // Iterate entries in a deterministic order to keep related files close together
-                    zipFile.stream()
+                    zipFile.entries.toList()
                         .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") }
-                        .sorted(Comparator.comparing<ZipEntry, String> { it.name })
-                        .forEachOrdered { zipEntry ->
-                            processZipEntry(zipFile, foundry, zipEntry, true)
+                        .sortedBy { it.name }
+                        .forEach { zipEntry ->
+                            processZipEntry(zipFile, zipFoundry, zipEntry, true)
                         }
                 }
             }
         } else {
-            ZipFile(zipFilePath).use { zipFile ->
-                zipFile.stream()
+            ApacheZipFile(File(zipFilePath)).use { zipFile ->
+                zipFile.entries.toList()
                     .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") }
-                    .sorted(Comparator.comparing<ZipEntry, String> { it.name })
-                    .forEachOrdered { zipEntry ->
+                    .sortedBy { it.name }
+                    .forEach { zipEntry ->
                         processZipEntry(zipFile, foundry, zipEntry, false)
                     }
             }
@@ -755,16 +872,18 @@
         return String.format(Locale.ROOT, "%02d:%02d:%02d", h, m, sec)
     }
 
-    private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) {
+    private fun processZipEntriesWithPool(zipFile: ApacheZipFile, foundry: String, waitForMorpho: Boolean) {
         // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested
-        val entries: MutableList<ZipEntry> = ArrayList()
+        val entries: MutableList<ZipArchiveEntry> = ArrayList()
         var documentCount = 0
-        val enumEntries = zipFile.entries()
+        val enumEntries = zipFile.entries
         while (enumEntries.hasMoreElements()) {
             val e = enumEntries.nextElement()
-            if (extractMetadataRegex.isEmpty() && e.name.contains("header.xml")) continue
+            // Skip header.xml unless metadata extraction is requested OR output format is KRILL
+            if (extractMetadataRegex.isEmpty() && outputFormat != OutputFormat.KRILL && e.name.contains("header.xml")) continue
             entries.add(e)
         }
+        LOGGER.fine("Collected ${entries.size} entries from ZIP, foundry=$foundry")
         if (entries.isEmpty()) return
 
         // Determine document count for progress: prefer data.xml, fallback to tokens.xml
@@ -804,6 +923,7 @@
 
         // Submit all entry tasks to the shared executor and await completion before closing the zip
         val latch = java.util.concurrent.CountDownLatch(entries.size)
+        LOGGER.info("processZipEntriesWithPool: processing ${entries.size} entries with foundry=$foundry")
         entries.forEach { entry ->
             entryExecutor?.execute {
                 try {
@@ -822,7 +942,7 @@
         }
     }
 
-    fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {
+    fun processZipEntry(zipFile: ApacheZipFile, _foundry: String, zipEntry: ZipArchiveEntry, passedWaitForMorpho: Boolean) {
         var foundry = _foundry
         var waitForMorpho = passedWaitForMorpho
         LOGGER.finer("Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}")
@@ -853,6 +973,7 @@
 
         try {
             if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency)\\.xml$"))) {
+                LOGGER.finer("Processing entry: ${zipEntry.name}, foundry=$foundry")
                 // Ensure the entry stream and reader are closed to avoid native memory buildup
                 val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
                 val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
@@ -894,6 +1015,10 @@
                             extraFeatures[docId] = extractMiscSpans(spans)
                         sentences[docId] = extractSentenceSpans(spans)
 
+                        // For krill format, collect structural spans (only from base foundry to avoid duplicates)
+                        if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+                            collectKrillStructureSpans(docId, spans)
+                        }
                     }
 
                     "tokens.xml" -> {
@@ -902,32 +1027,48 @@
                         }
                         val tokenSpans: NodeList = doc.getElementsByTagName("span")
                         tokens[docId] = extractSpans(tokenSpans)
+
+                        // For krill format with base foundry, collect base text data immediately
+                        if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+                            collectKrillBaseData(docId)
+                        }
                     }
 
                     "morpho.xml" -> {
                         waitForMorpho = true
                         fnames[docId] = zipEntry.name
+                        LOGGER.info("Processing morpho.xml for $docId with foundry=$foundry from ${zipEntry.name}")
                         val fsSpans: NodeList = doc.getElementsByTagName("span")
                         val morphoSpans = extractMorphoSpans(fsSpans)
 
                         // Merge with existing morpho data (e.g., from dependency.xml)
-                        // instead of replacing it
-                        if (morpho[docId] == null) {
-                            morpho[docId] = morphoSpans
-                        } else {
-                            // Merge: add morpho data while preserving existing dependency data
-                            morphoSpans.forEach { (key, mfs) ->
-                                val existing = morpho[docId]?.get(key)
-                                if (existing != null) {
-                                    // Preserve head and deprel from existing (dependency.xml)
-                                    mfs.head = existing.head
-                                    mfs.deprel = existing.deprel
+                        // Synchronize access to morpho[docId] to avoid race conditions
+                        val morphoMap = synchronized(morpho) {
+                            morpho.getOrPut(docId) { morphoSpans }
+                        }
+
+                        if (morphoMap !== morphoSpans) {
+                            // Map already existed, need to merge
+                            synchronized(morphoMap) {
+                                morphoSpans.forEach { (key, mfs) ->
+                                    val existing = morphoMap[key]
+                                    if (existing != null) {
+                                        // Preserve head and deprel from existing (dependency.xml)
+                                        mfs.head = existing.head
+                                        mfs.deprel = existing.deprel
+                                    }
+                                    morphoMap[key] = mfs
                                 }
-                                morpho[docId]!![key] = mfs
+                                LOGGER.fine("Merged morpho.xml with existing data for $docId (preserved ${morphoMap.count { it.value.head != "_" }} dependency relations)")
                             }
-                            LOGGER.fine("Merged morpho.xml with existing data for $docId (preserved ${morpho[docId]!!.count { it.value.head != "_" }} dependency relations)")
                         }
                         tokens[docId] = extractSpans(fsSpans)
+
+                        // For krill format, collect morpho data immediately with the correct foundry
+                        if (outputFormat == OutputFormat.KRILL) {
+                            val morphoFoundry = getFoundryForLayer(foundry, "morpho")
+                            collectKrillMorphoData(docId, morphoFoundry, "morpho")
+                        }
                     }
 
                     "dependency.xml" -> {
@@ -940,26 +1081,38 @@
                         // Merge dependency info into existing morpho data
                         // Note: heads are stored as offsets (e.g., "100-110") and will be resolved
                         // to token indices later during CoNLL-U output
-                        if (morpho[docId] == null) {
-                            morpho[docId] = mutableMapOf()
-                            LOGGER.info("Created new morpho map for $docId")
+                        // Synchronize access to morpho[docId] to avoid race conditions
+                        val morphoMap = synchronized(morpho) {
+                            morpho.getOrPut(docId) {
+                                LOGGER.info("Created new morpho map for $docId")
+                                mutableMapOf()
+                            }
                         }
+
                         var mergedCount = 0
                         var newCount = 0
-                        depMap.forEach { (key, depSpan) ->
-                            val existing = morpho[docId]?.get(key)
-                            if (existing != null) {
-                                // Update existing morpho with dependency info (head is still offset-based)
-                                existing.head = depSpan.head
-                                existing.deprel = depSpan.deprel
-                                mergedCount++
-                            } else {
-                                // Create new entry with just dependency info
-                                morpho[docId]!![key] = depSpan
-                                newCount++
+                        synchronized(morphoMap) {
+                            depMap.forEach { (key, depSpan) ->
+                                val existing = morphoMap[key]
+                                if (existing != null) {
+                                    // Update existing morpho with dependency info (head is still offset-based)
+                                    existing.head = depSpan.head
+                                    existing.deprel = depSpan.deprel
+                                    mergedCount++
+                                } else {
+                                    // Create new entry with just dependency info
+                                    morphoMap[key] = depSpan
+                                    newCount++
+                                }
                             }
                         }
                         LOGGER.info("Dependency merge complete: $mergedCount merged, $newCount new entries (heads will be resolved during output)")
+
+                        // For krill format, collect dependency data with the correct foundry
+                        if (outputFormat == OutputFormat.KRILL) {
+                            val depFoundry = getFoundryForLayer(foundry, "dependency")
+                            collectKrillMorphoData(docId, depFoundry, "dependency")
+                        }
                     }
                 }
 
@@ -972,15 +1125,18 @@
                     waitForMorpho -> true
                     // For direct KorAPXML output without external annotator, require morpho unless -t/-P (handled above)
                     outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null -> true
+                    // For krill format, morpho is not required - we collect whatever is available
+                    outputFormat == OutputFormat.KRILL -> false
                     else -> false
                 }
                 // For lemma-only/lemma-based word2vec/now, we can proceed without full text
                 val textRequired = when (outputFormat) {
                     OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly)
+                    OutputFormat.KRILL -> true  // Krill needs text from base ZIP
                     else -> true
                 }
 
-                LOGGER.fine("Checking if ready to process $docId: texts=${texts[docId] != null}, sentences=${sentences[docId] != null}, tokens=${tokens[docId] != null}, morpho=${morpho[docId] != null}, morphoRequired=$morphoRequired, textRequired=$textRequired, annotationWorkerPool=${annotationWorkerPool != null}")
+                LOGGER.fine("Checking if ready to process $docId: texts=${texts[docId] != null}, sentences=${sentences[docId] != null}, tokens=${tokens[docId] != null}, morpho=${morpho[docId] != null}, morphoRequired=$morphoRequired, textRequired=$textRequired")
 
                 if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null
                     && (!morphoRequired || morpho[docId] != null)
@@ -991,13 +1147,19 @@
                 } else {
                     LOGGER.fine("NOT ready to process $docId yet: textOK=${texts[docId] != null || !textRequired}, sentencesOK=${sentences[docId] != null}, tokensOK=${tokens[docId] != null}, morphoOK=${!morphoRequired || morpho[docId] != null}")
                 }
-            } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
+            } else if ((extractMetadataRegex.isNotEmpty() || outputFormat == OutputFormat.KRILL) && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
                 //LOGGER.info("Processing header file: " + zipEntry.name)
                 val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }
                 val docId =
                     Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
                         ?.replace(Regex("/"), "_")
                 LOGGER.fine("Processing header file: " + zipEntry.name + " docId: " + docId)
+
+                // For krill format, extract rich metadata
+                if (outputFormat == OutputFormat.KRILL && docId != null) {
+                    collectKrillMetadata(docId, text)
+                }
+
                 val meta = ArrayList<String>()
                 extractMetadataRegex.forEach { regex ->
                     val match = Regex(regex).find(text)
@@ -1012,10 +1174,12 @@
                         useLemma -> true
                         waitForMorpho -> true
                         outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null -> true
+                        outputFormat == OutputFormat.KRILL -> false
                         else -> false
                     }
                     val textRequired = when (outputFormat) {
                         OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly)
+                        OutputFormat.KRILL -> true
                         else -> true
                     }
                     if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null
@@ -1050,8 +1214,14 @@
         docId: String,
         foundry: String,
     ) {
-        LOGGER.fine("Processing text: $docId in thread ${Thread.currentThread().threadId()}")
+        LOGGER.fine("processText called: $docId, foundry=$foundry, outputFormat=$outputFormat")
         var morphoFoundry = getMorphoFoundry()
+
+        // Special handling for krill format: data is collected immediately when parsed, not here
+        if (outputFormat == OutputFormat.KRILL) {
+            return
+        }
+
         val output =
         if (outputFormat == OutputFormat.WORD2VEC) {
             lmTrainingOutput(docId)
@@ -1965,7 +2135,674 @@
         sentences.remove(tempDocId)
     }
 
-}
+    // Collect structural spans from structure.xml for krill format
+    private fun collectKrillStructureSpans(docId: String, spans: NodeList) {
+        val textData = krillData.getOrPut(docId) {
+            KrillTextData(textId = docId)
+        }
+
+        synchronized(textData) {
+            // Only collect if not already collected (avoid duplicates from multiple ZIP processing)
+            if (textData.structureSpans.isNotEmpty()) {
+                LOGGER.fine("Structure spans already collected for $docId, skipping")
+                return
+            }
+            for (i in 0 until spans.length) {
+                val span = spans.item(i) as? Element ?: continue
+
+                val from = span.getAttribute("from").toIntOrNull() ?: continue
+                val to = span.getAttribute("to").toIntOrNull() ?: continue
+                val level = span.getAttribute("l").toIntOrNull() ?: 0
+
+                // Extract span name from fs/f[@name='name']
+                val fsElements = span.getElementsByTagName("fs")
+                if (fsElements.length == 0) continue
+
+                val fs = fsElements.item(0) as Element
+                val fElements = fs.getElementsByTagName("f")
+
+                var spanName: String? = null
+                val attributes = mutableMapOf<String, String>()
+
+                for (j in 0 until fElements.length) {
+                    val f = fElements.item(j) as Element
+                    val fName = f.getAttribute("name")
+
+                    when (fName) {
+                        "name" -> spanName = f.textContent
+                        "attr" -> {
+                            // Extract attributes from nested fs
+                            val attrFs = f.getElementsByTagName("fs")
+                            if (attrFs.length > 0) {
+                                val attrFsElement = attrFs.item(0) as Element
+                                val attrFElements = attrFsElement.getElementsByTagName("f")
+                                for (k in 0 until attrFElements.length) {
+                                    val attrF = attrFElements.item(k) as Element
+                                    val attrName = attrF.getAttribute("name")
+                                    val attrValue = attrF.textContent
+                                    attributes[attrName] = attrValue
+                                }
+                            }
+                        }
+                    }
+                }
+
+                if (spanName != null) {
+                    textData.structureSpans.add(StructureSpan(
+                        layer = "dereko/s:$spanName",
+                        from = from,
+                        to = to,
+                        tokenFrom = -1,  // Will be resolved later
+                        tokenTo = -1,    // Will be resolved later
+                        depth = level,
+                        attributes = attributes
+                    ))
+                }
+            }
+
+            LOGGER.fine("Collected ${textData.structureSpans.size} structural spans for $docId")
+        }
+    }
+
+    // Collect rich metadata from header.xml for krill format
+    private fun collectKrillMetadata(docId: String, headerXml: String) {
+        val textData = krillData.getOrPut(docId) {
+            KrillTextData(textId = docId)
+        }
+
+        synchronized(textData) {
+            // Extract various metadata fields
+            Regex("<editor>([^<]+)</editor>").find(headerXml)?.let {
+                textData.headerMetadata["corpusEditor"] = it.groupValues[1]
+                textData.headerMetadata["editor"] = it.groupValues[1]
+            }
+
+            // Publisher
+            Regex("<publisher>([^<]+)</publisher>").find(headerXml)?.let {
+                textData.headerMetadata["publisher"] = it.groupValues[1]
+            }
+
+            // Availability (license)
+            Regex("<availability[^>]*>([^<]+)</availability>").find(headerXml)?.let {
+                textData.headerMetadata["availability"] = it.groupValues[1]
+            }
+
+            // Author
+            Regex("<h\\.author>([^<]+)</h\\.author>").find(headerXml)?.let {
+                textData.headerMetadata["author"] = it.groupValues[1].trim()
+            }
+
+            // Title (from analytic section)
+            Regex("<analytic>.*?<h\\.title[^>]*>([^<]+)</h\\.title>", RegexOption.DOT_MATCHES_ALL).find(headerXml)?.let {
+                textData.headerMetadata["title"] = it.groupValues[1]
+            }
+
+            // Corpus title (from monogr section)
+            Regex("<monogr>.*?<h\\.title[^>]*>([^<]+)</h\\.title>", RegexOption.DOT_MATCHES_ALL).find(headerXml)?.let {
+                textData.headerMetadata["corpusTitle"] = it.groupValues[1]
+            }
+
+            // Reference
+            Regex("<reference type=\"complete\"[^>]*>([^<]+)</reference>").find(headerXml)?.let {
+                textData.headerMetadata["reference"] = it.groupValues[1]
+            }
+
+            // Creation date
+            Regex("<creatDate>([^<]+)</creatDate>").find(headerXml)?.let {
+                val dateStr = it.groupValues[1].replace(".", "-")
+                textData.headerMetadata["creationDate"] = dateStr
+            }
+
+            // Publication date (from year/month/day elements)
+            val year = Regex("<pubDate type=\"year\">([^<]+)</pubDate>").find(headerXml)?.groupValues?.get(1)
+            val month = Regex("<pubDate type=\"month\">([^<]+)</pubDate>").find(headerXml)?.groupValues?.get(1)
+            val day = Regex("<pubDate type=\"day\">([^<]+)</pubDate>").find(headerXml)?.groupValues?.get(1)
+            if (year != null && month != null && day != null) {
+                val monthPadded = month.padStart(2, '0')
+                val dayPadded = day.padStart(2, '0')
+                textData.headerMetadata["pubDate"] = "$year-$monthPadded-$dayPadded"
+            }
+
+            // Text class (from catRef)
+            Regex("<catRef[^>]+target=\"([^\"]+)\"").find(headerXml)?.let {
+                val target = it.groupValues[1]
+                // Extract topics from "topic.staat-gesellschaft.biographien-interviews"
+                val topics = target.split(".").drop(1) // Drop "topic" prefix
+                if (topics.isNotEmpty()) {
+                    textData.headerMetadata["textClass"] = topics
+                }
+            }
+
+            // Text type
+            Regex("<textTypeArt>([^<]+)</textTypeArt>").find(headerXml)?.let {
+                textData.headerMetadata["textTypeArt"] = it.groupValues[1]
+            }
+
+            // External link (from page_url ref)
+            Regex("<ref type=\"page_url\" target=\"([^\"]+)\"").find(headerXml)?.let {
+                textData.headerMetadata["externalLink"] = it.groupValues[1]
+            }
+
+            // Language - infer from corpus or default to "de" for German corpora
+            textData.headerMetadata["language"] = "de"
+
+            // Text type (plural form) - might need to be derived or hardcoded based on corpus
+            textData.headerMetadata["textType"] = textData.headerMetadata.getOrDefault("textTypeArt", "Diskussionen") as String + "en"
+
+            // Distributor - default value for IDS corpora
+            textData.headerMetadata["distributor"] = "Leibniz-Institut für Deutsche Sprache"
+
+            LOGGER.fine("Collected ${textData.headerMetadata.size} metadata fields for $docId")
+        }
+    }
+
+    // Collect base text data (text, tokens, sentences) for krill format
+    private fun collectKrillBaseData(docId: String) {
+        LOGGER.info("Collecting krill base data for $docId: text=${texts[docId] != null}, tokens=${tokens[docId] != null}, sentences=${sentences[docId] != null}")
+
+        val textData = krillData.getOrPut(docId) {
+            KrillTextData(textId = docId)
+        }
+
+        synchronized(textData) {
+            if (texts[docId] != null) {
+                textData.textContent = texts[docId]!!.toString()
+            }
+            if (tokens[docId] != null) {
+                textData.tokens = tokens[docId]
+            }
+            if (sentences[docId] != null) {
+                textData.sentences = sentences[docId]
+            }
+            // Collect metadata
+            if (metadata[docId] != null) {
+                metadata[docId]!!.forEachIndexed { index, value ->
+                    textData.headerMetadata["field_$index"] = value
+                }
+            }
+            LOGGER.info("  Collected base text data for $docId: ${textData.textContent?.length ?: 0} chars, ${textData.tokens?.size ?: 0} tokens")
+        }
+
+        // Release base data from memory (but keep morpho for later foundries)
+        texts.remove(docId)
+        tokens.remove(docId)
+        sentences.remove(docId)
+        fnames.remove(docId)
+        metadata.remove(docId)
+        extraFeatures.remove(docId)
+    }
+
+    // Extract the appropriate foundry name for a given annotation layer
+    // For combined foundries like "marmot-malt", split into morpho (marmot) and dependency (malt) parts
+    private fun getFoundryForLayer(foundry: String, layer: String): String {
+        return if ("-" in foundry) {
+            val parts = foundry.split("-")
+            when (layer) {
+                "morpho" -> parts[0]  // First part is morphology tagger (e.g., "marmot")
+                "dependency" -> parts.getOrElse(1) { parts[0] }  // Second part is parser (e.g., "malt")
+                else -> foundry
+            }
+        } else {
+            foundry  // Single foundry used for all layers
+        }
+    }
+
+    // Collect morpho data from a specific foundry for krill format
+    // annotationType: "morpho" = collect POS/lemma/features, "dependency" = collect head/deprel only
+    private fun collectKrillMorphoData(docId: String, foundry: String, annotationType: String = "morpho") {
+        LOGGER.info("Collecting krill $annotationType data for $docId, foundry=$foundry, morpho=${morpho[docId]?.size ?: 0}")
+
+        val textData = krillData.getOrPut(docId) {
+            KrillTextData(textId = docId)
+        }
+
+        val morphoDataMap = morpho[docId]
+        if (morphoDataMap != null && morphoDataMap.isNotEmpty()) {
+            // Synchronize on morpho map to avoid concurrent modification
+            synchronized(morphoDataMap) {
+                // Copy the data while holding the lock, filtering by annotation type
+                val morphoDataCopy = morphoDataMap.mapValues { (_, span) ->
+                    // Create a filtered copy of the span based on annotation type
+                    val filteredSpan = MorphoSpan()
+                    if (annotationType == "morpho") {
+                        // Copy only morphological annotations (POS, lemma, features)
+                        filteredSpan.lemma = span.lemma
+                        filteredSpan.upos = span.upos
+                        filteredSpan.xpos = span.xpos
+                        filteredSpan.feats = span.feats
+                        filteredSpan.misc = span.misc
+                    } else if (annotationType == "dependency") {
+                        // Copy only dependency annotations (head, deprel)
+                        filteredSpan.head = span.head
+                        filteredSpan.deprel = span.deprel
+                    }
+                    filteredSpan
+                }.toMutableMap()
+
+                synchronized(textData) {
+                    // Merge with existing morpho data for this foundry (don't overwrite)
+                    val existingFoundryData = textData.morphoByFoundry[foundry]
+                    if (existingFoundryData == null) {
+                        // First time collecting this foundry - just copy
+                        textData.morphoByFoundry[foundry] = morphoDataCopy
+                        LOGGER.info("  Added ${morphoDataCopy.size} $annotationType annotations for $docId from foundry $foundry, total foundries=${textData.morphoByFoundry.keys}")
+                    } else {
+                        // Merge with existing data (e.g., adding dependencies to existing morpho)
+                        var mergedCount = 0
+                        var newCount = 0
+                        morphoDataCopy.forEach { (key, newSpan) ->
+                            val existingSpan = existingFoundryData[key]
+                            if (existingSpan != null) {
+                                // Merge: add new annotations based on type
+                                if (annotationType == "dependency") {
+                                    // Only update dependency fields
+                                    if (newSpan.head != null && newSpan.head != "_") existingSpan.head = newSpan.head
+                                    if (newSpan.deprel != null && newSpan.deprel != "_") existingSpan.deprel = newSpan.deprel
+                                } else if (annotationType == "morpho") {
+                                    // Only update morphological fields (check for "_" since MorphoSpan defaults to "_", not null)
+                                    if (newSpan.lemma != null && newSpan.lemma != "_" && (existingSpan.lemma == null || existingSpan.lemma == "_")) existingSpan.lemma = newSpan.lemma
+                                    if (newSpan.upos != null && newSpan.upos != "_" && (existingSpan.upos == null || existingSpan.upos == "_")) existingSpan.upos = newSpan.upos
+                                    if (newSpan.xpos != null && newSpan.xpos != "_" && (existingSpan.xpos == null || existingSpan.xpos == "_")) existingSpan.xpos = newSpan.xpos
+                                    if (newSpan.feats != null && newSpan.feats != "_" && (existingSpan.feats == null || existingSpan.feats == "_")) existingSpan.feats = newSpan.feats
+                                    if (newSpan.misc != null && newSpan.misc != "_" && (existingSpan.misc == null || existingSpan.misc == "_")) existingSpan.misc = newSpan.misc
+                                }
+                                mergedCount++
+                            } else {
+                                // New span not in existing data
+                                existingFoundryData[key] = newSpan
+                                newCount++
+                            }
+                        }
+                        LOGGER.info("  Merged ${morphoDataCopy.size} $annotationType annotations for $docId from foundry $foundry ($mergedCount merged, $newCount new), total foundries=${textData.morphoByFoundry.keys}")
+                    }
+                }
+            }
+        }
+
+        // Note: Don't clear morpho[docId] here because we might need it for subsequent layers
+        // (e.g., when processing marmot-malt, morpho.xml is collected as "marmot" first,
+        // then dependency.xml needs the data to collect as "malt")
+        // The data will be cleared when the document is fully processed
+    }
+
+    // Old collectKrillData - no longer used, kept for reference
+    private fun collectKrillData(docId: String, foundry: String) {
+        LOGGER.info("Collecting krill data for $docId, foundry=$foundry, morpho=${morpho[docId]?.size ?: 0}")
+
+        // Get or create KrillTextData for this text
+        val wasNew = krillData[docId] == null
+        val textData = krillData.getOrPut(docId) {
+            LOGGER.info("  Creating new KrillTextData for $docId")
+            KrillTextData(textId = docId)
+        }
+        if (!wasNew) {
+            LOGGER.info("  Found existing KrillTextData for $docId, foundries=${textData.morphoByFoundry.keys}")
+        }
+
+        // Collect text content (only from base foundry)
+        if (foundry == "base" && texts[docId] != null) {
+            synchronized(textData) {
+                textData.textContent = texts[docId]!!.toString()
+                textData.tokens = tokens[docId]
+                textData.sentences = sentences[docId]
+                LOGGER.info("  Collected base text data for $docId: ${textData.textContent?.length ?: 0} chars, ${textData.tokens?.size ?: 0} tokens")
+
+                // Collect metadata
+                if (metadata[docId] != null) {
+                    metadata[docId]!!.forEachIndexed { index, value ->
+                        textData.headerMetadata["field_$index"] = value
+                    }
+                }
+            }
+        }
+
+        // Collect morpho annotations from this foundry
+        if (morpho[docId] != null && morpho[docId]!!.isNotEmpty()) {
+            synchronized(textData) {
+                // Make a copy of the morpho data to preserve it
+                textData.morphoByFoundry[foundry] = morpho[docId]!!.toMutableMap()
+                LOGGER.info("  Added ${morpho[docId]!!.size} morpho spans for $docId from foundry $foundry, total foundries=${textData.morphoByFoundry.keys}")
+            }
+        }
+
+        // Release memory for this document
+        arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
+            if (map === morpho) {
+                morpho[docId]?.clear()
+            }
+            map.remove(docId)
+        }
+    }
+
+    private fun generateKrillJson(textData: KrillTextData): String {
+        val sb = StringBuilder()
+        sb.append("{")
+
+        // @context and version
+        sb.append("\"@context\":\"http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld\",")
+        sb.append("\"version\":\"0.4\",")
+
+        // fields (metadata)
+        sb.append("\"fields\":[")
+        val fields = mutableListOf<String>()
+
+        // Extract corpus, doc, and text sigle from textId (e.g., "WUD24_I0083.95367")
+        // Convert underscores to slashes for proper format
+        val textIdWithSlashes = textData.textId.replace("_", "/").replace(".", "/")
+        val sigleParts = textIdWithSlashes.split("/")
+        if (sigleParts.size >= 3) {
+            fields.add(jsonObject(listOf(
+                "value" to jsonString(sigleParts[0]),
+                "type" to jsonString("type:string"),
+                "@type" to jsonString("koral:field"),
+                "key" to jsonString("corpusSigle")
+            )))
+            fields.add(jsonObject(listOf(
+                "@type" to jsonString("koral:field"),
+                "value" to jsonString("${sigleParts[0]}/${sigleParts[1]}"),
+                "type" to jsonString("type:string"),
+                "key" to jsonString("docSigle")
+            )))
+            fields.add(jsonObject(listOf(
+                "@type" to jsonString("koral:field"),
+                "type" to jsonString("type:string"),
+                "value" to jsonString(textIdWithSlashes),
+                "key" to jsonString("textSigle")
+            )))
+        }
+
+        // Add additional metadata fields from header with correct types
+        val fieldOrder = listOf(
+            "corpusEditor", "distributor", "editor", "externalLink", "publisher", "reference",
+            "creationDate", "pubDate", "textClass", "availability", "language", "textType",
+            "textTypeArt", "author", "corpusTitle", "title"
+        )
+
+        fieldOrder.forEach { key ->
+            val value = textData.headerMetadata[key] ?: return@forEach
+
+            // Determine field type and value format
+            val (fieldType, fieldValue) = when (key) {
+                "creationDate", "pubDate" -> {
+                    "type:date" to jsonString(value.toString())
+                }
+                "textClass" -> {
+                    "type:keywords" to when (value) {
+                        is List<*> -> jsonArray(value.map { jsonString(it.toString()) })
+                        else -> jsonArray(listOf(jsonString(value.toString())))
+                    }
+                }
+                "availability", "language" -> {
+                    "type:string" to jsonString(value.toString())
+                }
+                "author", "corpusTitle", "title" -> {
+                    "type:text" to jsonString(value.toString())
+                }
+                "externalLink" -> {
+                    val url = value.toString()
+                    // Extract title from corpus/publisher metadata if available
+                    val title = textData.headerMetadata["publisher"]?.toString() ?: "Link"
+                    val encodedUrl = url.replace(":", "%3A").replace("/", "%2F")
+                    "type:attachement" to jsonString("data:application/x.korap-link;title=$title,$encodedUrl")
+                }
+                else -> {
+                    // corpusEditor, distributor, editor, publisher, reference, textType, textTypeArt
+                    "type:attachement" to jsonString("data:,${value.toString()}")
+                }
+            }
+
+            fields.add(jsonObject(listOf(
+                "key" to jsonString(key),
+                "@type" to jsonString("koral:field"),
+                "value" to fieldValue,
+                "type" to jsonString(fieldType)
+            )))
+        }
+
+        sb.append(fields.joinToString(","))
+        sb.append("],")
+
+        // data section
+        sb.append("\"data\":{")
+        sb.append("\"text\":${jsonString(textData.textContent ?: "")},")
+
+        // layerInfos - list all foundries
+        val layerInfos = mutableListOf<String>()
+        if (textData.sentences != null) {
+            layerInfos.add("dereko/s=spans")
+        }
+
+        // Collect layers by foundry type (with dependency check)
+        val foundryLayers = mutableMapOf<String, MutableSet<String>>()
+        textData.morphoByFoundry.keys.sorted().forEach { foundry ->
+            val shortFoundry = when(foundry) {
+                "base" -> null
+                "tree_tagger" -> "tt"
+                "marmot-malt" -> "marmot"
+                else -> foundry
+            }
+            if (shortFoundry != null) {
+                val layers = foundryLayers.getOrPut(shortFoundry) { mutableSetOf() }
+
+                // Check if this foundry has dependency annotations
+                val hasDependencies = textData.morphoByFoundry[foundry]?.values?.any {
+                    it.head != null && it.head != "_" && it.deprel != null && it.deprel != "_"
+                } ?: false
+
+                if (hasDependencies) {
+                    layers.add("d=rels")
+                }
+                layers.add("l=tokens")
+                layers.add("p=tokens")
+                layers.add("m=tokens")
+            }
+        }
+
+        // Add foundry layers in sorted order
+        foundryLayers.keys.sorted().forEach { foundry ->
+            foundryLayers[foundry]?.sorted()?.forEach { layer ->
+                layerInfos.add("$foundry/$layer")
+            }
+        }
+        sb.append("\"layerInfos\":${jsonString(layerInfos.joinToString(" "))},")
+
+        // stream - token-level annotations
+        sb.append("\"stream\":[")
+        if (textData.tokens != null) {
+            val streamItems = generateKrillStream(textData)
+            sb.append(streamItems.joinToString(","))
+        }
+        sb.append("]")
+
+        sb.append("}")  // close data
+        sb.append("}")  // close root
+
+        return sb.toString()
+    }
+
+    private fun generateKrillStream(textData: KrillTextData): List<String> {
+        val tokens = textData.tokens ?: return emptyList()
+        val text = textData.textContent ?: ""
+        val sentences = textData.sentences ?: emptyArray()
+        val result = mutableListOf<String>()
+
+        // Build offset-to-index map for resolving dependency heads and structural spans
+        val offsetToIndex = mutableMapOf<String, Int>()
+        tokens.forEachIndexed { index, token ->
+            offsetToIndex["${token.from}-${token.to}"] = index
+        }
+
+        // Resolve tokenFrom and tokenTo for structural spans
+        val resolvedStructureSpans = textData.structureSpans.map { span ->
+            // Find first and last token covered by this span
+            var tokenFrom = tokens.indexOfFirst { it.from >= span.from && it.from < span.to }
+            var tokenTo = tokens.indexOfLast { it.to > span.from && it.to <= span.to }
+
+            // Handle edge cases
+            if (tokenFrom == -1) tokenFrom = 0
+            if (tokenTo == -1) tokenTo = tokens.size - 1
+
+            span.copy(tokenFrom = tokenFrom, tokenTo = tokenTo)
+        }
+
+        // Group structural spans by their starting token
+        val spansByToken = mutableMapOf<Int, MutableList<StructureSpan>>()
+        resolvedStructureSpans.forEach { span ->
+            spansByToken.getOrPut(span.tokenFrom) { mutableListOf() }.add(span)
+        }
+
+        // Count paragraph spans (name="p")
+        val paragraphCount = textData.structureSpans.count { it.layer.endsWith(":p") }
+
+        tokens.forEachIndexed { index, token ->
+            val tokenAnnotations = mutableListOf<String>()
+            val spanKey = "${token.from}-${token.to}"
+
+            // Add counts and structural spans only for first token
+            if (index == 0) {
+                if (paragraphCount > 0) {
+                    tokenAnnotations.add(jsonString("-:base/paragraphs\$<i>$paragraphCount"))
+                }
+                if (sentences.isNotEmpty()) {
+                    tokenAnnotations.add(jsonString("-:base/sentences\$<i>${sentences.size}"))
+                }
+                tokenAnnotations.add(jsonString("-:tokens\$<i>${tokens.size}"))
+
+                // Add all structural spans that start at token 0 or cover the whole document
+                val spansAtZero = spansByToken[0] ?: emptyList()
+                spansAtZero.sortedWith(compareBy({ -it.depth }, { it.layer })).forEach { span ->
+                    val spanAnnotation = if (span.attributes.isEmpty()) {
+                        "<>:${span.layer}\$<b>64<i>${span.from}<i>${span.to}<i>${span.tokenTo}<b>${span.depth}"
+                    } else {
+                        // Spans with attributes get a unique ID
+                        val attrId = span.depth
+                        "<>:${span.layer}\$<b>64<i>${span.from}<i>${span.to}<i>${span.tokenTo}<b>${span.depth}<s>$attrId"
+                    }
+                    tokenAnnotations.add(jsonString(spanAnnotation))
+
+                    // Add attribute annotations
+                    span.attributes.forEach { (key, value) ->
+                        val attrAnnotation = if (value.isEmpty()) {
+                            "@:dereko/s:$key\$<b>17<s>${span.depth}<i>${span.tokenTo}"
+                        } else {
+                            "@:dereko/s:$key:$value\$<b>17<s>${span.depth}<i>${span.tokenTo}"
+                        }
+                        tokenAnnotations.add(jsonString(attrAnnotation))
+                    }
+                }
+            } else {
+                // Add structural spans that start at this token
+                spansByToken[index]?.sortedWith(compareBy({ -it.depth }, { it.layer }))?.forEach { span ->
+                    val spanAnnotation = if (span.attributes.isEmpty()) {
+                        "<>:${span.layer}\$<b>64<i>${span.from}<i>${span.to}<i>${span.tokenTo}<b>${span.depth}"
+                    } else {
+                        "<>:${span.layer}\$<b>64<i>${span.from}<i>${span.to}<i>${span.tokenTo}<b>${span.depth}<s>${span.depth}"
+                    }
+                    tokenAnnotations.add(jsonString(spanAnnotation))
+
+                    span.attributes.forEach { (key, value) ->
+                        val attrAnnotation = if (value.isEmpty()) {
+                            "@:dereko/s:$key\$<b>17<s>${span.depth}<i>${span.tokenTo}"
+                        } else {
+                            "@:dereko/s:$key:$value\$<b>17<s>${span.depth}<i>${span.tokenTo}"
+                        }
+                        tokenAnnotations.add(jsonString(attrAnnotation))
+                    }
+                }
+            }
+
+            // Token offset annotation
+            tokenAnnotations.add(jsonString("_$index\$<i>${token.from}<i>${token.to}"))
+
+            // Collect lemmas from all foundries first (for "i:" annotation)
+            val baseMorpho = textData.morphoByFoundry["base"]?.get(spanKey)
+            val lemma = baseMorpho?.lemma?.takeIf { it != "_" }
+            if (lemma != null) {
+                tokenAnnotations.add(jsonString("i:${lemma.lowercase()}"))
+            }
+
+            // Collect annotations from all foundries for this token
+            val sortedFoundries = textData.morphoByFoundry.keys.sorted()
+            sortedFoundries.forEach { foundry ->
+                val morphoSpan = textData.morphoByFoundry[foundry]?.get(spanKey)
+                if (morphoSpan != null) {
+                    val prefix = when(foundry) {
+                        "tree_tagger" -> "tt"
+                        "marmot-malt" -> "marmot"
+                        "base" -> null  // Skip base for most annotations
+                        else -> foundry
+                    }
+
+                    if (prefix != null) {
+                        // Morphological features (sorted)
+                        if (morphoSpan.feats != null && morphoSpan.feats != "_") {
+                            val features = mutableListOf<String>()
+                            morphoSpan.feats!!.split("|").forEach { feat ->
+                                val parts = feat.split("=")
+                                if (parts.size == 2) {
+                                    val key = parts[0].lowercase()
+                                    val value = parts[1].lowercase()
+                                    features.add("$prefix/m:$key:$value")
+                                }
+                            }
+                            features.sorted().forEach { tokenAnnotations.add(jsonString(it)) }
+                        }
+
+                        // POS (xpos) with optional byte encoding
+                        if (morphoSpan.xpos != null && morphoSpan.xpos != "_") {
+                            tokenAnnotations.add(jsonString("$prefix/p:${morphoSpan.xpos}"))
+                        }
+
+                        // Lemma
+                        if (morphoSpan.lemma != null && morphoSpan.lemma != "_") {
+                            tokenAnnotations.add(jsonString("$prefix/l:${morphoSpan.lemma}"))
+                        }
+
+                        // UPOS
+                        if (morphoSpan.upos != null && morphoSpan.upos != "_") {
+                            tokenAnnotations.add(jsonString("$prefix/u:${morphoSpan.upos}"))
+                        }
+                    }
+
+                    // Dependency relations
+                    if (morphoSpan.head != null && morphoSpan.head != "_" && morphoSpan.deprel != null && morphoSpan.deprel != "_") {
+                        // Head can be either an offset (e.g., "100-110") or a token index (e.g., "1")
+                        val headStr = morphoSpan.head!!
+                        val resolvedHeadIndex = if (headStr.contains("-")) {
+                            // Offset format - resolve to token index
+                            offsetToIndex[headStr]
+                        } else {
+                            // Already a token index (1-based CoNLL-U format)
+                            val idx = headStr.toIntOrNull()
+                            if (idx != null && idx > 0) idx - 1 else null  // Convert 1-based to 0-based
+                        }
+
+                        if (resolvedHeadIndex != null) {
+                            // Regular dependency - outgoing edge to head
+                            tokenAnnotations.add(jsonString(">:$prefix/d:${morphoSpan.deprel}\$<b>32<i>$resolvedHeadIndex"))
+                        } else if (headStr == "0" || (headStr.contains("-") && headStr.startsWith("0-"))) {
+                            // ROOT node - use incoming edge format with full span info
+                            tokenAnnotations.add(jsonString("<:$prefix/d:${morphoSpan.deprel}\$<b>34<i>${token.from}<i>${token.to}<i>$index<i>1"))
+                        }
+                    }
+                }
+            }
+
+            // Surface form (always last)
+            val surfaceForm = if (token.to <= text.length) {
+                text.substring(token.from, token.to)
+            } else {
+                ""
+            }
+            tokenAnnotations.add(jsonString("s:$surfaceForm"))
+
+            result.add(jsonArray(tokenAnnotations))
+        }
+
+        return result
+    }
+
+}  // End of KorapXmlTool class
 
 fun main(args: Array<String>): Unit {
     try { Locale.setDefault(Locale.ROOT) } catch (_: Exception) {}
@@ -1977,7 +2814,7 @@
 }
 
 enum class OutputFormat {
-    CONLLU, WORD2VEC, KORAPXML, NOW
+    CONLLU, WORD2VEC, KORAPXML, NOW, KRILL
 }
 
 object ConlluOutputFormat {
@@ -1996,3 +2833,40 @@
     const val NAME = "now"
 }
 
+object KrillOutputFormat {
+    const val NAME = "krill"
+}
+
+// JSON utility functions for krill output (no external JSON library dependency)
+fun String.escapeJson(): String {
+    val sb = StringBuilder()
+    for (c in this) {
+        when (c) {
+            '\\' -> sb.append("\\\\")
+            '"' -> sb.append("\\\"")
+            '\b' -> sb.append("\\b")
+            '\n' -> sb.append("\\n")
+            '\r' -> sb.append("\\r")
+            '\t' -> sb.append("\\t")
+            else -> {
+                if (c < ' ') {
+                    sb.append(String.format("\\u%04x", c.code))
+                } else {
+                    sb.append(c)
+                }
+            }
+        }
+    }
+    return sb.toString()
+}
+
+fun jsonString(value: String): String = "\"${value.escapeJson()}\""
+
+fun jsonArray(items: List<String>): String = items.joinToString(",", "[", "]")
+
+fun jsonObject(pairs: List<Pair<String, String>>): String {
+    return pairs.joinToString(",", "{", "}") { (key, value) ->
+        "${jsonString(key)}:${value}"
+    }
+}
+

diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index ec8f5b1..a4f9174 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt

@@ -421,4 +421,127 @@
             "Should find tokens with numeric HEAD values in column 7"
         )
     }
+
+    @Test
+    fun krillOutputMatchesExpectedStructure() {
+        // Test krill format output against expected reference
+        val baseZip = loadResource("wud24_sample.zip").path
+        val spacyZip = loadResource("wud24_sample.spacy.zip").path
+        val marmotMaltZip = loadResource("wud24_sample.marmot-malt.zip").path
+        val expectedTar = loadResource("wud24_sample.krill.tar").path
+
+        // Create temporary output file
+        val outputTar = File.createTempFile("wud24_krill_test", ".tar")
+        outputTar.deleteOnExit()
+
+        // Generate krill output
+        val args = arrayOf("-f", "krill", "-o", baseZip, spacyZip, marmotMaltZip)
+        val exitCode = debug(args)
+
+        // Check that generation succeeded
+        assertTrue(exitCode == 0, "Krill conversion should succeed")
+
+        // Expected output file name
+        val generatedTar = File(baseZip.replace(".zip", ".krill.tar"))
+        assertTrue(generatedTar.exists(), "Generated krill tar should exist at ${generatedTar.path}")
+
+        // Extract both tars to temp directories
+        val expectedDir = File.createTempFile("expected", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+        val generatedDir = File.createTempFile("generated", "").let {
+            it.delete()
+            it.mkdirs()
+            it
+        }
+
+        try {
+            // Extract tars using tar command
+            ProcessBuilder("tar", "-xf", expectedTar, "-C", expectedDir.path).start().waitFor()
+            ProcessBuilder("tar", "-xf", generatedTar.path, "-C", generatedDir.path).start().waitFor()
+
+            // Get list of JSON files in both directories
+            val expectedFiles = expectedDir.listFiles()?.filter { it.name.endsWith(".json.gz") }?.sorted() ?: emptyList()
+            val generatedFiles = generatedDir.listFiles()?.filter { it.name.endsWith(".json.gz") }?.sorted() ?: emptyList()
+
+            // Check same number of files
+            assertTrue(
+                expectedFiles.size == generatedFiles.size,
+                "Should have same number of JSON files. Expected: ${expectedFiles.size}, Got: ${generatedFiles.size}"
+            )
+
+            // Compare each JSON file
+            expectedFiles.zip(generatedFiles).forEach { (expectedFile, generatedFile) ->
+                System.err.println("Comparing: ${expectedFile.name} vs ${generatedFile.name}")
+
+                // Parse both JSON files
+                val expectedJson = ProcessBuilder("gunzip", "-c", expectedFile.path)
+                    .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                    .start()
+                    .inputStream
+                    .bufferedReader()
+                    .readText()
+
+                val generatedJson = ProcessBuilder("gunzip", "-c", generatedFile.path)
+                    .redirectOutput(ProcessBuilder.Redirect.PIPE)
+                    .start()
+                    .inputStream
+                    .bufferedReader()
+                    .readText()
+
+                // Check basic structure with simple string checks
+                // Rather than parsing JSON, just verify key elements are present
+                assertTrue(expectedJson.contains("\"@context\""), "Expected should have @context")
+                assertTrue(generatedJson.contains("\"@context\""), "Generated should have @context")
+                assertTrue(generatedJson.contains("\"version\""), "Generated should have version")
+                assertTrue(generatedJson.contains("\"fields\""), "Generated should have fields")
+                assertTrue(generatedJson.contains("\"data\""), "Generated should have data")
+                assertTrue(generatedJson.contains("\"text\""), "Generated should have text")
+                assertTrue(generatedJson.contains("\"stream\""), "Generated should have stream")
+
+                // Count metadata fields in both
+                val expectedFieldCount = Regex("\"@type\"\\s*:\\s*\"koral:field\"").findAll(expectedJson).count()
+                val generatedFieldCount = Regex("\"@type\"\\s*:\\s*\"koral:field\"").findAll(generatedJson).count()
+                assertTrue(
+                    expectedFieldCount == generatedFieldCount,
+                    "Should have same number of metadata fields in ${expectedFile.name}. Expected: $expectedFieldCount, Got: $generatedFieldCount"
+                )
+
+                // Count stream tokens (approximate by counting array entries)
+                // Stream format: [[...],[...],...] so count "],["
+                val expectedTokenCount = expectedJson.substringAfter("\"stream\"").let {
+                    Regex("\\]\\s*,\\s*\\[").findAll(it).count() + 1
+                }
+                val generatedTokenCount = generatedJson.substringAfter("\"stream\"").let {
+                    Regex("\\]\\s*,\\s*\\[").findAll(it).count() + 1
+                }
+                assertTrue(
+                    expectedTokenCount == generatedTokenCount,
+                    "Should have same token count in ${expectedFile.name}. Expected: $expectedTokenCount, Got: $generatedTokenCount"
+                )
+
+                // Check that we have multi-foundry annotations (spacy and malt)
+                val streamStr = generatedJson
+                assertTrue(
+                    streamStr.contains("spacy/"),
+                    "Should have spacy foundry annotations"
+                )
+                assertTrue(
+                    streamStr.contains("malt/") || streamStr.contains("marmot/"),
+                    "Should have malt or marmot foundry annotations"
+                )
+
+                System.err.println("  ✓ ${expectedFile.name} matches structure")
+            }
+
+            System.err.println("All krill output files match expected structure!")
+        } finally {
+            // Cleanup
+            expectedDir.deleteRecursively()
+            generatedDir.deleteRecursively()
+            generatedTar.delete()
+        }
+    }
 }

diff --git a/app/src/test/resources/wud24_sample.cfg b/app/src/test/resources/wud24_sample.cfg
new file mode 100644
index 0000000..f534b7b
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.cfg

@@ -0,0 +1,14 @@
+overwrite         0
+input-base        .
+token             Base#tokens
+base-sentences    DeReKo#Structure
+base-paragraphs   DeReKo#Structure
+base-pagebreaks   DeReKo#Structure
+sequential-extraction 1
+to-tar            1
+jobs              -1
+meta              I5
+gzip              1
+log               WARN 
+koral             0.4
+output            .

diff --git a/app/src/test/resources/wud24_sample.krill.tar b/app/src/test/resources/wud24_sample.krill.tar
new file mode 100644
index 0000000..0a1745e
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.krill.tar
Binary files differ

diff --git a/app/src/test/resources/wud24_sample.marmot-malt.zip b/app/src/test/resources/wud24_sample.marmot-malt.zip
new file mode 100644
index 0000000..3ae661a
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.marmot-malt.zip
Binary files differ

diff --git a/app/src/test/resources/wud24_sample.opennlp.zip b/app/src/test/resources/wud24_sample.opennlp.zip
new file mode 100644
index 0000000..53115dd
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.opennlp.zip
Binary files differ

diff --git a/app/src/test/resources/wud24_sample.spacy.zip b/app/src/test/resources/wud24_sample.spacy.zip
new file mode 100644
index 0000000..d65720a
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.spacy.zip
Binary files differ

diff --git a/app/src/test/resources/wud24_sample.tree_tagger.zip b/app/src/test/resources/wud24_sample.tree_tagger.zip
new file mode 100644
index 0000000..38934a6
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.tree_tagger.zip
Binary files differ

diff --git a/app/src/test/resources/wud24_sample.zip b/app/src/test/resources/wud24_sample.zip
new file mode 100644
index 0000000..90f9c15
--- /dev/null
+++ b/app/src/test/resources/wud24_sample.zip
Binary files differ
commit	86b055aaae908968e7395942b2883d2795c1fba0	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Nov 08 17:42:09 2025 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Nov 08 18:23:08 2025 +0100
tree	83dd41695795d5227bbb9550514c27cc03df1046
parent	c6b51e75a98927a248c985f9c0044c1e88f1be66 [diff]