Fix 2nd Krill output route for UTF8 buffer overflows

Should fix exceptions like:

dnb24.krill.log:java.lang.NegativeArraySizeException: -1394181943
dnb24.krill.log-	at
java.base/java.lang.String.encodeUTF8_UTF16(String.java:1326)
dnb24.krill.log-	at
java.base/java.lang.String.encodeUTF8(String.java:1299)
dnb24.krill.log-	at java.base/java.lang.String.encode(String.java:867)
dnb24.krill.log-	at
java.base/java.lang.String.getBytes(String.java:1818)
dnb24.krill.log-	at
de.ids_mannheim.korapxmltools.KorapXmlTool.compressKrillText(KorapXmlTool.kt:5531)
dnb24.krill.log-	at
de.ids_mannheim.korapxmltools.KorapXmlTool.enqueueKrillCompression$lambda$0(KorapXmlTool.kt:5570)
dnb24.krill.log-	at
java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:572)
dnb24.krill.log-	at
java.base/java.util.concurrent.FutureTask.run(FutureTask.java:317)
dnb24.krill.log-	at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
dnb24.krill.log-	at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)

Change-Id: I0a4029d9c8df3cbe2ceff4e7c250f1482fde3fe8
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index d089d0b..a90213c 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -5517,31 +5517,7 @@
                 KrillJsonGenerator.generate(textData, corpusMetadata, docMetadata, includeNonWordTokens)
             }
             
-            // Choose compression format based on --lz4 flag
-            val (jsonFileName, compressedData) = if (useLz4) {
-                val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
-                val jsonBytes = json.toByteArray(Charsets.UTF_8)
-                val byteOut = ByteArrayOutputStream()
-                net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
-                    lz4Out.write(jsonBytes)
-                }
-                Pair(fileName, byteOut.toByteArray())
-            } else {
-                // Use GZIP with level 1 compression for speed
-                val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
-                val jsonBytes = json.toByteArray(Charsets.UTF_8)
-                val byteOut = ByteArrayOutputStream(jsonBytes.size)
-                
-                // Create GZIPOutputStream with level 1 (fast) compression
-                val gzipOut = object : java.util.zip.GZIPOutputStream(byteOut) {
-                    init {
-                        def.setLevel(1)
-                    }
-                }
-                gzipOut.use { it.write(jsonBytes) }
-                
-                Pair(fileName, byteOut.toByteArray())
-            }
+            val (jsonFileName, compressedData) = compressKrillJson(textId, json)
 
             // Store compressed data for sequential TAR writing
             krillCompressedData[textId] = CompressedKrillData(textId, jsonFileName, compressedData)
@@ -5585,6 +5561,33 @@
         }
     }
 
+    private fun compressKrillJson(textId: String, json: String): Pair<String, ByteArray> {
+        return if (useLz4) {
+            val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
+            val byteOut = ByteArrayOutputStream()
+            net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
+                OutputStreamWriter(lz4Out, StandardCharsets.UTF_8).use { writer ->
+                    writer.write(json)
+                }
+            }
+            Pair(fileName, byteOut.toByteArray())
+        } else {
+            val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
+            val byteOut = ByteArrayOutputStream()
+            val gzipOut = object : java.util.zip.GZIPOutputStream(byteOut) {
+                init {
+                    def.setLevel(1)
+                }
+            }
+            gzipOut.use { gzip ->
+                OutputStreamWriter(gzip, StandardCharsets.UTF_8).use { writer ->
+                    writer.write(json)
+                }
+            }
+            Pair(fileName, byteOut.toByteArray())
+        }
+    }
+
     private fun writeReadyKrillTexts(textIds: Collection<String>): Int {
         var outputCount = 0
 
@@ -5864,31 +5867,7 @@
 
             val json = KrillJsonGenerator.generate(textData, corpusMetadata, docMetadata, includeNonWordTokens)
             
-            // Choose compression format based on --lz4 flag
-            val (jsonFileName, compressedData) = if (useLz4) {
-                val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
-                val jsonBytes = json.toByteArray(Charsets.UTF_8)
-                val byteOut = ByteArrayOutputStream()
-                net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
-                    lz4Out.write(jsonBytes)
-                }
-                Pair(fileName, byteOut.toByteArray())
-            } else {
-                // Use GZIP with level 1 compression for speed
-                val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
-                val jsonBytes = json.toByteArray(Charsets.UTF_8)
-                val byteOut = ByteArrayOutputStream(jsonBytes.size)
-                
-                // Create GZIPOutputStream with level 1 (fast) compression
-                val gzipOut = object : java.util.zip.GZIPOutputStream(byteOut) {
-                    init {
-                        def.setLevel(1)
-                    }
-                }
-                gzipOut.use { it.write(jsonBytes) }
-                
-                Pair(fileName, byteOut.toByteArray())
-            }
+            val (jsonFileName, compressedData) = compressKrillJson(textId, json)
 
             // Write to TAR (synchronized for thread safety)
             synchronized(krillTarOutputStream!!) {
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 180b332..a703331 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -1,5 +1,6 @@
 package de.ids_mannheim.korapxmltools
 
+import net.jpountz.lz4.LZ4FrameInputStream
 import org.junit.After
 import org.junit.AfterClass
 import org.junit.Before
@@ -662,6 +663,35 @@
     }
 
     @Test
+    fun krillCanWriteLz4CompressedJson() {
+        val baseZip = loadResource("wud24_sample.zip").path
+        val generatedTar = ensureKrillTar("wud24_lz4", "wud24_sample.krill.tar") { outputDir ->
+            arrayOf("-t", "krill", "-q", "--lz4", "-D", outputDir.path, baseZip)
+        }
+        assertTrue(generatedTar.exists())
+
+        val extractDir = File.createTempFile("extract_lz4", "").let { it.delete(); it.mkdirs(); it }
+        try {
+            val tarProcess = ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path)
+                .redirectErrorStream(true)
+                .start()
+            assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed for ${generatedTar.path}")
+
+            val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.lz4") }.orEmpty()
+            assertTrue(jsonFiles.isNotEmpty(), "Expected LZ4-compressed JSON files in ${generatedTar.path}")
+
+            jsonFiles.forEach { jsonFile ->
+                val jsonContent = LZ4FrameInputStream(jsonFile.inputStream()).bufferedReader().use { it.readText() }
+                assertTrue(jsonContent.contains("\"@context\""))
+                assertTrue(jsonContent.contains("\"@type\":\"koral:corpus\""))
+                assertTrue(jsonContent.contains("\"text\""))
+            }
+        } finally {
+            extractDir.deleteRecursively()
+        }
+    }
+
+    @Test
     fun testProbabilitySortingInKrillJsonOutput() {
         // Test that multiple POS annotations are sorted by descending probability in Krill JSON output
         // Use the base sample ZIP which should contain POS annotations with probabilities