Add --lz4 option for krill output

Change-Id: I99750a780bf0898b52fa0c2e83e587e89cbaa086
diff --git a/app/build.gradle b/app/build.gradle
index 52cc72c..97af808 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -47,6 +47,7 @@
     implementation 'org.apache.ant:ant:1.10.15'
     implementation 'org.apache.commons:commons-compress:1.28.0'
     implementation 'me.tongfei:progressbar:0.10.1'
+    implementation 'org.lz4:lz4-java:1.8.0'
 
     // Fix XML APIs conflict - force resolution to the non-relocated version
     implementation('xml-apis:xml-apis:1.0.b2') {
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index f6a003f..5147734 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -19,6 +19,9 @@
 import picocli.CommandLine.*
 import java.io.*
 import java.lang.Integer.parseInt
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+import java.nio.charset.StandardCharsets
 import java.util.*
 import java.util.concurrent.Callable
 import java.util.concurrent.ConcurrentHashMap
@@ -33,7 +36,6 @@
 import java.util.stream.IntStream
 import java.util.zip.GZIPOutputStream
 import java.util.zip.ZipFile
-import java.nio.charset.StandardCharsets
 import kotlin.text.Charsets
 import me.tongfei.progressbar.ProgressBar
 import me.tongfei.progressbar.ProgressBarBuilder
@@ -199,6 +201,12 @@
     )
     var includeNonWordTokens: Boolean = false
 
+    @Option(
+        names = ["--lz4"],
+        description = ["Use LZ4 compression for Krill JSON output instead of gzip (faster but larger files)."]
+    )
+    var useLz4: Boolean = false
+
     @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
     var offsets: Boolean = false
 
@@ -1033,6 +1041,7 @@
                     if (!textFoundries.containsAll(expectedForThisText)) {
                         LOGGER.warning("Outputting incomplete text $textId with foundries ${textFoundries.sorted()} (expected: ${expectedForThisText.sorted()})")
                     }
+                    
                     outputKrillText(textId, textData)
                     // Continue stepping the same progress bar
                     incrementalProgressBar?.step()
@@ -3472,6 +3481,8 @@
         }
     }
 
+
+
     // Start timer-based scanner for incremental output
     private fun startIncrementalWriterThread() {
         if (outputFormat != OutputFormat.KRILL || krillTarOutputStream == null) return
@@ -3732,39 +3743,37 @@
             }
 
             val json = KrillJsonGenerator.generate(textData, corpusMetadata, docMetadata, includeNonWordTokens)
-            val jsonFileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
-
-            // Compress JSON with GZIP (fast compression level 1 for better performance)
-            val byteOut = ByteArrayOutputStream()
-            val deflater = java.util.zip.Deflater(1, true)  // Level 1 (fast), nowrap=true for gzip
-            val deflaterOut = java.util.zip.DeflaterOutputStream(byteOut, deflater)
             
-            // Write gzip header manually (required for nowrap mode)
-            byteOut.write(byteArrayOf(0x1f, 0x8b.toByte()))  // Magic number
-            byteOut.write(0x08)  // Compression method (deflate)
-            byteOut.write(0x00)  // Flags
-            byteOut.write(ByteArray(6))  // Timestamp + extra flags + OS (all zeros)
-            
-            // Compress data
-            val jsonBytes = json.toByteArray(Charsets.UTF_8)
-            deflaterOut.write(jsonBytes)
-            deflaterOut.finish()
-            
-            // Write gzip trailer (CRC32 + uncompressed size)
-            val crc = java.util.zip.CRC32()
-            crc.update(jsonBytes)
-            val crcValue = crc.value.toInt()
-            byteOut.write(crcValue and 0xFF)
-            byteOut.write((crcValue shr 8) and 0xFF)
-            byteOut.write((crcValue shr 16) and 0xFF)
-            byteOut.write((crcValue shr 24) and 0xFF)
-            val size = jsonBytes.size
-            byteOut.write(size and 0xFF)
-            byteOut.write((size shr 8) and 0xFF)
-            byteOut.write((size shr 16) and 0xFF)
-            byteOut.write((size shr 24) and 0xFF)
-            
-            val compressedData = byteOut.toByteArray()
+            // Choose compression format based on --lz4 flag
+            val (jsonFileName, compressedData) = if (useLz4) {
+                val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
+                val jsonBytes = json.toByteArray(Charsets.UTF_8)
+                val byteOut = ByteArrayOutputStream()
+                net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
+                    lz4Out.write(jsonBytes)
+                }
+                Pair(fileName, byteOut.toByteArray())
+            } else {
+                // Use fast GZIP (level 1) for better performance
+                val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
+                val jsonBytes = json.toByteArray(Charsets.UTF_8)
+                val byteOut = ByteArrayOutputStream()
+                val deflater = java.util.zip.Deflater(1, true) // level 1, nowrap=true for raw deflate
+                java.util.zip.DeflaterOutputStream(byteOut, deflater).use { deflateOut ->
+                    // Write GZIP header
+                    byteOut.write(byteArrayOf(0x1f, 0x8b.toByte(), 8, 0, 0, 0, 0, 0, 0, 0))
+                    val crc = java.util.zip.CRC32()
+                    crc.update(jsonBytes)
+                    deflateOut.write(jsonBytes)
+                    deflateOut.finish()
+                    // Write GZIP trailer (CRC32 and uncompressed size)
+                    val trailer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN)
+                    trailer.putInt(crc.value.toInt())
+                    trailer.putInt(jsonBytes.size)
+                    byteOut.write(trailer.array())
+                }
+                Pair(fileName, byteOut.toByteArray())
+            }
 
             // Write to TAR (synchronized for thread safety)
             synchronized(krillTarOutputStream!!) {
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 2c7cd6d..bfbc622 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -1051,7 +1051,7 @@
                     .inputStream
                     .bufferedReader()
                     .use { it.readText() }
-                jsonFile.name to jsonContent
+                jsonFile.name.removeSuffix(".gz") to jsonContent
             }
         } finally {
             extractDir.deleteRecursively()