Fix 2nd Krill output route for UTF8 buffer overflows
Should fix exceptions like:
dnb24.krill.log:java.lang.NegativeArraySizeException: -1394181943
dnb24.krill.log- at
java.base/java.lang.String.encodeUTF8_UTF16(String.java:1326)
dnb24.krill.log- at
java.base/java.lang.String.encodeUTF8(String.java:1299)
dnb24.krill.log- at java.base/java.lang.String.encode(String.java:867)
dnb24.krill.log- at
java.base/java.lang.String.getBytes(String.java:1818)
dnb24.krill.log- at
de.ids_mannheim.korapxmltools.KorapXmlTool.compressKrillText(KorapXmlTool.kt:5531)
dnb24.krill.log- at
de.ids_mannheim.korapxmltools.KorapXmlTool.enqueueKrillCompression$lambda$0(KorapXmlTool.kt:5570)
dnb24.krill.log- at
java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:572)
dnb24.krill.log- at
java.base/java.util.concurrent.FutureTask.run(FutureTask.java:317)
dnb24.krill.log- at
java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
dnb24.krill.log- at
java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
Change-Id: I0a4029d9c8df3cbe2ceff4e7c250f1482fde3fe8
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index d089d0b..a90213c 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -5517,31 +5517,7 @@
KrillJsonGenerator.generate(textData, corpusMetadata, docMetadata, includeNonWordTokens)
}
- // Choose compression format based on --lz4 flag
- val (jsonFileName, compressedData) = if (useLz4) {
- val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
- val jsonBytes = json.toByteArray(Charsets.UTF_8)
- val byteOut = ByteArrayOutputStream()
- net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
- lz4Out.write(jsonBytes)
- }
- Pair(fileName, byteOut.toByteArray())
- } else {
- // Use GZIP with level 1 compression for speed
- val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
- val jsonBytes = json.toByteArray(Charsets.UTF_8)
- val byteOut = ByteArrayOutputStream(jsonBytes.size)
-
- // Create GZIPOutputStream with level 1 (fast) compression
- val gzipOut = object : java.util.zip.GZIPOutputStream(byteOut) {
- init {
- def.setLevel(1)
- }
- }
- gzipOut.use { it.write(jsonBytes) }
-
- Pair(fileName, byteOut.toByteArray())
- }
+ val (jsonFileName, compressedData) = compressKrillJson(textId, json)
// Store compressed data for sequential TAR writing
krillCompressedData[textId] = CompressedKrillData(textId, jsonFileName, compressedData)
@@ -5585,6 +5561,33 @@
}
}
+ private fun compressKrillJson(textId: String, json: String): Pair<String, ByteArray> {
+ return if (useLz4) {
+ val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
+ val byteOut = ByteArrayOutputStream()
+ net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
+ OutputStreamWriter(lz4Out, StandardCharsets.UTF_8).use { writer ->
+ writer.write(json)
+ }
+ }
+ Pair(fileName, byteOut.toByteArray())
+ } else {
+ val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
+ val byteOut = ByteArrayOutputStream()
+ val gzipOut = object : java.util.zip.GZIPOutputStream(byteOut) {
+ init {
+ def.setLevel(1)
+ }
+ }
+ gzipOut.use { gzip ->
+ OutputStreamWriter(gzip, StandardCharsets.UTF_8).use { writer ->
+ writer.write(json)
+ }
+ }
+ Pair(fileName, byteOut.toByteArray())
+ }
+ }
+
private fun writeReadyKrillTexts(textIds: Collection<String>): Int {
var outputCount = 0
@@ -5864,31 +5867,7 @@
val json = KrillJsonGenerator.generate(textData, corpusMetadata, docMetadata, includeNonWordTokens)
- // Choose compression format based on --lz4 flag
- val (jsonFileName, compressedData) = if (useLz4) {
- val fileName = textId.replace("_", "-").replace(".", "-") + ".json.lz4"
- val jsonBytes = json.toByteArray(Charsets.UTF_8)
- val byteOut = ByteArrayOutputStream()
- net.jpountz.lz4.LZ4FrameOutputStream(byteOut).use { lz4Out ->
- lz4Out.write(jsonBytes)
- }
- Pair(fileName, byteOut.toByteArray())
- } else {
- // Use GZIP with level 1 compression for speed
- val fileName = textId.replace("_", "-").replace(".", "-") + ".json.gz"
- val jsonBytes = json.toByteArray(Charsets.UTF_8)
- val byteOut = ByteArrayOutputStream(jsonBytes.size)
-
- // Create GZIPOutputStream with level 1 (fast) compression
- val gzipOut = object : java.util.zip.GZIPOutputStream(byteOut) {
- init {
- def.setLevel(1)
- }
- }
- gzipOut.use { it.write(jsonBytes) }
-
- Pair(fileName, byteOut.toByteArray())
- }
+ val (jsonFileName, compressedData) = compressKrillJson(textId, json)
// Write to TAR (synchronized for thread safety)
synchronized(krillTarOutputStream!!) {
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
index 180b332..a703331 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KrillJsonGeneratorTest.kt
@@ -1,5 +1,6 @@
package de.ids_mannheim.korapxmltools
+import net.jpountz.lz4.LZ4FrameInputStream
import org.junit.After
import org.junit.AfterClass
import org.junit.Before
@@ -662,6 +663,35 @@
}
@Test
+ fun krillCanWriteLz4CompressedJson() {
+ val baseZip = loadResource("wud24_sample.zip").path
+ val generatedTar = ensureKrillTar("wud24_lz4", "wud24_sample.krill.tar") { outputDir ->
+ arrayOf("-t", "krill", "-q", "--lz4", "-D", outputDir.path, baseZip)
+ }
+ assertTrue(generatedTar.exists())
+
+ val extractDir = File.createTempFile("extract_lz4", "").let { it.delete(); it.mkdirs(); it }
+ try {
+ val tarProcess = ProcessBuilder("tar", "-xf", generatedTar.path, "-C", extractDir.path)
+ .redirectErrorStream(true)
+ .start()
+ assertTrue(tarProcess.waitFor() == 0, "Tar extraction should succeed for ${generatedTar.path}")
+
+ val jsonFiles = extractDir.listFiles()?.filter { it.name.endsWith(".json.lz4") }.orEmpty()
+ assertTrue(jsonFiles.isNotEmpty(), "Expected LZ4-compressed JSON files in ${generatedTar.path}")
+
+ jsonFiles.forEach { jsonFile ->
+ val jsonContent = LZ4FrameInputStream(jsonFile.inputStream()).bufferedReader().use { it.readText() }
+ assertTrue(jsonContent.contains("\"@context\""))
+ assertTrue(jsonContent.contains("\"@type\":\"koral:corpus\""))
+ assertTrue(jsonContent.contains("\"text\""))
+ }
+ } finally {
+ extractDir.deleteRecursively()
+ }
+ }
+
+ @Test
fun testProbabilitySortingInKrillJsonOutput() {
// Test that multiple POS annotations are sorted by descending probability in Krill JSON output
// Use the base sample ZIP which should contain POS annotations with probabilities