Fix heap issues with 2krill conversion Change-Id: Idc0a28e1f23762abc8fef5c8b48b127449deb1a5

commit: 81d53cae939c23dc85a1f887262b1e09a90fe9a5 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Mar 05 12:07:15 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Mar 05 12:07:15 2026 +0100
tree: 04c4545321ffc769514e3a1a53a9149169f5928b
parent: b5f0b39c09c13b07ca1cd317dec0befa10af7bd6 [diff]
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 5f593f8..4dc5133 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt

@@ -1533,13 +1533,17 @@
 
                 // Continue using the same progress bar for remaining texts (no separate bar)
                 // Output remaining texts (these weren't output incrementally, possibly incomplete)
-                // Copy keys to avoid ConcurrentModificationException if scanner is still running
-                val remainingKeys = krillData.keys.sortedWith(this::compareTextIds)
+                // Copy keys to avoid ConcurrentModificationException if scanner is still running.
+                // NOTE: after compressKrillText() removes from krillData, a completed-but-not-yet-
+                // written text lives only in krillCompressedData, so we must merge both key sets.
+                val remainingRawKeys = krillData.keys.toSet()
+                val remainingCompressedKeys = krillCompressedData.keys.toSet()
+                val remainingKeys = (remainingRawKeys + remainingCompressedKeys).sortedWith(this::compareTextIds)
                 
-                // Phase 1: Submit all remaining texts for parallel compression
+                // Phase 1: Submit all remaining RAW texts for parallel compression
                 if (compressionExecutor != null && !compressionExecutor!!.isShutdown) {
-                    LOGGER.info("Submitting ${remainingKeys.size} remaining texts for parallel compression")
-                    remainingKeys.forEach { textId ->
+                    LOGGER.info("Submitting ${remainingRawKeys.size} remaining texts for parallel compression")
+                    remainingRawKeys.forEach { textId ->
                         val textData = krillData[textId]
                         if (textData != null && !krillCompressedData.containsKey(textId)) {
                             compressionExecutor!!.submit {
@@ -1566,23 +1570,24 @@
                 
                 // Phase 2: Write compressed data to TAR sequentially
                 remainingKeys.forEach { textId ->
-                    val textData = krillData.remove(textId) ?: return@forEach  // Skip if already removed
+                    // textData may be null if compressKrillText already removed it from krillData
+                    val textData = krillData.remove(textId)
                     val compressedData = krillCompressedData.remove(textId)
                     
                     if (compressedData != null) {
                         // Already compressed - just write to TAR
-                        val textFoundries = textData.morphoByFoundry.keys.toSet() + setOf("base")
-                        val expectedForThisText = zipInventory.filter { (_, texts) -> texts.contains(textId) }.keys
-                            .flatMap { zipPath ->
-                                val foundry = getFoundryFromZipFileName(File(zipPath).name)
-                                if (foundry.contains("-")) foundry.split("-") else listOf(foundry)
+                        if (textData != null) {
+                            val textFoundries = textData.morphoByFoundry.keys.toSet() + setOf("base")
+                            val expectedForThisText = zipInventory.filter { (_, texts) -> texts.contains(textId) }.keys
+                                .flatMap { zipPath ->
+                                    val foundry = getFoundryFromZipFileName(File(zipPath).name)
+                                    if (foundry.contains("-")) foundry.split("-") else listOf(foundry)
+                                }
+                                .toSet()
+                            if (!textFoundries.containsAll(expectedForThisText)) {
+                                LOGGER.warning("Outputting incomplete text $textId with foundries ${textFoundries.sorted()} (expected: ${expectedForThisText.sorted()})")
                             }
-                            .toSet()
-
-                        if (!textFoundries.containsAll(expectedForThisText)) {
-                            LOGGER.warning("Outputting incomplete text $textId with foundries ${textFoundries.sorted()} (expected: ${expectedForThisText.sorted()})")
                         }
-                        
                         // Write pre-compressed data directly to TAR
                         try {
                             synchronized(krillTarOutputStream!!) {
@@ -1598,7 +1603,7 @@
                             LOGGER.severe("ERROR writing $textId to TAR: ${e.message}")
                             e.printStackTrace()
                         }
-                    } else {
+                    } else if (textData != null) {
                         // Fallback: compress inline if not in compressed cache (shouldn't happen)
                         LOGGER.warning("Text $textId not in compressed cache, compressing inline")
                         val textFoundries = textData.morphoByFoundry.keys.toSet() + setOf("base")
@@ -1615,6 +1620,9 @@
                         
                         outputKrillText(textId, textData)
                         incrementalProgressBar?.step()
+                    } else {
+                        // Both null: text was already written by the incremental writer - skip
+                        LOGGER.fine("Text $textId already written incrementally, skipping in finalization")
                     }
                 }
 
@@ -5362,7 +5370,12 @@
 
             // Store compressed data for sequential TAR writing
             krillCompressedData[textId] = CompressedKrillData(textId, jsonFileName, compressedData)
-            LOGGER.finer("Compressed text $textId (${compressedData.size} bytes)")
+            // Free the raw KrillTextData immediately - we no longer need it now that
+            // the JSON is serialised and compressed. This is the main memory saver for
+            // large corpora: without this, raw data and compressed bytes both sit in
+            // memory simultaneously until the incremental writer catches up.
+            krillData.remove(textId)
+            LOGGER.finer("Compressed text $textId (${compressedData.size} bytes), released raw data")
         } catch (e: Exception) {
             LOGGER.severe("ERROR compressing $textId: ${e.message}")
             e.printStackTrace()

diff --git a/korapxmltool.shebang b/korapxmltool.shebang
index a435461..5fb2768 100644
--- a/korapxmltool.shebang
+++ b/korapxmltool.shebang

@@ -119,12 +119,27 @@
     fi
     
     # Detect annotation workloads (memory-intensive)
+    has_krill=false
     for arg in "$@"; do
       case "$arg" in
         -T|--tag-with|-P|--parse-with) has_annotation=true ;;
         *) ;;
       esac
     done
+    # Also treat invocation as korapxml2krill as a krill workload
+    [[ "$(basename "$0")" == *krill* ]] && has_krill=true
+    # Detect -f krill / -t krill / --format krill / --output-format krill flag+value pairs
+    # and bare "krill" as a positional argument (shouldn't normally appear, but be safe)
+    prev_arg=""
+    for arg in "$@"; do
+      case "$arg" in
+        krill) has_krill=true ;;
+      esac
+      if [[ ( "$prev_arg" == "-f" || "$prev_arg" == "-t" || "$prev_arg" == "--output-format" || "$prev_arg" == "--format" ) && "$arg" == "krill" ]]; then
+        has_krill=true
+      fi
+      prev_arg="$arg"
+    done
     
     # Calculate memory based on workload
     if [[ "$has_annotation" == true ]]; then
@@ -141,6 +156,14 @@
         (( xmx_mb > 65536 )) && xmx_mb=65536
       fi
       workload_type="annotation"
+    elif [[ "$has_krill" == true ]]; then
+      # Krill JSON generation: holds morphological data for all in-flight texts in memory
+      # while merging multiple annotation ZIPs. Use 80% of available RAM with no
+      # artificial upper bound – on large-memory machines (e.g. 1.5 TB) the job
+      # genuinely needs a large heap and should not be silently capped at 32 GB.
+      xmx_mb=$(( mem_mb * 80 / 100 ))
+      (( xmx_mb < 4096 )) && xmx_mb=4096
+      workload_type="krill"
     elif [[ "$large_corpus" == true ]]; then
       # Large corpus without annotation: use 80% memory, min 4GB, max 96GB
       xmx_mb=$(( mem_mb * 80 / 100 ))
commit	81d53cae939c23dc85a1f887262b1e09a90fe9a5	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Mar 05 12:07:15 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Mar 05 12:07:15 2026 +0100
tree	04c4545321ffc769514e3a1a53a9149169f5928b
parent	b5f0b39c09c13b07ca1cd317dec0befa10af7bd6 [diff]