Calculate heap demand for krill conversion
Change-Id: I5c9b4d73ba7f30ff93349368095204e221e7cf21
diff --git a/korapxmltool.shebang b/korapxmltool.shebang
index 5fb2768..3633107 100644
--- a/korapxmltool.shebang
+++ b/korapxmltool.shebang
@@ -157,13 +157,32 @@
fi
workload_type="annotation"
elif [[ "$has_krill" == true ]]; then
- # Krill JSON generation: holds morphological data for all in-flight texts in memory
- # while merging multiple annotation ZIPs. Use 80% of available RAM with no
- # artificial upper bound – on large-memory machines (e.g. 1.5 TB) the job
- # genuinely needs a large heap and should not be silently capped at 32 GB.
- xmx_mb=$(( mem_mb * 80 / 100 ))
- (( xmx_mb < 4096 )) && xmx_mb=4096
+ # Krill merges multiple annotation ZIPs while holding uncompressed, deserialized
+ # data in memory. KorAP-XML decompresses to roughly 10-20x its on-disk size, so
+ # we target 20x the compressed input size to give enough heap without hogging
+ # the whole machine (which would prevent running parallel krill jobs).
+ # Falls back to 80% of RAM when no input size can be determined.
+ # Floor: 4 GB. Ceiling: 90% of available RAM.
+ # Use ParallelGC to avoid GCLocker contention ("Retried waiting for GCLocker too
+ # often") that plagues G1GC under JNI critical sections.
workload_type="krill"
+ if (( total_input_size > 0 )); then
+ xmx_mb=$(( total_input_size * 22 / 1024 / 1024 ))
+ workload_type="krill_input_based"
+ else
+ xmx_mb=$(( mem_mb * 80 / 100 ))
+ fi
+ (( xmx_mb < 4096 )) && xmx_mb=4096
+ max_krill_mb=$(( mem_mb * 90 / 100 ))
+ (( xmx_mb > max_krill_mb )) && xmx_mb=$max_krill_mb
+ EXTRA_OPTS+=("-XX:+UseParallelGC" "-XX:-UseGCOverheadLimit")
+ # ParallelGC is stop-the-world and therefore immune to the GCLocker contention
+ # ("Retried waiting for GCLocker too often") that plagues G1GC when JNI critical
+ # sections are active. It is available in every JDK 21 distribution and requires
+ # no extra vm.max_map_count allowance, unlike ZGC.
+ # -UseGCOverheadLimit disables the "GC overhead limit exceeded" safety valve that
+ # fires when >98% of CPU time is spent in GC; without it, the JVM throws an error
+ # instead of continuing to collect, which is counterproductive for large krill jobs.
elif [[ "$large_corpus" == true ]]; then
# Large corpus without annotation: use 80% memory, min 4GB, max 96GB
xmx_mb=$(( mem_mb * 80 / 100 ))