Calculate defaults for heap and threads adaptively
Change-Id: I55bf914647e06e7205b49cd4a1818d70a06fd1b0
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 5b30088..adb1fec 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -236,9 +236,9 @@
@Option(
names = ["-j", "--jobs", "--threads"],
paramLabel = "THREADS",
- description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"]
+ description = ["Maximum number of threads to use. Default: intelligent detection based on format and available memory"]
)
- var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2
+ var maxThreads: Int = 0 // 0 = auto-detect in call()
fun setThreads(threads: Int) {
if (threads < 1) {
throw ParameterException(spec.commandLine(), String.format(Locale.ROOT, "Invalid value `%d' for option '--threads': must be at least 1", threads))
@@ -323,6 +323,110 @@
"corenlp" to "germanSR.ser.gz"
)
+ // Calculate optimal thread count based on format, memory, and input characteristics
+ private fun calculateOptimalThreads(): Int {
+ val availableCores = Runtime.getRuntime().availableProcessors()
+ val availableMemoryGB = getAvailableMemoryGB()
+
+ // Detect program name for compatibility mode
+ val programName = System.getProperty("sun.java.command")?.split(" ")?.first()?.split("/")?.last() ?: "korapxmltool"
+
+ return when {
+ // CoNLL-U conversion: lightweight, max 10 threads
+ programName == "korapxml2conllu" || outputFormat == OutputFormat.CONLLU -> {
+ minOf(10, availableCores)
+ }
+
+ // Krill export: depends on corpus size
+ programName == "korapxml2krill" || outputFormat == OutputFormat.KRILL -> {
+ calculateKrillThreads(availableCores, availableMemoryGB)
+ }
+
+ // Tagging/Parsing: memory-intensive, needs careful balancing
+ taggerName != null || parserName != null -> {
+ calculateAnnotationThreads(availableCores, availableMemoryGB)
+ }
+
+ // Default case: conservative threading for general processing
+ else -> {
+ minOf(availableCores / 2, 8)
+ }
+ }.also { threads ->
+ LOGGER.info("Thread calculation: format=$outputFormat, cores=$availableCores, memory=${availableMemoryGB}GB, result=$threads")
+ }
+ }
+
+ private fun calculateKrillThreads(cores: Int, memoryGB: Double): Int {
+ // Analyze input corpus size if possible
+ val totalInputSize = zipFileNames?.sumOf { File(it).length() } ?: 0L
+ val totalInputGB = totalInputSize / (1024.0 * 1024.0 * 1024.0)
+
+ return when {
+ // Very large corpora (>30GB): scale with cores, up to 75% utilization
+ totalInputGB > 30 -> {
+ minOf(cores, maxOf(32, cores * 3 / 4))
+ }
+ // Large corpora (>20GB): scale based on size and available cores
+ totalInputGB > 20 -> {
+ minOf(cores, maxOf(16, cores * 3 / 4))
+ }
+ // Medium-large corpora (10-20GB): scale threads with input size and cores
+ totalInputGB > 10 -> {
+ // For large machines (64+ cores), scale aggressively; medium machines (32+) moderately
+ when {
+ cores >= 64 -> minOf(cores * 3 / 4, maxOf(16, (totalInputGB * 1.2).toInt()))
+ cores >= 32 -> minOf(cores / 2, maxOf(10, (totalInputGB * 0.8).toInt()))
+ else -> minOf(cores, 16)
+ }
+ }
+ // Medium corpora (1-10GB): 10 threads on small machines, scale on large machines
+ totalInputGB > 1 -> {
+ when {
+ cores >= 64 -> minOf(cores / 2, 32)
+ cores >= 32 -> minOf(cores / 4, 16)
+ else -> minOf(cores, 10)
+ }
+ }
+ // Small-medium corpora (0.1-1GB): use 8 threads
+ totalInputGB > 0.1 -> {
+ minOf(cores, 8)
+ }
+ // Very small corpora (<0.1GB): minimal threading + overhead for compression/writing
+ else -> {
+ minOf(4, cores / 2)
+ }
+ }.also { threads ->
+ LOGGER.info("Krill threading: input=${"%.1f".format(totalInputGB)}GB, cores=$cores, threads=$threads")
+ }
+ }
+
+ private fun calculateAnnotationThreads(cores: Int, memoryGB: Double): Int {
+ // Annotation tools are very memory-intensive (especially parsers)
+ // Estimate memory per thread: parsing ~1.5GB, tagging ~0.8GB
+ val memoryPerThreadGB = when {
+ parserName != null -> 1.5 // 1.5GB per parser thread
+ taggerName != null -> 0.8 // 0.8GB per tagger thread
+ else -> 0.5 // 0.5GB for other annotation
+ }
+
+ val memoryBasedThreads = maxOf(1, ((memoryGB * 0.8) / memoryPerThreadGB).toInt())
+ val coreBasedThreads = maxOf(1, cores / 2) // Leave cores for I/O and GC
+
+ return minOf(memoryBasedThreads, coreBasedThreads, 16).also { threads ->
+ LOGGER.info("Annotation threading: ${parserName ?: taggerName}, memory=${"%.1f".format(memoryGB)}GB, " +
+ "memLimit=${memoryBasedThreads}t, coreLimit=${coreBasedThreads}t, result=${threads}t")
+ }
+ }
+
+ private fun getAvailableMemoryGB(): Double {
+ // Get heap size that was actually allocated by JVM
+ val runtime = Runtime.getRuntime()
+ val maxMemory = runtime.maxMemory()
+
+ // Convert to GB, use 80% for safety (leave room for GC, off-heap, etc.)
+ return maxMemory * 0.8 / (1024.0 * 1024.0 * 1024.0)
+ }
+
// Helper function to resolve model path with default search directory
private fun resolveModelPath(modelPath: String): String? {
// If absolute path or relative path exists as-is, return it
@@ -462,6 +566,12 @@
LOGGER.level = level
handler.level = level // Handler also needs to be set to the same level
+ // Auto-detect optimal thread count if not specified
+ if (maxThreads == 0) {
+ maxThreads = calculateOptimalThreads()
+ LOGGER.info("Auto-detected optimal thread count: $maxThreads threads")
+ }
+
// Log model path resolutions that occurred during parameter parsing
modelPathResolutions.forEach { (original, resolved) ->
LOGGER.info("Resolved model path '$original' to '$resolved'")
diff --git a/korapxmltool.shebang b/korapxmltool.shebang
index 4019c5e..a435461 100644
--- a/korapxmltool.shebang
+++ b/korapxmltool.shebang
@@ -76,9 +76,86 @@
# If no valid XMX was provided or parsing failed, use auto-detection
if [[ -z ${xmx_mb:-} ]]; then
mem_mb=$(detect_mem_limit_mb)
- xmx_mb=$(( mem_mb * 75 / 100 ))
- (( xmx_mb < 1024 )) && xmx_mb=1024
- (( xmx_mb > 65536 )) && xmx_mb=65536
+
+ # Intelligent memory allocation based on use case
+ # Check command line arguments for workload hints
+ workload_type="default"
+ large_corpus=false
+ has_annotation=false
+
+ # Detect large corpus (>5GB input files)
+ # Note: This is a rough estimate - exact size will be calculated in Kotlin
+ total_input_size=0
+ for arg in "$@"; do
+ # Skip options and flags, only process files
+ if [[ "$arg" != -* && "$arg" == *.zip ]]; then
+ if [[ -e "$arg" ]]; then
+ # Use -L to follow symlinks (many corpora use symlinked ZIPs)
+ size=$(stat -L -c%s "$arg" 2>/dev/null || stat -L -f%z "$arg" 2>/dev/null || echo 0)
+ total_input_size=$((total_input_size + size))
+ fi
+ fi
+ done
+
+ if (( total_input_size > 0 )); then
+ # Calculate with one decimal place for better accuracy with smaller files
+ if command -v bc >/dev/null 2>&1; then
+ total_input_gb_precise=$(echo "scale=1; $total_input_size / (1024*1024*1024)" | bc -l)
+ # Ensure we always have .X format, even for values < 1
+ [[ "$total_input_gb_precise" != *.* ]] && total_input_gb_precise="${total_input_gb_precise}.0"
+ else
+ # Fallback using awk for better precision
+ total_input_gb_precise=$(awk "BEGIN {printf \"%.1f\", $total_input_size / (1024*1024*1024)}")
+ fi
+ # Integer part for comparison (convert 1.5 -> 1, 0.1 -> 0, etc.)
+ total_input_gb=$(echo "$total_input_gb_precise" | cut -d. -f1)
+ # Handle case where total_input_gb might be empty for very small files
+ [[ -z "$total_input_gb" ]] && total_input_gb=0
+ (( total_input_gb > 5 )) && large_corpus=true
+ else
+ # Cannot determine size reliably at shell level
+ total_input_gb=0
+ total_input_gb_precise="0.0"
+ fi
+
+ # Detect annotation workloads (memory-intensive)
+ for arg in "$@"; do
+ case "$arg" in
+ -T|--tag-with|-P|--parse-with) has_annotation=true ;;
+ *) ;;
+ esac
+ done
+
+ # Calculate memory based on workload
+ if [[ "$has_annotation" == true ]]; then
+ # Annotation: need substantial memory for models + processing
+ if [[ "$large_corpus" == true ]]; then
+ # Large corpus + annotation: use 85% of memory, min 8GB, max 128GB
+ xmx_mb=$(( mem_mb * 85 / 100 ))
+ (( xmx_mb < 8192 )) && xmx_mb=8192
+ (( xmx_mb > 131072 )) && xmx_mb=131072
+ else
+ # Small corpus + annotation: use 80% of memory, min 4GB, max 64GB
+ xmx_mb=$(( mem_mb * 80 / 100 ))
+ (( xmx_mb < 4096 )) && xmx_mb=4096
+ (( xmx_mb > 65536 )) && xmx_mb=65536
+ fi
+ workload_type="annotation"
+ elif [[ "$large_corpus" == true ]]; then
+ # Large corpus without annotation: use 80% memory, min 4GB, max 96GB
+ xmx_mb=$(( mem_mb * 80 / 100 ))
+ (( xmx_mb < 4096 )) && xmx_mb=4096
+ (( xmx_mb > 98304 )) && xmx_mb=98304
+ workload_type="large_corpus"
+ else
+ # Default: lightweight processing, use 75% memory, min 1GB, max 32GB
+ xmx_mb=$(( mem_mb * 75 / 100 ))
+ (( xmx_mb < 1024 )) && xmx_mb=1024
+ (( xmx_mb > 32768 )) && xmx_mb=32768
+ workload_type="default"
+ fi
+
+ echo "Auto-detected: workload=$workload_type, input=${total_input_gb_precise}GB, memory=${xmx_mb}MB" >&2
fi
EXTRA_OPTS+=("-Xmx${xmx_mb}m")
fi