Calculate defaults for heap and threads adaptively

Change-Id: I55bf914647e06e7205b49cd4a1818d70a06fd1b0
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 5b30088..adb1fec 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -236,9 +236,9 @@
     @Option(
         names = ["-j", "--jobs", "--threads"],
         paramLabel = "THREADS",
-        description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"]
+        description = ["Maximum number of threads to use. Default: intelligent detection based on format and available memory"]
     )
-    var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2
+    var maxThreads: Int = 0  // 0 = auto-detect in call()
     fun setThreads(threads: Int) {
         if (threads < 1) {
             throw ParameterException(spec.commandLine(), String.format(Locale.ROOT, "Invalid value `%d' for option '--threads': must be at least 1", threads))
@@ -323,6 +323,110 @@
         "corenlp" to "germanSR.ser.gz"
     )
 
+    // Calculate optimal thread count based on format, memory, and input characteristics
+    private fun calculateOptimalThreads(): Int {
+        val availableCores = Runtime.getRuntime().availableProcessors()
+        val availableMemoryGB = getAvailableMemoryGB()
+        
+        // Detect program name for compatibility mode
+        val programName = System.getProperty("sun.java.command")?.split(" ")?.first()?.split("/")?.last() ?: "korapxmltool"
+        
+        return when {
+            // CoNLL-U conversion: lightweight, max 10 threads
+            programName == "korapxml2conllu" || outputFormat == OutputFormat.CONLLU -> {
+                minOf(10, availableCores)
+            }
+            
+            // Krill export: depends on corpus size
+            programName == "korapxml2krill" || outputFormat == OutputFormat.KRILL -> {
+                calculateKrillThreads(availableCores, availableMemoryGB)
+            }
+            
+            // Tagging/Parsing: memory-intensive, needs careful balancing
+            taggerName != null || parserName != null -> {
+                calculateAnnotationThreads(availableCores, availableMemoryGB)
+            }
+            
+            // Default case: conservative threading for general processing
+            else -> {
+                minOf(availableCores / 2, 8)
+            }
+        }.also { threads ->
+            LOGGER.info("Thread calculation: format=$outputFormat, cores=$availableCores, memory=${availableMemoryGB}GB, result=$threads")
+        }
+    }
+    
+    private fun calculateKrillThreads(cores: Int, memoryGB: Double): Int {
+        // Analyze input corpus size if possible
+        val totalInputSize = zipFileNames?.sumOf { File(it).length() } ?: 0L
+        val totalInputGB = totalInputSize / (1024.0 * 1024.0 * 1024.0)
+        
+        return when {
+            // Very large corpora (>30GB): scale with cores, up to 75% utilization
+            totalInputGB > 30 -> {
+                minOf(cores, maxOf(32, cores * 3 / 4))
+            }
+            // Large corpora (>20GB): scale based on size and available cores
+            totalInputGB > 20 -> {
+                minOf(cores, maxOf(16, cores * 3 / 4))
+            }
+            // Medium-large corpora (10-20GB): scale threads with input size and cores
+            totalInputGB > 10 -> {
+                // For large machines (64+ cores), scale aggressively; medium machines (32+) moderately
+                when {
+                    cores >= 64 -> minOf(cores * 3 / 4, maxOf(16, (totalInputGB * 1.2).toInt()))
+                    cores >= 32 -> minOf(cores / 2, maxOf(10, (totalInputGB * 0.8).toInt()))
+                    else -> minOf(cores, 16)
+                }
+            }
+            // Medium corpora (1-10GB): 10 threads on small machines, scale on large machines
+            totalInputGB > 1 -> {
+                when {
+                    cores >= 64 -> minOf(cores / 2, 32)
+                    cores >= 32 -> minOf(cores / 4, 16)
+                    else -> minOf(cores, 10)
+                }
+            }
+            // Small-medium corpora (0.1-1GB): use 8 threads  
+            totalInputGB > 0.1 -> {
+                minOf(cores, 8)
+            }
+            // Very small corpora (<0.1GB): minimal threading + overhead for compression/writing
+            else -> {
+                minOf(4, cores / 2)
+            }
+        }.also { threads ->
+            LOGGER.info("Krill threading: input=${"%.1f".format(totalInputGB)}GB, cores=$cores, threads=$threads")
+        }
+    }
+    
+    private fun calculateAnnotationThreads(cores: Int, memoryGB: Double): Int {
+        // Annotation tools are very memory-intensive (especially parsers)
+        // Estimate memory per thread: parsing ~1.5GB, tagging ~0.8GB
+        val memoryPerThreadGB = when {
+            parserName != null -> 1.5  // 1.5GB per parser thread
+            taggerName != null -> 0.8  // 0.8GB per tagger thread  
+            else -> 0.5                // 0.5GB for other annotation
+        }
+        
+        val memoryBasedThreads = maxOf(1, ((memoryGB * 0.8) / memoryPerThreadGB).toInt())
+        val coreBasedThreads = maxOf(1, cores / 2)  // Leave cores for I/O and GC
+        
+        return minOf(memoryBasedThreads, coreBasedThreads, 16).also { threads ->
+            LOGGER.info("Annotation threading: ${parserName ?: taggerName}, memory=${"%.1f".format(memoryGB)}GB, " +
+                       "memLimit=${memoryBasedThreads}t, coreLimit=${coreBasedThreads}t, result=${threads}t")
+        }
+    }
+    
+    private fun getAvailableMemoryGB(): Double {
+        // Get heap size that was actually allocated by JVM
+        val runtime = Runtime.getRuntime()
+        val maxMemory = runtime.maxMemory()
+        
+        // Convert to GB, use 80% for safety (leave room for GC, off-heap, etc.)
+        return maxMemory * 0.8 / (1024.0 * 1024.0 * 1024.0)
+    }
+
     // Helper function to resolve model path with default search directory
     private fun resolveModelPath(modelPath: String): String? {
         // If absolute path or relative path exists as-is, return it
@@ -462,6 +566,12 @@
         LOGGER.level = level
         handler.level = level  // Handler also needs to be set to the same level
 
+        // Auto-detect optimal thread count if not specified
+        if (maxThreads == 0) {
+            maxThreads = calculateOptimalThreads()
+            LOGGER.info("Auto-detected optimal thread count: $maxThreads threads")
+        }
+
         // Log model path resolutions that occurred during parameter parsing
         modelPathResolutions.forEach { (original, resolved) ->
             LOGGER.info("Resolved model path '$original' to '$resolved'")
diff --git a/korapxmltool.shebang b/korapxmltool.shebang
index 4019c5e..a435461 100644
--- a/korapxmltool.shebang
+++ b/korapxmltool.shebang
@@ -76,9 +76,86 @@
   # If no valid XMX was provided or parsing failed, use auto-detection
   if [[ -z ${xmx_mb:-} ]]; then
     mem_mb=$(detect_mem_limit_mb)
-    xmx_mb=$(( mem_mb * 75 / 100 ))
-    (( xmx_mb < 1024 )) && xmx_mb=1024
-    (( xmx_mb > 65536 )) && xmx_mb=65536
+    
+    # Intelligent memory allocation based on use case
+    # Check command line arguments for workload hints
+    workload_type="default"
+    large_corpus=false
+    has_annotation=false
+    
+    # Detect large corpus (>5GB input files)
+    # Note: This is a rough estimate - exact size will be calculated in Kotlin
+    total_input_size=0
+    for arg in "$@"; do
+      # Skip options and flags, only process files
+      if [[ "$arg" != -* && "$arg" == *.zip ]]; then
+        if [[ -e "$arg" ]]; then
+          # Use -L to follow symlinks (many corpora use symlinked ZIPs)
+          size=$(stat -L -c%s "$arg" 2>/dev/null || stat -L -f%z "$arg" 2>/dev/null || echo 0)
+          total_input_size=$((total_input_size + size))
+        fi
+      fi
+    done
+    
+    if (( total_input_size > 0 )); then
+      # Calculate with one decimal place for better accuracy with smaller files
+      if command -v bc >/dev/null 2>&1; then
+        total_input_gb_precise=$(echo "scale=1; $total_input_size / (1024*1024*1024)" | bc -l)
+        # Ensure we always have .X format, even for values < 1
+        [[ "$total_input_gb_precise" != *.* ]] && total_input_gb_precise="${total_input_gb_precise}.0"
+      else
+        # Fallback using awk for better precision
+        total_input_gb_precise=$(awk "BEGIN {printf \"%.1f\", $total_input_size / (1024*1024*1024)}")
+      fi
+      # Integer part for comparison (convert 1.5 -> 1, 0.1 -> 0, etc.)
+      total_input_gb=$(echo "$total_input_gb_precise" | cut -d. -f1)
+      # Handle case where total_input_gb might be empty for very small files
+      [[ -z "$total_input_gb" ]] && total_input_gb=0
+      (( total_input_gb > 5 )) && large_corpus=true
+    else
+      # Cannot determine size reliably at shell level
+      total_input_gb=0
+      total_input_gb_precise="0.0"
+    fi
+    
+    # Detect annotation workloads (memory-intensive)
+    for arg in "$@"; do
+      case "$arg" in
+        -T|--tag-with|-P|--parse-with) has_annotation=true ;;
+        *) ;;
+      esac
+    done
+    
+    # Calculate memory based on workload
+    if [[ "$has_annotation" == true ]]; then
+      # Annotation: need substantial memory for models + processing
+      if [[ "$large_corpus" == true ]]; then
+        # Large corpus + annotation: use 85% of memory, min 8GB, max 128GB
+        xmx_mb=$(( mem_mb * 85 / 100 ))
+        (( xmx_mb < 8192 )) && xmx_mb=8192
+        (( xmx_mb > 131072 )) && xmx_mb=131072
+      else
+        # Small corpus + annotation: use 80% of memory, min 4GB, max 64GB
+        xmx_mb=$(( mem_mb * 80 / 100 ))
+        (( xmx_mb < 4096 )) && xmx_mb=4096
+        (( xmx_mb > 65536 )) && xmx_mb=65536
+      fi
+      workload_type="annotation"
+    elif [[ "$large_corpus" == true ]]; then
+      # Large corpus without annotation: use 80% memory, min 4GB, max 96GB
+      xmx_mb=$(( mem_mb * 80 / 100 ))
+      (( xmx_mb < 4096 )) && xmx_mb=4096
+      (( xmx_mb > 98304 )) && xmx_mb=98304
+      workload_type="large_corpus"
+    else
+      # Default: lightweight processing, use 75% memory, min 1GB, max 32GB
+      xmx_mb=$(( mem_mb * 75 / 100 ))
+      (( xmx_mb < 1024 )) && xmx_mb=1024
+      (( xmx_mb > 32768 )) && xmx_mb=32768
+      workload_type="default"
+    fi
+    
+    echo "Auto-detected: workload=$workload_type, input=${total_input_gb_precise}GB, memory=${xmx_mb}MB" >&2
   fi
   EXTRA_OPTS+=("-Xmx${xmx_mb}m")
 fi