| Marc Kupietz | fbfcd04 | 2025-11-16 13:33:32 +0100 | [diff] [blame] | 1 | #!/usr/bin/env bash |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 2 | # Shebang header for korapxmltool: auto-enables native access (Java 25+) and chooses a default -Xmx. |
| Marc Kupietz | fbfcd04 | 2025-11-16 13:33:32 +0100 | [diff] [blame] | 3 | # Usage: |
| 4 | # cat korapxmltool.shebang app/build/libs/korapxmltool.jar > korapxmltool |
| 5 | # chmod +x korapxmltool |
| 6 | # (Run `zip -A korapxmltool` if your unzip/java complains about prepended bytes.) |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 7 | # |
| 8 | # Override memory: |
| Marc Kupietz | 570d0e0 | 2025-11-17 10:42:41 +0100 | [diff] [blame] | 9 | # export KORAPXMLTOOL_XMX=20g # sets -Xmx20g |
| 10 | # export KORAPXMLTOOL_XMX=8192m # sets -Xmx8192m |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 11 | # export KORAPXMLTOOL_JAVA_OPTS="... -Xmx4g" # full custom opts |
| 12 | # Otherwise we pick ~75% of detected memory (cgroup aware), clamped to [1024m, 65536m]. |
| Marc Kupietz | fbfcd04 | 2025-11-16 13:33:32 +0100 | [diff] [blame] | 13 | |
| 14 | set -euo pipefail |
| 15 | |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 16 | has_xmx=false |
| 17 | for v in "${JDK_JAVA_OPTIONS:-}" "${JAVA_TOOL_OPTIONS:-}" "${KORAPXMLTOOL_JAVA_OPTS:-}"; do |
| 18 | [[ $v == *"-Xmx"* ]] && has_xmx=true && break |
| 19 | done |
| 20 | for arg in "$@"; do |
| 21 | [[ $arg == -Xmx* ]] && has_xmx=true && break |
| 22 | done |
| 23 | |
| 24 | detect_mem_limit_mb() { |
| 25 | local cgroup_limit |
| 26 | if [[ -f /sys/fs/cgroup/memory.max ]]; then |
| 27 | cgroup_limit=$(< /sys/fs/cgroup/memory.max) |
| 28 | [[ $cgroup_limit == "max" ]] && cgroup_limit="" |
| 29 | elif [[ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]]; then |
| 30 | cgroup_limit=$(< /sys/fs/cgroup/memory/memory.limit_in_bytes) |
| 31 | fi |
| 32 | |
| 33 | local limit_mb="" |
| 34 | if [[ -n ${cgroup_limit:-} && $cgroup_limit =~ ^[0-9]+$ && $cgroup_limit -lt 9223372036854771712 ]]; then |
| 35 | limit_mb=$(( cgroup_limit / 1024 / 1024 )) |
| 36 | fi |
| 37 | |
| 38 | local memtotal_kb |
| 39 | memtotal_kb=$(awk '/MemTotal/ { print $2; exit }' /proc/meminfo 2>/dev/null || echo "") |
| 40 | local total_mb="" |
| 41 | if [[ $memtotal_kb =~ ^[0-9]+$ ]]; then |
| 42 | total_mb=$(( memtotal_kb / 1024 )) |
| 43 | fi |
| 44 | |
| 45 | if [[ -n $limit_mb && -n $total_mb ]]; then |
| 46 | (( limit_mb < total_mb )) && echo "$limit_mb" || echo "$total_mb" |
| 47 | else |
| 48 | echo "${limit_mb:-${total_mb:-4096}}" |
| 49 | fi |
| 50 | } |
| 51 | |
| Marc Kupietz | fbfcd04 | 2025-11-16 13:33:32 +0100 | [diff] [blame] | 52 | EXTRA_OPTS=() |
| 53 | if [[ "${JDK_JAVA_OPTIONS:-}" != *"--enable-native-access="* ]]; then |
| 54 | EXTRA_OPTS+=(--enable-native-access=ALL-UNNAMED) |
| 55 | fi |
| 56 | |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 57 | if ! $has_xmx; then |
| Marc Kupietz | 570d0e0 | 2025-11-17 10:42:41 +0100 | [diff] [blame] | 58 | if [[ -n ${KORAPXMLTOOL_XMX:-} ]]; then |
| 59 | # Handle KORAPXMLTOOL_XMX with units (g/G for GB, m/M for MB, or just number for MB) |
| 60 | if [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+[gG]$ ]]; then |
| 61 | # Convert GB to MB |
| 62 | xmx_gb=${KORAPXMLTOOL_XMX%[gG]} |
| 63 | xmx_mb=$((xmx_gb * 1024)) |
| 64 | elif [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+[mM]$ ]]; then |
| 65 | # Extract MB value |
| 66 | xmx_mb=${KORAPXMLTOOL_XMX%[mM]} |
| 67 | elif [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+$ ]]; then |
| 68 | # Treat plain number as MB for backward compatibility |
| 69 | xmx_mb=${KORAPXMLTOOL_XMX} |
| 70 | else |
| 71 | echo "Warning: Invalid KORAPXMLTOOL_XMX format '${KORAPXMLTOOL_XMX}'. Use formats like '20g', '8192m', or '8192'." >&2 |
| 72 | xmx_mb="" |
| 73 | fi |
| 74 | fi |
| 75 | |
| 76 | # If no valid XMX was provided or parsing failed, use auto-detection |
| 77 | if [[ -z ${xmx_mb:-} ]]; then |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 78 | mem_mb=$(detect_mem_limit_mb) |
| Marc Kupietz | 803394d | 2025-11-20 10:41:24 +0100 | [diff] [blame^] | 79 | |
| 80 | # Intelligent memory allocation based on use case |
| 81 | # Check command line arguments for workload hints |
| 82 | workload_type="default" |
| 83 | large_corpus=false |
| 84 | has_annotation=false |
| 85 | |
| 86 | # Detect large corpus (>5GB input files) |
| 87 | # Note: This is a rough estimate - exact size will be calculated in Kotlin |
| 88 | total_input_size=0 |
| 89 | for arg in "$@"; do |
| 90 | # Skip options and flags, only process files |
| 91 | if [[ "$arg" != -* && "$arg" == *.zip ]]; then |
| 92 | if [[ -e "$arg" ]]; then |
| 93 | # Use -L to follow symlinks (many corpora use symlinked ZIPs) |
| 94 | size=$(stat -L -c%s "$arg" 2>/dev/null || stat -L -f%z "$arg" 2>/dev/null || echo 0) |
| 95 | total_input_size=$((total_input_size + size)) |
| 96 | fi |
| 97 | fi |
| 98 | done |
| 99 | |
| 100 | if (( total_input_size > 0 )); then |
| 101 | # Calculate with one decimal place for better accuracy with smaller files |
| 102 | if command -v bc >/dev/null 2>&1; then |
| 103 | total_input_gb_precise=$(echo "scale=1; $total_input_size / (1024*1024*1024)" | bc -l) |
| 104 | # Ensure we always have .X format, even for values < 1 |
| 105 | [[ "$total_input_gb_precise" != *.* ]] && total_input_gb_precise="${total_input_gb_precise}.0" |
| 106 | else |
| 107 | # Fallback using awk for better precision |
| 108 | total_input_gb_precise=$(awk "BEGIN {printf \"%.1f\", $total_input_size / (1024*1024*1024)}") |
| 109 | fi |
| 110 | # Integer part for comparison (convert 1.5 -> 1, 0.1 -> 0, etc.) |
| 111 | total_input_gb=$(echo "$total_input_gb_precise" | cut -d. -f1) |
| 112 | # Handle case where total_input_gb might be empty for very small files |
| 113 | [[ -z "$total_input_gb" ]] && total_input_gb=0 |
| 114 | (( total_input_gb > 5 )) && large_corpus=true |
| 115 | else |
| 116 | # Cannot determine size reliably at shell level |
| 117 | total_input_gb=0 |
| 118 | total_input_gb_precise="0.0" |
| 119 | fi |
| 120 | |
| 121 | # Detect annotation workloads (memory-intensive) |
| 122 | for arg in "$@"; do |
| 123 | case "$arg" in |
| 124 | -T|--tag-with|-P|--parse-with) has_annotation=true ;; |
| 125 | *) ;; |
| 126 | esac |
| 127 | done |
| 128 | |
| 129 | # Calculate memory based on workload |
| 130 | if [[ "$has_annotation" == true ]]; then |
| 131 | # Annotation: need substantial memory for models + processing |
| 132 | if [[ "$large_corpus" == true ]]; then |
| 133 | # Large corpus + annotation: use 85% of memory, min 8GB, max 128GB |
| 134 | xmx_mb=$(( mem_mb * 85 / 100 )) |
| 135 | (( xmx_mb < 8192 )) && xmx_mb=8192 |
| 136 | (( xmx_mb > 131072 )) && xmx_mb=131072 |
| 137 | else |
| 138 | # Small corpus + annotation: use 80% of memory, min 4GB, max 64GB |
| 139 | xmx_mb=$(( mem_mb * 80 / 100 )) |
| 140 | (( xmx_mb < 4096 )) && xmx_mb=4096 |
| 141 | (( xmx_mb > 65536 )) && xmx_mb=65536 |
| 142 | fi |
| 143 | workload_type="annotation" |
| 144 | elif [[ "$large_corpus" == true ]]; then |
| 145 | # Large corpus without annotation: use 80% memory, min 4GB, max 96GB |
| 146 | xmx_mb=$(( mem_mb * 80 / 100 )) |
| 147 | (( xmx_mb < 4096 )) && xmx_mb=4096 |
| 148 | (( xmx_mb > 98304 )) && xmx_mb=98304 |
| 149 | workload_type="large_corpus" |
| 150 | else |
| 151 | # Default: lightweight processing, use 75% memory, min 1GB, max 32GB |
| 152 | xmx_mb=$(( mem_mb * 75 / 100 )) |
| 153 | (( xmx_mb < 1024 )) && xmx_mb=1024 |
| 154 | (( xmx_mb > 32768 )) && xmx_mb=32768 |
| 155 | workload_type="default" |
| 156 | fi |
| 157 | |
| 158 | echo "Auto-detected: workload=$workload_type, input=${total_input_gb_precise}GB, memory=${xmx_mb}MB" >&2 |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 159 | fi |
| 160 | EXTRA_OPTS+=("-Xmx${xmx_mb}m") |
| 161 | fi |
| 162 | |
| Marc Kupietz | bab5d7e | 2025-11-17 15:16:46 +0100 | [diff] [blame] | 163 | # Set default KORAPXMLTOOL_MODELS_PATH relative to executable if not already set |
| 164 | if [[ -z ${KORAPXMLTOOL_MODELS_PATH:-} ]]; then |
| 165 | SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" |
| 166 | export KORAPXMLTOOL_MODELS_PATH="${SCRIPT_DIR}/../lib/models" |
| 167 | fi |
| 168 | |
| Marc Kupietz | 02cd8bf | 2025-11-16 14:21:25 +0100 | [diff] [blame] | 169 | exec java "${EXTRA_OPTS[@]}" ${KORAPXMLTOOL_JAVA_OPTS:-} -jar "$0" "$@" |