blob: a4354611638d51749aa302b37744893b23f288e9 [file] [log] [blame]
Marc Kupietzfbfcd042025-11-16 13:33:32 +01001#!/usr/bin/env bash
Marc Kupietz02cd8bf2025-11-16 14:21:25 +01002# Shebang header for korapxmltool: auto-enables native access (Java 25+) and chooses a default -Xmx.
Marc Kupietzfbfcd042025-11-16 13:33:32 +01003# Usage:
4# cat korapxmltool.shebang app/build/libs/korapxmltool.jar > korapxmltool
5# chmod +x korapxmltool
6# (Run `zip -A korapxmltool` if your unzip/java complains about prepended bytes.)
Marc Kupietz02cd8bf2025-11-16 14:21:25 +01007#
8# Override memory:
Marc Kupietz570d0e02025-11-17 10:42:41 +01009# export KORAPXMLTOOL_XMX=20g # sets -Xmx20g
10# export KORAPXMLTOOL_XMX=8192m # sets -Xmx8192m
Marc Kupietz02cd8bf2025-11-16 14:21:25 +010011# export KORAPXMLTOOL_JAVA_OPTS="... -Xmx4g" # full custom opts
12# Otherwise we pick ~75% of detected memory (cgroup aware), clamped to [1024m, 65536m].
Marc Kupietzfbfcd042025-11-16 13:33:32 +010013
14set -euo pipefail
15
Marc Kupietz02cd8bf2025-11-16 14:21:25 +010016has_xmx=false
17for v in "${JDK_JAVA_OPTIONS:-}" "${JAVA_TOOL_OPTIONS:-}" "${KORAPXMLTOOL_JAVA_OPTS:-}"; do
18 [[ $v == *"-Xmx"* ]] && has_xmx=true && break
19done
20for arg in "$@"; do
21 [[ $arg == -Xmx* ]] && has_xmx=true && break
22done
23
24detect_mem_limit_mb() {
25 local cgroup_limit
26 if [[ -f /sys/fs/cgroup/memory.max ]]; then
27 cgroup_limit=$(< /sys/fs/cgroup/memory.max)
28 [[ $cgroup_limit == "max" ]] && cgroup_limit=""
29 elif [[ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]]; then
30 cgroup_limit=$(< /sys/fs/cgroup/memory/memory.limit_in_bytes)
31 fi
32
33 local limit_mb=""
34 if [[ -n ${cgroup_limit:-} && $cgroup_limit =~ ^[0-9]+$ && $cgroup_limit -lt 9223372036854771712 ]]; then
35 limit_mb=$(( cgroup_limit / 1024 / 1024 ))
36 fi
37
38 local memtotal_kb
39 memtotal_kb=$(awk '/MemTotal/ { print $2; exit }' /proc/meminfo 2>/dev/null || echo "")
40 local total_mb=""
41 if [[ $memtotal_kb =~ ^[0-9]+$ ]]; then
42 total_mb=$(( memtotal_kb / 1024 ))
43 fi
44
45 if [[ -n $limit_mb && -n $total_mb ]]; then
46 (( limit_mb < total_mb )) && echo "$limit_mb" || echo "$total_mb"
47 else
48 echo "${limit_mb:-${total_mb:-4096}}"
49 fi
50}
51
Marc Kupietzfbfcd042025-11-16 13:33:32 +010052EXTRA_OPTS=()
53if [[ "${JDK_JAVA_OPTIONS:-}" != *"--enable-native-access="* ]]; then
54 EXTRA_OPTS+=(--enable-native-access=ALL-UNNAMED)
55fi
56
Marc Kupietz02cd8bf2025-11-16 14:21:25 +010057if ! $has_xmx; then
Marc Kupietz570d0e02025-11-17 10:42:41 +010058 if [[ -n ${KORAPXMLTOOL_XMX:-} ]]; then
59 # Handle KORAPXMLTOOL_XMX with units (g/G for GB, m/M for MB, or just number for MB)
60 if [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+[gG]$ ]]; then
61 # Convert GB to MB
62 xmx_gb=${KORAPXMLTOOL_XMX%[gG]}
63 xmx_mb=$((xmx_gb * 1024))
64 elif [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+[mM]$ ]]; then
65 # Extract MB value
66 xmx_mb=${KORAPXMLTOOL_XMX%[mM]}
67 elif [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+$ ]]; then
68 # Treat plain number as MB for backward compatibility
69 xmx_mb=${KORAPXMLTOOL_XMX}
70 else
71 echo "Warning: Invalid KORAPXMLTOOL_XMX format '${KORAPXMLTOOL_XMX}'. Use formats like '20g', '8192m', or '8192'." >&2
72 xmx_mb=""
73 fi
74 fi
75
76 # If no valid XMX was provided or parsing failed, use auto-detection
77 if [[ -z ${xmx_mb:-} ]]; then
Marc Kupietz02cd8bf2025-11-16 14:21:25 +010078 mem_mb=$(detect_mem_limit_mb)
Marc Kupietz803394d2025-11-20 10:41:24 +010079
80 # Intelligent memory allocation based on use case
81 # Check command line arguments for workload hints
82 workload_type="default"
83 large_corpus=false
84 has_annotation=false
85
86 # Detect large corpus (>5GB input files)
87 # Note: This is a rough estimate - exact size will be calculated in Kotlin
88 total_input_size=0
89 for arg in "$@"; do
90 # Skip options and flags, only process files
91 if [[ "$arg" != -* && "$arg" == *.zip ]]; then
92 if [[ -e "$arg" ]]; then
93 # Use -L to follow symlinks (many corpora use symlinked ZIPs)
94 size=$(stat -L -c%s "$arg" 2>/dev/null || stat -L -f%z "$arg" 2>/dev/null || echo 0)
95 total_input_size=$((total_input_size + size))
96 fi
97 fi
98 done
99
100 if (( total_input_size > 0 )); then
101 # Calculate with one decimal place for better accuracy with smaller files
102 if command -v bc >/dev/null 2>&1; then
103 total_input_gb_precise=$(echo "scale=1; $total_input_size / (1024*1024*1024)" | bc -l)
104 # Ensure we always have .X format, even for values < 1
105 [[ "$total_input_gb_precise" != *.* ]] && total_input_gb_precise="${total_input_gb_precise}.0"
106 else
107 # Fallback using awk for better precision
108 total_input_gb_precise=$(awk "BEGIN {printf \"%.1f\", $total_input_size / (1024*1024*1024)}")
109 fi
110 # Integer part for comparison (convert 1.5 -> 1, 0.1 -> 0, etc.)
111 total_input_gb=$(echo "$total_input_gb_precise" | cut -d. -f1)
112 # Handle case where total_input_gb might be empty for very small files
113 [[ -z "$total_input_gb" ]] && total_input_gb=0
114 (( total_input_gb > 5 )) && large_corpus=true
115 else
116 # Cannot determine size reliably at shell level
117 total_input_gb=0
118 total_input_gb_precise="0.0"
119 fi
120
121 # Detect annotation workloads (memory-intensive)
122 for arg in "$@"; do
123 case "$arg" in
124 -T|--tag-with|-P|--parse-with) has_annotation=true ;;
125 *) ;;
126 esac
127 done
128
129 # Calculate memory based on workload
130 if [[ "$has_annotation" == true ]]; then
131 # Annotation: need substantial memory for models + processing
132 if [[ "$large_corpus" == true ]]; then
133 # Large corpus + annotation: use 85% of memory, min 8GB, max 128GB
134 xmx_mb=$(( mem_mb * 85 / 100 ))
135 (( xmx_mb < 8192 )) && xmx_mb=8192
136 (( xmx_mb > 131072 )) && xmx_mb=131072
137 else
138 # Small corpus + annotation: use 80% of memory, min 4GB, max 64GB
139 xmx_mb=$(( mem_mb * 80 / 100 ))
140 (( xmx_mb < 4096 )) && xmx_mb=4096
141 (( xmx_mb > 65536 )) && xmx_mb=65536
142 fi
143 workload_type="annotation"
144 elif [[ "$large_corpus" == true ]]; then
145 # Large corpus without annotation: use 80% memory, min 4GB, max 96GB
146 xmx_mb=$(( mem_mb * 80 / 100 ))
147 (( xmx_mb < 4096 )) && xmx_mb=4096
148 (( xmx_mb > 98304 )) && xmx_mb=98304
149 workload_type="large_corpus"
150 else
151 # Default: lightweight processing, use 75% memory, min 1GB, max 32GB
152 xmx_mb=$(( mem_mb * 75 / 100 ))
153 (( xmx_mb < 1024 )) && xmx_mb=1024
154 (( xmx_mb > 32768 )) && xmx_mb=32768
155 workload_type="default"
156 fi
157
158 echo "Auto-detected: workload=$workload_type, input=${total_input_gb_precise}GB, memory=${xmx_mb}MB" >&2
Marc Kupietz02cd8bf2025-11-16 14:21:25 +0100159 fi
160 EXTRA_OPTS+=("-Xmx${xmx_mb}m")
161fi
162
Marc Kupietzbab5d7e2025-11-17 15:16:46 +0100163# Set default KORAPXMLTOOL_MODELS_PATH relative to executable if not already set
164if [[ -z ${KORAPXMLTOOL_MODELS_PATH:-} ]]; then
165 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
166 export KORAPXMLTOOL_MODELS_PATH="${SCRIPT_DIR}/../lib/models"
167fi
168
Marc Kupietz02cd8bf2025-11-16 14:21:25 +0100169exec java "${EXTRA_OPTS[@]}" ${KORAPXMLTOOL_JAVA_OPTS:-} -jar "$0" "$@"