blob: a4354611638d51749aa302b37744893b23f288e9 [file] [log] [blame]
#!/usr/bin/env bash
# Shebang header for korapxmltool: auto-enables native access (Java 25+) and chooses a default -Xmx.
# Usage:
# cat korapxmltool.shebang app/build/libs/korapxmltool.jar > korapxmltool
# chmod +x korapxmltool
# (Run `zip -A korapxmltool` if your unzip/java complains about prepended bytes.)
#
# Override memory:
# export KORAPXMLTOOL_XMX=20g # sets -Xmx20g
# export KORAPXMLTOOL_XMX=8192m # sets -Xmx8192m
# export KORAPXMLTOOL_JAVA_OPTS="... -Xmx4g" # full custom opts
# Otherwise we pick ~75% of detected memory (cgroup aware), clamped to [1024m, 65536m].
set -euo pipefail
has_xmx=false
for v in "${JDK_JAVA_OPTIONS:-}" "${JAVA_TOOL_OPTIONS:-}" "${KORAPXMLTOOL_JAVA_OPTS:-}"; do
[[ $v == *"-Xmx"* ]] && has_xmx=true && break
done
for arg in "$@"; do
[[ $arg == -Xmx* ]] && has_xmx=true && break
done
detect_mem_limit_mb() {
local cgroup_limit
if [[ -f /sys/fs/cgroup/memory.max ]]; then
cgroup_limit=$(< /sys/fs/cgroup/memory.max)
[[ $cgroup_limit == "max" ]] && cgroup_limit=""
elif [[ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]]; then
cgroup_limit=$(< /sys/fs/cgroup/memory/memory.limit_in_bytes)
fi
local limit_mb=""
if [[ -n ${cgroup_limit:-} && $cgroup_limit =~ ^[0-9]+$ && $cgroup_limit -lt 9223372036854771712 ]]; then
limit_mb=$(( cgroup_limit / 1024 / 1024 ))
fi
local memtotal_kb
memtotal_kb=$(awk '/MemTotal/ { print $2; exit }' /proc/meminfo 2>/dev/null || echo "")
local total_mb=""
if [[ $memtotal_kb =~ ^[0-9]+$ ]]; then
total_mb=$(( memtotal_kb / 1024 ))
fi
if [[ -n $limit_mb && -n $total_mb ]]; then
(( limit_mb < total_mb )) && echo "$limit_mb" || echo "$total_mb"
else
echo "${limit_mb:-${total_mb:-4096}}"
fi
}
EXTRA_OPTS=()
if [[ "${JDK_JAVA_OPTIONS:-}" != *"--enable-native-access="* ]]; then
EXTRA_OPTS+=(--enable-native-access=ALL-UNNAMED)
fi
if ! $has_xmx; then
if [[ -n ${KORAPXMLTOOL_XMX:-} ]]; then
# Handle KORAPXMLTOOL_XMX with units (g/G for GB, m/M for MB, or just number for MB)
if [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+[gG]$ ]]; then
# Convert GB to MB
xmx_gb=${KORAPXMLTOOL_XMX%[gG]}
xmx_mb=$((xmx_gb * 1024))
elif [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+[mM]$ ]]; then
# Extract MB value
xmx_mb=${KORAPXMLTOOL_XMX%[mM]}
elif [[ ${KORAPXMLTOOL_XMX} =~ ^[0-9]+$ ]]; then
# Treat plain number as MB for backward compatibility
xmx_mb=${KORAPXMLTOOL_XMX}
else
echo "Warning: Invalid KORAPXMLTOOL_XMX format '${KORAPXMLTOOL_XMX}'. Use formats like '20g', '8192m', or '8192'." >&2
xmx_mb=""
fi
fi
# If no valid XMX was provided or parsing failed, use auto-detection
if [[ -z ${xmx_mb:-} ]]; then
mem_mb=$(detect_mem_limit_mb)
# Intelligent memory allocation based on use case
# Check command line arguments for workload hints
workload_type="default"
large_corpus=false
has_annotation=false
# Detect large corpus (>5GB input files)
# Note: This is a rough estimate - exact size will be calculated in Kotlin
total_input_size=0
for arg in "$@"; do
# Skip options and flags, only process files
if [[ "$arg" != -* && "$arg" == *.zip ]]; then
if [[ -e "$arg" ]]; then
# Use -L to follow symlinks (many corpora use symlinked ZIPs)
size=$(stat -L -c%s "$arg" 2>/dev/null || stat -L -f%z "$arg" 2>/dev/null || echo 0)
total_input_size=$((total_input_size + size))
fi
fi
done
if (( total_input_size > 0 )); then
# Calculate with one decimal place for better accuracy with smaller files
if command -v bc >/dev/null 2>&1; then
total_input_gb_precise=$(echo "scale=1; $total_input_size / (1024*1024*1024)" | bc -l)
# Ensure we always have .X format, even for values < 1
[[ "$total_input_gb_precise" != *.* ]] && total_input_gb_precise="${total_input_gb_precise}.0"
else
# Fallback using awk for better precision
total_input_gb_precise=$(awk "BEGIN {printf \"%.1f\", $total_input_size / (1024*1024*1024)}")
fi
# Integer part for comparison (convert 1.5 -> 1, 0.1 -> 0, etc.)
total_input_gb=$(echo "$total_input_gb_precise" | cut -d. -f1)
# Handle case where total_input_gb might be empty for very small files
[[ -z "$total_input_gb" ]] && total_input_gb=0
(( total_input_gb > 5 )) && large_corpus=true
else
# Cannot determine size reliably at shell level
total_input_gb=0
total_input_gb_precise="0.0"
fi
# Detect annotation workloads (memory-intensive)
for arg in "$@"; do
case "$arg" in
-T|--tag-with|-P|--parse-with) has_annotation=true ;;
*) ;;
esac
done
# Calculate memory based on workload
if [[ "$has_annotation" == true ]]; then
# Annotation: need substantial memory for models + processing
if [[ "$large_corpus" == true ]]; then
# Large corpus + annotation: use 85% of memory, min 8GB, max 128GB
xmx_mb=$(( mem_mb * 85 / 100 ))
(( xmx_mb < 8192 )) && xmx_mb=8192
(( xmx_mb > 131072 )) && xmx_mb=131072
else
# Small corpus + annotation: use 80% of memory, min 4GB, max 64GB
xmx_mb=$(( mem_mb * 80 / 100 ))
(( xmx_mb < 4096 )) && xmx_mb=4096
(( xmx_mb > 65536 )) && xmx_mb=65536
fi
workload_type="annotation"
elif [[ "$large_corpus" == true ]]; then
# Large corpus without annotation: use 80% memory, min 4GB, max 96GB
xmx_mb=$(( mem_mb * 80 / 100 ))
(( xmx_mb < 4096 )) && xmx_mb=4096
(( xmx_mb > 98304 )) && xmx_mb=98304
workload_type="large_corpus"
else
# Default: lightweight processing, use 75% memory, min 1GB, max 32GB
xmx_mb=$(( mem_mb * 75 / 100 ))
(( xmx_mb < 1024 )) && xmx_mb=1024
(( xmx_mb > 32768 )) && xmx_mb=32768
workload_type="default"
fi
echo "Auto-detected: workload=$workload_type, input=${total_input_gb_precise}GB, memory=${xmx_mb}MB" >&2
fi
EXTRA_OPTS+=("-Xmx${xmx_mb}m")
fi
# Set default KORAPXMLTOOL_MODELS_PATH relative to executable if not already set
if [[ -z ${KORAPXMLTOOL_MODELS_PATH:-} ]]; then
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
export KORAPXMLTOOL_MODELS_PATH="${SCRIPT_DIR}/../lib/models"
fi
exec java "${EXTRA_OPTS[@]}" ${KORAPXMLTOOL_JAVA_OPTS:-} -jar "$0" "$@"