Make KorAP tokenizer max heap size adjustable
via an env variable KORAPXMLTEI_TOKENIZER_HEAP_SIZE
Resolves #8
Change-Id: I600de1006acd6c00eab9b2b1c9c8b5492f20f4f2
diff --git a/Changes b/Changes
index c2de592..f5904b6 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+ - Make KorAP-Tokenizer heap size configurable via environment
+ variable KORAPXMLTEI_TOKENIZER_HEAP_SIZE.
+
2.6.0 2024-11-11
- Add -o parameter.
- Add support for inline dependency relations.
diff --git a/Readme.pod b/Readme.pod
index a587374..35f643e 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -293,6 +293,11 @@
Activate minimal debugging.
Defaults to C<false>.
+=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
+
+Set the heap size for the tokenizer process.
+Defaults to C<512m>.
+
=back
=head1 COPYRIGHT AND LICENSE
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 13baa9d..ba39fc7 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -39,8 +39,9 @@
# Construct a new KorAP Tokenizer
sub new {
my ($class, $sentence_split) = @_;
- my $self = $class->SUPER::new("$java -Xmx512m -jar $tokenizer_jar --no-tokens --positions" .
- ($sentence_split? " --sentence-boundaries" : ""));
+ my $heap_size = $ENV{KORAPXMLTEI_TOKENIZER_HEAP_SIZE} // '512m';
+ my $self = $class->SUPER::new("$java -Xmx$heap_size -jar $tokenizer_jar --no-tokens --positions" .
+ ($sentence_split ? " --sentence-boundaries" : ""));
$self->{sentence_split} = $sentence_split;
$self->{name} = 'korap';
$self->{sep} = "\n\x{04}\n";
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 7d079ca..7d9cfbd 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -804,6 +804,11 @@
Activate minimal debugging.
Defaults to C<false>.
+=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
+
+Set the heap size for the tokenizer process.
+Defaults to C<512m>.
+
=back
=head1 COPYRIGHT AND LICENSE