Make KorAP tokenizer max heap size adjustable

via an env variable KORAPXMLTEI_TOKENIZER_HEAP_SIZE

Resolves #8

Change-Id: I600de1006acd6c00eab9b2b1c9c8b5492f20f4f2
diff --git a/Changes b/Changes
index c2de592..f5904b6 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+        - Make KorAP-Tokenizer heap size configurable via environment
+          variable KORAPXMLTEI_TOKENIZER_HEAP_SIZE.
+
 2.6.0 2024-11-11
         - Add -o parameter.
         - Add support for inline dependency relations.
diff --git a/Readme.pod b/Readme.pod
index a587374..35f643e 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -293,6 +293,11 @@
 Activate minimal debugging.
 Defaults to C<false>.
 
+=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
+
+Set the heap size for the tokenizer process.
+Defaults to C<512m>.
+
 =back
 
 =head1 COPYRIGHT AND LICENSE
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 13baa9d..ba39fc7 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -39,8 +39,9 @@
 # Construct a new KorAP Tokenizer
 sub new {
   my ($class, $sentence_split) = @_;
-  my $self = $class->SUPER::new("$java -Xmx512m -jar $tokenizer_jar --no-tokens --positions" .
-      ($sentence_split? " --sentence-boundaries" : ""));
+  my $heap_size = $ENV{KORAPXMLTEI_TOKENIZER_HEAP_SIZE} // '512m';
+  my $self = $class->SUPER::new("$java -Xmx$heap_size -jar $tokenizer_jar --no-tokens --positions" .
+      ($sentence_split ? " --sentence-boundaries" : ""));
   $self->{sentence_split} = $sentence_split;
   $self->{name} = 'korap';
   $self->{sep} = "\n\x{04}\n";
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 7d079ca..7d9cfbd 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -804,6 +804,11 @@
 Activate minimal debugging.
 Defaults to C<false>.
 
+=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
+
+Set the heap size for the tokenizer process.
+Defaults to C<512m>.
+
 =back
 
 =head1 COPYRIGHT AND LICENSE