blob: b9f36105c3e4b6d76c1334ffca9fa4b8a9a2afd8 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use Encode qw(decode_utf8);
BEGIN {
unshift @INC, "$FindBin::Bin/../../lib";
};
use KorAP::XML::TEI::Tokenizer::Aggressive;
$| = 1;
my $state_file = shift @ARGV;
my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
sub _state {
return 0 unless $state_file;
return 0 unless open(my $fh, '<', $state_file);
my $count = <$fh> // 0;
close($fh);
chomp $count;
return $count || 0;
}
sub _set_state {
my $count = shift;
return unless $state_file;
open(my $fh, '>', $state_file) or die "Can't write state file '$state_file': $!";
print {$fh} $count;
close($fh);
}
while (!eof(STDIN)) {
my $line = decode_utf8(<>);
for my $text (split(/\n?\x{04}\n?/, $line)) {
next if !defined $text || $text eq '';
if (index($text, '__CRASH_ONCE__') >= 0 && !_state()) {
_set_state(1);
exit 9;
}
if (index($text, '__ALWAYS_CRASH__') >= 0) {
exit 9;
}
$tok->tokenize($text);
print join(' ', $tok->boundaries), "\n";
$tok->reset;
}
}
1;