| Marc Kupietz | cfaefb9 | 2026-04-07 12:13:11 +0200 | [diff] [blame^] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use FindBin; |
| 5 | use Encode qw(decode_utf8); |
| 6 | |
| 7 | BEGIN { |
| 8 | unshift @INC, "$FindBin::Bin/../../lib"; |
| 9 | }; |
| 10 | |
| 11 | use KorAP::XML::TEI::Tokenizer::Aggressive; |
| 12 | |
| 13 | $| = 1; |
| 14 | |
| 15 | my $state_file = shift @ARGV; |
| 16 | my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new; |
| 17 | |
| 18 | sub _state { |
| 19 | return 0 unless $state_file; |
| 20 | return 0 unless open(my $fh, '<', $state_file); |
| 21 | my $count = <$fh> // 0; |
| 22 | close($fh); |
| 23 | chomp $count; |
| 24 | return $count || 0; |
| 25 | } |
| 26 | |
| 27 | sub _set_state { |
| 28 | my $count = shift; |
| 29 | return unless $state_file; |
| 30 | open(my $fh, '>', $state_file) or die "Can't write state file '$state_file': $!"; |
| 31 | print {$fh} $count; |
| 32 | close($fh); |
| 33 | } |
| 34 | |
| 35 | while (!eof(STDIN)) { |
| 36 | my $line = decode_utf8(<>); |
| 37 | for my $text (split(/\n?\x{04}\n?/, $line)) { |
| 38 | next if !defined $text || $text eq ''; |
| 39 | |
| 40 | if (index($text, '__CRASH_ONCE__') >= 0 && !_state()) { |
| 41 | _set_state(1); |
| 42 | exit 9; |
| 43 | } |
| 44 | |
| 45 | if (index($text, '__ALWAYS_CRASH__') >= 0) { |
| 46 | exit 9; |
| 47 | } |
| 48 | |
| 49 | $tok->tokenize($text); |
| 50 | print join(' ', $tok->boundaries), "\n"; |
| 51 | $tok->reset; |
| 52 | } |
| 53 | } |
| 54 | |
| 55 | 1; |