blob: b9f36105c3e4b6d76c1334ffca9fa4b8a9a2afd8 [file] [log] [blame]
Marc Kupietzcfaefb92026-04-07 12:13:11 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4use FindBin;
5use Encode qw(decode_utf8);
6
7BEGIN {
8 unshift @INC, "$FindBin::Bin/../../lib";
9};
10
11use KorAP::XML::TEI::Tokenizer::Aggressive;
12
13$| = 1;
14
15my $state_file = shift @ARGV;
16my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
17
18sub _state {
19 return 0 unless $state_file;
20 return 0 unless open(my $fh, '<', $state_file);
21 my $count = <$fh> // 0;
22 close($fh);
23 chomp $count;
24 return $count || 0;
25}
26
27sub _set_state {
28 my $count = shift;
29 return unless $state_file;
30 open(my $fh, '>', $state_file) or die "Can't write state file '$state_file': $!";
31 print {$fh} $count;
32 close($fh);
33}
34
35while (!eof(STDIN)) {
36 my $line = decode_utf8(<>);
37 for my $text (split(/\n?\x{04}\n?/, $line)) {
38 next if !defined $text || $text eq '';
39
40 if (index($text, '__CRASH_ONCE__') >= 0 && !_state()) {
41 _set_state(1);
42 exit 9;
43 }
44
45 if (index($text, '__ALWAYS_CRASH__') >= 0) {
46 exit 9;
47 }
48
49 $tok->tokenize($text);
50 print join(' ', $tok->boundaries), "\n";
51 $tok->reset;
52 }
53}
54
551;