Add missing tokenizer_faulty.pl
Change-Id: Iccb2f2a3418bbc03e88e5e9cd577ecc3afc96107
diff --git a/t/cmd/tokenizer_faulty.pl b/t/cmd/tokenizer_faulty.pl
new file mode 100644
index 0000000..b9f3610
--- /dev/null
+++ b/t/cmd/tokenizer_faulty.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+use Encode qw(decode_utf8);
+
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../../lib";
+};
+
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+
+$| = 1;
+
+my $state_file = shift @ARGV;
+my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+
+sub _state {
+ return 0 unless $state_file;
+ return 0 unless open(my $fh, '<', $state_file);
+ my $count = <$fh> // 0;
+ close($fh);
+ chomp $count;
+ return $count || 0;
+}
+
+sub _set_state {
+ my $count = shift;
+ return unless $state_file;
+ open(my $fh, '>', $state_file) or die "Can't write state file '$state_file': $!";
+ print {$fh} $count;
+ close($fh);
+}
+
+while (!eof(STDIN)) {
+ my $line = decode_utf8(<>);
+ for my $text (split(/\n?\x{04}\n?/, $line)) {
+ next if !defined $text || $text eq '';
+
+ if (index($text, '__CRASH_ONCE__') >= 0 && !_state()) {
+ _set_state(1);
+ exit 9;
+ }
+
+ if (index($text, '__ALWAYS_CRASH__') >= 0) {
+ exit 9;
+ }
+
+ $tok->tokenize($text);
+ print join(' ', $tok->boundaries), "\n";
+ $tok->reset;
+ }
+}
+
+1;