Add missing tokenizer_faulty.pl

Change-Id: Iccb2f2a3418bbc03e88e5e9cd577ecc3afc96107
diff --git a/t/cmd/tokenizer_faulty.pl b/t/cmd/tokenizer_faulty.pl
new file mode 100644
index 0000000..b9f3610
--- /dev/null
+++ b/t/cmd/tokenizer_faulty.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+use Encode qw(decode_utf8);
+
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../../lib";
+};
+
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+
+$| = 1;
+
+my $state_file = shift @ARGV;
+my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+
+sub _state {
+  return 0 unless $state_file;
+  return 0 unless open(my $fh, '<', $state_file);
+  my $count = <$fh> // 0;
+  close($fh);
+  chomp $count;
+  return $count || 0;
+}
+
+sub _set_state {
+  my $count = shift;
+  return unless $state_file;
+  open(my $fh, '>', $state_file) or die "Can't write state file '$state_file': $!";
+  print {$fh} $count;
+  close($fh);
+}
+
+while (!eof(STDIN)) {
+  my $line = decode_utf8(<>);
+  for my $text (split(/\n?\x{04}\n?/, $line)) {
+    next if !defined $text || $text eq '';
+
+    if (index($text, '__CRASH_ONCE__') >= 0 && !_state()) {
+      _set_state(1);
+      exit 9;
+    }
+
+    if (index($text, '__ALWAYS_CRASH__') >= 0) {
+      exit 9;
+    }
+
+    $tok->tokenize($text);
+    print join(' ', $tok->boundaries), "\n";
+    $tok->reset;
+  }
+}
+
+1;