Fixed bug in tokenizer to recognize non-word-tokenizations

Change-Id: I4d9d5ffaefc45dc2220c17273dee70e05080137e
diff --git a/t/script/archive.t b/t/script/archive.t
index aee0860..a0b4dd6 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -128,6 +128,24 @@
 };
 
 ok(-d $output, 'Ouput directory exists');
+
+
+$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
+$call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input
+);
+
+# Test without compression
+{
+  local $SIG{__WARN__} = sub {};
+  my $out = stderr_from(sub { system($call); });
+
+  like($out, qr!no base tokenization!s, $call);
+};
+
+
 unlink($output);
 
 done_testing;