faster processing of UTF8-chars

Change-Id: I53ebfbf6a54e319dfeb1569b1ac070278059b0dc
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 4bdc255..3407451 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -5,6 +5,7 @@
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile rel2abs/;
 use File::Temp 'tempfile';
+use Encode qw!decode!;
 use FindBin;
 use Getopt::Long;
 
@@ -59,6 +60,7 @@
 my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
 my $t_data = '';
 if ((open(FH, '<' . $t_dataf))) {
+  binmode(FH);
   while (!eof(FH)) {
     $t_data .= <FH>
   };
@@ -68,6 +70,8 @@
   die "Unable to load $t_dataf";
 };
 
+my $t_data_utf_8 = decode('utf-8',$t_data);
+
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 
@@ -111,12 +115,26 @@
     }
   ),
   Dumbbench::Instance::PerlSub->new(
+    name => 'Tokenizer-conservative-utf-8',
+    code => sub {
+      $result = $cons_tok->reset->tokenize($t_data_utf_8);
+      $result = 0;
+    }
+  ),
+  Dumbbench::Instance::PerlSub->new(
     name => 'Tokenizer-aggressive',
     code => sub {
       $result = $aggr_tok->reset->tokenize($t_data);
       $result = 0;
     }
   ),
+  Dumbbench::Instance::PerlSub->new(
+    name => 'Tokenizer-aggressive-utf-8',
+    code => sub {
+      $result = $aggr_tok->reset->tokenize($t_data_utf_8);
+      $result = 0;
+    }
+  )
 );
 
 # Run benchmarks