faster processing of UTF8-chars Change-Id: I53ebfbf6a54e319dfeb1569b1ac070278059b0dc

commit: 994aff7addeadc7be57b848e9d8c50bdafb11bd3 [log] [tgz]
author: Peter Harders <harders@ids-mannheim.de> Sat Jul 25 09:53:35 2020 +0200
committer: Akron <nils@diewald-online.de> Sat Jul 25 17:58:27 2020 +0200
tree: 90392a860269ead824f4409bd70475a1488d5b07
parent: 854a11503d3c0e9d22f322c26572d2e032192fc0 [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 517a901..63a9ce0 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -8,12 +8,28 @@
 # Tokenize string "aggressively" and return an array
 # with character boundaries.
 sub tokenize {
-  my ($self, $txt) = @_;
+  my ($self, $txt_utf8) = @_;
+
+  my $txt;
+
+  # faster processing of UTF8-chars
+  foreach my $char (split //, $txt_utf8) {
+    if ($char =~ /\p{Punct}/) {
+      $txt .= "p"
+    } elsif ($char =~ /[^\p{Punct}\s]/) {
+      $txt .= "P"
+    } elsif ($char =~ /\s/) {
+      $txt .= "s"
+    } else {
+      $txt .= "o" # other: should actually only happen for string end (0 byte)
+      # check could be 'ord($char)==0'
+    }
+  };
 
   # Iterate over the whole string
-  while ($txt =~ /([^\p{Punct}\s]+)
-                  (?:(\p{Punct})|\s?)|
-                  (\p{Punct})/gx){
+  while ($txt =~ /(P+)
+                  (?:(p)|s?)|
+                  (p)/gx){
 
     # Starts with a character sequence
     if (defined $1){

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 36cc14a..c89c4fd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -8,13 +8,29 @@
 # Tokenize string "conservatively" and return an array
 # with character boundaries.
 sub tokenize {
-  my ($self, $txt) = @_;
+  my ($self, $txt_utf8) = @_;
+
+  my $txt;
+
+  # faster processing of UTF8-chars
+  foreach my $char (split //, $txt_utf8) {
+    if ($char =~ /\p{Punct}/) {
+      $txt .= "p"
+    } elsif ($char =~ /[^\p{Punct}\s]/) {
+      $txt .= "P"
+    } elsif ($char =~ /\s/) {
+      $txt .= "s"
+    } else {
+      $txt .= "o" # other: should actually only happen for string end (0 byte)
+      # check could be 'ord($char)==0'
+    }
+  };
 
   # Iterate over the whole string
-  while ($txt =~ /(\p{Punct}*)
-                  ([^\p{Punct}\s]+(?:\p{Punct}+[^\p{Punct}\s]+)*)?
-                  (\p{Punct}*)
-                  \s?/gx) {
+  while ($txt =~ /(p*)
+                  (P+(?:p+P+)*)?
+                  (p*)
+                  s?/gx) {
 
     # Punctuation preceding a token
     $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
@@ -43,22 +59,21 @@
     if ($preceding) {
 
       $pr = 1; # the first punctuation character should always be tokenized
-      # note: this also fixes the bug with '.Der', where '.' was not tokenized (see t/tokenization.t)
 
       # Punctuation character doesn't start at first position
       if ($p1 != 0) {
         # Check char before punctuation char
-        $pr = ( substr( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
+        $pr = ( substr( $txt, $p1-1, 1 ) =~ /[ps]/ );
       }
     }
 
     else {
       # Check char after punctuation char
-      $pr = ( substr( $txt, $p2, 1 ) =~ /[\p{Punct}\s]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
+      $pr = ( substr( $txt, $p2, 1 ) =~ /[ps]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
 
       # Check char before punctuation char
       unless ($pr) {
-        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
+        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[ps]/ );
       };
     };
 

diff --git a/t/tokenization.t b/t/tokenization.t
index 42f7840..1d75e5f 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -1,10 +1,9 @@
 use strict;
 use warnings;
-#use open qw(:std :utf8); # see perlunifaq: What is the difference between ":encoding" and ":utf8"?
-use open qw(:std :encoding(UTF-8)); # assume utf-8 encoding (see utf8 in Test::More)
 use Test::More;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use open qw(:std :utf8); # assume utf-8 encoding
 
 use FindBin;
 BEGIN {
@@ -45,7 +44,6 @@
 $cons->reset->tokenize("... Der");
 is_deeply($cons, [0,1,1,2,2,3,4,7]);
 
-# done: '.' is now tokenized
 $cons->reset->tokenize(".Der");
 is_deeply($cons, [0,1,1,4]);
 
@@ -64,58 +62,41 @@
 $cons->reset->tokenize("Der\talte\nMann");
 is_deeply($cons, [0,3,4,8,9,13]);
 
-
-##### TODO: big wikipedia.txt leads to very slow processing => use smaller test file as temporary solution (see below)
 ## Test data
-#my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
-#my $data = '';
-#
-#ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
-#while (!eof($fh)) {
-#  $data .= <$fh>
-#};
-#
-### DEBUG
-##my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
-##foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
-#
-#ok(close($fh), 'Close file wikipedia.txt');
-#
-#is(134996, length($data)); # mind that each UTF-8 character counts only once
-#
-## TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
-## No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
-## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
-## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
-## note
-## check different output with/without additional UTF-8 layer
-##  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-##  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-#
-#diag("DEBUG (aggr): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
-#$aggr->reset->tokenize($data);
-#is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
-#is(47112, scalar(@$aggr));
-#
-#diag("DEBUG (cons): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
-#$cons->reset->tokenize($data);
-#is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
-#is(42412, scalar(@$cons));
-#
+my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
+my $data = '';
+
+ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
+
+while (!eof($fh)) {
+  $data .= <$fh>
+};
+
+ok(close($fh), 'Close file wikipedia.txt');
+
+is(134996, length($data));
+
+$aggr->reset->tokenize($data);
+is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
+is(47112, scalar(@$aggr));
+
+$cons->reset->tokenize($data);
+is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
+is(42412, scalar(@$cons));
+
 ## check tokenization of 'Community-Ämter aufgestiegen'
 ##  from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
 ##  from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
-#my @vals_got=(66070,66085,66086,66098);
-#my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
-#is_deeply([@vals_exp], [@vals_got]);
-##
-##### TODO: use smaller test file as temporary workaround (until problem solved)
+my @vals_got=(66070,66085,66086,66098);
+my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
+is_deeply([@vals_exp], [@vals_got]);
+
 $cons->reset->tokenize("Community-\xc4mter aufgestiegen");
 is_deeply($cons, [0,15,16,28]);
 
-my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
-my $data = '';
-ok(open(my $fh, '<' . $dataf), 'Open file wikipedia_small.txt');
+$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
+$data = '';
+ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
 while (!eof($fh)) {
   $data .= <$fh>
 };
@@ -128,7 +109,6 @@
 $cons->reset->tokenize($data);
 is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
 is(302, scalar(@$cons));
-#####
 
 
 done_testing;

diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 4bdc255..3407451 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl

@@ -5,6 +5,7 @@
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile rel2abs/;
 use File::Temp 'tempfile';
+use Encode qw!decode!;
 use FindBin;
 use Getopt::Long;
 
@@ -59,6 +60,7 @@
 my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
 my $t_data = '';
 if ((open(FH, '<' . $t_dataf))) {
+  binmode(FH);
   while (!eof(FH)) {
     $t_data .= <FH>
   };
@@ -68,6 +70,8 @@
   die "Unable to load $t_dataf";
 };
 
+my $t_data_utf_8 = decode('utf-8',$t_data);
+
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 
@@ -111,12 +115,26 @@
     }
   ),
   Dumbbench::Instance::PerlSub->new(
+    name => 'Tokenizer-conservative-utf-8',
+    code => sub {
+      $result = $cons_tok->reset->tokenize($t_data_utf_8);
+      $result = 0;
+    }
+  ),
+  Dumbbench::Instance::PerlSub->new(
     name => 'Tokenizer-aggressive',
     code => sub {
       $result = $aggr_tok->reset->tokenize($t_data);
       $result = 0;
     }
   ),
+  Dumbbench::Instance::PerlSub->new(
+    name => 'Tokenizer-aggressive-utf-8',
+    code => sub {
+      $result = $aggr_tok->reset->tokenize($t_data_utf_8);
+      $result = 0;
+    }
+  )
 );
 
 # Run benchmarks
commit	994aff7addeadc7be57b848e9d8c50bdafb11bd3	[log] [tgz]
author	Peter Harders <harders@ids-mannheim.de>	Sat Jul 25 09:53:35 2020 +0200
committer	Akron <nils@diewald-online.de>	Sat Jul 25 17:58:27 2020 +0200
tree	90392a860269ead824f4409bd70475a1488d5b07
parent	854a11503d3c0e9d22f322c26572d2e032192fc0 [diff]