faster processing of UTF8-chars
Change-Id: I53ebfbf6a54e319dfeb1569b1ac070278059b0dc
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 517a901..63a9ce0 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -8,12 +8,28 @@
# Tokenize string "aggressively" and return an array
# with character boundaries.
sub tokenize {
- my ($self, $txt) = @_;
+ my ($self, $txt_utf8) = @_;
+
+ my $txt;
+
+ # faster processing of UTF8-chars
+ foreach my $char (split //, $txt_utf8) {
+ if ($char =~ /\p{Punct}/) {
+ $txt .= "p"
+ } elsif ($char =~ /[^\p{Punct}\s]/) {
+ $txt .= "P"
+ } elsif ($char =~ /\s/) {
+ $txt .= "s"
+ } else {
+ $txt .= "o" # other: should actually only happen for string end (0 byte)
+ # check could be 'ord($char)==0'
+ }
+ };
# Iterate over the whole string
- while ($txt =~ /([^\p{Punct}\s]+)
- (?:(\p{Punct})|\s?)|
- (\p{Punct})/gx){
+ while ($txt =~ /(P+)
+ (?:(p)|s?)|
+ (p)/gx){
# Starts with a character sequence
if (defined $1){
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 36cc14a..c89c4fd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -8,13 +8,29 @@
# Tokenize string "conservatively" and return an array
# with character boundaries.
sub tokenize {
- my ($self, $txt) = @_;
+ my ($self, $txt_utf8) = @_;
+
+ my $txt;
+
+ # faster processing of UTF8-chars
+ foreach my $char (split //, $txt_utf8) {
+ if ($char =~ /\p{Punct}/) {
+ $txt .= "p"
+ } elsif ($char =~ /[^\p{Punct}\s]/) {
+ $txt .= "P"
+ } elsif ($char =~ /\s/) {
+ $txt .= "s"
+ } else {
+ $txt .= "o" # other: should actually only happen for string end (0 byte)
+ # check could be 'ord($char)==0'
+ }
+ };
# Iterate over the whole string
- while ($txt =~ /(\p{Punct}*)
- ([^\p{Punct}\s]+(?:\p{Punct}+[^\p{Punct}\s]+)*)?
- (\p{Punct}*)
- \s?/gx) {
+ while ($txt =~ /(p*)
+ (P+(?:p+P+)*)?
+ (p*)
+ s?/gx) {
# Punctuation preceding a token
$self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
@@ -43,22 +59,21 @@
if ($preceding) {
$pr = 1; # the first punctuation character should always be tokenized
- # note: this also fixes the bug with '.Der', where '.' was not tokenized (see t/tokenization.t)
# Punctuation character doesn't start at first position
if ($p1 != 0) {
# Check char before punctuation char
- $pr = ( substr( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
+ $pr = ( substr( $txt, $p1-1, 1 ) =~ /[ps]/ );
}
}
else {
# Check char after punctuation char
- $pr = ( substr( $txt, $p2, 1 ) =~ /[\p{Punct}\s]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
+ $pr = ( substr( $txt, $p2, 1 ) =~ /[ps]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
# Check char before punctuation char
unless ($pr) {
- $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
+ $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[ps]/ );
};
};
diff --git a/t/tokenization.t b/t/tokenization.t
index 42f7840..1d75e5f 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -1,10 +1,9 @@
use strict;
use warnings;
-#use open qw(:std :utf8); # see perlunifaq: What is the difference between ":encoding" and ":utf8"?
-use open qw(:std :encoding(UTF-8)); # assume utf-8 encoding (see utf8 in Test::More)
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
+use open qw(:std :utf8); # assume utf-8 encoding
use FindBin;
BEGIN {
@@ -45,7 +44,6 @@
$cons->reset->tokenize("... Der");
is_deeply($cons, [0,1,1,2,2,3,4,7]);
-# done: '.' is now tokenized
$cons->reset->tokenize(".Der");
is_deeply($cons, [0,1,1,4]);
@@ -64,58 +62,41 @@
$cons->reset->tokenize("Der\talte\nMann");
is_deeply($cons, [0,3,4,8,9,13]);
-
-##### TODO: big wikipedia.txt leads to very slow processing => use smaller test file as temporary solution (see below)
## Test data
-#my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
-#my $data = '';
-#
-#ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
-#while (!eof($fh)) {
-# $data .= <$fh>
-#};
-#
-### DEBUG
-##my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
-##foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
-#
-#ok(close($fh), 'Close file wikipedia.txt');
-#
-#is(134996, length($data)); # mind that each UTF-8 character counts only once
-#
-## TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
-## No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
-## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
-## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
-## note
-## check different output with/without additional UTF-8 layer
-## echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-## echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-#
-#diag("DEBUG (aggr): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
-#$aggr->reset->tokenize($data);
-#is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
-#is(47112, scalar(@$aggr));
-#
-#diag("DEBUG (cons): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
-#$cons->reset->tokenize($data);
-#is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
-#is(42412, scalar(@$cons));
-#
+my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
+my $data = '';
+
+ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
+
+while (!eof($fh)) {
+ $data .= <$fh>
+};
+
+ok(close($fh), 'Close file wikipedia.txt');
+
+is(134996, length($data));
+
+$aggr->reset->tokenize($data);
+is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
+is(47112, scalar(@$aggr));
+
+$cons->reset->tokenize($data);
+is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
+is(42412, scalar(@$cons));
+
## check tokenization of 'Community-Ämter aufgestiegen'
## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
-#my @vals_got=(66070,66085,66086,66098);
-#my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
-#is_deeply([@vals_exp], [@vals_got]);
-##
-##### TODO: use smaller test file as temporary workaround (until problem solved)
+my @vals_got=(66070,66085,66086,66098);
+my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
+is_deeply([@vals_exp], [@vals_got]);
+
$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
is_deeply($cons, [0,15,16,28]);
-my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
-my $data = '';
-ok(open(my $fh, '<' . $dataf), 'Open file wikipedia_small.txt');
+$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
+$data = '';
+ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
while (!eof($fh)) {
$data .= <$fh>
};
@@ -128,7 +109,6 @@
$cons->reset->tokenize($data);
is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
is(302, scalar(@$cons));
-#####
done_testing;
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 4bdc255..3407451 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -5,6 +5,7 @@
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile rel2abs/;
use File::Temp 'tempfile';
+use Encode qw!decode!;
use FindBin;
use Getopt::Long;
@@ -59,6 +60,7 @@
my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
my $t_data = '';
if ((open(FH, '<' . $t_dataf))) {
+ binmode(FH);
while (!eof(FH)) {
$t_data .= <FH>
};
@@ -68,6 +70,8 @@
die "Unable to load $t_dataf";
};
+my $t_data_utf_8 = decode('utf-8',$t_data);
+
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
@@ -111,12 +115,26 @@
}
),
Dumbbench::Instance::PerlSub->new(
+ name => 'Tokenizer-conservative-utf-8',
+ code => sub {
+ $result = $cons_tok->reset->tokenize($t_data_utf_8);
+ $result = 0;
+ }
+ ),
+ Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-aggressive',
code => sub {
$result = $aggr_tok->reset->tokenize($t_data);
$result = 0;
}
),
+ Dumbbench::Instance::PerlSub->new(
+ name => 'Tokenizer-aggressive-utf-8',
+ code => sub {
+ $result = $aggr_tok->reset->tokenize($t_data_utf_8);
+ $result = 0;
+ }
+ )
);
# Run benchmarks