faster processing of UTF8-chars Change-Id: I53ebfbf6a54e319dfeb1569b1ac070278059b0dc

commit: 994aff7addeadc7be57b848e9d8c50bdafb11bd3 [log] [tgz]
author: Peter Harders <harders@ids-mannheim.de> Sat Jul 25 09:53:35 2020 +0200
committer: Akron <nils@diewald-online.de> Sat Jul 25 17:58:27 2020 +0200
tree: 90392a860269ead824f4409bd70475a1488d5b07
parent: 854a11503d3c0e9d22f322c26572d2e032192fc0 [diff]
diff --git a/t/tokenization.t b/t/tokenization.t
index 42f7840..1d75e5f 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -1,10 +1,9 @@
 use strict;
 use warnings;
-#use open qw(:std :utf8); # see perlunifaq: What is the difference between ":encoding" and ":utf8"?
-use open qw(:std :encoding(UTF-8)); # assume utf-8 encoding (see utf8 in Test::More)
 use Test::More;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use open qw(:std :utf8); # assume utf-8 encoding
 
 use FindBin;
 BEGIN {
@@ -45,7 +44,6 @@
 $cons->reset->tokenize("... Der");
 is_deeply($cons, [0,1,1,2,2,3,4,7]);
 
-# done: '.' is now tokenized
 $cons->reset->tokenize(".Der");
 is_deeply($cons, [0,1,1,4]);
 
@@ -64,58 +62,41 @@
 $cons->reset->tokenize("Der\talte\nMann");
 is_deeply($cons, [0,3,4,8,9,13]);
 
-
-##### TODO: big wikipedia.txt leads to very slow processing => use smaller test file as temporary solution (see below)
 ## Test data
-#my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
-#my $data = '';
-#
-#ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
-#while (!eof($fh)) {
-#  $data .= <$fh>
-#};
-#
-### DEBUG
-##my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
-##foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
-#
-#ok(close($fh), 'Close file wikipedia.txt');
-#
-#is(134996, length($data)); # mind that each UTF-8 character counts only once
-#
-## TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
-## No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
-## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
-## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
-## note
-## check different output with/without additional UTF-8 layer
-##  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-##  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-#
-#diag("DEBUG (aggr): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
-#$aggr->reset->tokenize($data);
-#is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
-#is(47112, scalar(@$aggr));
-#
-#diag("DEBUG (cons): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
-#$cons->reset->tokenize($data);
-#is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
-#is(42412, scalar(@$cons));
-#
+my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
+my $data = '';
+
+ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
+
+while (!eof($fh)) {
+  $data .= <$fh>
+};
+
+ok(close($fh), 'Close file wikipedia.txt');
+
+is(134996, length($data));
+
+$aggr->reset->tokenize($data);
+is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
+is(47112, scalar(@$aggr));
+
+$cons->reset->tokenize($data);
+is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
+is(42412, scalar(@$cons));
+
 ## check tokenization of 'Community-Ämter aufgestiegen'
 ##  from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
 ##  from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
-#my @vals_got=(66070,66085,66086,66098);
-#my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
-#is_deeply([@vals_exp], [@vals_got]);
-##
-##### TODO: use smaller test file as temporary workaround (until problem solved)
+my @vals_got=(66070,66085,66086,66098);
+my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
+is_deeply([@vals_exp], [@vals_got]);
+
 $cons->reset->tokenize("Community-\xc4mter aufgestiegen");
 is_deeply($cons, [0,15,16,28]);
 
-my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
-my $data = '';
-ok(open(my $fh, '<' . $dataf), 'Open file wikipedia_small.txt');
+$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
+$data = '';
+ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
 while (!eof($fh)) {
   $data .= <$fh>
 };
@@ -128,7 +109,6 @@
 $cons->reset->tokenize($data);
 is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
 is(302, scalar(@$cons));
-#####
 
 
 done_testing;
commit	994aff7addeadc7be57b848e9d8c50bdafb11bd3	[log] [tgz]
author	Peter Harders <harders@ids-mannheim.de>	Sat Jul 25 09:53:35 2020 +0200
committer	Akron <nils@diewald-online.de>	Sat Jul 25 17:58:27 2020 +0200
tree	90392a860269ead824f4409bd70475a1488d5b07
parent	854a11503d3c0e9d22f322c26572d2e032192fc0 [diff]