bugfixing Conservative.pm
1. identified wrong tokenization caused by wrong pattern match ($3)
(wrote a test in t/tokenization.t, that shows the wrong tokenization)
2. removed wrong pattern match ($3) and adjusted test in t/tokenization.t
3. cleaned up (also changed some comments)
4. fixed missing tokenization of first punctuation char
5. exchanged [^A-Za-z0-9] by [\p{Punct}\s]
(TODO: yet no approp. test found)
Change-Id: Ib494c79c3e6971a57ad874fc62583c625095cf28
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 75b11eb..517a901 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -11,8 +11,8 @@
my ($self, $txt) = @_;
# Iterate over the whole string
- while ($txt =~ /([^\p{Punct} \x{9}\n]+)
- (?:(\p{Punct})|(?:[ \x{9}\n])?)|
+ while ($txt =~ /([^\p{Punct}\s]+)
+ (?:(\p{Punct})|\s?)|
(\p{Punct})/gx){
# Starts with a character sequence
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index b3373f5..36cc14a 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -12,9 +12,9 @@
# Iterate over the whole string
while ($txt =~ /(\p{Punct}*)
- ([^\p{Punct} \x{9}\n]+(?:(\p{Punct}+)[^\p{Punct} \x{9}\n]+)*)?
+ ([^\p{Punct}\s]+(?:\p{Punct}+[^\p{Punct}\s]+)*)?
(\p{Punct}*)
- (?:[ \x{9}\n])?/gx) {
+ \s?/gx) {
# Punctuation preceding a token
$self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
@@ -24,52 +24,47 @@
# Punctuation following a token
$self->_add_surroundings($txt, $-[3], $+[3]) if $3;
-
- # Special chars after token
- $self->_add_surroundings($txt, $-[4], $+[4]) if $4;
};
return
};
-# Check if surrounding characters are token-worthy
+# Check if surrounding characters justify tokenization of Punctuation
+# (in that case $pr is set)
sub _add_surroundings {
my ($self, $txt, $p1, $p2, $preceding) = @_;
- my $pr;
+ my $pr; # "print" (tokenize) punctuation character (if one of the below tests justified it)
- if ($p2 == $p1+1) {
+ if ($p2 == $p1+1) { # single punctuation character
# Variant for preceding characters
if ($preceding) {
- # Character doesn't start and first position
+
+ $pr = 1; # the first punctuation character should always be tokenized
+ # note: this also fixes the bug with '.Der', where '.' was not tokenized (see t/tokenization.t)
+
+ # Punctuation character doesn't start at first position
if ($p1 != 0) {
-
- # Check if there is something to print
- $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
- };
-
- # There is nothing to print
- unless ($pr){
-
- # Check, if the first character following the special char is a character?
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
- };
+ # Check char before punctuation char
+ $pr = ( substr( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
+ }
}
else {
- # Check the char after the match
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+ # Check char after punctuation char
+ $pr = ( substr( $txt, $p2, 1 ) =~ /[\p{Punct}\s]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
- # Check the char before the match
+ # Check char before punctuation char
unless ($pr) {
- $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+ $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
};
};
- # Either before or after the char there is a token
+ # tokenize punctuation char (because it was justified)
push @$self, ($p1, $p2) if $pr; # from and to
+
return;
};
diff --git a/t/data/wikipedia_small.txt b/t/data/wikipedia_small.txt
new file mode 100644
index 0000000..384bdf9
--- /dev/null
+++ b/t/data/wikipedia_small.txt
@@ -0,0 +1,25 @@
+
+Banner logo
+
+Mach mit beim Wettbewerb „Wikipedia-Artikel brauchen Fotos“ und gewinne einen Preis.
+Ausblenden
+Dieser Artikel existiert auch als Audiodatei.
+Wikipedia
+Zur Navigation springen
+Zur Suche springen
+Dieser Artikel behandelt die freie Online-Enzyklopädie Wikipedia. Für die deutschsprachige Ausgabe siehe Deutschsprachige Wikipedia, für den gleichnamigen Asteroiden siehe (274301) Wikipedia.
+Wikipedia
+Eine aus Puzzleteilen zusammengesetzte, oben noch nicht ganz geschlossene, hellgraue Kugel; die Puzzleteile tragen Schriftzeichen in verschiedenen Sprachen
+Logo der Wikipedia
+www.wikipedia.org (Startseite)
+de.wikipedia.org (deutschsprachige Version)
+Motto Die freie Enzyklopädie
+Beschreibung Wiki einer freien, kollektiv erstellten Online-Enzyklopädie
+Registrierung optional
+Sprachen 294 aktive und 10 geschlossene Sprachversionen[1]
+Eigentümer Wikimedia Foundation
+Urheber angemeldete und nicht angemeldete Autoren
+Erschienen 15. Januar 2001
+Artikel Über 49,3 Millionen (Stand: Januar 2019)[2]
+
+davon deutschsprachig: de.wikipedia.org 2.452.974 (aktuell zum Zeitpunkt des Seitenaufrufs im Browser)[3]
diff --git a/t/tokenization.t b/t/tokenization.t
index 503eeee..42f7840 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -45,51 +45,90 @@
$cons->reset->tokenize("... Der");
is_deeply($cons, [0,1,1,2,2,3,4,7]);
-# TODO:
-# bug: '.' is not tokenized
+# done: '.' is now tokenized
$cons->reset->tokenize(".Der");
-is_deeply($cons, [1,4]);
+is_deeply($cons, [0,1,1,4]);
$cons->reset->tokenize(".Der.... ");
-is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
+is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
$cons->reset->tokenize("..Der.... ");
is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
-# Test data
-my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
-my $data = '';
+$cons->reset->tokenize(". Der.... ");
+is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
-ok(open(my $fh, '<' . $dataf), 'Open file');
+$cons->reset->tokenize(". .Der.... ");
+is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
+
+$cons->reset->tokenize("Der\talte\nMann");
+is_deeply($cons, [0,3,4,8,9,13]);
+
+
+##### TODO: big wikipedia.txt leads to very slow processing => use smaller test file as temporary solution (see below)
+## Test data
+#my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
+#my $data = '';
+#
+#ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
+#while (!eof($fh)) {
+# $data .= <$fh>
+#};
+#
+### DEBUG
+##my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
+##foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
+#
+#ok(close($fh), 'Close file wikipedia.txt');
+#
+#is(134996, length($data)); # mind that each UTF-8 character counts only once
+#
+## TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
+## No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
+## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
+## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
+## note
+## check different output with/without additional UTF-8 layer
+## echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
+## echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
+#
+#diag("DEBUG (aggr): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
+#$aggr->reset->tokenize($data);
+#is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
+#is(47112, scalar(@$aggr));
+#
+#diag("DEBUG (cons): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
+#$cons->reset->tokenize($data);
+#is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
+#is(42412, scalar(@$cons));
+#
+## check tokenization of 'Community-Ämter aufgestiegen'
+## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
+## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
+#my @vals_got=(66070,66085,66086,66098);
+#my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
+#is_deeply([@vals_exp], [@vals_got]);
+##
+##### TODO: use smaller test file as temporary workaround (until problem solved)
+$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
+is_deeply($cons, [0,15,16,28]);
+
+my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
+my $data = '';
+ok(open(my $fh, '<' . $dataf), 'Open file wikipedia_small.txt');
while (!eof($fh)) {
$data .= <$fh>
};
+ok(close($fh), 'Close file wikipedia_small.txt');
-## DEBUG
-#my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
-#foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
-
-ok(close($fh), 'Close file');
-
-is(134996, length($data)); # mind that each UTF-8 character counts only once
-
-## note
-# check different output with/without additional UTF-8 layer
-# echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-# echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-
-# TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
-# No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
-# cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
-# cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
-diag("DEBUG: Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
$aggr->reset->tokenize($data);
is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
-is(47112, scalar(@$aggr));
+is(366, scalar(@$aggr));
-diag("DEBUG: Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
$cons->reset->tokenize($data);
is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
-is(43218, scalar(@$cons));
+is(302, scalar(@$cons));
+#####
+
done_testing;