bugfixing Conservative.pm 1. identified wrong tokenization caused by wrong pattern match ($3) (wrote a test in t/tokenization.t, that shows the wrong tokenization) 2. removed wrong pattern match ($3) and adjusted test in t/tokenization.t 3. cleaned up (also changed some comments) 4. fixed missing tokenization of first punctuation char 5. exchanged [^A-Za-z0-9] by [\p{Punct}\s] (TODO: yet no approp. test found) Change-Id: Ib494c79c3e6971a57ad874fc62583c625095cf28

commit: 854a11503d3c0e9d22f322c26572d2e032192fc0 [log] [tgz]
author: Peter Harders <harders@ids-mannheim.de> Wed Jul 22 22:48:02 2020 +0200
committer: Peter Harders <harders@ids-mannheim.de> Fri Jul 24 20:24:20 2020 +0200
tree: 39ea0c4db5401d1097ec1a12ed33d27d217376df
parent: 1d65f9467ab04537821c0d6efd565c49ac3649fb [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 75b11eb..517a901 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -11,8 +11,8 @@
   my ($self, $txt) = @_;
 
   # Iterate over the whole string
-  while ($txt =~ /([^\p{Punct} \x{9}\n]+)
-                  (?:(\p{Punct})|(?:[ \x{9}\n])?)|
+  while ($txt =~ /([^\p{Punct}\s]+)
+                  (?:(\p{Punct})|\s?)|
                   (\p{Punct})/gx){
 
     # Starts with a character sequence

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index b3373f5..36cc14a 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -12,9 +12,9 @@
 
   # Iterate over the whole string
   while ($txt =~ /(\p{Punct}*)
-                  ([^\p{Punct} \x{9}\n]+(?:(\p{Punct}+)[^\p{Punct} \x{9}\n]+)*)?
+                  ([^\p{Punct}\s]+(?:\p{Punct}+[^\p{Punct}\s]+)*)?
                   (\p{Punct}*)
-                  (?:[ \x{9}\n])?/gx) {
+                  \s?/gx) {
 
     # Punctuation preceding a token
     $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
@@ -24,52 +24,47 @@
 
     # Punctuation following a token
     $self->_add_surroundings($txt, $-[3], $+[3]) if $3;
-
-    # Special chars after token
-    $self->_add_surroundings($txt, $-[4], $+[4]) if $4;
   };
 
   return
 };
 
 
-# Check if surrounding characters are token-worthy
+# Check if surrounding characters justify tokenization of Punctuation
+#  (in that case $pr is set)
 sub _add_surroundings {
   my ($self, $txt, $p1, $p2, $preceding) = @_;
 
-  my $pr;
+  my $pr; # "print" (tokenize) punctuation character (if one of the below tests justified it)
 
-  if ($p2 == $p1+1) {
+  if ($p2 == $p1+1) { # single punctuation character
 
     # Variant for preceding characters
     if ($preceding) {
-      # Character doesn't start and first position
+
+      $pr = 1; # the first punctuation character should always be tokenized
+      # note: this also fixes the bug with '.Der', where '.' was not tokenized (see t/tokenization.t)
+
+      # Punctuation character doesn't start at first position
       if ($p1 != 0) {
-
-        # Check if there is something to print
-        $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
-      };
-
-      # There is nothing to print
-      unless ($pr){
-
-        # Check, if the first character following the special char is a character?
-        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
-      };
+        # Check char before punctuation char
+        $pr = ( substr( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
+      }
     }
 
     else {
-      # Check the char after the match
-      $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+      # Check char after punctuation char
+      $pr = ( substr( $txt, $p2, 1 ) =~ /[\p{Punct}\s]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
 
-      # Check the char before the match
+      # Check char before punctuation char
       unless ($pr) {
-        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[\p{Punct}\s]/ );
       };
     };
 
-    # Either before or after the char there is a token
+    # tokenize punctuation char (because it was justified)
     push @$self, ($p1, $p2) if $pr;  # from and to
+
     return;
   };
 

diff --git a/t/data/wikipedia_small.txt b/t/data/wikipedia_small.txt
new file mode 100644
index 0000000..384bdf9
--- /dev/null
+++ b/t/data/wikipedia_small.txt

@@ -0,0 +1,25 @@
+
+Banner logo
+
+Mach mit beim Wettbewerb „Wikipedia-Artikel brauchen Fotos“ und gewinne einen Preis.
+Ausblenden
+Dieser Artikel existiert auch als Audiodatei.
+Wikipedia
+Zur Navigation springen
+Zur Suche springen
+Dieser Artikel behandelt die freie Online-Enzyklopädie Wikipedia. Für die deutschsprachige Ausgabe siehe Deutschsprachige Wikipedia, für den gleichnamigen Asteroiden siehe (274301) Wikipedia.
+Wikipedia
+Eine aus Puzzleteilen zusammengesetzte, oben noch nicht ganz geschlossene, hellgraue Kugel; die Puzzleteile tragen Schriftzeichen in verschiedenen Sprachen
+Logo der Wikipedia
+www.wikipedia.org (Startseite)
+de.wikipedia.org (deutschsprachige Version)
+Motto 	Die freie Enzyklopädie
+Beschreibung 	Wiki einer freien, kollektiv erstellten Online-Enzyklopädie
+Registrierung 	optional
+Sprachen 	294 aktive und 10 geschlossene Sprachversionen[1]
+Eigentümer 	Wikimedia Foundation
+Urheber 	angemeldete und nicht angemeldete Autoren
+Erschienen 	15. Januar 2001
+Artikel 	Über 49,3 Millionen (Stand: Januar 2019)[2]
+
+davon deutschsprachig: de.wikipedia.org 2.452.974 (aktuell zum Zeitpunkt des Seitenaufrufs im Browser)[3]

diff --git a/t/tokenization.t b/t/tokenization.t
index 503eeee..42f7840 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -45,51 +45,90 @@
 $cons->reset->tokenize("... Der");
 is_deeply($cons, [0,1,1,2,2,3,4,7]);
 
-# TODO:
-#   bug: '.' is not tokenized
+# done: '.' is now tokenized
 $cons->reset->tokenize(".Der");
-is_deeply($cons, [1,4]);
+is_deeply($cons, [0,1,1,4]);
 
 $cons->reset->tokenize(".Der.... ");
-is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
+is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
 
 $cons->reset->tokenize("..Der.... ");
 is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
 
-# Test data
-my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
-my $data = '';
+$cons->reset->tokenize(". Der.... ");
+is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
 
-ok(open(my $fh, '<' . $dataf), 'Open file');
+$cons->reset->tokenize(". .Der.... ");
+is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
+
+$cons->reset->tokenize("Der\talte\nMann");
+is_deeply($cons, [0,3,4,8,9,13]);
+
+
+##### TODO: big wikipedia.txt leads to very slow processing => use smaller test file as temporary solution (see below)
+## Test data
+#my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
+#my $data = '';
+#
+#ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
+#while (!eof($fh)) {
+#  $data .= <$fh>
+#};
+#
+### DEBUG
+##my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
+##foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
+#
+#ok(close($fh), 'Close file wikipedia.txt');
+#
+#is(134996, length($data)); # mind that each UTF-8 character counts only once
+#
+## TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
+## No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
+## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
+## cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
+## note
+## check different output with/without additional UTF-8 layer
+##  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
+##  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
+#
+#diag("DEBUG (aggr): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
+#$aggr->reset->tokenize($data);
+#is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
+#is(47112, scalar(@$aggr));
+#
+#diag("DEBUG (cons): Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
+#$cons->reset->tokenize($data);
+#is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
+#is(42412, scalar(@$cons));
+#
+## check tokenization of 'Community-Ämter aufgestiegen'
+##  from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
+##  from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
+#my @vals_got=(66070,66085,66086,66098);
+#my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
+#is_deeply([@vals_exp], [@vals_got]);
+##
+##### TODO: use smaller test file as temporary workaround (until problem solved)
+$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
+is_deeply($cons, [0,15,16,28]);
+
+my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
+my $data = '';
+ok(open(my $fh, '<' . $dataf), 'Open file wikipedia_small.txt');
 while (!eof($fh)) {
   $data .= <$fh>
 };
+ok(close($fh), 'Close file wikipedia_small.txt');
 
-## DEBUG
-#my @layers = PerlIO::get_layers($fh); # see 'man PerlIO': Querying the layers of filehandles
-#foreach my $l(@layers){print STDERR "DEBUG (filehandle layer): $l\n"};
-
-ok(close($fh), 'Close file');
-
-is(134996, length($data)); # mind that each UTF-8 character counts only once
-
-## note
-# check different output with/without additional UTF-8 layer
-#  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-#  echo "„Wikipedia-Artikel brauchen Fotos“" | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}'
-
-# TODO: With then necessary open-pragma (see above), this is extremely slow ... Where's the bottleneck?
-# No performance-issue, when piping 'wikipedia.txt' into a perl one-liner (also not, when using while-loop from Aggressive.pm):
-# cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; for($i=0;$i<length;$i++){$c=substr $_,$i,1; print ">$c<\n" if $c=~/\p{Punct}/}' >/dev/null
-# cat t/data/wikipedia.txt | perl -ne 'use open qw(:std :utf8); chomp; while($_=~/([^\p{Punct} \x{9}\n]+)(?:(\p{Punct})|(?:[ \x{9}\n])?)|(\p{Punct})/gx){ print "$1\n" if $1}' >/dev/null
-diag("DEBUG: Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
 $aggr->reset->tokenize($data);
 is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
-is(47112, scalar(@$aggr));
+is(366, scalar(@$aggr));
 
-diag("DEBUG: Tokenizing Wikipedia Text (134K). Because of an additional PerlIO layer (utf8) on the filehandle, this takes significant more time. Please wait ...\n");
 $cons->reset->tokenize($data);
 is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
-is(43218, scalar(@$cons));
+is(302, scalar(@$cons));
+#####
+
 
 done_testing;
commit	854a11503d3c0e9d22f322c26572d2e032192fc0	[log] [tgz]
author	Peter Harders <harders@ids-mannheim.de>	Wed Jul 22 22:48:02 2020 +0200
committer	Peter Harders <harders@ids-mannheim.de>	Fri Jul 24 20:24:20 2020 +0200
tree	39ea0c4db5401d1097ec1a12ed33d27d217376df
parent	1d65f9467ab04537821c0d6efd565c49ac3649fb [diff]