Improve utf-8-preprocessing for tokenizers Change-Id: If18ab6d40b98d372f161a377a0d4efd17444fa69

commit: 190d02213a60059aa1f5641c86fd89c5265abd00 [log] [tgz]
author: Akron <nils@diewald-online.de> Sat Jul 25 22:44:33 2020 +0200
committer: Akron <nils@diewald-online.de> Sun Jul 26 09:26:20 2020 +0200
tree: d92d407cb57797a43cca4788d1402dedbedaf6dd
parent: 994aff7addeadc7be57b848e9d8c50bdafb11bd3 [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 63a9ce0..5099eeb 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -8,28 +8,18 @@
 # Tokenize string "aggressively" and return an array
 # with character boundaries.
 sub tokenize {
-  my ($self, $txt_utf8) = @_;
+  my ($self, $txt) = @_;
 
-  my $txt;
-
-  # faster processing of UTF8-chars
-  foreach my $char (split //, $txt_utf8) {
-    if ($char =~ /\p{Punct}/) {
-      $txt .= "p"
-    } elsif ($char =~ /[^\p{Punct}\s]/) {
-      $txt .= "P"
-    } elsif ($char =~ /\s/) {
-      $txt .= "s"
-    } else {
-      $txt .= "o" # other: should actually only happen for string end (0 byte)
-      # check could be 'ord($char)==0'
-    }
-  };
+  # Replace MBCs with single bytes
+  $txt =~ s/\p{Punct}/./g;
+  $txt =~ s/\s/~/g;
+  $txt =~ s/[^\.\~]/_/g;
+  utf8::downgrade($txt);
 
   # Iterate over the whole string
-  while ($txt =~ /(P+)
-                  (?:(p)|s?)|
-                  (p)/gx){
+  while ($txt =~ /(_+)
+                  (?:(\.)|\~?)|
+                  (\.)/gx){
 
     # Starts with a character sequence
     if (defined $1){

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index c89c4fd..0efb648 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -8,29 +8,19 @@
 # Tokenize string "conservatively" and return an array
 # with character boundaries.
 sub tokenize {
-  my ($self, $txt_utf8) = @_;
+  my ($self, $txt) = @_;
 
-  my $txt;
-
-  # faster processing of UTF8-chars
-  foreach my $char (split //, $txt_utf8) {
-    if ($char =~ /\p{Punct}/) {
-      $txt .= "p"
-    } elsif ($char =~ /[^\p{Punct}\s]/) {
-      $txt .= "P"
-    } elsif ($char =~ /\s/) {
-      $txt .= "s"
-    } else {
-      $txt .= "o" # other: should actually only happen for string end (0 byte)
-      # check could be 'ord($char)==0'
-    }
-  };
+  # Replace MBCs with single bytes
+  $txt =~ s/\p{Punct}/./g;
+  $txt =~ s/\s/~/g;
+  $txt =~ s/[^\.\~]/_/g;
+  utf8::downgrade($txt);
 
   # Iterate over the whole string
-  while ($txt =~ /(p*)
-                  (P+(?:p+P+)*)?
-                  (p*)
-                  s?/gx) {
+  while ($txt =~ /(\.*)
+                  (_+(?:\.+_+)*)?
+                  (\.*)
+                  \~?/gx) {
 
     # Punctuation preceding a token
     $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
@@ -54,6 +44,7 @@
   my $pr; # "print" (tokenize) punctuation character (if one of the below tests justified it)
 
   if ($p2 == $p1+1) { # single punctuation character
+    my $char;
 
     # Variant for preceding characters
     if ($preceding) {
@@ -62,18 +53,24 @@
 
       # Punctuation character doesn't start at first position
       if ($p1 != 0) {
+
         # Check char before punctuation char
-        $pr = ( substr( $txt, $p1-1, 1 ) =~ /[ps]/ );
+        $char = substr( $txt, $p1-1, 1 );
+        $pr = ($char eq '.' || $char eq '~') ? 1 : 0;
       }
     }
 
     else {
       # Check char after punctuation char
-      $pr = ( substr( $txt, $p2, 1 ) =~ /[ps]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
+      $char = substr( $txt, $p2, 1 );
+
+      # The last punctuation character should always be tokenized
+      $pr = (!$char || $char eq '.' || $char eq '~') ? 1 : 0;
 
       # Check char before punctuation char
       unless ($pr) {
-        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[ps]/ );
+        $char = substr ( $txt, $p1-1, 1);
+        $pr = ($char eq '.' || $char eq '~' ) ? 1 : 0;
       };
     };
commit	190d02213a60059aa1f5641c86fd89c5265abd00	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sat Jul 25 22:44:33 2020 +0200
committer	Akron <nils@diewald-online.de>	Sun Jul 26 09:26:20 2020 +0200
tree	d92d407cb57797a43cca4788d1402dedbedaf6dd
parent	994aff7addeadc7be57b848e9d8c50bdafb11bd3 [diff]