Simplify conservative tokenization code Change-Id: I957c21064ef17691b74de3d57361df62197350ef

commit: 3479082d7eb20d3f120a89344667aa59a335c2b9 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Jul 07 15:32:50 2020 +0200
committer: Peter Harders <harders@ids-mannheim.de> Thu Jul 09 14:07:25 2020 +0200
tree: 2badaeaee75b583d82ffb82c6b46777317e6c52f
parent: 510a88cfddf2901fac4c9353b8dbfb73d4fb7c64 [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenization.pm b/lib/KorAP/XML/TEI/Tokenization.pm
index 5b09cdd..a340471 100644
--- a/lib/KorAP/XML/TEI/Tokenization.pm
+++ b/lib/KorAP/XML/TEI/Tokenization.pm

@@ -38,6 +38,57 @@
 };
 
 
+sub _check_surroundings {
+  my ($txt, $offset, $p1, $p2, $preceeding) = @_;
+
+  my $pr;
+
+  if ($p2 == $p1+1) {
+
+    # Variant for preceeding characters
+    if ($preceeding) {
+      # Character doesn't start and first position
+      if ($p1 != 0) {
+
+        # Check if the prefix is a character
+        $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
+      };
+
+      # There is no prefix
+      unless ($pr){
+
+        # Check, if the first character following the special char is a character?
+        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
+      };
+    }
+
+    else {
+      # Check the char after the match
+      $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+
+      # Check the char before the match
+      unless ($pr) {
+        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+      };
+    };
+
+    return () unless $pr;
+
+    # Either before or after the char there is a token
+    return ($p1+$offset, $p2+$offset);  # from and to
+  };
+
+  my @list;
+
+  # Iterate over all single punctuation symbols
+  for (my $i = $p1; $i < $p2; $i++ ){
+    push @list, $i+$offset, $i+1+$offset; # from and to
+  };
+
+  return @list;
+};
+
+
 # Tokenize string "conservatively" and return an array
 # with character boundaries.
 sub conservative {
@@ -45,9 +96,6 @@
   $offset //= 0;
 
   my @tokens;
-  my ($tmp, $p1, $p2, $pr);
-
-  my $i;
 
   # Iterate over the whole string
   while ($txt =~ /(\p{Punct}*)
@@ -56,109 +104,16 @@
                   (?:[ \x{9}\n])?/gx) {
 
     # Punctuation preceding a token
-    if ($1) {
-      ($p1,$p2) = ($-[1], $+[1]);
-
-      # Only a single character
-      if ($p2 == $p1+1) {
-
-        # Character doesn't start and first position
-        if ($p1 != 0) {
-
-          # Check if the prefix is a character
-          $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
-        }
-
-        # Prefix is empty
-        else {
-          $pr = 0
-        };
-
-        # There is no prefix
-        unless ($pr){
-
-          # Check, if the first character following the special char is a character?
-          $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
-        };
-
-        if ($pr){
-          push @tokens, $p1+$offset, $p2+$offset; # from and to
-        };
-
-      } else {
-
-        # Iterate over all single punctuation symbols
-        for ($i = $p1; $i < $p2; $i++) {
-          push @tokens, $i+$offset, $i+1+$offset; # from and to
-        }
-      }
-    };
+    push @tokens, _check_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
 
     # Token sequence
-    if ($2){
-      push @tokens, $-[2]+$offset, $+[2]+$offset; # from and to
-    };
+    push @tokens, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
 
     # Punctuation following a token
-    if ($3){
-      ($p1,$p2) = ($-[3], $+[3]);
+    push @tokens, _check_surroundings($txt, $offset, $-[3], $+[3]) if $3;
 
-      # Only a single character
-      if ($p2 == $p1+1){
-
-        # Check the char after the match
-        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
-        # Check the char before the match
-        unless ($pr){
-          $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
-        };
-
-        # Either before or after the char there is a token
-        if ($pr) {
-          push @tokens, $p1+$offset, $p2+$offset; # from and to
-        };
-
-      }
-
-      else {
-
-        # Iterate over all single punctuation symbols
-        for ( $i = $p1; $i < $p2; $i++) {
-          push @tokens, $i+$offset, $i+1+$offset; # from and to
-        };
-      };
-    };
-
-    if ($4) { # special chars after token
-
-      ($p1,$p2) = ($-[4], $+[4]);
-
-      if ($p2 == $p1+1) {
-
-        # Check the char after the match
-        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
-        # Check the char before the match
-        unless ($pr) {
-          $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
-        };
-
-        # Either before or after the char there is a token
-        if ($pr){
-          push @tokens, $p1+$offset, $p2+$offset;  # from and to
-        };
-
-      }
-
-      else {
-
-        # Iterate over all single punctuation symbols
-        for ( $i = $p1; $i < $p2; $i++ ){
-          push @tokens, $i+$offset, $i+1+$offset; # from and to
-        };
-      };
-    };
+    # Special chars after token
+    push @tokens, _check_surroundings($txt, $offset, $-[4], $+[4]) if $4;
   };
 
   return \@tokens
commit	3479082d7eb20d3f120a89344667aa59a335c2b9	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Jul 07 15:32:50 2020 +0200
committer	Peter Harders <harders@ids-mannheim.de>	Thu Jul 09 14:07:25 2020 +0200
tree	2badaeaee75b583d82ffb82c6b46777317e6c52f
parent	510a88cfddf2901fac4c9353b8dbfb73d4fb7c64 [diff]