Simplify conservative tokenization code

Change-Id: I957c21064ef17691b74de3d57361df62197350ef
diff --git a/lib/KorAP/XML/TEI/Tokenization.pm b/lib/KorAP/XML/TEI/Tokenization.pm
index 5b09cdd..a340471 100644
--- a/lib/KorAP/XML/TEI/Tokenization.pm
+++ b/lib/KorAP/XML/TEI/Tokenization.pm
@@ -38,6 +38,57 @@
 };
 
 
+sub _check_surroundings {
+  my ($txt, $offset, $p1, $p2, $preceeding) = @_;
+
+  my $pr;
+
+  if ($p2 == $p1+1) {
+
+    # Variant for preceeding characters
+    if ($preceeding) {
+      # Character doesn't start and first position
+      if ($p1 != 0) {
+
+        # Check if the prefix is a character
+        $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
+      };
+
+      # There is no prefix
+      unless ($pr){
+
+        # Check, if the first character following the special char is a character?
+        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
+      };
+    }
+
+    else {
+      # Check the char after the match
+      $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+
+      # Check the char before the match
+      unless ($pr) {
+        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+      };
+    };
+
+    return () unless $pr;
+
+    # Either before or after the char there is a token
+    return ($p1+$offset, $p2+$offset);  # from and to
+  };
+
+  my @list;
+
+  # Iterate over all single punctuation symbols
+  for (my $i = $p1; $i < $p2; $i++ ){
+    push @list, $i+$offset, $i+1+$offset; # from and to
+  };
+
+  return @list;
+};
+
+
 # Tokenize string "conservatively" and return an array
 # with character boundaries.
 sub conservative {
@@ -45,9 +96,6 @@
   $offset //= 0;
 
   my @tokens;
-  my ($tmp, $p1, $p2, $pr);
-
-  my $i;
 
   # Iterate over the whole string
   while ($txt =~ /(\p{Punct}*)
@@ -56,109 +104,16 @@
                   (?:[ \x{9}\n])?/gx) {
 
     # Punctuation preceding a token
-    if ($1) {
-      ($p1,$p2) = ($-[1], $+[1]);
-
-      # Only a single character
-      if ($p2 == $p1+1) {
-
-        # Character doesn't start and first position
-        if ($p1 != 0) {
-
-          # Check if the prefix is a character
-          $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
-        }
-
-        # Prefix is empty
-        else {
-          $pr = 0
-        };
-
-        # There is no prefix
-        unless ($pr){
-
-          # Check, if the first character following the special char is a character?
-          $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
-        };
-
-        if ($pr){
-          push @tokens, $p1+$offset, $p2+$offset; # from and to
-        };
-
-      } else {
-
-        # Iterate over all single punctuation symbols
-        for ($i = $p1; $i < $p2; $i++) {
-          push @tokens, $i+$offset, $i+1+$offset; # from and to
-        }
-      }
-    };
+    push @tokens, _check_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
 
     # Token sequence
-    if ($2){
-      push @tokens, $-[2]+$offset, $+[2]+$offset; # from and to
-    };
+    push @tokens, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
 
     # Punctuation following a token
-    if ($3){
-      ($p1,$p2) = ($-[3], $+[3]);
+    push @tokens, _check_surroundings($txt, $offset, $-[3], $+[3]) if $3;
 
-      # Only a single character
-      if ($p2 == $p1+1){
-
-        # Check the char after the match
-        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
-        # Check the char before the match
-        unless ($pr){
-          $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
-        };
-
-        # Either before or after the char there is a token
-        if ($pr) {
-          push @tokens, $p1+$offset, $p2+$offset; # from and to
-        };
-
-      }
-
-      else {
-
-        # Iterate over all single punctuation symbols
-        for ( $i = $p1; $i < $p2; $i++) {
-          push @tokens, $i+$offset, $i+1+$offset; # from and to
-        };
-      };
-    };
-
-    if ($4) { # special chars after token
-
-      ($p1,$p2) = ($-[4], $+[4]);
-
-      if ($p2 == $p1+1) {
-
-        # Check the char after the match
-        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
-        # Check the char before the match
-        unless ($pr) {
-          $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
-        };
-
-        # Either before or after the char there is a token
-        if ($pr){
-          push @tokens, $p1+$offset, $p2+$offset;  # from and to
-        };
-
-      }
-
-      else {
-
-        # Iterate over all single punctuation symbols
-        for ( $i = $p1; $i < $p2; $i++ ){
-          push @tokens, $i+$offset, $i+1+$offset; # from and to
-        };
-      };
-    };
+    # Special chars after token
+    push @tokens, _check_surroundings($txt, $offset, $-[4], $+[4]) if $4;
   };
 
   return \@tokens