Simplify conservative tokenization code
Change-Id: I957c21064ef17691b74de3d57361df62197350ef
diff --git a/lib/KorAP/XML/TEI/Tokenization.pm b/lib/KorAP/XML/TEI/Tokenization.pm
index 5b09cdd..a340471 100644
--- a/lib/KorAP/XML/TEI/Tokenization.pm
+++ b/lib/KorAP/XML/TEI/Tokenization.pm
@@ -38,6 +38,57 @@
};
+sub _check_surroundings {
+ my ($txt, $offset, $p1, $p2, $preceeding) = @_;
+
+ my $pr;
+
+ if ($p2 == $p1+1) {
+
+ # Variant for preceeding characters
+ if ($preceeding) {
+ # Character doesn't start and first position
+ if ($p1 != 0) {
+
+ # Check if the prefix is a character
+ $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
+ };
+
+ # There is no prefix
+ unless ($pr){
+
+ # Check, if the first character following the special char is a character?
+ $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
+ };
+ }
+
+ else {
+ # Check the char after the match
+ $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+
+ # Check the char before the match
+ unless ($pr) {
+ $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+ };
+ };
+
+ return () unless $pr;
+
+ # Either before or after the char there is a token
+ return ($p1+$offset, $p2+$offset); # from and to
+ };
+
+ my @list;
+
+ # Iterate over all single punctuation symbols
+ for (my $i = $p1; $i < $p2; $i++ ){
+ push @list, $i+$offset, $i+1+$offset; # from and to
+ };
+
+ return @list;
+};
+
+
# Tokenize string "conservatively" and return an array
# with character boundaries.
sub conservative {
@@ -45,9 +96,6 @@
$offset //= 0;
my @tokens;
- my ($tmp, $p1, $p2, $pr);
-
- my $i;
# Iterate over the whole string
while ($txt =~ /(\p{Punct}*)
@@ -56,109 +104,16 @@
(?:[ \x{9}\n])?/gx) {
# Punctuation preceding a token
- if ($1) {
- ($p1,$p2) = ($-[1], $+[1]);
-
- # Only a single character
- if ($p2 == $p1+1) {
-
- # Character doesn't start and first position
- if ($p1 != 0) {
-
- # Check if the prefix is a character
- $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
- }
-
- # Prefix is empty
- else {
- $pr = 0
- };
-
- # There is no prefix
- unless ($pr){
-
- # Check, if the first character following the special char is a character?
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
- };
-
- if ($pr){
- push @tokens, $p1+$offset, $p2+$offset; # from and to
- };
-
- } else {
-
- # Iterate over all single punctuation symbols
- for ($i = $p1; $i < $p2; $i++) {
- push @tokens, $i+$offset, $i+1+$offset; # from and to
- }
- }
- };
+ push @tokens, _check_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
# Token sequence
- if ($2){
- push @tokens, $-[2]+$offset, $+[2]+$offset; # from and to
- };
+ push @tokens, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
# Punctuation following a token
- if ($3){
- ($p1,$p2) = ($-[3], $+[3]);
+ push @tokens, _check_surroundings($txt, $offset, $-[3], $+[3]) if $3;
- # Only a single character
- if ($p2 == $p1+1){
-
- # Check the char after the match
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
- # Check the char before the match
- unless ($pr){
- $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
- };
-
- # Either before or after the char there is a token
- if ($pr) {
- push @tokens, $p1+$offset, $p2+$offset; # from and to
- };
-
- }
-
- else {
-
- # Iterate over all single punctuation symbols
- for ( $i = $p1; $i < $p2; $i++) {
- push @tokens, $i+$offset, $i+1+$offset; # from and to
- };
- };
- };
-
- if ($4) { # special chars after token
-
- ($p1,$p2) = ($-[4], $+[4]);
-
- if ($p2 == $p1+1) {
-
- # Check the char after the match
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
- # Check the char before the match
- unless ($pr) {
- $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
- };
-
- # Either before or after the char there is a token
- if ($pr){
- push @tokens, $p1+$offset, $p2+$offset; # from and to
- };
-
- }
-
- else {
-
- # Iterate over all single punctuation symbols
- for ( $i = $p1; $i < $p2; $i++ ){
- push @tokens, $i+$offset, $i+1+$offset; # from and to
- };
- };
- };
+ # Special chars after token
+ push @tokens, _check_surroundings($txt, $offset, $-[4], $+[4]) if $4;
};
return \@tokens