Separate dummy tokenization from main script with minimal changes Change-Id: I74ca0a47ad897ef639c90f8af564d08dd5050c63

commit: eac374d7e4d87f6d73f67b5658c9084dfead2d12 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Jul 07 09:00:44 2020 +0200
committer: Akron <nils@diewald-online.de> Tue Jul 07 09:43:11 2020 +0200
tree: d092ee684de76f63768e8df3af20aa13510a5a26
parent: 7fab93bf003e4737d0b9b260e73948eac1368add [diff]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index d50dc24..305388b 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -50,6 +50,7 @@
 };
 
 use KorAP::XML::TEI;
+use KorAP::XML::TEI::Tokenization;
 
 our $VERSION = '0.01';
 
@@ -107,7 +108,7 @@
 ##
 
 ## dummy tokenization (only for testing)
-my $_GEN_TOK_DUMMY             = 0;      # use dummy base tokenization for testing (base tokenization is normally done by external tools)
+my $_GEN_TOK_DUMMY             = 1;      # use dummy base tokenization for testing (base tokenization is normally done by external tools)
   my $_tok_file_con            = "tokens_conservative.xml";
   my $_tok_file_agg            = "tokens_aggressive.xml";
   my ( @tok_tokens_con, @tok_tokens_agg, $m1, $m2, $m3, $m4, $tmp, $p1, $p2, $pr, $txt, $offset );
@@ -1092,126 +1093,14 @@
 
         $txt = $e->[1];
 
+
         if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
 
+          my $tok = KorAP::XML::TEI::Tokenization::conservative($txt, $offset);
+          push @tok_tokens_con, @$tok;
 
-          # ~ start: conservative tokenization ~
-
-
-          # '\p{Punct}' is equal to the character class '[-!"#%&'()*,./:;?@[\\\]_{}]'
-          while ( $txt =~ /([\p{Punct}]*)([^\p{Punct} \x{9}\n]+(?:([\p{Punct}]+)[^\p{Punct} \x{9}\n]+)*)?([\p{Punct}]*)(?:[ \x{9}\n])?/g ){
-
-            $m1 = $1; $m2 = $2; $m3 = $3; $m4 = $4;
-
-            if ( "$m1" ne "" ){ # special chars before token
-
-              $p1 = $-[1]; $p2 = $+[1];
-
-              #print STDERR "A1: ".$m1." -> from $p1 to $p2\n";
-
-              if ( $p2 == $p1+1 ){
-
-                if ( $p1 != 0 ){ $tmp = substr( $txt, $p1-1, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) } else { $pr = 0 };
-
-                if ( not $pr ){ $tmp = substr( $txt, $p2, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) };
-
-                if ( $pr ){ push @tok_tokens_con, $p1+$offset; push @tok_tokens_con, $p2+$offset }; # from and to
-
-              } else {
-
-                for ( $i = 0; $i < ( $p2-$p1 ); $i++ ){
-
-                  #print STDERR "A2: ".substr($m1,$i,1)." -> from $p1 to $p2\n";
-
-                  push @tok_tokens_con, $p1+$i+$offset; push @tok_tokens_con, $p1+$i+1+$offset; # from and to
-                }
-              }
-
-            } # fi: "$m1" ne ""
-
-            #print STDERR "B: "."$m2 -> from ".($-[2]+$offset)." to ".($+[2]+$offset)."\n" if defined $m2;   # token (wordform)
-
-            if ( defined $m2 ){ push @tok_tokens_con, $-[2]+$offset; push @tok_tokens_con, $+[2]+$offset }; # from and to
-
-            if ( defined $m3 ){
-
-              $p1 = $-[3]; $p2 = $+[3];
-
-              #print STDERR "C: ".$m3." -> from $p1 to $p2\n";
-
-              if ( $p2 == $p1+1 ){
-
-                $tmp = substr( $txt, $p2, 1); $pr = ( $tmp =~ /^$/ ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) if not $pr; # char after match
-
-                if ( not $pr ){ $tmp = substr( $txt, $p1-1, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) }; # char before match
-
-                if ( $pr ){ push @tok_tokens_con, $p1+$offset; push @tok_tokens_con, $p2+$offset }; # from and to
-
-              } else { # length($m3)>1 => print all chars
-
-                for ( $i = 0; $i < ( $p2-$p1 ); $i++ ){
-
-                  #$tmp=substr($m3,$i,1);
-                  #print STDERR "C2: $tmp -> from $p1 to $p2\n";
-
-                  push @tok_tokens_con, $p1+$i+$offset; push @tok_tokens_con, $p1+$i+1+$offset; # from and to
-                }
-
-              }
-
-            } # fi: defined $m3
-
-            if ( "$m4" ne "" ){ # special chars after token
-
-              $p1 = $-[4]; $p2 = $+[4];
-
-              #print STDERR "D1: ".$m4." -> from ".($p1+$offset)." to ".($p2+$offset)."\n";
-
-              if ( $p2 == $p1+1 ){
-
-                $tmp = substr( $txt, $p2, 1 ); $pr = ( $tmp =~ /^$/ ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) if not $pr; # char after match
-
-                if ( not $pr ){ $tmp = substr ( $txt, $p1-1, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) }; # char before match
-
-                if ( $pr ){ push @tok_tokens_con, $p1+$offset; push @tok_tokens_con, $p2+$offset } # from and to
-
-              }else{
-
-                for ( $i = 0; $i < ( $p2-$p1 ); $i++ ){
-
-                  #print STDERR "D2: ".substr($m4,$i,1)." -> from ".($p1+$i+$offset)." to ".($p1+$i+1+$offset)."\n";
-
-                  push @tok_tokens_con, $p1+$i+$offset; push @tok_tokens_con, $p1+$i+1+$offset; # from and to
-                }
-              }
-
-            }# fi: "$m4" ne ""
-
-          }# end: while
-
-
-          # ~ end: conservative tokenization ~
-
-
-          # ~ start: aggressive tokenization ~
-
-
-          while ( $txt =~ /([^\p{Punct} \x{9}\n]+)(?:([\p{Punct}])|(?:[ \x{9}\n])?)|([\p{Punct}])/g ){
-
-            if ( defined $1 ){
-
-              push @tok_tokens_agg, $-[1]+$offset; push @tok_tokens_agg, $+[1]+$offset; # from and to
-
-              if ( defined $2 ){ push @tok_tokens_agg, $-[2]+$offset; push @tok_tokens_agg, $+[2]+$offset } # from and to
-
-            }else{ # defined $3
-
-              push @tok_tokens_agg, $-[3]+$offset; push @tok_tokens_agg, $+[3]+$offset # from and to
-            }
-
-          } # end: while
-
-          # ~ end: aggressive tokenization ~
+          $tok = KorAP::XML::TEI::Tokenization::aggressive($txt, $offset);
+          push @tok_tokens_agg, @$tok;
 
           ##$offset = $dl+1;
commit	eac374d7e4d87f6d73f67b5658c9084dfead2d12	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Jul 07 09:00:44 2020 +0200
committer	Akron <nils@diewald-online.de>	Tue Jul 07 09:43:11 2020 +0200
tree	d092ee684de76f63768e8df3af20aa13510a5a26
parent	7fab93bf003e4737d0b9b260e73948eac1368add [diff]