Separate dummy tokenization from main script with minimal changes
Change-Id: I74ca0a47ad897ef639c90f8af564d08dd5050c63
diff --git a/script/tei2korapxml b/script/tei2korapxml
index d50dc24..305388b 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -50,6 +50,7 @@
};
use KorAP::XML::TEI;
+use KorAP::XML::TEI::Tokenization;
our $VERSION = '0.01';
@@ -107,7 +108,7 @@
##
## dummy tokenization (only for testing)
-my $_GEN_TOK_DUMMY = 0; # use dummy base tokenization for testing (base tokenization is normally done by external tools)
+my $_GEN_TOK_DUMMY = 1; # use dummy base tokenization for testing (base tokenization is normally done by external tools)
my $_tok_file_con = "tokens_conservative.xml";
my $_tok_file_agg = "tokens_aggressive.xml";
my ( @tok_tokens_con, @tok_tokens_agg, $m1, $m2, $m3, $m4, $tmp, $p1, $p2, $pr, $txt, $offset );
@@ -1092,126 +1093,14 @@
$txt = $e->[1];
+
if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
+ my $tok = KorAP::XML::TEI::Tokenization::conservative($txt, $offset);
+ push @tok_tokens_con, @$tok;
- # ~ start: conservative tokenization ~
-
-
- # '\p{Punct}' is equal to the character class '[-!"#%&'()*,./:;?@[\\\]_{}]'
- while ( $txt =~ /([\p{Punct}]*)([^\p{Punct} \x{9}\n]+(?:([\p{Punct}]+)[^\p{Punct} \x{9}\n]+)*)?([\p{Punct}]*)(?:[ \x{9}\n])?/g ){
-
- $m1 = $1; $m2 = $2; $m3 = $3; $m4 = $4;
-
- if ( "$m1" ne "" ){ # special chars before token
-
- $p1 = $-[1]; $p2 = $+[1];
-
- #print STDERR "A1: ".$m1." -> from $p1 to $p2\n";
-
- if ( $p2 == $p1+1 ){
-
- if ( $p1 != 0 ){ $tmp = substr( $txt, $p1-1, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) } else { $pr = 0 };
-
- if ( not $pr ){ $tmp = substr( $txt, $p2, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) };
-
- if ( $pr ){ push @tok_tokens_con, $p1+$offset; push @tok_tokens_con, $p2+$offset }; # from and to
-
- } else {
-
- for ( $i = 0; $i < ( $p2-$p1 ); $i++ ){
-
- #print STDERR "A2: ".substr($m1,$i,1)." -> from $p1 to $p2\n";
-
- push @tok_tokens_con, $p1+$i+$offset; push @tok_tokens_con, $p1+$i+1+$offset; # from and to
- }
- }
-
- } # fi: "$m1" ne ""
-
- #print STDERR "B: "."$m2 -> from ".($-[2]+$offset)." to ".($+[2]+$offset)."\n" if defined $m2; # token (wordform)
-
- if ( defined $m2 ){ push @tok_tokens_con, $-[2]+$offset; push @tok_tokens_con, $+[2]+$offset }; # from and to
-
- if ( defined $m3 ){
-
- $p1 = $-[3]; $p2 = $+[3];
-
- #print STDERR "C: ".$m3." -> from $p1 to $p2\n";
-
- if ( $p2 == $p1+1 ){
-
- $tmp = substr( $txt, $p2, 1); $pr = ( $tmp =~ /^$/ ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) if not $pr; # char after match
-
- if ( not $pr ){ $tmp = substr( $txt, $p1-1, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) }; # char before match
-
- if ( $pr ){ push @tok_tokens_con, $p1+$offset; push @tok_tokens_con, $p2+$offset }; # from and to
-
- } else { # length($m3)>1 => print all chars
-
- for ( $i = 0; $i < ( $p2-$p1 ); $i++ ){
-
- #$tmp=substr($m3,$i,1);
- #print STDERR "C2: $tmp -> from $p1 to $p2\n";
-
- push @tok_tokens_con, $p1+$i+$offset; push @tok_tokens_con, $p1+$i+1+$offset; # from and to
- }
-
- }
-
- } # fi: defined $m3
-
- if ( "$m4" ne "" ){ # special chars after token
-
- $p1 = $-[4]; $p2 = $+[4];
-
- #print STDERR "D1: ".$m4." -> from ".($p1+$offset)." to ".($p2+$offset)."\n";
-
- if ( $p2 == $p1+1 ){
-
- $tmp = substr( $txt, $p2, 1 ); $pr = ( $tmp =~ /^$/ ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) if not $pr; # char after match
-
- if ( not $pr ){ $tmp = substr ( $txt, $p1-1, 1 ); $pr = ( $tmp =~ /^[^A-Za-z0-9]/ ) }; # char before match
-
- if ( $pr ){ push @tok_tokens_con, $p1+$offset; push @tok_tokens_con, $p2+$offset } # from and to
-
- }else{
-
- for ( $i = 0; $i < ( $p2-$p1 ); $i++ ){
-
- #print STDERR "D2: ".substr($m4,$i,1)." -> from ".($p1+$i+$offset)." to ".($p1+$i+1+$offset)."\n";
-
- push @tok_tokens_con, $p1+$i+$offset; push @tok_tokens_con, $p1+$i+1+$offset; # from and to
- }
- }
-
- }# fi: "$m4" ne ""
-
- }# end: while
-
-
- # ~ end: conservative tokenization ~
-
-
- # ~ start: aggressive tokenization ~
-
-
- while ( $txt =~ /([^\p{Punct} \x{9}\n]+)(?:([\p{Punct}])|(?:[ \x{9}\n])?)|([\p{Punct}])/g ){
-
- if ( defined $1 ){
-
- push @tok_tokens_agg, $-[1]+$offset; push @tok_tokens_agg, $+[1]+$offset; # from and to
-
- if ( defined $2 ){ push @tok_tokens_agg, $-[2]+$offset; push @tok_tokens_agg, $+[2]+$offset } # from and to
-
- }else{ # defined $3
-
- push @tok_tokens_agg, $-[3]+$offset; push @tok_tokens_agg, $+[3]+$offset # from and to
- }
-
- } # end: while
-
- # ~ end: aggressive tokenization ~
+ $tok = KorAP::XML::TEI::Tokenization::aggressive($txt, $offset);
+ push @tok_tokens_agg, @$tok;
##$offset = $dl+1;