clean up intern tokenization - moved intern tokenization out of retr_info() - removed $offset parameter (not anymore necessary) Change-Id: I063efdee193ab41c2705971a95341573884fd3a2

commit: b122717d9a9015ba486eb8503015cfefdb54f1e4 [log] [tgz]
author: Peter Harders <harders@ids-mannheim.de> Tue Jul 21 02:12:10 2020 +0200
committer: Akron <nils@diewald-online.de> Tue Jul 21 19:40:11 2020 +0200
tree: 4dda245d8f2cc516271220a5581aa54dc9f4d522
parent: 4c6ff5b08659b889aeff9cb7ed77eb968d94cab5 [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 4e623f9..75b11eb 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -8,9 +8,7 @@
 # Tokenize string "aggressively" and return an array
 # with character boundaries.
 sub tokenize {
-  my ($self, $txt, $offset) = @_;
-
-  $offset //= 0;
+  my ($self, $txt) = @_;
 
   # Iterate over the whole string
   while ($txt =~ /([^\p{Punct} \x{9}\n]+)
@@ -19,17 +17,17 @@
 
     # Starts with a character sequence
     if (defined $1){
-      push @$self, $-[1]+$offset, $+[1]+$offset; # from and to
+      push @$self, $-[1], $+[1]; # from and to
 
       # Followed by a punctuation
       if ($2){
-        push @$self, $-[2]+$offset, $+[2]+$offset # from and to
+        push @$self, $-[2], $+[2] # from and to
       }
     }
 
     # Starts with a punctuation
     else {
-      push @$self, $-[3]+$offset, $+[3]+$offset # from and to
+      push @$self, $-[3], $+[3] # from and to
     };
   };
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index d3b793e..b3373f5 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -8,8 +8,7 @@
 # Tokenize string "conservatively" and return an array
 # with character boundaries.
 sub tokenize {
-  my ($self, $txt, $offset) = @_;
-  $offset //= 0;
+  my ($self, $txt) = @_;
 
   # Iterate over the whole string
   while ($txt =~ /(\p{Punct}*)
@@ -18,16 +17,16 @@
                   (?:[ \x{9}\n])?/gx) {
 
     # Punctuation preceding a token
-    $self->_add_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
+    $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
 
     # Token sequence
-    push @$self, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
+    push @$self, ($-[2], $+[2]) if $2; # from and to
 
     # Punctuation following a token
-    $self->_add_surroundings($txt, $offset, $-[3], $+[3]) if $3;
+    $self->_add_surroundings($txt, $-[3], $+[3]) if $3;
 
     # Special chars after token
-    $self->_add_surroundings($txt, $offset, $-[4], $+[4]) if $4;
+    $self->_add_surroundings($txt, $-[4], $+[4]) if $4;
   };
 
   return
@@ -36,7 +35,7 @@
 
 # Check if surrounding characters are token-worthy
 sub _add_surroundings {
-  my ($self, $txt, $offset, $p1, $p2, $preceding) = @_;
+  my ($self, $txt, $p1, $p2, $preceding) = @_;
 
   my $pr;
 
@@ -70,13 +69,13 @@
     };
 
     # Either before or after the char there is a token
-    push @$self, ($p1+$offset, $p2+$offset) if $pr;  # from and to
+    push @$self, ($p1, $p2) if $pr;  # from and to
     return;
   };
 
   # Iterate over all single punctuation symbols
   for (my $i = $p1; $i < $p2; $i++ ){
-    push @$self, $i+$offset, $i+1+$offset; # from and to
+    push @$self, $i, $i+1; # from and to
   };
 };
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 8cfa0cf..7b740a1 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -54,7 +54,7 @@
 
 # Tokenize text in an external process
 sub tokenize {
-  my ($self, $txt, $offset) = @_;
+  my ($self, $txt) = @_;
   return unless $self->{pid};
   my $out = $self->{chld_in};
   print $out $txt . $self->{sep};

diff --git a/script/tei2korapxml b/script/tei2korapxml
index c94e3cb..3390ff6 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -93,7 +93,6 @@
   my $_tok_file_agg  = "tokens_aggressive.xml";
   my $aggr_tok       = KorAP::XML::TEI::Tokenizer::Aggressive->new;
   my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
-  my ( $txt, $offset );
 ##
 
 my $_tok_dir         = "base";                       # name of directory for storing tokenization files
@@ -350,13 +349,19 @@
         $data =~ tr/\n\r/  /; # note: 2 blanks - otherwise offset data would become corrupt
         #
 
-        $data = encode_utf8( $data );
 
         if ( $_GEN_TOK_EXT ){
-          # TODO: $offset is only necessary for $cons_tok and $aggr_tok and as long as they're part of 'retr_info'
-          $ext_tok->tokenize($data, $offset);
+
+          $ext_tok->tokenize($data);
+
+        } elsif ( $_GEN_TOK_INT ){
+
+          $cons_tok->tokenize($data);
+          $aggr_tok->tokenize($data);
         }
 
+        $data = encode_utf8( $data );
+
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
 
@@ -377,34 +382,30 @@
 
         # ~ tokenization ~
 
-        if ( $_GEN_TOK_EXT || $_GEN_TOK_INT ){
+        if ( $_GEN_TOK_EXT ) {
 
-          if ( $_GEN_TOK_EXT ) {
+          $ext_tok->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+            $text_id_esc
+          )
 
-            $ext_tok->to_zip(
-              $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
-              $text_id_esc
-            );
+        } elsif ( $_GEN_TOK_INT ){
 
-          } elsif ( $_GEN_TOK_INT ){
-
-            # Output token streams to zip streams
-            $cons_tok->to_zip(
-              $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
-              $text_id_esc
-            );
-            $aggr_tok->to_zip(
-              $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
-              $text_id_esc
-            );
-            $offset = 0;
-            $aggr_tok->reset;
-            $cons_tok->reset;
-          }
-
-          #print STDERR "$0: write_tokenization(): DONE\n";
+          # Output token streams to zip streams
+          $cons_tok->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+            $text_id_esc
+          );
+          $aggr_tok->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+            $text_id_esc
+          );
+          $aggr_tok->reset;
+          $cons_tok->reset;
         }
 
+        #print STDERR "$0: write_tokenization(): DONE\n";
+
         $data_fl = 0; $buf_in = $data = $dir = ""; # reinit.
 
       } else { # $dir eq ""
@@ -1064,29 +1065,6 @@
       $dl += length( $e->[1] ); # update length of $data
 
 
-      if ( $_GEN_TOK_INT ){
-
-        #~~~~~
-        # from here: intern tokenization
-        #~~~~~
-
-
-        $txt = $e->[1];
-
-        # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
-        $cons_tok->tokenize($txt, $offset);
-        $aggr_tok->tokenize($txt, $offset);
-
-        $offset = $dl;
-
-
-        #~~~~~
-        # until here: intern tokenization
-        #~~~~~
-
-      }
-
-
       #~~~~~
       # until here: text- and whitespace-nodes
       #~~~~~

diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index e484160..ad286df 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl

@@ -7,8 +7,6 @@
 };
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 
-use open qw(:std :utf8); # assume utf-8 encoding
-
 $| = 1;
 
 # Init tokenizer

diff --git a/t/script.t b/t/script.t
index 85c2cea..2bb00bb 100644
--- a/t/script.t
+++ b/t/script.t

@@ -183,14 +183,16 @@
 # Tokenize with external tokenizer
 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
 
+my ($fh2, $outzip2) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
+
 stderr_like(
-  sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
+  sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
   qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
   'Processing'
 );
 
 # Uncompress GOE/AGA/00000/base/tokens.xml from zip file
-$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+$zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
 
 # Read GOE/AGA/00000/base/tokens.xml
 $tokens_xml = '';
@@ -216,19 +218,22 @@
 # TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
 
 
+my ($fh3, $outzip3) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
+
+
 # ~ test conservative tokenization ~
 
 $file = catfile($f, 'data', 'text_with_blanks.i5.xml');
 
 stderr_like(
-  sub { `cat '$file' | perl '$script' > '$outzip'` },
+  sub { `cat '$file' | perl '$script' > '$outzip3'` },
   qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
   'Processing'
 );
 
-ok(-e $outzip, "File $outzip exists");
+ok(-e $outzip3, "File $outzip3 exists");
 
-$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
+$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
 
 ok($zip, 'Zip-File is created');
 
@@ -265,7 +270,7 @@
 
 # ~ test aggressive tokenization ~
 
-$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
+$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
 
 ok($zip, 'Zip-File is created');
 

diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index c1657a6..9d0489a 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl

@@ -106,14 +106,14 @@
   Dumbbench::Instance::PerlSub->new(
     name => 'Tokenizer-conservative',
     code => sub {
-      $result = $cons_tok->reset->tokenize($t_data, 0);
+      $result = $cons_tok->reset->tokenize($t_data);
       $result = 0;
     }
   ),
   Dumbbench::Instance::PerlSub->new(
     name => 'Tokenizer-aggressive',
     code => sub {
-      $result = $aggr_tok->reset->tokenize($t_data, 0);
+      $result = $aggr_tok->reset->tokenize($t_data);
       $result = 0;
     }
   ),
commit	b122717d9a9015ba486eb8503015cfefdb54f1e4	[log] [tgz]
author	Peter Harders <harders@ids-mannheim.de>	Tue Jul 21 02:12:10 2020 +0200
committer	Akron <nils@diewald-online.de>	Tue Jul 21 19:40:11 2020 +0200
tree	4dda245d8f2cc516271220a5581aa54dc9f4d522
parent	4c6ff5b08659b889aeff9cb7ed77eb968d94cab5 [diff]