Establish tokenizer objects for aggressive and conservative base tokenization Change-Id: I702098185b0b6292c73217268d4516d55a2f95b5

commit: d962747a4ac7e02a6040fad736e4a8a45a6b4431 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jul 09 16:53:09 2020 +0200
committer: Akron <nils@diewald-online.de> Thu Jul 09 17:01:06 2020 +0200
tree: 9f39e305982ab20660aaa09b669b8accfe00ddba
parent: 95612c3d5a2aeea3d81915b7a846a8dc69d46df4 [diff]
diff --git a/.gitignore b/.gitignore
index 9071e73..b0955d2 100644
--- a/.gitignore
+++ b/.gitignore

@@ -6,3 +6,4 @@
 *~
 .*
 !.gitignore
+/sandbox
\ No newline at end of file

diff --git a/lib/KorAP/XML/TEI/Tokenization.pm b/lib/KorAP/XML/TEI/Tokenization.pm
deleted file mode 100644
index a340471..0000000
--- a/lib/KorAP/XML/TEI/Tokenization.pm
+++ /dev/null

@@ -1,123 +0,0 @@
-package KorAP::XML::TEI::Tokenization;
-use strict;
-use warnings;
-
-# This tokenizer was originally written by cschnober.
-# '\p{Punct}' is equal to the character class '[-!"#%&'()*,./:;?@[\\\]_{}]'
-
-# Tokenize string "aggressively" and return an array
-# with character boundaries.
-sub aggressive {
-  my ($txt, $offset) = @_;
-
-  $offset //= 0;
-  my @tokens;
-
-  # Iterate over the whole string
-  while ($txt =~ /([^\p{Punct} \x{9}\n]+)
-                  (?:(\p{Punct})|(?:[ \x{9}\n])?)|
-                  (\p{Punct})/gx){
-
-    # Starts with a character sequence
-    if (defined $1){
-      push @tokens, $-[1]+$offset, $+[1]+$offset; # from and to
-
-      # Followed by a punctuation
-      if ($2){
-        push @tokens, $-[2]+$offset, $+[2]+$offset # from and to
-      }
-    }
-
-    # Starts with a punctuation
-    else {
-      push @tokens, $-[3]+$offset, $+[3]+$offset # from and to
-    };
-  };
-
-  return \@tokens;
-};
-
-
-sub _check_surroundings {
-  my ($txt, $offset, $p1, $p2, $preceeding) = @_;
-
-  my $pr;
-
-  if ($p2 == $p1+1) {
-
-    # Variant for preceeding characters
-    if ($preceeding) {
-      # Character doesn't start and first position
-      if ($p1 != 0) {
-
-        # Check if the prefix is a character
-        $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
-      };
-
-      # There is no prefix
-      unless ($pr){
-
-        # Check, if the first character following the special char is a character?
-        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
-      };
-    }
-
-    else {
-      # Check the char after the match
-      $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
-      # Check the char before the match
-      unless ($pr) {
-        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
-      };
-    };
-
-    return () unless $pr;
-
-    # Either before or after the char there is a token
-    return ($p1+$offset, $p2+$offset);  # from and to
-  };
-
-  my @list;
-
-  # Iterate over all single punctuation symbols
-  for (my $i = $p1; $i < $p2; $i++ ){
-    push @list, $i+$offset, $i+1+$offset; # from and to
-  };
-
-  return @list;
-};
-
-
-# Tokenize string "conservatively" and return an array
-# with character boundaries.
-sub conservative {
-  my ($txt, $offset) = @_;
-  $offset //= 0;
-
-  my @tokens;
-
-  # Iterate over the whole string
-  while ($txt =~ /(\p{Punct}*)
-                  ([^\p{Punct} \x{9}\n]+(?:(\p{Punct}+)[^\p{Punct} \x{9}\n]+)*)?
-                  (\p{Punct}*)
-                  (?:[ \x{9}\n])?/gx) {
-
-    # Punctuation preceding a token
-    push @tokens, _check_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
-
-    # Token sequence
-    push @tokens, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
-
-    # Punctuation following a token
-    push @tokens, _check_surroundings($txt, $offset, $-[3], $+[3]) if $3;
-
-    # Special chars after token
-    push @tokens, _check_surroundings($txt, $offset, $-[4], $+[4]) if $4;
-  };
-
-  return \@tokens
-};
-
-
-1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer.pm b/lib/KorAP/XML/TEI/Tokenizer.pm
new file mode 100644
index 0000000..c4a310a
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer.pm

@@ -0,0 +1,78 @@
+package KorAP::XML::TEI::Tokenizer;
+use strict;
+use warnings;
+
+# This is the base class for tokenizer objects.
+
+# Construct a new tokenizer
+sub new {
+  bless [], shift;
+};
+
+
+# Reset the inner state of the tokenizer
+# and return the tokenizer object.
+sub reset {
+  @{$_[0]} = ();
+  $_[0];
+};
+
+
+# Return boundaries
+sub boundaries {
+  @{$_[0]};
+};
+
+
+# Return data as a string
+sub to_string {
+  my ($self, $text_id) = @_;
+
+  unless ($text_id) {
+    warn 'Missing textID';
+    return;
+  };
+
+  my $output = $self->_header($text_id);
+
+  my $c = 0;
+  for (my $i = 0; $i < ($#$self + 1); $i +=  2 ){
+    $output .= qq!    <span id="t_$c" from="! . $self->[$i] . '" to="' .
+      $self->[$i+1] . qq!" />\n!;
+    $c++;
+  }
+
+  return $output . $self->_footer;
+};
+
+
+# Write data to zip stream
+sub to_zip {
+  my ($self, $zip, $text_id) = @_;
+  $zip->print($self->to_string($text_id));
+};
+
+
+# Header for XML output
+sub _header {
+  my ($self, $text_id) = @_;
+  return <<"HEADER";
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng"
+            type="application/xml"
+            schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="$text_id"
+       xmlns="http://ids-mannheim.de/ns/KorAP"
+       version="KorAP-0.4">
+  <spanList>
+HEADER
+};
+
+
+# Footer for XML output
+sub _footer {
+  "  </spanList>\n</layer>";
+};
+
+
+1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
new file mode 100644
index 0000000..4e623f9
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -0,0 +1,40 @@
+package KorAP::XML::TEI::Tokenizer::Aggressive;
+use base 'KorAP::XML::TEI::Tokenizer';
+use strict;
+use warnings;
+
+# This tokenizer was originally written by cschnober.
+
+# Tokenize string "aggressively" and return an array
+# with character boundaries.
+sub tokenize {
+  my ($self, $txt, $offset) = @_;
+
+  $offset //= 0;
+
+  # Iterate over the whole string
+  while ($txt =~ /([^\p{Punct} \x{9}\n]+)
+                  (?:(\p{Punct})|(?:[ \x{9}\n])?)|
+                  (\p{Punct})/gx){
+
+    # Starts with a character sequence
+    if (defined $1){
+      push @$self, $-[1]+$offset, $+[1]+$offset; # from and to
+
+      # Followed by a punctuation
+      if ($2){
+        push @$self, $-[2]+$offset, $+[2]+$offset # from and to
+      }
+    }
+
+    # Starts with a punctuation
+    else {
+      push @$self, $-[3]+$offset, $+[3]+$offset # from and to
+    };
+  };
+
+  return;
+};
+
+
+1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
new file mode 100644
index 0000000..d3b793e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -0,0 +1,84 @@
+package KorAP::XML::TEI::Tokenizer::Conservative;
+use base 'KorAP::XML::TEI::Tokenizer';
+use strict;
+use warnings;
+
+# This tokenizer was originally written by cschnober.
+
+# Tokenize string "conservatively" and return an array
+# with character boundaries.
+sub tokenize {
+  my ($self, $txt, $offset) = @_;
+  $offset //= 0;
+
+  # Iterate over the whole string
+  while ($txt =~ /(\p{Punct}*)
+                  ([^\p{Punct} \x{9}\n]+(?:(\p{Punct}+)[^\p{Punct} \x{9}\n]+)*)?
+                  (\p{Punct}*)
+                  (?:[ \x{9}\n])?/gx) {
+
+    # Punctuation preceding a token
+    $self->_add_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
+
+    # Token sequence
+    push @$self, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
+
+    # Punctuation following a token
+    $self->_add_surroundings($txt, $offset, $-[3], $+[3]) if $3;
+
+    # Special chars after token
+    $self->_add_surroundings($txt, $offset, $-[4], $+[4]) if $4;
+  };
+
+  return
+};
+
+
+# Check if surrounding characters are token-worthy
+sub _add_surroundings {
+  my ($self, $txt, $offset, $p1, $p2, $preceding) = @_;
+
+  my $pr;
+
+  if ($p2 == $p1+1) {
+
+    # Variant for preceding characters
+    if ($preceding) {
+      # Character doesn't start and first position
+      if ($p1 != 0) {
+
+        # Check if there is something to print
+        $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
+      };
+
+      # There is nothing to print
+      unless ($pr){
+
+        # Check, if the first character following the special char is a character?
+        $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
+      };
+    }
+
+    else {
+      # Check the char after the match
+      $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+
+      # Check the char before the match
+      unless ($pr) {
+        $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+      };
+    };
+
+    # Either before or after the char there is a token
+    push @$self, ($p1+$offset, $p2+$offset) if $pr;  # from and to
+    return;
+  };
+
+  # Iterate over all single punctuation symbols
+  for (my $i = $p1; $i < $p2; $i++ ){
+    push @$self, $i+$offset, $i+1+$offset; # from and to
+  };
+};
+
+
+1;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 4f2035e..359fea3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -22,7 +22,8 @@
 };
 
 use KorAP::XML::TEI;
-use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Tokenizer::Conservative;
+use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
 
 our $VERSION = '0.01';
@@ -84,7 +85,9 @@
 my $_GEN_TOK_DUMMY             = 1;      # use dummy base tokenization for testing (base tokenization is normally done by external tools)
   my $_tok_file_con            = "tokens_conservative.xml";
   my $_tok_file_agg            = "tokens_aggressive.xml";
-  my ( @tok_tokens_con, @tok_tokens_agg, $txt, $offset );
+  my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+  my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
+  my ( $txt, $offset );
 my $_base_tokenization_dir     = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
 
 my $_DEBUG           = 0;                            # set to 1 for minimal more debug output (no need to be parametrized)
@@ -362,7 +365,9 @@
           select_tokenization();
 
           if ( $_GEN_TOK_DUMMY ){
-            $offset = 0; @tok_tokens_con=(); @tok_tokens_agg=();
+            $offset = 0;
+            $aggr_tok->reset;
+            $cons_tok->reset;
           }
         }
 
@@ -1005,13 +1010,8 @@
 
         if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
 
-          my $tok = KorAP::XML::TEI::Tokenization::conservative($txt, $offset);
-          push @tok_tokens_con, @$tok;
-
-          $tok = KorAP::XML::TEI::Tokenization::aggressive($txt, $offset);
-          push @tok_tokens_agg, @$tok;
-
-          ##$offset = $dl+1;
+          $cons_tok->tokenize($txt, $offset);
+          $aggr_tok->tokenize($txt, $offset);
 
           $offset = $dl;
 
@@ -1059,14 +1059,23 @@
     }
   ## 
   }elsif( $_GEN_TOK_DUMMY ){
-    write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con", $text_id_esc, \@tok_tokens_con);
-    write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg", $text_id_esc, \@tok_tokens_agg);
+
+    # Output token streams to zip streams
+    $cons_tok->to_zip(
+      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con"),
+      $text_id_esc
+    );
+    $aggr_tok->to_zip(
+      $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg"),
+      $text_id_esc
+    );
   }
 
   #print STDERR "$0: write_tokenization(): DONE\n";
 
 } # end: select_tokenization
 
+
 sub write_tokenization { # called from select_tokenization()
 
   my ( $fname, $textid_esc, $bounds ) = @_;

diff --git a/t/tokenization.t b/t/tokenization.t
index a8f8935..932407b 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -2,7 +2,6 @@
 use warnings;
 use Test::More;
 use File::Basename 'dirname';
-use Data::Dumper;
 use File::Spec::Functions qw/catfile/;
 use File::Temp 'tempfile';
 
@@ -11,41 +10,44 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-require_ok('KorAP::XML::TEI::Tokenization');
+require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
+require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
 
 # Test aggressive
-my $aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte Mann");
+my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+$aggr->tokenize("Der alte Mann");
 is_deeply($aggr, [0,3,4,8,9,13]);
 
-$aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte bzw. der grau-melierte Mann");
+$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
 is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
 
 # Test conservative
-my $cons = KorAP::XML::TEI::Tokenization::conservative("Der alte Mann");
+my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
+$cons->tokenize("Der alte Mann");
 is_deeply($cons, [0,3,4,8,9,13]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative("Der alte bzw. der grau-melierte Mann");
+$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
 is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative(". Der");
+$cons->reset->tokenize(". Der");
 is_deeply($cons, [0,1,2,5]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative(" . Der");
+$cons->reset->tokenize(" . Der");
 is_deeply($cons, [1,2,3,6]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative("   . Der");
+$cons->reset->tokenize("   . Der");
 is_deeply($cons, [3,4,5,8]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative("... Der");
+$cons->reset->tokenize("... Der");
 is_deeply($cons, [0,1,1,2,2,3,4,7]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative(".Der");
+$cons->reset->tokenize(".Der");
 is_deeply($cons, [1,4]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative(".Der.... ");
+$cons->reset->tokenize(".Der.... ");
 is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
 
-$cons = KorAP::XML::TEI::Tokenization::conservative("..Der.... ");
+$cons->reset->tokenize("..Der.... ");
 is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
 
 # Test data
@@ -60,11 +62,11 @@
 
 is(137166, length($data));
 
-$aggr = KorAP::XML::TEI::Tokenization::aggressive($data);
+$aggr->reset->tokenize($data);
 is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
 is(47242, scalar(@$aggr));
 
-$cons = KorAP::XML::TEI::Tokenization::conservative($data);
+$cons->reset->tokenize($data);
 is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
 is(43068, scalar(@$cons));
 

diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 85effd8..b27acd6 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl

@@ -13,7 +13,8 @@
 };
 
 use KorAP::XML::TEI;
-use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+use KorAP::XML::TEI::Tokenizer::Conservative;
 
 my $columns = 0;
 my $no_header = 0;
@@ -65,7 +66,10 @@
 }
 else {
   die "Unable to load $t_dataf";
-}
+};
+
+my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
+my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 
 
 # Add benchmark instances
@@ -100,16 +104,16 @@
     }
   ),
   Dumbbench::Instance::PerlSub->new(
-    name => 'Tokenization-conservative',
+    name => 'Tokenizer-conservative',
     code => sub {
-      $result = KorAP::XML::TEI::Tokenization::conservative($t_data, 0);
+      $result = $cons_tok->reset->tokenize($t_data, 0);
       $result = 0;
     }
   ),
   Dumbbench::Instance::PerlSub->new(
-    name => 'Tokenization-aggressive',
+    name => 'Tokenizer-aggressive',
     code => sub {
-      $result = KorAP::XML::TEI::Tokenization::aggressive($t_data, 0);
+      $result = $aggr_tok->reset->tokenize($t_data, 0);
       $result = 0;
     }
   ),
commit	d962747a4ac7e02a6040fad736e4a8a45a6b4431	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jul 09 16:53:09 2020 +0200
committer	Akron <nils@diewald-online.de>	Thu Jul 09 17:01:06 2020 +0200
tree	9f39e305982ab20660aaa09b669b8accfe00ddba
parent	95612c3d5a2aeea3d81915b7a846a8dc69d46df4 [diff]