Establish tokenizer objects for aggressive and conservative base tokenization
Change-Id: I702098185b0b6292c73217268d4516d55a2f95b5
diff --git a/.gitignore b/.gitignore
index 9071e73..b0955d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
*~
.*
!.gitignore
+/sandbox
\ No newline at end of file
diff --git a/lib/KorAP/XML/TEI/Tokenization.pm b/lib/KorAP/XML/TEI/Tokenization.pm
deleted file mode 100644
index a340471..0000000
--- a/lib/KorAP/XML/TEI/Tokenization.pm
+++ /dev/null
@@ -1,123 +0,0 @@
-package KorAP::XML::TEI::Tokenization;
-use strict;
-use warnings;
-
-# This tokenizer was originally written by cschnober.
-# '\p{Punct}' is equal to the character class '[-!"#%&'()*,./:;?@[\\\]_{}]'
-
-# Tokenize string "aggressively" and return an array
-# with character boundaries.
-sub aggressive {
- my ($txt, $offset) = @_;
-
- $offset //= 0;
- my @tokens;
-
- # Iterate over the whole string
- while ($txt =~ /([^\p{Punct} \x{9}\n]+)
- (?:(\p{Punct})|(?:[ \x{9}\n])?)|
- (\p{Punct})/gx){
-
- # Starts with a character sequence
- if (defined $1){
- push @tokens, $-[1]+$offset, $+[1]+$offset; # from and to
-
- # Followed by a punctuation
- if ($2){
- push @tokens, $-[2]+$offset, $+[2]+$offset # from and to
- }
- }
-
- # Starts with a punctuation
- else {
- push @tokens, $-[3]+$offset, $+[3]+$offset # from and to
- };
- };
-
- return \@tokens;
-};
-
-
-sub _check_surroundings {
- my ($txt, $offset, $p1, $p2, $preceeding) = @_;
-
- my $pr;
-
- if ($p2 == $p1+1) {
-
- # Variant for preceeding characters
- if ($preceeding) {
- # Character doesn't start and first position
- if ($p1 != 0) {
-
- # Check if the prefix is a character
- $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
- };
-
- # There is no prefix
- unless ($pr){
-
- # Check, if the first character following the special char is a character?
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
- };
- }
-
- else {
- # Check the char after the match
- $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
-
- # Check the char before the match
- unless ($pr) {
- $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
- };
- };
-
- return () unless $pr;
-
- # Either before or after the char there is a token
- return ($p1+$offset, $p2+$offset); # from and to
- };
-
- my @list;
-
- # Iterate over all single punctuation symbols
- for (my $i = $p1; $i < $p2; $i++ ){
- push @list, $i+$offset, $i+1+$offset; # from and to
- };
-
- return @list;
-};
-
-
-# Tokenize string "conservatively" and return an array
-# with character boundaries.
-sub conservative {
- my ($txt, $offset) = @_;
- $offset //= 0;
-
- my @tokens;
-
- # Iterate over the whole string
- while ($txt =~ /(\p{Punct}*)
- ([^\p{Punct} \x{9}\n]+(?:(\p{Punct}+)[^\p{Punct} \x{9}\n]+)*)?
- (\p{Punct}*)
- (?:[ \x{9}\n])?/gx) {
-
- # Punctuation preceding a token
- push @tokens, _check_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
-
- # Token sequence
- push @tokens, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
-
- # Punctuation following a token
- push @tokens, _check_surroundings($txt, $offset, $-[3], $+[3]) if $3;
-
- # Special chars after token
- push @tokens, _check_surroundings($txt, $offset, $-[4], $+[4]) if $4;
- };
-
- return \@tokens
-};
-
-
-1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer.pm b/lib/KorAP/XML/TEI/Tokenizer.pm
new file mode 100644
index 0000000..c4a310a
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer.pm
@@ -0,0 +1,78 @@
+package KorAP::XML::TEI::Tokenizer;
+use strict;
+use warnings;
+
+# This is the base class for tokenizer objects.
+
+# Construct a new tokenizer
+sub new {
+ bless [], shift;
+};
+
+
+# Reset the inner state of the tokenizer
+# and return the tokenizer object.
+sub reset {
+ @{$_[0]} = ();
+ $_[0];
+};
+
+
+# Return boundaries
+sub boundaries {
+ @{$_[0]};
+};
+
+
+# Return data as a string
+sub to_string {
+ my ($self, $text_id) = @_;
+
+ unless ($text_id) {
+ warn 'Missing textID';
+ return;
+ };
+
+ my $output = $self->_header($text_id);
+
+ my $c = 0;
+ for (my $i = 0; $i < ($#$self + 1); $i += 2 ){
+ $output .= qq! <span id="t_$c" from="! . $self->[$i] . '" to="' .
+ $self->[$i+1] . qq!" />\n!;
+ $c++;
+ }
+
+ return $output . $self->_footer;
+};
+
+
+# Write data to zip stream
+sub to_zip {
+ my ($self, $zip, $text_id) = @_;
+ $zip->print($self->to_string($text_id));
+};
+
+
+# Header for XML output
+sub _header {
+ my ($self, $text_id) = @_;
+ return <<"HEADER";
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="$text_id"
+ xmlns="http://ids-mannheim.de/ns/KorAP"
+ version="KorAP-0.4">
+ <spanList>
+HEADER
+};
+
+
+# Footer for XML output
+sub _footer {
+ " </spanList>\n</layer>";
+};
+
+
+1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
new file mode 100644
index 0000000..4e623f9
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -0,0 +1,40 @@
+package KorAP::XML::TEI::Tokenizer::Aggressive;
+use base 'KorAP::XML::TEI::Tokenizer';
+use strict;
+use warnings;
+
+# This tokenizer was originally written by cschnober.
+
+# Tokenize string "aggressively" and return an array
+# with character boundaries.
+sub tokenize {
+ my ($self, $txt, $offset) = @_;
+
+ $offset //= 0;
+
+ # Iterate over the whole string
+ while ($txt =~ /([^\p{Punct} \x{9}\n]+)
+ (?:(\p{Punct})|(?:[ \x{9}\n])?)|
+ (\p{Punct})/gx){
+
+ # Starts with a character sequence
+ if (defined $1){
+ push @$self, $-[1]+$offset, $+[1]+$offset; # from and to
+
+ # Followed by a punctuation
+ if ($2){
+ push @$self, $-[2]+$offset, $+[2]+$offset # from and to
+ }
+ }
+
+ # Starts with a punctuation
+ else {
+ push @$self, $-[3]+$offset, $+[3]+$offset # from and to
+ };
+ };
+
+ return;
+};
+
+
+1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
new file mode 100644
index 0000000..d3b793e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -0,0 +1,84 @@
+package KorAP::XML::TEI::Tokenizer::Conservative;
+use base 'KorAP::XML::TEI::Tokenizer';
+use strict;
+use warnings;
+
+# This tokenizer was originally written by cschnober.
+
+# Tokenize string "conservatively" and return an array
+# with character boundaries.
+sub tokenize {
+ my ($self, $txt, $offset) = @_;
+ $offset //= 0;
+
+ # Iterate over the whole string
+ while ($txt =~ /(\p{Punct}*)
+ ([^\p{Punct} \x{9}\n]+(?:(\p{Punct}+)[^\p{Punct} \x{9}\n]+)*)?
+ (\p{Punct}*)
+ (?:[ \x{9}\n])?/gx) {
+
+ # Punctuation preceding a token
+ $self->_add_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
+
+ # Token sequence
+ push @$self, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
+
+ # Punctuation following a token
+ $self->_add_surroundings($txt, $offset, $-[3], $+[3]) if $3;
+
+ # Special chars after token
+ $self->_add_surroundings($txt, $offset, $-[4], $+[4]) if $4;
+ };
+
+ return
+};
+
+
+# Check if surrounding characters are token-worthy
+sub _add_surroundings {
+ my ($self, $txt, $offset, $p1, $p2, $preceding) = @_;
+
+ my $pr;
+
+ if ($p2 == $p1+1) {
+
+ # Variant for preceding characters
+ if ($preceding) {
+ # Character doesn't start and first position
+ if ($p1 != 0) {
+
+ # Check if there is something to print
+ $pr = ( substr( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]$/ );
+ };
+
+ # There is nothing to print
+ unless ($pr){
+
+ # Check, if the first character following the special char is a character?
+ $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]$/ );
+ };
+ }
+
+ else {
+ # Check the char after the match
+ $pr = ( substr( $txt, $p2, 1 ) =~ /^[^A-Za-z0-9]?$/ );
+
+ # Check the char before the match
+ unless ($pr) {
+ $pr = ( substr ( $txt, $p1-1, 1 ) =~ /^[^A-Za-z0-9]/ );
+ };
+ };
+
+ # Either before or after the char there is a token
+ push @$self, ($p1+$offset, $p2+$offset) if $pr; # from and to
+ return;
+ };
+
+ # Iterate over all single punctuation symbols
+ for (my $i = $p1; $i < $p2; $i++ ){
+ push @$self, $i+$offset, $i+1+$offset; # from and to
+ };
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 4f2035e..359fea3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -22,7 +22,8 @@
};
use KorAP::XML::TEI;
-use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Tokenizer::Conservative;
+use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Zipper;
our $VERSION = '0.01';
@@ -84,7 +85,9 @@
my $_GEN_TOK_DUMMY = 1; # use dummy base tokenization for testing (base tokenization is normally done by external tools)
my $_tok_file_con = "tokens_conservative.xml";
my $_tok_file_agg = "tokens_aggressive.xml";
- my ( @tok_tokens_con, @tok_tokens_agg, $txt, $offset );
+ my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+ my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
+ my ( $txt, $offset );
my $_base_tokenization_dir = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
my $_DEBUG = 0; # set to 1 for minimal more debug output (no need to be parametrized)
@@ -362,7 +365,9 @@
select_tokenization();
if ( $_GEN_TOK_DUMMY ){
- $offset = 0; @tok_tokens_con=(); @tok_tokens_agg=();
+ $offset = 0;
+ $aggr_tok->reset;
+ $cons_tok->reset;
}
}
@@ -1005,13 +1010,8 @@
if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
- my $tok = KorAP::XML::TEI::Tokenization::conservative($txt, $offset);
- push @tok_tokens_con, @$tok;
-
- $tok = KorAP::XML::TEI::Tokenization::aggressive($txt, $offset);
- push @tok_tokens_agg, @$tok;
-
- ##$offset = $dl+1;
+ $cons_tok->tokenize($txt, $offset);
+ $aggr_tok->tokenize($txt, $offset);
$offset = $dl;
@@ -1059,14 +1059,23 @@
}
##
}elsif( $_GEN_TOK_DUMMY ){
- write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con", $text_id_esc, \@tok_tokens_con);
- write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg", $text_id_esc, \@tok_tokens_agg);
+
+ # Output token streams to zip streams
+ $cons_tok->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con"),
+ $text_id_esc
+ );
+ $aggr_tok->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg"),
+ $text_id_esc
+ );
}
#print STDERR "$0: write_tokenization(): DONE\n";
} # end: select_tokenization
+
sub write_tokenization { # called from select_tokenization()
my ( $fname, $textid_esc, $bounds ) = @_;
diff --git a/t/tokenization.t b/t/tokenization.t
index a8f8935..932407b 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -2,7 +2,6 @@
use warnings;
use Test::More;
use File::Basename 'dirname';
-use Data::Dumper;
use File::Spec::Functions qw/catfile/;
use File::Temp 'tempfile';
@@ -11,41 +10,44 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-require_ok('KorAP::XML::TEI::Tokenization');
+require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
+require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
# Test aggressive
-my $aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte Mann");
+my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+$aggr->tokenize("Der alte Mann");
is_deeply($aggr, [0,3,4,8,9,13]);
-$aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte bzw. der grau-melierte Mann");
+$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
# Test conservative
-my $cons = KorAP::XML::TEI::Tokenization::conservative("Der alte Mann");
+my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
+$cons->tokenize("Der alte Mann");
is_deeply($cons, [0,3,4,8,9,13]);
-$cons = KorAP::XML::TEI::Tokenization::conservative("Der alte bzw. der grau-melierte Mann");
+$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
-$cons = KorAP::XML::TEI::Tokenization::conservative(". Der");
+$cons->reset->tokenize(". Der");
is_deeply($cons, [0,1,2,5]);
-$cons = KorAP::XML::TEI::Tokenization::conservative(" . Der");
+$cons->reset->tokenize(" . Der");
is_deeply($cons, [1,2,3,6]);
-$cons = KorAP::XML::TEI::Tokenization::conservative(" . Der");
+$cons->reset->tokenize(" . Der");
is_deeply($cons, [3,4,5,8]);
-$cons = KorAP::XML::TEI::Tokenization::conservative("... Der");
+$cons->reset->tokenize("... Der");
is_deeply($cons, [0,1,1,2,2,3,4,7]);
-$cons = KorAP::XML::TEI::Tokenization::conservative(".Der");
+$cons->reset->tokenize(".Der");
is_deeply($cons, [1,4]);
-$cons = KorAP::XML::TEI::Tokenization::conservative(".Der.... ");
+$cons->reset->tokenize(".Der.... ");
is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
-$cons = KorAP::XML::TEI::Tokenization::conservative("..Der.... ");
+$cons->reset->tokenize("..Der.... ");
is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
# Test data
@@ -60,11 +62,11 @@
is(137166, length($data));
-$aggr = KorAP::XML::TEI::Tokenization::aggressive($data);
+$aggr->reset->tokenize($data);
is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
is(47242, scalar(@$aggr));
-$cons = KorAP::XML::TEI::Tokenization::conservative($data);
+$cons->reset->tokenize($data);
is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
is(43068, scalar(@$cons));
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 85effd8..b27acd6 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -13,7 +13,8 @@
};
use KorAP::XML::TEI;
-use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+use KorAP::XML::TEI::Tokenizer::Conservative;
my $columns = 0;
my $no_header = 0;
@@ -65,7 +66,10 @@
}
else {
die "Unable to load $t_dataf";
-}
+};
+
+my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
+my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
# Add benchmark instances
@@ -100,16 +104,16 @@
}
),
Dumbbench::Instance::PerlSub->new(
- name => 'Tokenization-conservative',
+ name => 'Tokenizer-conservative',
code => sub {
- $result = KorAP::XML::TEI::Tokenization::conservative($t_data, 0);
+ $result = $cons_tok->reset->tokenize($t_data, 0);
$result = 0;
}
),
Dumbbench::Instance::PerlSub->new(
- name => 'Tokenization-aggressive',
+ name => 'Tokenizer-aggressive',
code => sub {
- $result = KorAP::XML::TEI::Tokenization::aggressive($t_data, 0);
+ $result = $aggr_tok->reset->tokenize($t_data, 0);
$result = 0;
}
),