clean up intern tokenization
- moved intern tokenization out of retr_info()
- removed $offset parameter (not anymore necessary)
Change-Id: I063efdee193ab41c2705971a95341573884fd3a2
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 4e623f9..75b11eb 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -8,9 +8,7 @@
# Tokenize string "aggressively" and return an array
# with character boundaries.
sub tokenize {
- my ($self, $txt, $offset) = @_;
-
- $offset //= 0;
+ my ($self, $txt) = @_;
# Iterate over the whole string
while ($txt =~ /([^\p{Punct} \x{9}\n]+)
@@ -19,17 +17,17 @@
# Starts with a character sequence
if (defined $1){
- push @$self, $-[1]+$offset, $+[1]+$offset; # from and to
+ push @$self, $-[1], $+[1]; # from and to
# Followed by a punctuation
if ($2){
- push @$self, $-[2]+$offset, $+[2]+$offset # from and to
+ push @$self, $-[2], $+[2] # from and to
}
}
# Starts with a punctuation
else {
- push @$self, $-[3]+$offset, $+[3]+$offset # from and to
+ push @$self, $-[3], $+[3] # from and to
};
};
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index d3b793e..b3373f5 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -8,8 +8,7 @@
# Tokenize string "conservatively" and return an array
# with character boundaries.
sub tokenize {
- my ($self, $txt, $offset) = @_;
- $offset //= 0;
+ my ($self, $txt) = @_;
# Iterate over the whole string
while ($txt =~ /(\p{Punct}*)
@@ -18,16 +17,16 @@
(?:[ \x{9}\n])?/gx) {
# Punctuation preceding a token
- $self->_add_surroundings($txt, $offset, $-[1], $+[1], 1) if $1;
+ $self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
# Token sequence
- push @$self, ($-[2]+$offset, $+[2]+$offset) if $2; # from and to
+ push @$self, ($-[2], $+[2]) if $2; # from and to
# Punctuation following a token
- $self->_add_surroundings($txt, $offset, $-[3], $+[3]) if $3;
+ $self->_add_surroundings($txt, $-[3], $+[3]) if $3;
# Special chars after token
- $self->_add_surroundings($txt, $offset, $-[4], $+[4]) if $4;
+ $self->_add_surroundings($txt, $-[4], $+[4]) if $4;
};
return
@@ -36,7 +35,7 @@
# Check if surrounding characters are token-worthy
sub _add_surroundings {
- my ($self, $txt, $offset, $p1, $p2, $preceding) = @_;
+ my ($self, $txt, $p1, $p2, $preceding) = @_;
my $pr;
@@ -70,13 +69,13 @@
};
# Either before or after the char there is a token
- push @$self, ($p1+$offset, $p2+$offset) if $pr; # from and to
+ push @$self, ($p1, $p2) if $pr; # from and to
return;
};
# Iterate over all single punctuation symbols
for (my $i = $p1; $i < $p2; $i++ ){
- push @$self, $i+$offset, $i+1+$offset; # from and to
+ push @$self, $i, $i+1; # from and to
};
};
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 8cfa0cf..7b740a1 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -54,7 +54,7 @@
# Tokenize text in an external process
sub tokenize {
- my ($self, $txt, $offset) = @_;
+ my ($self, $txt) = @_;
return unless $self->{pid};
my $out = $self->{chld_in};
print $out $txt . $self->{sep};
diff --git a/script/tei2korapxml b/script/tei2korapxml
index c94e3cb..3390ff6 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -93,7 +93,6 @@
my $_tok_file_agg = "tokens_aggressive.xml";
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
- my ( $txt, $offset );
##
my $_tok_dir = "base"; # name of directory for storing tokenization files
@@ -350,13 +349,19 @@
$data =~ tr/\n\r/ /; # note: 2 blanks - otherwise offset data would become corrupt
#
- $data = encode_utf8( $data );
if ( $_GEN_TOK_EXT ){
- # TODO: $offset is only necessary for $cons_tok and $aggr_tok and as long as they're part of 'retr_info'
- $ext_tok->tokenize($data, $offset);
+
+ $ext_tok->tokenize($data);
+
+ } elsif ( $_GEN_TOK_INT ){
+
+ $cons_tok->tokenize($data);
+ $aggr_tok->tokenize($data);
}
+ $data = encode_utf8( $data );
+
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
@@ -377,34 +382,30 @@
# ~ tokenization ~
- if ( $_GEN_TOK_EXT || $_GEN_TOK_INT ){
+ if ( $_GEN_TOK_EXT ) {
- if ( $_GEN_TOK_EXT ) {
+ $ext_tok->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+ $text_id_esc
+ )
- $ext_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
- $text_id_esc
- );
+ } elsif ( $_GEN_TOK_INT ){
- } elsif ( $_GEN_TOK_INT ){
-
- # Output token streams to zip streams
- $cons_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
- $text_id_esc
- );
- $aggr_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
- $text_id_esc
- );
- $offset = 0;
- $aggr_tok->reset;
- $cons_tok->reset;
- }
-
- #print STDERR "$0: write_tokenization(): DONE\n";
+ # Output token streams to zip streams
+ $cons_tok->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+ $text_id_esc
+ );
+ $aggr_tok->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+ $text_id_esc
+ );
+ $aggr_tok->reset;
+ $cons_tok->reset;
}
+ #print STDERR "$0: write_tokenization(): DONE\n";
+
$data_fl = 0; $buf_in = $data = $dir = ""; # reinit.
} else { # $dir eq ""
@@ -1064,29 +1065,6 @@
$dl += length( $e->[1] ); # update length of $data
- if ( $_GEN_TOK_INT ){
-
- #~~~~~
- # from here: intern tokenization
- #~~~~~
-
-
- $txt = $e->[1];
-
- # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
- $cons_tok->tokenize($txt, $offset);
- $aggr_tok->tokenize($txt, $offset);
-
- $offset = $dl;
-
-
- #~~~~~
- # until here: intern tokenization
- #~~~~~
-
- }
-
-
#~~~~~
# until here: text- and whitespace-nodes
#~~~~~
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index e484160..ad286df 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -7,8 +7,6 @@
};
use KorAP::XML::TEI::Tokenizer::Aggressive;
-use open qw(:std :utf8); # assume utf-8 encoding
-
$| = 1;
# Init tokenizer
diff --git a/t/script.t b/t/script.t
index 85c2cea..2bb00bb 100644
--- a/t/script.t
+++ b/t/script.t
@@ -183,14 +183,16 @@
# Tokenize with external tokenizer
my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+my ($fh2, $outzip2) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
+
stderr_like(
- sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
+ sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
'Processing'
);
# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
-$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+$zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
# Read GOE/AGA/00000/base/tokens.xml
$tokens_xml = '';
@@ -216,19 +218,22 @@
# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
+my ($fh3, $outzip3) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
+
+
# ~ test conservative tokenization ~
$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
stderr_like(
- sub { `cat '$file' | perl '$script' > '$outzip'` },
+ sub { `cat '$file' | perl '$script' > '$outzip3'` },
qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
'Processing'
);
-ok(-e $outzip, "File $outzip exists");
+ok(-e $outzip3, "File $outzip3 exists");
-$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
+$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
ok($zip, 'Zip-File is created');
@@ -265,7 +270,7 @@
# ~ test aggressive tokenization ~
-$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
+$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
ok($zip, 'Zip-File is created');
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index c1657a6..9d0489a 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -106,14 +106,14 @@
Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-conservative',
code => sub {
- $result = $cons_tok->reset->tokenize($t_data, 0);
+ $result = $cons_tok->reset->tokenize($t_data);
$result = 0;
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-aggressive',
code => sub {
- $result = $aggr_tok->reset->tokenize($t_data, 0);
+ $result = $aggr_tok->reset->tokenize($t_data);
$result = 0;
}
),