Make tokenization chainable and remove unnecessary tokenization switch
Change-Id: Iaf47a0fcad225931c941fd7379c02e51a428be07
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 5099eeb..c36b605 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -37,7 +37,7 @@
};
};
- return;
+ return $self;
};
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 0efb648..237d87f 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -32,7 +32,7 @@
$self->_add_surroundings($txt, $-[3], $+[3]) if $3;
};
- return
+ return $self;
};
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 7b740a1..4fe3751 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -58,6 +58,7 @@
return unless $self->{pid};
my $out = $self->{chld_in};
print $out $txt . $self->{sep};
+ return $self;
};
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 3875200..318051a 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -317,16 +317,32 @@
#
+ # ~ tokenization ~
+
if ( $_GEN_TOK_EXT ){
- $ext_tok->tokenize($data);
+ $ext_tok->tokenize($data)->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+ $text_id_esc
+ );
}
if ( $_GEN_TOK_INT ){
- $cons_tok->tokenize($data);
- $aggr_tok->tokenize($data);
+ # Tokenize and output
+ $cons_tok->tokenize($data)->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+ $text_id_esc
+ );
+
+ $aggr_tok->tokenize($data)->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+ $text_id_esc
+ );
+
+ $aggr_tok->reset;
+ $cons_tok->reset;
}
$data = encode_utf8( $data );
@@ -348,33 +364,6 @@
write_tokens() if $_TOKENS_PROC && @tokens;
-
- # ~ tokenization ~
-
- if ( $_GEN_TOK_EXT ) {
-
- $ext_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
- $text_id_esc
- )
-
- }
-
- if ( $_GEN_TOK_INT ){
-
- # Output token streams to zip streams
- $cons_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
- $text_id_esc
- );
- $aggr_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
- $text_id_esc
- );
- $aggr_tok->reset;
- $cons_tok->reset;
- }
-
#print STDERR "$0: write_tokenization(): DONE\n";
$data_fl = 0; $buf_in = $data = $dir = ""; # reinit.
diff --git a/t/tokenization.t b/t/tokenization.t
index b132a63..92b7cc3 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -24,6 +24,12 @@
$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
+like(
+ $aggr->reset->tokenize("Der")->to_string('a'),
+ qr!id="t_0"!,
+ 'Chainable'
+);
+
# Test conservative
my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
$cons->tokenize("Der alte Mann");