Make tokenization chainable and remove unnecessary tokenization switch
Change-Id: Iaf47a0fcad225931c941fd7379c02e51a428be07
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 3875200..318051a 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -317,16 +317,32 @@
#
+ # ~ tokenization ~
+
if ( $_GEN_TOK_EXT ){
- $ext_tok->tokenize($data);
+ $ext_tok->tokenize($data)->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+ $text_id_esc
+ );
}
if ( $_GEN_TOK_INT ){
- $cons_tok->tokenize($data);
- $aggr_tok->tokenize($data);
+ # Tokenize and output
+ $cons_tok->tokenize($data)->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+ $text_id_esc
+ );
+
+ $aggr_tok->tokenize($data)->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+ $text_id_esc
+ );
+
+ $aggr_tok->reset;
+ $cons_tok->reset;
}
$data = encode_utf8( $data );
@@ -348,33 +364,6 @@
write_tokens() if $_TOKENS_PROC && @tokens;
-
- # ~ tokenization ~
-
- if ( $_GEN_TOK_EXT ) {
-
- $ext_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
- $text_id_esc
- )
-
- }
-
- if ( $_GEN_TOK_INT ){
-
- # Output token streams to zip streams
- $cons_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
- $text_id_esc
- );
- $aggr_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
- $text_id_esc
- );
- $aggr_tok->reset;
- $cons_tok->reset;
- }
-
#print STDERR "$0: write_tokenization(): DONE\n";
$data_fl = 0; $buf_in = $data = $dir = ""; # reinit.