Reduce indentation level and test for missing text ids
Change-Id: Ief41eaf446dd083abd3d63701a417279f5d9ec90
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9aca1f6..8d15173 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -202,7 +202,7 @@
# Maybe not necessary
$data->reset;
-$dir = "";
+$dir = '';
if ( $input_fname ne '' ){
unless (open($input_fh, '<', $input_fname)) {
@@ -266,101 +266,100 @@
" contains additional information ... => Aborting (line=$_)");
};
- if ($dir ne "") {
+ if ($dir eq '') {
+ $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
+ next MAIN;
+ };
- my $reader = XML::LibXML::Reader->new(
- string => "<text>$buf_in</text>",
- huge => 1
+ my $reader = XML::LibXML::Reader->new(
+ string => "<text>$buf_in</text>",
+ huge => 1
+ );
+
+ # See notes on whitespace handling
+ my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+
+ # XCT_LINE_NUMBERS is only needed for debugging
+ # (see XML::CompactTree::XS)
+ $param |= XCT_LINE_NUMBERS if DEBUG;
+ my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
+
+ $structures->reset;
+
+ $tokens->reset if $_TOKENS_PROC;
+
+ # ~ whitespace related issue ~
+ $add_one = 0;
+ %ws = ();
+
+ # ~ recursion ~
+ retr_info(1, \$tree_data->[2] ); # parse input data
+
+ if (DEBUG) {
+ $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
+ };
+
+ # ~ write data.xml ~
+ $data->to_zip(
+ $zipper->new_stream("$dir/${_data_file}.xml"),
+ $text_id_esc
+ );
+
+ # ~ tokenization ~
+ if ($_GEN_TOK_EXT) {
+
+ # Tokenize and output
+ $ext_tok->tokenize($data->data)->to_zip(
+ $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
+ $text_id_esc
);
+ };
- # See notes on whitespace handling
- my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+ if ($_GEN_TOK_INT) {
- # XCT_LINE_NUMBERS is only needed for debugging
- # (see XML::CompactTree::XS)
- $param |= XCT_LINE_NUMBERS if DEBUG;
- my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
-
- $structures->reset;
-
- $tokens->reset if $_TOKENS_PROC;
-
- # ~ whitespace related issue ~
- $add_one = 0;
- %ws = ();
-
- # ~ recursion ~
- retr_info(1, \$tree_data->[2] ); # parse input data
-
- if (DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
- };
-
- # ~ write data.xml ~
- $data->to_zip(
- $zipper->new_stream("$dir/${_data_file}.xml"),
+ # Tokenize and output
+ $cons_tok->tokenize($data->data)->to_zip(
+ $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
$text_id_esc
);
- # ~ tokenization ~
- if ($_GEN_TOK_EXT) {
+ $aggr_tok->tokenize($data->data)->to_zip(
+ $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
+ $text_id_esc
+ );
- # Tokenize and output
- $ext_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
- $text_id_esc
- );
- };
+ $aggr_tok->reset;
+ $cons_tok->reset;
+ };
- if ($_GEN_TOK_INT) {
-
- # Tokenize and output
- $cons_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
- $text_id_esc
- );
-
- $aggr_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
- $text_id_esc
- );
-
- $aggr_tok->reset;
- $cons_tok->reset;
- };
-
- if ($use_tokenizer_sentence_splits) {
- $ext_tok->sentencize_from_previous_input($structures);
- }
-
- # ~ write structures ~
- if (!$structures->empty) {
- $structures->to_zip(
- $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
- $text_id_esc,
- 2 # = structure serialization
- );
- };
-
- # ~ write tokens ~
- if ($_TOKENS_PROC && !$tokens->empty) {
- $tokens->to_zip(
- $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
- $text_id_esc,
- $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
- );
- };
-
- $dir = ""; # reinit.
-
- # Maybe not necessary
- $data->reset;
-
- } else { # $dir eq ""
-
- $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
+ if ($use_tokenizer_sentence_splits) {
+ $ext_tok->sentencize_from_previous_input($structures);
}
+ # ~ write structures ~
+ if (!$structures->empty) {
+ $structures->to_zip(
+ $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
+ $text_id_esc,
+ 2 # = structure serialization
+ );
+ };
+
+ # ~ write tokens ~
+ if ($_TOKENS_PROC && !$tokens->empty) {
+ $tokens->to_zip(
+ $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
+ $text_id_esc,
+ $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
+ );
+ };
+
+ # reinit.
+ $dir = '';
+
+ # Maybe not necessary
+ $data->reset;
+
next MAIN;
};