Remove unnecessary call to main()
Change-Id: Ib8be6ce7f812e7ef00acefcc694c3cae4b994b01
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 174c439..07d3a7d 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -199,280 +199,268 @@
# ~ read input and write output (text by text) ~
-main();
+my ( $pfx, $sfx );
-#
-# ~~~ subs ~~~
-#
+my $tl = 0; # text line (needed for whitespace handling)
+$input_fh = *STDIN; # input file handle (default: stdin)
-sub main {
+# Maybe not necessary
+$data->reset;
- my ( $pfx, $sfx );
+$dir = "";
- my $tl = 0; # text line (needed for whitespace handling)
+if ( $input_fname ne '' ){
+ unless (open($input_fh, '<', $input_fname)) {
+ die $log->fatal("File '$input_fname' could not be opened.");
+ };
+}
- $input_fh = *STDIN; # input file handle (default: stdin)
+# prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in 'man XML::LibXML')
+# removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
+# see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
+# see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
+binmode $input_fh;
- # Maybe not necessary
- $data->reset;
+my $pos;
+my $l = length('</' . $_TEXT_BODY) + 1;
- $dir = "";
+# ~ loop (reading input document) ~
- if ( $input_fname ne '' ){
- unless (open($input_fh, '<', $input_fname)) {
- die $log->fatal("File '$input_fname' could not be opened.");
+MAIN: while ( <$input_fh> ){
+
+ $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
+
+ if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+
+ # ~ start of text body ~
+
+ $pfx = $1;
+ $sfx = $2;
+
+ if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
+ die $log->fatal("input line number $.: " .
+ "line with opening text-body tag '${_TEXT_BODY}' " .
+ "contains additional information ... => Aborting (line=$_)");
};
- }
- # prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in 'man XML::LibXML')
- # removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
- # see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
- # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
- binmode $input_fh;
+ # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
+ my $buf_in = '';
- my $pos;
- my $l = length('</' . $_TEXT_BODY) + 1;
+ # Iterate over all lines in the text body
+ while (<$input_fh>) {
- # ~ loop (reading input document) ~
+ $_ = remove_xml_comments( $input_fh, $_ );
- MAIN: while ( <$input_fh> ){
+ # ~ end of text body ~
+ if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
- $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
+ # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
+ if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
+ die $log->fatal("input line number $.: " .
+ "line with closing text-body tag '${_TEXT_BODY}'".
+ " contains additional information ... => Aborting (line=$_)");
+ };
- if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+ if ($dir ne "") {
- # ~ start of text body ~
+ $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
- $pfx = $1;
- $sfx = $2;
+ # ~ whitespace handling ~
+ #
+ # Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
+ # (see function 'retr_info()').
+ #
+ # Definition of significant and insignificant whitespace
+ # (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
+ #
+ # Significant whitespace is part of the document content and should be preserved.
+ # Insignificant whitespace is used when editing XML documents for readability.
+ # These whitespaces are typically not intended for inclusion in the delivery of the document.
+ #
- if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
- die $log->fatal("input line number $.: " .
- "line with opening text-body tag '${_TEXT_BODY}' " .
- "contains additional information ... => Aborting (line=$_)");
- };
+ my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
- # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
- my $buf_in = '';
+ # _XCT_LINE_NUMBERS is only for debugging
+ $param |= XCT_LINE_NUMBERS if $_XCT_LN;
+ $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
- # Iterate over all lines in the text body
- while (<$input_fh>) {
+ $structures->reset;
- $_ = remove_xml_comments( $input_fh, $_ );
+ $tokens->reset if $_TOKENS_PROC;
- # ~ end of text body ~
- if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
+ # ~ whitespace related issue ~
+ $add_one = 0;
+ %ws = ();
- # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
+ # ~ recursion ~
+ retr_info(1, \$tree_data->[2] ); # parse input data
- if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
- die $log->fatal("input line number $.: " .
- "line with closing text-body tag '${_TEXT_BODY}'".
- " contains additional information ... => Aborting (line=$_)");
+ if ($_DEBUG) {
+ $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
};
- if ( $dir ne "" ){
+ # ~ write data.xml ~
+ $data->to_zip(
+ $zipper->new_stream("$dir/${_data_file}"),
+ $text_id_esc
+ );
- $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
+ # ~ tokenization ~
+ if ($_GEN_TOK_EXT) {
- # ~ whitespace handling ~
- #
- # Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
- # (see function 'retr_info()').
- #
- # Definition of significant and insignificant whitespace
- # (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
- #
- # Significant whitespace is part of the document content and should be preserved.
- # Insignificant whitespace is used when editing XML documents for readability.
- # These whitespaces are typically not intended for inclusion in the delivery of the document.
- #
+ # Tokenize and output
+ $ext_tok->tokenize($data->data)->to_zip(
+ $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
+ $text_id_esc
+ );
+ };
- my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+ if ($_GEN_TOK_INT) {
- # _XCT_LINE_NUMBERS is only for debugging
- $param |= XCT_LINE_NUMBERS if $_XCT_LN;
- $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
-
- $structures->reset;
-
- $tokens->reset if $_TOKENS_PROC;
-
- # ~ whitespace related issue ~
- $add_one = 0;
- %ws = ();
-
-
- # ~ recursion ~
- retr_info(1, \$tree_data->[2] ); # parse input data
-
- if ($_DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
- };
-
- # ~ write data.xml ~
- $data->to_zip(
- $zipper->new_stream("$dir/${_data_file}"),
+ # Tokenize and output
+ $cons_tok->tokenize($data->data)->to_zip(
+ $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
$text_id_esc
);
- # ~ tokenization ~
- if ($_GEN_TOK_EXT) {
+ $aggr_tok->tokenize($data->data)->to_zip(
+ $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
+ $text_id_esc
+ );
- # Tokenize and output
- $ext_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
- $text_id_esc
- );
- };
+ $aggr_tok->reset;
+ $cons_tok->reset;
+ };
- if ($_GEN_TOK_INT) {
+ # ~ write structures ~
+ if (!$structures->empty) {
+ $structures->to_zip(
+ $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
+ $text_id_esc,
+ 2 # = structure serialization
+ );
+ };
- # Tokenize and output
- $cons_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
- $text_id_esc
- );
+ # ~ write tokens ~
+ if ($_TOKENS_PROC && !$tokens->empty) {
+ $tokens->to_zip(
+ $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
+ $text_id_esc,
+ $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
+ );
+ };
- $aggr_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
- $text_id_esc
- );
+ $dir = ""; # reinit.
- $aggr_tok->reset;
- $cons_tok->reset;
- };
+ # Maybe not necessary
+ $data->reset;
- # ~ write structures ~
- if (!$structures->empty) {
- $structures->to_zip(
- $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
- $text_id_esc,
- 2 # = structure serialization
- );
- };
+ } else { # $dir eq ""
- # ~ write tokens ~
- if ($_TOKENS_PROC && !$tokens->empty) {
- $tokens->to_zip(
- $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
- $text_id_esc,
- $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
- );
- };
-
-
- $dir = ""; # reinit.
-
- # Maybe not necessary
- $data->reset;
-
- } else { # $dir eq ""
-
- $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
- }
-
- next MAIN;
- };
-
- # ~ inside text body ~
-
- # ~ whitespace handling ~
-
- # The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
- # into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
- #
- # It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
- # example further down and notes on 'Input restrictions' in the manpage).
- #
- # Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
- #
- # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
- # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
- #
- # Examples (how primary text with linebreaks would be converted by below code):
- #
- # '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
- # '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
-
- s/^\s+//; s/\s+$//; # remove consecutive whitespace at beginning and end (mostly one newline)
-
- ### NOTE: this is only relevant, if a text consists of more than one line
- ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
- ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
- if (m/<[^>]+>[^<]/){ # line contains at least one tag with at least one character contents
-
- # NOTE: not stringent ('...' stands for text):
- #
- # beg1............................end1 => no blank before 'beg1'
- # beg2....<pb/>...................end2 => no blank before 'beg2'
- # beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
- # beg4....<test>ok</test>.........end4 => blank before 'beg4'
- #
- # => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
- # ^
- # |_blank between 'end3' and 'beg4'
-
- $tl++; # counter for text lines
-
- s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+ $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
}
- ###
- # add line to buffer
- $buf_in .= $_;
+ next MAIN;
};
- } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
+ # ~ inside text body ~
- # ~ start of header ~
- $pfx = $1;
- my $content = "$2\n";
+ # ~ whitespace handling ~
- if ($pfx !~ /^\s*$/) {
- die $log->fatal("input line number $.: " .
- "line with opening header tag" .
- " is not in expected format ... => Aborting (line=$_)");
- };
+ # The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
+ # into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
+ #
+ # It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
+ # example further down and notes on 'Input restrictions' in the manpage).
+ #
+ # Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
+ #
+ # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
+ # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
+ #
+ # Examples (how primary text with linebreaks would be converted by below code):
+ #
+ # '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
+ # '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
- # Parse header
- my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+ s/^\s+//; s/\s+$//; # remove consecutive whitespace at beginning and end (mostly one newline)
- # Header was parseable
- if ($header) {
+ ### NOTE: this is only relevant, if a text consists of more than one line
+ ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
+ ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
+ if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
- # Write header to zip
- my $file = $header->dir . '/' . $_header_file;
+ # NOTE: not stringent ('...' stands for text):
+ #
+ # beg1............................end1 => no blank before 'beg1'
+ # beg2....<pb/>...................end2 => no blank before 'beg2'
+ # beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
+ # beg4....<test>ok</test>.........end4 => blank before 'beg4'
+ #
+ # => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
+ # ^
+ # |_blank between 'end3' and 'beg4'
- $log->debug("Writing file $file") if $_DEBUG;
+ $tl++; # counter for text lines
- $header->to_zip($zipper->new_stream($file));
+ s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+ }
+ ###
- # Header is for text level
- if ($header->type eq 'text') {
+ # add line to buffer
+ $buf_in .= $_;
+ };
- # Remember dir and sigles
- $dir = $header->dir;
- $text_id = $header->id;
- $text_id_esc = $header->id_esc;
+ } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
- # log output for seeing progression
- $log->notice("$0: main(): text_id=".decode('UTF-8', $text_id));
+ # ~ start of header ~
+ $pfx = $1;
+ my $content = "$2\n";
- $tl = 0; # reset (needed for ~ whitespace handling ~)
- }
+ if ($pfx !~ /^\s*$/) {
+ die $log->fatal("input line number $.: " .
+ "line with opening header tag" .
+ " is not in expected format ... => Aborting (line=$_)");
+ };
+
+ # Parse header
+ my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+
+ # Header was parseable
+ if ($header) {
+
+ # Write header to zip
+ my $file = $header->dir . '/' . $_header_file;
+
+ $log->debug("Writing file $file") if $_DEBUG;
+
+ $header->to_zip($zipper->new_stream($file));
+
+ # Header is for text level
+ if ($header->type eq 'text') {
+
+ # Remember dir and sigles
+ $dir = $header->dir;
+ $text_id = $header->id;
+ $text_id_esc = $header->id_esc;
+
+ # log output for seeing progression
+ $log->notice("$0: main(): text_id=".decode('UTF-8', $text_id));
+
+ $tl = 0; # reset (needed for ~ whitespace handling ~)
}
}
- } #end: while
+ }
+} #end: while
- $zipper->close;
+$zipper->close;
- $ext_tok->close if $_GEN_TOK_EXT;
+$ext_tok->close if $_GEN_TOK_EXT;
-} # end: sub main
+exit(0);
sub retr_info { # called from main()