Parse text in separate text body loop (similar to headers) Change-Id: Ieac55fd0c2c8e8a5af0491a5cabfc4320bef9691

commit: 598d1a70953b6cd33c3952b39c7778a8ce579586 [log] [tgz]
author: Akron <nils@diewald-online.de> Sun Aug 02 17:33:31 2020 +0200
committer: Peter Harders <harders@ids-mannheim.de> Thu Sep 24 11:16:39 2020 +0200
tree: 50bffcac093acbac39a04f92a4ce35c4ff7c367a
parent: 3bdc0a36820036911421e4d12e7a12955b6f75d5 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index ce84a4b..0546658 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -142,15 +142,12 @@
 my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
 my $input_fh;                                        # input file handle (default: stdin)
 
-my $buf_in;                                          # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
 my $data;                                            # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
 
 my $dir;                                             # text     directory (below $_root_dir)
 
 my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id (see %ent)
 
-my ( $data_fl );
-
 my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
 
 
@@ -229,9 +226,7 @@
 
   $input_fh = *STDIN;  # input file handle (default: stdin)
 
-  $data_fl = 0;
-
-  $buf_in = $data = $dir = "";
+  $data = $dir = "";
 
 
   if ( $input_fname ne '' ){
@@ -252,201 +247,199 @@
 
   # ~ loop (reading input document) ~
 
-  while ( <$input_fh> ){
+  MAIN: while ( <$input_fh> ){
 
-    # TODO: yet not tested fo big amounts of data
-    # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
-    $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
-
-    if ( $data_fl && ($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
-
-      # ~ end of text body ~
+    $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
 
 
-      # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
-
-      die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
-        ." contains additional information ... => Aborting\n\tline=$_"
-        if (substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/;
-
-      if ( $dir ne "" ){
-
-        $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
-
-        #  ~ whitespace handling ~
-        #
-        #  Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
-        #   (see function 'retr_info()').
-        #
-        #  Definition of significant and insignificant whitespace
-        #   (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
-        #
-        #   Significant whitespace is part of the document content and should be preserved.
-        #   Insignificant whitespace is used when editing XML documents for readability.
-        #    These whitespaces are typically not intended for inclusion in the delivery of the document.
-        #
-        if ( $_XCT_LN ){ # _XCT_LINE_NUMBERS is only for debugging
-          $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY | XCT_LINE_NUMBERS );
-        } else {
-          $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
-        }
-
-        @structures = (); @oti = ();
-
-        if ( $_TOKENS_PROC ){
-          $tokens->reset;
-        }
-
-        $dl = 0;
-
-        # ~ whitespace related issue ~
-        $add_one = 0;
-        %ws = ();
-
-
-        # ~ recursion ~
-        retr_info(1, \$tree_data->[2] ); # parse input data
-
-
-        # ~ write data.xml ~
-
-        # TODO: should not be necessary, because whitespace at the end of every input line is removed: see 'whitespace handling' inside text body
-        #   (...elsif ( $data_fl )....)
-        $data =~ tr/\n\r/  /; # note: 2 blanks - otherwise offset data would become corrupt
-        #
-
-
-        # ~ tokenization ~
-
-        if ( $_GEN_TOK_EXT ){
-
-          # Tokenize and output
-          $ext_tok->tokenize($data)->to_zip(
-            $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
-            $text_id_esc
-          );
-        };
-
-        if ( $_GEN_TOK_INT ){
-
-          # Tokenize and output
-          $cons_tok->tokenize($data)->to_zip(
-            $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
-            $text_id_esc
-          );
-
-          $aggr_tok->tokenize($data)->to_zip(
-            $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
-            $text_id_esc
-          );
-
-          $aggr_tok->reset;
-          $cons_tok->reset;
-        };
-
-        # Encode and escape data
-        $data = escape_xml(encode( "UTF-8", $data ));
-        # note: the index still refers to the 'single character'-versions,
-        # which are counted as 1 (search for '&amp;' in data.xml and see
-        # corresponding indices in $_tokens_file)
-
-        if ($_DEBUG) {
-          $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
-        };
-
-        $zipper->new_stream("$dir/$_data_file")
-          ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
-
-        # ~ write structures ~
-
-        write_structures() if @structures;
-
-
-        # ~ write tokens ~
-
-        if ($_TOKENS_PROC && !$tokens->empty) {
-          $tokens->to_zip(
-            $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
-            $text_id_esc,
-            $_INLINE_ANNOT
-          );
-        };
-
-        #print STDERR "$0: write_tokenization(): DONE\n";
-
-        $data_fl = 0; $buf_in = $data = $dir = ""; # reinit.
-
-      } else { # $dir eq ""
-        $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
-      }
-
-    } elsif ( $data_fl ){
-
-
-      # ~ inside text body ~
-
-
-      #print STDERR "inside text body (\$data_fl set)\n";
-
-      # ~ whitespace handling ~
-
-      # The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
-      #  into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
-      #
-      # It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
-      #  example further down and notes on 'Input restrictions' in the manpage).
-      #
-      # Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
-      #
-      # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
-      #  an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
-      #
-      # Examples (how primary text with linebreaks would be converted by below code):
-      #
-      #  '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
-      #  '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
-
-      s/^\s+//; s/\s+$//; # remove consecutive whitespace at beginning and end (mostly one newline)
-
-      ### NOTE: this is only relevant, if a text consists of more than one line
-      ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
-      ###  do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
-      if ( m/<[^>]+>[^<]/ ){ # line contains at least one tag with at least one character contents
-
-        # NOTE: not stringent ('...' stands for text):
-        #
-        #   beg1............................end1  => no blank before 'beg1'
-        #   beg2....<pb/>...................end2  => no blank before 'beg2'
-        #   beg3....<info attr1="val1"/>....end3  => no blank before 'beg3'
-        #   beg4....<test>ok</test>.........end4  =>    blank before 'beg4'
-        #
-        #     =>  beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
-        #                                                                            ^
-        #                                                                            |_blank between 'end3' and 'beg4'
-
-        $tl++; # counter for text lines
-
-        s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
-      }
-      ###
-
-      # add line to buffer
-      $buf_in .= $_;
-
-    } elsif ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+    if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
       # ~ start of text body ~
 
-      #print STDERR "inside text body\n";
-
       $pfx = $1; $sfx = $2;
 
-      $data_fl = 1;
-
       die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
           if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
 
+      # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
+      my $buf_in = '';
+
+      # Iterate over all lines in the text body
+      while (<$input_fh>) {
+
+        $_ = remove_xml_comments( $input_fh, $_ );
+
+        # ~ end of text body ~
+        if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
+
+          # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
+
+          die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
+            ." contains additional information ... => Aborting\n\tline=$_"
+            if (substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/;
+
+          if ( $dir ne "" ){
+
+            $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
+
+            #  ~ whitespace handling ~
+            #
+            #  Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
+            #   (see function 'retr_info()').
+            #
+            #  Definition of significant and insignificant whitespace
+            #   (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
+            #
+            #   Significant whitespace is part of the document content and should be preserved.
+            #   Insignificant whitespace is used when editing XML documents for readability.
+            #    These whitespaces are typically not intended for inclusion in the delivery of the document.
+            #
+            if ( $_XCT_LN ){ # _XCT_LINE_NUMBERS is only for debugging
+              $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY | XCT_LINE_NUMBERS );
+            } else {
+              $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
+            }
+
+            @structures = (); @oti = ();
+
+            if ( $_TOKENS_PROC ){
+              $tokens->reset;
+            }
+
+            $dl = 0;
+
+            # ~ whitespace related issue ~
+            $add_one = 0;
+            %ws = ();
+
+
+            # ~ recursion ~
+            retr_info(1, \$tree_data->[2] ); # parse input data
+
+
+            # ~ write data.xml ~
+
+            # TODO: should not be necessary, because whitespace at the end of every input line is removed: see 'whitespace handling' inside text body
+            $data =~ tr/\n\r/  /; # note: 2 blanks - otherwise offset data would become corrupt
+            #
+
+
+            # ~ tokenization ~
+
+            if ( $_GEN_TOK_EXT ){
+
+              # Tokenize and output
+              $ext_tok->tokenize($data)->to_zip(
+                $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
+                $text_id_esc
+              );
+            };
+
+            if ( $_GEN_TOK_INT ){
+
+              # Tokenize and output
+              $cons_tok->tokenize($data)->to_zip(
+                $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
+                $text_id_esc
+              );
+
+              $aggr_tok->tokenize($data)->to_zip(
+                $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
+                $text_id_esc
+              );
+
+              $aggr_tok->reset;
+              $cons_tok->reset;
+            };
+
+            # Encode and escape data
+            $data = escape_xml(encode( "UTF-8", $data ));
+            # note: the index still refers to the 'single character'-versions,
+            # which are counted as 1 (search for '&amp;' in data.xml and see
+            # corresponding indices in $_tokens_file)
+
+            if ($_DEBUG) {
+              $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+            };
+
+            $zipper->new_stream("$dir/$_data_file")
+              ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
+
+            # ~ write structures ~
+
+            write_structures() if @structures;
+
+
+            # ~ write tokens ~
+
+            if ($_TOKENS_PROC && !$tokens->empty) {
+              $tokens->to_zip(
+                $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
+                $text_id_esc,
+                $_INLINE_ANNOT
+              );
+            };
+
+
+            $data = $dir = ""; # reinit.
+
+          } else { # $dir eq ""
+
+            $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
+          }
+
+          next MAIN;
+        };
+
+        # ~ inside text body ~
+
+        # ~ whitespace handling ~
+
+        # The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
+        #  into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
+        #
+        # It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
+        #  example further down and notes on 'Input restrictions' in the manpage).
+        #
+        # Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
+        #
+        # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
+        #  an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
+        #
+        # Examples (how primary text with linebreaks would be converted by below code):
+        #
+        #  '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
+        #  '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
+
+        s/^\s+//; s/\s+$//; # remove consecutive whitespace at beginning and end (mostly one newline)
+
+        ### NOTE: this is only relevant, if a text consists of more than one line
+        ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
+        ###  do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
+        if ( m/<[^>]+>[^<]/ ){ # line contains at least one tag with at least one character contents
+
+          # NOTE: not stringent ('...' stands for text):
+          #
+          #   beg1............................end1  => no blank before 'beg1'
+          #   beg2....<pb/>...................end2  => no blank before 'beg2'
+          #   beg3....<info attr1="val1"/>....end3  => no blank before 'beg3'
+          #   beg4....<test>ok</test>.........end4  =>    blank before 'beg4'
+          #
+          #     =>  beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
+          #                                                                            ^
+          #                                                                            |_blank between 'end3' and 'beg4'
+
+          $tl++; # counter for text lines
+
+          s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+        }
+        ###
+
+        # add line to buffer
+        $buf_in .= $_;
+      };
+
     } elsif ( m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$# ){
 
       # ~ start of header ~
@@ -867,8 +860,6 @@
 
   # ~ write @structures ~
 
-  #print STDERR "$0: write_structures(): ...\n";
-
   if ( $dir eq "" ){
     $log->warn("write_structures(): empty textSigle => nothing to do ...");
     return;
@@ -940,8 +931,6 @@
   $zipper->new_stream("$dir/$_structure_dir/$_structure_file")
     ->print($output);
 
-  #print STDERR "$0: write_structures(): DONE\n";
-
 } # end: sub write_structures
commit	598d1a70953b6cd33c3952b39c7778a8ce579586	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sun Aug 02 17:33:31 2020 +0200
committer	Peter Harders <harders@ids-mannheim.de>	Thu Sep 24 11:16:39 2020 +0200
tree	50bffcac093acbac39a04f92a4ce35c4ff7c367a
parent	3bdc0a36820036911421e4d12e7a12955b6f75d5 [diff] [blame]