First attempt to create a primary data collector Change-Id: I6243512838a0cd33f8db182d93288bce45a3bbbc

commit: a10ad59196e8b22355a6cb7bc5d4ae98c6bd7b7d [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Aug 03 11:20:23 2020 +0200
committer: Akron <nils@diewald-online.de> Mon Sep 28 11:59:24 2020 +0200
tree: 70a9ed052e2fa72cb62ee5159ce1d7659c0f667c
parent: 41021abd841594bbb18bc8094df6950620daf5df [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8066b4c..0912d30 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -20,11 +20,12 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
+use KorAP::XML::TEI qw!remove_xml_comments!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Annotations::Collector;
+use KorAP::XML::TEI::Data;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
@@ -149,28 +150,24 @@
 my $structures = KorAP::XML::TEI::Annotations::Collector->new;
 
 
+# Initialize Data-Collector
+my $data = KorAP::XML::TEI::Data->new;
+
+
 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
 my $input_fh;                                        # input file handle (default: stdin)
 
-my $data;                                            # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
-
 my $dir;                                             # text     directory (below $_root_dir)
 
 my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id (see %ent)
 
-my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
-
-my ( $ref, $idx, $att_idx );                         # needed in func. 'write_structures'
-
 my ( $reader,                                        # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
      $tree_data );                                   # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
 
 # these are only used inside recursive function 'retr_info'
 my ( $_IDX,                                          # value is set dependent on $_XCT_LN - for extracting array of child elements from element in $tree_data
      $e,                                             # element from $tree_data
-     $dl,                                            # actual length of string $data
-                                                     #                            represents the actual processed element from @structures
      ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
      $add_one,                                       # ...
      $fval,                                          # ...
@@ -190,8 +187,6 @@
 
 ($_XCT_LN)?($_IDX=5):($_IDX=4);
 
-$data_prfx1 = $data_prfx2 = $data_sfx = "";
-
 $fval = 0;
 
 # Normalize regex for header parsing
@@ -201,16 +196,6 @@
   s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
 };
 
-$data_prfx1   = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-$data_prfx1  .= "<?xml-model href=\"text.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n";
-$data_prfx1  .= "<raw_text docid=\"";
-$data_prfx2  .= "\" xmlns=\"http://ids-mannheim.de/ns/KorAP\">\n";
-## TODO: can 'metadata.xml' change or is it constant?
-$data_prfx2  .= "  <metadata file=\"metadata.xml\" />\n";
-##
-$data_prfx2  .= "  <text>";
-$data_sfx     = "</text>\n</raw_text>";
-
 
 # ~ read input and write output (text by text) ~
 main();
@@ -229,8 +214,10 @@
 
   $input_fh = *STDIN;  # input file handle (default: stdin)
 
-  $data = $dir = "";
+  # Maybe not necessary
+  $data->reset;
 
+  $dir = "";
 
   if ( $input_fname ne '' ){
 
@@ -250,7 +237,7 @@
 
   # ~ loop (reading input document) ~
 
-  MAIN: while ( <$input_fh> ){
+ MAIN: while ( <$input_fh> ){
 
     $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
 
@@ -263,7 +250,7 @@
 
       die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+        if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
 
       # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
       my $buf_in = '';
@@ -298,11 +285,12 @@
             #   Insignificant whitespace is used when editing XML documents for readability.
             #    These whitespaces are typically not intended for inclusion in the delivery of the document.
             #
-            if ( $_XCT_LN ){ # _XCT_LINE_NUMBERS is only for debugging
-              $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY | XCT_LINE_NUMBERS );
-            } else {
-              $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
-            }
+
+            my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+
+            # _XCT_LINE_NUMBERS is only for debugging
+            $param |= XCT_LINE_NUMBERS if $_XCT_LN;
+            $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
 
             $structures->reset;
 
@@ -310,8 +298,6 @@
               $tokens->reset;
             }
 
-            $dl = 0;
-
             # ~ whitespace related issue ~
             $add_one = 0;
             %ws = ();
@@ -320,34 +306,22 @@
             # ~ recursion ~
             retr_info(1, \$tree_data->[2] ); # parse input data
 
-
-            # ~ write data.xml ~
-
-            # TODO: should not be necessary, because whitespace at the end of every input line is removed: see 'whitespace handling' inside text body
-            $data =~ tr/\n\r/  /; # note: 2 blanks - otherwise offset data would become corrupt
-            #
-
-
-            # Encode and escape data
-            my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
-            # note: the index still refers to the 'single character'-versions,
-            # which are counted as 1 (search for '&amp;' in data.xml and see
-            # corresponding indices in $_tokens_file)
-
             if ($_DEBUG) {
               $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
             };
 
-            $zipper->new_stream("$dir/$_data_file")
-              ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
-
+            # ~ write data.xml ~
+            $data->to_zip(
+              $zipper->new_stream("$dir/${_data_file}"),
+              $text_id_esc
+            );
 
             # ~ tokenization ~
 
             if ( $_GEN_TOK_EXT ){
 
               # Tokenize and output
-              $ext_tok->tokenize($data)->to_zip(
+              $ext_tok->tokenize($data->data)->to_zip(
                 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
                 $text_id_esc
               );
@@ -356,12 +330,12 @@
             if ( $_GEN_TOK_INT ){
 
               # Tokenize and output
-              $cons_tok->tokenize($data)->to_zip(
+              $cons_tok->tokenize($data->data)->to_zip(
                 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
                 $text_id_esc
               );
 
-              $aggr_tok->tokenize($data)->to_zip(
+              $aggr_tok->tokenize($data->data)->to_zip(
                 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
                 $text_id_esc
               );
@@ -389,7 +363,10 @@
             };
 
 
-            $data = $dir = ""; # reinit.
+            $dir = ""; # reinit.
+
+            # Maybe not necessary
+            $data->reset;
 
           } else { # $dir eq ""
 
@@ -593,7 +570,6 @@
 
   foreach $e ( @{${$_[0]}} ){ # iteration through all array elements ($_[0] is a reference to an array reference)
 
-
     if ( $e->[0] == XML_READER_TYPE_ELEMENT ){ # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
 
 
@@ -633,7 +609,7 @@
       # ~ index 'from' ~
 
       # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
-      $anno->set_from($dl + $add_one);
+      $anno->set_from($data->position + $add_one);
 
       #~~~~
       # until here: tag-node (opening)
@@ -652,6 +628,7 @@
       # from here: tag-node (closing)
       #~~~~~
 
+      my $pos = $data->position;
 
       # ~ handle structures and tokens ~
 
@@ -667,18 +644,18 @@
 
         # in case this fails, check input
         die "ERROR ($0, retr_info()): text_id='$text_id', processing of \@structures: from-value ($fval) is 2 or more greater"
-          ." than to-value ($dl) => please check. aborting ...\n"
-            if ( $fval - 1 ) > $dl;
+          ." than to-value ($pos) => please check. aborting ...\n"
+          if ( $fval - 1 ) > $pos;
 
         # TODO: find example for which this case applies
         #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
-        # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $dl;
+        # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
         #   do testing with bigger corpus excerpt (wikipedia?)
-        $anno->set_from($dl) if $fval == $dl + 1;
-        $anno->set_to($dl);
+        $anno->set_from($pos) if $fval == $pos + 1;
+        $anno->set_to($pos);
         $anno->set_level($rl);
 
-        # note: use $dl, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
+        # note: use $pos, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
       }
 
       # ~ whitespace related issue ~
@@ -725,7 +702,7 @@
       #  the last read 'non-tag'-node has to be corrected (see [1]),
       #
       # For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
-      #  additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $dl).
+      #  additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
       #
       # [1]
       # Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
@@ -750,8 +727,9 @@
 
         $add_one = 0;
 
-        $ws{ $dl }++; # state, that this from-index belongs to a whitespace-node
-                      #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+        # state, that this from-index belongs to a whitespace-node
+        #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+        $ws{$data->position}++;
 
       }else{
 
@@ -761,12 +739,9 @@
       }
 
 
-      # ~ update $data and $dl ~
+      # ~ update $data ~
 
-      $data .= $e->[1];
-
-      $dl += length( $e->[1] ); # update length of $data
-
+      $data->append($e->[1]);
 
       #~~~~~
       # until here: text- and whitespace-nodes
commit	a10ad59196e8b22355a6cb7bc5d4ae98c6bd7b7d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Aug 03 11:20:23 2020 +0200
committer	Akron <nils@diewald-online.de>	Mon Sep 28 11:59:24 2020 +0200
tree	70a9ed052e2fa72cb62ee5159ce1d7659c0f667c
parent	41021abd841594bbb18bc8094df6950620daf5df [diff] [blame]