First attempt to create a primary data collector Change-Id: I6243512838a0cd33f8db182d93288bce45a3bbbc

commit: a10ad59196e8b22355a6cb7bc5d4ae98c6bd7b7d [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Aug 03 11:20:23 2020 +0200
committer: Akron <nils@diewald-online.de> Mon Sep 28 11:59:24 2020 +0200
tree: 70a9ed052e2fa72cb62ee5159ce1d7659c0f667c
parent: 41021abd841594bbb18bc8094df6950620daf5df [diff]
diff --git a/lib/KorAP/XML/TEI/Data.pm b/lib/KorAP/XML/TEI/Data.pm
new file mode 100644
index 0000000..0de4aca
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Data.pm

@@ -0,0 +1,99 @@
+package KorAP::XML::TEI::Data;
+use strict;
+use warnings;
+use Log::Any qw($log);
+use Encode qw(encode decode);
+use KorAP::XML::TEI qw!escape_xml_minimal!;
+
+sub new {
+  bless \(my $data = ''), shift;
+};
+
+
+# Return data as a string
+sub to_string {
+  my ($self, $text_id) = @_;
+
+  unless ($text_id) {
+    $log->warn('Missing textID');
+    return;
+  };
+
+  my $out = $self->_header($text_id);
+  $out .= '  <text>' . escape_xml_minimal($$self) . "</text>\n";
+  return  $out . $self->_footer;
+};
+
+
+# Reset the inner state of the collector
+# and return the collector object.
+sub reset {
+  ${$_[0]} = '';
+  $_[0];
+};
+
+
+# Return serialized data
+sub data {
+  ${$_[0]};
+};
+
+
+# Append data to data stream
+sub append {
+  my $d = pop;
+  # TODO:
+  #   should not be necessary, because whitespace at the end of
+  #   every input line is removed: see 'whitespace handling' inside
+  #   text body
+  # note:
+  #   2 blanks - otherwise offset data would become corrupt
+  $d =~ tr/\n\r/  /;
+
+  ${$_[0]} .= $d;
+};
+
+
+# Return the current position in data stream
+sub position {
+  length(${$_[0]});
+};
+
+
+# Header for XML output
+sub _header {
+  my (undef, $text_id) = @_;
+
+  # TODO:
+  #   Can 'metadata.xml' change or is it constant?
+  return <<"HEADER";
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng"
+            type="application/xml"
+            schematypens="http://relaxng.org/ns/structure/1.0"?>
+<raw_text docid="$text_id"
+          xmlns="http://ids-mannheim.de/ns/KorAP">
+  <metadata file="metadata.xml" />
+HEADER
+};
+
+
+# Footer for XML output
+sub _footer {
+  return '</raw_text>';
+};
+
+
+# Write data to zip stream
+sub to_zip {
+  my ($self, $zip, $text_id) = @_;
+
+  # Encode and escape data
+  # note: the index still refers to the 'single character'-versions,
+  # which are counted as 1 (search for '&amp;' in data.xml and see
+  # corresponding indices in $_tokens_file)
+  $zip->print(encode('UTF-8', $self->to_string($text_id)));
+};
+
+
+1;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8066b4c..0912d30 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -20,11 +20,12 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
+use KorAP::XML::TEI qw!remove_xml_comments!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Annotations::Collector;
+use KorAP::XML::TEI::Data;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
@@ -149,28 +150,24 @@
 my $structures = KorAP::XML::TEI::Annotations::Collector->new;
 
 
+# Initialize Data-Collector
+my $data = KorAP::XML::TEI::Data->new;
+
+
 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
 my $input_fh;                                        # input file handle (default: stdin)
 
-my $data;                                            # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
-
 my $dir;                                             # text     directory (below $_root_dir)
 
 my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id (see %ent)
 
-my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
-
-my ( $ref, $idx, $att_idx );                         # needed in func. 'write_structures'
-
 my ( $reader,                                        # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
      $tree_data );                                   # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
 
 # these are only used inside recursive function 'retr_info'
 my ( $_IDX,                                          # value is set dependent on $_XCT_LN - for extracting array of child elements from element in $tree_data
      $e,                                             # element from $tree_data
-     $dl,                                            # actual length of string $data
-                                                     #                            represents the actual processed element from @structures
      ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
      $add_one,                                       # ...
      $fval,                                          # ...
@@ -190,8 +187,6 @@
 
 ($_XCT_LN)?($_IDX=5):($_IDX=4);
 
-$data_prfx1 = $data_prfx2 = $data_sfx = "";
-
 $fval = 0;
 
 # Normalize regex for header parsing
@@ -201,16 +196,6 @@
   s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
 };
 
-$data_prfx1   = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-$data_prfx1  .= "<?xml-model href=\"text.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n";
-$data_prfx1  .= "<raw_text docid=\"";
-$data_prfx2  .= "\" xmlns=\"http://ids-mannheim.de/ns/KorAP\">\n";
-## TODO: can 'metadata.xml' change or is it constant?
-$data_prfx2  .= "  <metadata file=\"metadata.xml\" />\n";
-##
-$data_prfx2  .= "  <text>";
-$data_sfx     = "</text>\n</raw_text>";
-
 
 # ~ read input and write output (text by text) ~
 main();
@@ -229,8 +214,10 @@
 
   $input_fh = *STDIN;  # input file handle (default: stdin)
 
-  $data = $dir = "";
+  # Maybe not necessary
+  $data->reset;
 
+  $dir = "";
 
   if ( $input_fname ne '' ){
 
@@ -250,7 +237,7 @@
 
   # ~ loop (reading input document) ~
 
-  MAIN: while ( <$input_fh> ){
+ MAIN: while ( <$input_fh> ){
 
     $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
 
@@ -263,7 +250,7 @@
 
       die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+        if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
 
       # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
       my $buf_in = '';
@@ -298,11 +285,12 @@
             #   Insignificant whitespace is used when editing XML documents for readability.
             #    These whitespaces are typically not intended for inclusion in the delivery of the document.
             #
-            if ( $_XCT_LN ){ # _XCT_LINE_NUMBERS is only for debugging
-              $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY | XCT_LINE_NUMBERS );
-            } else {
-              $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
-            }
+
+            my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+
+            # _XCT_LINE_NUMBERS is only for debugging
+            $param |= XCT_LINE_NUMBERS if $_XCT_LN;
+            $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
 
             $structures->reset;
 
@@ -310,8 +298,6 @@
               $tokens->reset;
             }
 
-            $dl = 0;
-
             # ~ whitespace related issue ~
             $add_one = 0;
             %ws = ();
@@ -320,34 +306,22 @@
             # ~ recursion ~
             retr_info(1, \$tree_data->[2] ); # parse input data
 
-
-            # ~ write data.xml ~
-
-            # TODO: should not be necessary, because whitespace at the end of every input line is removed: see 'whitespace handling' inside text body
-            $data =~ tr/\n\r/  /; # note: 2 blanks - otherwise offset data would become corrupt
-            #
-
-
-            # Encode and escape data
-            my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
-            # note: the index still refers to the 'single character'-versions,
-            # which are counted as 1 (search for '&amp;' in data.xml and see
-            # corresponding indices in $_tokens_file)
-
             if ($_DEBUG) {
               $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
             };
 
-            $zipper->new_stream("$dir/$_data_file")
-              ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
-
+            # ~ write data.xml ~
+            $data->to_zip(
+              $zipper->new_stream("$dir/${_data_file}"),
+              $text_id_esc
+            );
 
             # ~ tokenization ~
 
             if ( $_GEN_TOK_EXT ){
 
               # Tokenize and output
-              $ext_tok->tokenize($data)->to_zip(
+              $ext_tok->tokenize($data->data)->to_zip(
                 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
                 $text_id_esc
               );
@@ -356,12 +330,12 @@
             if ( $_GEN_TOK_INT ){
 
               # Tokenize and output
-              $cons_tok->tokenize($data)->to_zip(
+              $cons_tok->tokenize($data->data)->to_zip(
                 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
                 $text_id_esc
               );
 
-              $aggr_tok->tokenize($data)->to_zip(
+              $aggr_tok->tokenize($data->data)->to_zip(
                 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
                 $text_id_esc
               );
@@ -389,7 +363,10 @@
             };
 
 
-            $data = $dir = ""; # reinit.
+            $dir = ""; # reinit.
+
+            # Maybe not necessary
+            $data->reset;
 
           } else { # $dir eq ""
 
@@ -593,7 +570,6 @@
 
   foreach $e ( @{${$_[0]}} ){ # iteration through all array elements ($_[0] is a reference to an array reference)
 
-
     if ( $e->[0] == XML_READER_TYPE_ELEMENT ){ # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
 
 
@@ -633,7 +609,7 @@
       # ~ index 'from' ~
 
       # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
-      $anno->set_from($dl + $add_one);
+      $anno->set_from($data->position + $add_one);
 
       #~~~~
       # until here: tag-node (opening)
@@ -652,6 +628,7 @@
       # from here: tag-node (closing)
       #~~~~~
 
+      my $pos = $data->position;
 
       # ~ handle structures and tokens ~
 
@@ -667,18 +644,18 @@
 
         # in case this fails, check input
         die "ERROR ($0, retr_info()): text_id='$text_id', processing of \@structures: from-value ($fval) is 2 or more greater"
-          ." than to-value ($dl) => please check. aborting ...\n"
-            if ( $fval - 1 ) > $dl;
+          ." than to-value ($pos) => please check. aborting ...\n"
+          if ( $fval - 1 ) > $pos;
 
         # TODO: find example for which this case applies
         #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
-        # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $dl;
+        # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
         #   do testing with bigger corpus excerpt (wikipedia?)
-        $anno->set_from($dl) if $fval == $dl + 1;
-        $anno->set_to($dl);
+        $anno->set_from($pos) if $fval == $pos + 1;
+        $anno->set_to($pos);
         $anno->set_level($rl);
 
-        # note: use $dl, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
+        # note: use $pos, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
       }
 
       # ~ whitespace related issue ~
@@ -725,7 +702,7 @@
       #  the last read 'non-tag'-node has to be corrected (see [1]),
       #
       # For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
-      #  additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $dl).
+      #  additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
       #
       # [1]
       # Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
@@ -750,8 +727,9 @@
 
         $add_one = 0;
 
-        $ws{ $dl }++; # state, that this from-index belongs to a whitespace-node
-                      #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+        # state, that this from-index belongs to a whitespace-node
+        #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+        $ws{$data->position}++;
 
       }else{
 
@@ -761,12 +739,9 @@
       }
 
 
-      # ~ update $data and $dl ~
+      # ~ update $data ~
 
-      $data .= $e->[1];
-
-      $dl += length( $e->[1] ); # update length of $data
-
+      $data->append($e->[1]);
 
       #~~~~~
       # until here: text- and whitespace-nodes

diff --git a/t/data.t b/t/data.t
new file mode 100644
index 0000000..b33c641
--- /dev/null
+++ b/t/data.t

@@ -0,0 +1,29 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use_ok('KorAP::XML::TEI::Data');
+
+my $d = KorAP::XML::TEI::Data->new;
+
+ok($d, 'Constructed');
+
+is($d->position, 0, 'Position');
+ok($d->append('aaa'), 'Add raw data');
+is($d->position, 3, 'Position');
+ok($d->reset, 'Reset');
+is($d->position, 0, 'Position');
+
+
+ok($d->append('  Dies ist '), 'Add raw data');
+is($d->position, 11, 'Position');
+ok($d->append("Ein Versuch\n"), 'Add raw data');
+is($d->position, 23, 'Position');
+
+my $loy = Test::XML::Loy->new($d->to_string('x'));
+
+$loy->attr_is('raw_text', 'docid', 'x')
+  ->text_is('raw_text text', '  Dies ist Ein Versuch ');
+
+done_testing;

diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 71dbe54..f618717 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl

@@ -17,6 +17,7 @@
 use KorAP::XML::TEI 'remove_xml_comments';
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Tokenizer::Conservative;
+use KorAP::XML::TEI::Data;
 
 my $columns = 0;
 my $no_header = 0;
@@ -73,10 +74,12 @@
 };
 
 my $t_data_utf_8 = decode('utf-8',$t_data);
+my @t_data_split = split(' ', $t_data);
 
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 
+my $data = KorAP::XML::TEI::Data->new;
 
 # Add benchmark instances
 $bench->add_instances(
@@ -142,6 +145,13 @@
       $result = $aggr_tok->reset->tokenize($t_data_utf_8);
       $result = 0;
     }
+  ),
+  Dumbbench::Instance::PerlSub->new(
+    name => 'Data-Collect with serialization',
+    code => sub {
+      $data->reset->append($_) foreach @t_data_split;
+      $result = $data->to_string;
+    }
   )
 );
commit	a10ad59196e8b22355a6cb7bc5d4ae98c6bd7b7d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Aug 03 11:20:23 2020 +0200
committer	Akron <nils@diewald-online.de>	Mon Sep 28 11:59:24 2020 +0200
tree	70a9ed052e2fa72cb62ee5159ce1d7659c0f667c
parent	41021abd841594bbb18bc8094df6950620daf5df [diff]