First attempt to create a primary data collector
Change-Id: I6243512838a0cd33f8db182d93288bce45a3bbbc
diff --git a/lib/KorAP/XML/TEI/Data.pm b/lib/KorAP/XML/TEI/Data.pm
new file mode 100644
index 0000000..0de4aca
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Data.pm
@@ -0,0 +1,99 @@
+package KorAP::XML::TEI::Data;
+use strict;
+use warnings;
+use Log::Any qw($log);
+use Encode qw(encode decode);
+use KorAP::XML::TEI qw!escape_xml_minimal!;
+
+sub new {
+ bless \(my $data = ''), shift;
+};
+
+
+# Return data as a string
+sub to_string {
+ my ($self, $text_id) = @_;
+
+ unless ($text_id) {
+ $log->warn('Missing textID');
+ return;
+ };
+
+ my $out = $self->_header($text_id);
+ $out .= ' <text>' . escape_xml_minimal($$self) . "</text>\n";
+ return $out . $self->_footer;
+};
+
+
+# Reset the inner state of the collector
+# and return the collector object.
+sub reset {
+ ${$_[0]} = '';
+ $_[0];
+};
+
+
+# Return serialized data
+sub data {
+ ${$_[0]};
+};
+
+
+# Append data to data stream
+sub append {
+ my $d = pop;
+ # TODO:
+ # should not be necessary, because whitespace at the end of
+ # every input line is removed: see 'whitespace handling' inside
+ # text body
+ # note:
+ # 2 blanks - otherwise offset data would become corrupt
+ $d =~ tr/\n\r/ /;
+
+ ${$_[0]} .= $d;
+};
+
+
+# Return the current position in data stream
+sub position {
+ length(${$_[0]});
+};
+
+
+# Header for XML output
+sub _header {
+ my (undef, $text_id) = @_;
+
+ # TODO:
+ # Can 'metadata.xml' change or is it constant?
+ return <<"HEADER";
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<raw_text docid="$text_id"
+ xmlns="http://ids-mannheim.de/ns/KorAP">
+ <metadata file="metadata.xml" />
+HEADER
+};
+
+
+# Footer for XML output
+sub _footer {
+ return '</raw_text>';
+};
+
+
+# Write data to zip stream
+sub to_zip {
+ my ($self, $zip, $text_id) = @_;
+
+ # Encode and escape data
+ # note: the index still refers to the 'single character'-versions,
+ # which are counted as 1 (search for '&' in data.xml and see
+ # corresponding indices in $_tokens_file)
+ $zip->print(encode('UTF-8', $self->to_string($text_id)));
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8066b4c..0912d30 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -20,11 +20,12 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
+use KorAP::XML::TEI qw!remove_xml_comments!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Annotations::Collector;
+use KorAP::XML::TEI::Data;
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
@@ -149,28 +150,24 @@
my $structures = KorAP::XML::TEI::Annotations::Collector->new;
+# Initialize Data-Collector
+my $data = KorAP::XML::TEI::Data->new;
+
+
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
my $input_fh; # input file handle (default: stdin)
-my $data; # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
-
my $dir; # text directory (below $_root_dir)
my ( $text_id, $text_id_esc ); # '$text_id_esc' = escaped version of $text_id (see %ent)
-my ( $data_prfx1, $data_prfx2, $data_sfx ); # $data_* are written to $_data_file
-
-my ( $ref, $idx, $att_idx ); # needed in func. 'write_structures'
-
my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
$tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
# these are only used inside recursive function 'retr_info'
my ( $_IDX, # value is set dependent on $_XCT_LN - for extracting array of child elements from element in $tree_data
$e, # element from $tree_data
- $dl, # actual length of string $data
- # represents the actual processed element from @structures
## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
$add_one, # ...
$fval, # ...
@@ -190,8 +187,6 @@
($_XCT_LN)?($_IDX=5):($_IDX=4);
-$data_prfx1 = $data_prfx2 = $data_sfx = "";
-
$fval = 0;
# Normalize regex for header parsing
@@ -201,16 +196,6 @@
s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
};
-$data_prfx1 = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-$data_prfx1 .= "<?xml-model href=\"text.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n";
-$data_prfx1 .= "<raw_text docid=\"";
-$data_prfx2 .= "\" xmlns=\"http://ids-mannheim.de/ns/KorAP\">\n";
-## TODO: can 'metadata.xml' change or is it constant?
-$data_prfx2 .= " <metadata file=\"metadata.xml\" />\n";
-##
-$data_prfx2 .= " <text>";
-$data_sfx = "</text>\n</raw_text>";
-
# ~ read input and write output (text by text) ~
main();
@@ -229,8 +214,10 @@
$input_fh = *STDIN; # input file handle (default: stdin)
- $data = $dir = "";
+ # Maybe not necessary
+ $data->reset;
+ $dir = "";
if ( $input_fname ne '' ){
@@ -250,7 +237,7 @@
# ~ loop (reading input document) ~
- MAIN: while ( <$input_fh> ){
+ MAIN: while ( <$input_fh> ){
$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
@@ -263,7 +250,7 @@
die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
." contains additional information ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+ if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
my $buf_in = '';
@@ -298,11 +285,12 @@
# Insignificant whitespace is used when editing XML documents for readability.
# These whitespaces are typically not intended for inclusion in the delivery of the document.
#
- if ( $_XCT_LN ){ # _XCT_LINE_NUMBERS is only for debugging
- $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY | XCT_LINE_NUMBERS );
- } else {
- $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
- }
+
+ my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+
+ # _XCT_LINE_NUMBERS is only for debugging
+ $param |= XCT_LINE_NUMBERS if $_XCT_LN;
+ $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
$structures->reset;
@@ -310,8 +298,6 @@
$tokens->reset;
}
- $dl = 0;
-
# ~ whitespace related issue ~
$add_one = 0;
%ws = ();
@@ -320,34 +306,22 @@
# ~ recursion ~
retr_info(1, \$tree_data->[2] ); # parse input data
-
- # ~ write data.xml ~
-
- # TODO: should not be necessary, because whitespace at the end of every input line is removed: see 'whitespace handling' inside text body
- $data =~ tr/\n\r/ /; # note: 2 blanks - otherwise offset data would become corrupt
- #
-
-
- # Encode and escape data
- my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
- # note: the index still refers to the 'single character'-versions,
- # which are counted as 1 (search for '&' in data.xml and see
- # corresponding indices in $_tokens_file)
-
if ($_DEBUG) {
$log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
};
- $zipper->new_stream("$dir/$_data_file")
- ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
-
+ # ~ write data.xml ~
+ $data->to_zip(
+ $zipper->new_stream("$dir/${_data_file}"),
+ $text_id_esc
+ );
# ~ tokenization ~
if ( $_GEN_TOK_EXT ){
# Tokenize and output
- $ext_tok->tokenize($data)->to_zip(
+ $ext_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
$text_id_esc
);
@@ -356,12 +330,12 @@
if ( $_GEN_TOK_INT ){
# Tokenize and output
- $cons_tok->tokenize($data)->to_zip(
+ $cons_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
$text_id_esc
);
- $aggr_tok->tokenize($data)->to_zip(
+ $aggr_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
$text_id_esc
);
@@ -389,7 +363,10 @@
};
- $data = $dir = ""; # reinit.
+ $dir = ""; # reinit.
+
+ # Maybe not necessary
+ $data->reset;
} else { # $dir eq ""
@@ -593,7 +570,6 @@
foreach $e ( @{${$_[0]}} ){ # iteration through all array elements ($_[0] is a reference to an array reference)
-
if ( $e->[0] == XML_READER_TYPE_ELEMENT ){ # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
@@ -633,7 +609,7 @@
# ~ index 'from' ~
# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
- $anno->set_from($dl + $add_one);
+ $anno->set_from($data->position + $add_one);
#~~~~
# until here: tag-node (opening)
@@ -652,6 +628,7 @@
# from here: tag-node (closing)
#~~~~~
+ my $pos = $data->position;
# ~ handle structures and tokens ~
@@ -667,18 +644,18 @@
# in case this fails, check input
die "ERROR ($0, retr_info()): text_id='$text_id', processing of \@structures: from-value ($fval) is 2 or more greater"
- ." than to-value ($dl) => please check. aborting ...\n"
- if ( $fval - 1 ) > $dl;
+ ." than to-value ($pos) => please check. aborting ...\n"
+ if ( $fval - 1 ) > $pos;
# TODO: find example for which this case applies
# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
- # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $dl;
+ # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
# do testing with bigger corpus excerpt (wikipedia?)
- $anno->set_from($dl) if $fval == $dl + 1;
- $anno->set_to($dl);
+ $anno->set_from($pos) if $fval == $pos + 1;
+ $anno->set_to($pos);
$anno->set_level($rl);
- # note: use $dl, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
+ # note: use $pos, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
}
# ~ whitespace related issue ~
@@ -725,7 +702,7 @@
# the last read 'non-tag'-node has to be corrected (see [1]),
#
# For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
- # additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $dl).
+ # additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
#
# [1]
# Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
@@ -750,8 +727,9 @@
$add_one = 0;
- $ws{ $dl }++; # state, that this from-index belongs to a whitespace-node
- # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+ # state, that this from-index belongs to a whitespace-node
+ # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+ $ws{$data->position}++;
}else{
@@ -761,12 +739,9 @@
}
- # ~ update $data and $dl ~
+ # ~ update $data ~
- $data .= $e->[1];
-
- $dl += length( $e->[1] ); # update length of $data
-
+ $data->append($e->[1]);
#~~~~~
# until here: text- and whitespace-nodes
diff --git a/t/data.t b/t/data.t
new file mode 100644
index 0000000..b33c641
--- /dev/null
+++ b/t/data.t
@@ -0,0 +1,29 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use_ok('KorAP::XML::TEI::Data');
+
+my $d = KorAP::XML::TEI::Data->new;
+
+ok($d, 'Constructed');
+
+is($d->position, 0, 'Position');
+ok($d->append('aaa'), 'Add raw data');
+is($d->position, 3, 'Position');
+ok($d->reset, 'Reset');
+is($d->position, 0, 'Position');
+
+
+ok($d->append(' Dies ist '), 'Add raw data');
+is($d->position, 11, 'Position');
+ok($d->append("Ein Versuch\n"), 'Add raw data');
+is($d->position, 23, 'Position');
+
+my $loy = Test::XML::Loy->new($d->to_string('x'));
+
+$loy->attr_is('raw_text', 'docid', 'x')
+ ->text_is('raw_text text', ' Dies ist Ein Versuch ');
+
+done_testing;
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 71dbe54..f618717 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -17,6 +17,7 @@
use KorAP::XML::TEI 'remove_xml_comments';
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Tokenizer::Conservative;
+use KorAP::XML::TEI::Data;
my $columns = 0;
my $no_header = 0;
@@ -73,10 +74,12 @@
};
my $t_data_utf_8 = decode('utf-8',$t_data);
+my @t_data_split = split(' ', $t_data);
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+my $data = KorAP::XML::TEI::Data->new;
# Add benchmark instances
$bench->add_instances(
@@ -142,6 +145,13 @@
$result = $aggr_tok->reset->tokenize($t_data_utf_8);
$result = 0;
}
+ ),
+ Dumbbench::Instance::PerlSub->new(
+ name => 'Data-Collect with serialization',
+ code => sub {
+ $data->reset->append($_) foreach @t_data_split;
+ $result = $data->to_string;
+ }
)
);