First attempt to create a primary data collector
Change-Id: I6243512838a0cd33f8db182d93288bce45a3bbbc
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8066b4c..0912d30 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -20,11 +20,12 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
+use KorAP::XML::TEI qw!remove_xml_comments!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Annotations::Collector;
+use KorAP::XML::TEI::Data;
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
@@ -149,28 +150,24 @@
my $structures = KorAP::XML::TEI::Annotations::Collector->new;
+# Initialize Data-Collector
+my $data = KorAP::XML::TEI::Data->new;
+
+
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
my $input_fh; # input file handle (default: stdin)
-my $data; # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
-
my $dir; # text directory (below $_root_dir)
my ( $text_id, $text_id_esc ); # '$text_id_esc' = escaped version of $text_id (see %ent)
-my ( $data_prfx1, $data_prfx2, $data_sfx ); # $data_* are written to $_data_file
-
-my ( $ref, $idx, $att_idx ); # needed in func. 'write_structures'
-
my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
$tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
# these are only used inside recursive function 'retr_info'
my ( $_IDX, # value is set dependent on $_XCT_LN - for extracting array of child elements from element in $tree_data
$e, # element from $tree_data
- $dl, # actual length of string $data
- # represents the actual processed element from @structures
## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
$add_one, # ...
$fval, # ...
@@ -190,8 +187,6 @@
($_XCT_LN)?($_IDX=5):($_IDX=4);
-$data_prfx1 = $data_prfx2 = $data_sfx = "";
-
$fval = 0;
# Normalize regex for header parsing
@@ -201,16 +196,6 @@
s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
};
-$data_prfx1 = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-$data_prfx1 .= "<?xml-model href=\"text.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n";
-$data_prfx1 .= "<raw_text docid=\"";
-$data_prfx2 .= "\" xmlns=\"http://ids-mannheim.de/ns/KorAP\">\n";
-## TODO: can 'metadata.xml' change or is it constant?
-$data_prfx2 .= " <metadata file=\"metadata.xml\" />\n";
-##
-$data_prfx2 .= " <text>";
-$data_sfx = "</text>\n</raw_text>";
-
# ~ read input and write output (text by text) ~
main();
@@ -229,8 +214,10 @@
$input_fh = *STDIN; # input file handle (default: stdin)
- $data = $dir = "";
+ # Maybe not necessary
+ $data->reset;
+ $dir = "";
if ( $input_fname ne '' ){
@@ -250,7 +237,7 @@
# ~ loop (reading input document) ~
- MAIN: while ( <$input_fh> ){
+ MAIN: while ( <$input_fh> ){
$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
@@ -263,7 +250,7 @@
die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
." contains additional information ... => Aborting\n\tline=$_"
- if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+ if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
my $buf_in = '';
@@ -298,11 +285,12 @@
# Insignificant whitespace is used when editing XML documents for readability.
# These whitespaces are typically not intended for inclusion in the delivery of the document.
#
- if ( $_XCT_LN ){ # _XCT_LINE_NUMBERS is only for debugging
- $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY | XCT_LINE_NUMBERS );
- } else {
- $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
- }
+
+ my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+
+ # _XCT_LINE_NUMBERS is only for debugging
+ $param |= XCT_LINE_NUMBERS if $_XCT_LN;
+ $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
$structures->reset;
@@ -310,8 +298,6 @@
$tokens->reset;
}
- $dl = 0;
-
# ~ whitespace related issue ~
$add_one = 0;
%ws = ();
@@ -320,34 +306,22 @@
# ~ recursion ~
retr_info(1, \$tree_data->[2] ); # parse input data
-
- # ~ write data.xml ~
-
- # TODO: should not be necessary, because whitespace at the end of every input line is removed: see 'whitespace handling' inside text body
- $data =~ tr/\n\r/ /; # note: 2 blanks - otherwise offset data would become corrupt
- #
-
-
- # Encode and escape data
- my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
- # note: the index still refers to the 'single character'-versions,
- # which are counted as 1 (search for '&' in data.xml and see
- # corresponding indices in $_tokens_file)
-
if ($_DEBUG) {
$log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
};
- $zipper->new_stream("$dir/$_data_file")
- ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
-
+ # ~ write data.xml ~
+ $data->to_zip(
+ $zipper->new_stream("$dir/${_data_file}"),
+ $text_id_esc
+ );
# ~ tokenization ~
if ( $_GEN_TOK_EXT ){
# Tokenize and output
- $ext_tok->tokenize($data)->to_zip(
+ $ext_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
$text_id_esc
);
@@ -356,12 +330,12 @@
if ( $_GEN_TOK_INT ){
# Tokenize and output
- $cons_tok->tokenize($data)->to_zip(
+ $cons_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
$text_id_esc
);
- $aggr_tok->tokenize($data)->to_zip(
+ $aggr_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
$text_id_esc
);
@@ -389,7 +363,10 @@
};
- $data = $dir = ""; # reinit.
+ $dir = ""; # reinit.
+
+ # Maybe not necessary
+ $data->reset;
} else { # $dir eq ""
@@ -593,7 +570,6 @@
foreach $e ( @{${$_[0]}} ){ # iteration through all array elements ($_[0] is a reference to an array reference)
-
if ( $e->[0] == XML_READER_TYPE_ELEMENT ){ # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
@@ -633,7 +609,7 @@
# ~ index 'from' ~
# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
- $anno->set_from($dl + $add_one);
+ $anno->set_from($data->position + $add_one);
#~~~~
# until here: tag-node (opening)
@@ -652,6 +628,7 @@
# from here: tag-node (closing)
#~~~~~
+ my $pos = $data->position;
# ~ handle structures and tokens ~
@@ -667,18 +644,18 @@
# in case this fails, check input
die "ERROR ($0, retr_info()): text_id='$text_id', processing of \@structures: from-value ($fval) is 2 or more greater"
- ." than to-value ($dl) => please check. aborting ...\n"
- if ( $fval - 1 ) > $dl;
+ ." than to-value ($pos) => please check. aborting ...\n"
+ if ( $fval - 1 ) > $pos;
# TODO: find example for which this case applies
# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
- # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $dl;
+ # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
# do testing with bigger corpus excerpt (wikipedia?)
- $anno->set_from($dl) if $fval == $dl + 1;
- $anno->set_to($dl);
+ $anno->set_from($pos) if $fval == $pos + 1;
+ $anno->set_to($pos);
$anno->set_level($rl);
- # note: use $dl, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
+ # note: use $pos, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
}
# ~ whitespace related issue ~
@@ -725,7 +702,7 @@
# the last read 'non-tag'-node has to be corrected (see [1]),
#
# For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
- # additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $dl).
+ # additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
#
# [1]
# Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
@@ -750,8 +727,9 @@
$add_one = 0;
- $ws{ $dl }++; # state, that this from-index belongs to a whitespace-node
- # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+ # state, that this from-index belongs to a whitespace-node
+ # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
+ $ws{$data->position}++;
}else{
@@ -761,12 +739,9 @@
}
- # ~ update $data and $dl ~
+ # ~ update $data ~
- $data .= $e->[1];
-
- $dl += length( $e->[1] ); # update length of $data
-
+ $data->append($e->[1]);
#~~~~~
# until here: text- and whitespace-nodes