Merge "Remove deprecated last_token method from Annotations::Collector"

commit: bda1f23d14f3c81dfe1ebb2866ceb833aae7877c [log] [tgz]
author: Akron <diewald@ids-mannheim.de> Tue Sep 29 13:03:52 2020 +0200
committer: Gerrit Code Review <gerrit2@korap.ids-mannheim.de> Tue Sep 29 13:03:52 2020 +0200
tree: 3f3ee61c507304987196c460a009f87fbe63667a
parent: 34b924a3ef05c9839dd6c4bf2ff083b473d3a861 [diff]
parent: 031a4f02c3252ba69e62e21ed4c91e8538673150 [diff]
diff --git a/Changes b/Changes
new file mode 100644
index 0000000..49ccd01
--- /dev/null
+++ b/Changes

@@ -0,0 +1,2 @@
+0.01 2020-09-28
+        - Initial release to GitHub.

diff --git a/Readme.pod b/Readme.pod
new file mode 100644
index 0000000..a71f3c8
--- /dev/null
+++ b/Readme.pod

@@ -0,0 +1,142 @@
+=pod
+
+=encoding utf8
+
+=head1 NAME
+
+tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
+
+=head1 SYNOPSIS
+
+  cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
+
+=head1 DESCRIPTION
+
+C<tei2korapxml> is a script to convert TEI P5 and
+L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
+based documents to the
+L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
+If no specific input is defined, data is
+read from C<STDIN>. If no specific output is defined, data is written
+to C<STDOUT>.
+
+This program is usually called from inside another script.
+
+=head1 FORMATS
+
+=head2 Input restrictions
+
+=over 2
+
+=item
+
+utf8 encoded
+
+=item
+
+TEI P5 formatted input with certain restrictions:
+
+=over 4
+
+=item
+
+B<mandatory>: text-header with integrated textsigle, text-body
+
+=item
+
+B<optional>: corp-header with integrated corpsigle,
+doc-header with integrated docsigle
+
+=back
+
+=item
+
+All tokens inside the primary text may not be
+newline seperated, because newlines are removed
+(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
+into blanks between 2 tokens could lead to additional blanks,
+where there should be none (e.g.: punctuation characters like C<,> or
+C<.> should not be seperated from their predecessor token).
+(see also code section C<~ whitespace handling ~>).
+
+=back
+
+=head2 Notes on the output
+
+=over 2
+
+=item
+
+zip file output (default on C<stdout>) with utf8 encoded entries
+(which together form the KorAP-XML format)
+
+=back
+
+=head1 INSTALLATION
+
+C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
+these bindings are available, the preferred way to install the script is
+to use L<cpanm|App::cpanminus>.
+
+  $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
+
+In case everything went well, the C<tei2korapxml> tool will
+be available on your command line immediately.
+
+Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
+
+=head1 OPTIONS
+
+=over 2
+
+=item B<--root|-r>
+
+The root directory for output. Defaults to C<.>.
+
+=item B<--help|-h>
+
+Print help information.
+
+=item B<--version|-v>
+
+Print version information.
+
+=item B<--tokenizer-call|-tc>
+
+Call an external tokenizer process, that will tokenize
+a single line from STDIN and outputs one token per line.
+
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
+=item B<--tokenizer-internal|-ti>
+
+Tokenize the data using two embedded tokenizers,
+that will take an I<Aggressive> and a I<conservative>
+approach.
+
+=item B<--log|-l>
+
+Loglevel for I<Log::Any>. Defaults to C<notice>.
+
+=back
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
+
+Author: Peter Harders
+
+Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
+
+L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
+
+This program is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-TEI/master/LICENSE>.
+
+=cut

diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 8f1678d..27ef6f9 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm

@@ -22,8 +22,10 @@
 sub remove_xml_comments {
   my ($fh, $html) = @_;
 
-  # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
-  # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
+  # the source code part where $tc is used, leads to the situation,
+  # that comments can produce an additional blank, which
+  # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...'
+  # would lead to '... <w>token</w>...' in $buf_in).
   # removing comments before processing the line, prevents this situation.
 
   my $pfx = '';
@@ -58,8 +60,9 @@
     }
   }
 
-  if ( $html =~ /^\s*$/ ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
-
+  if ($html =~ /^\s*$/) {
+    # get next line and feed it also to this sub,
+    # if actual line is empty or only contains whitespace
     $html = <$fh>;
     goto CHECK;
   };

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 9a09ec7..b7d4c87 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -52,7 +52,7 @@
   my ($self, $txt) = @_;
   return unless $self->{pid};
   my $out = $self->{chld_in};
-  print $out encode( "UTF-8", $txt ) . $self->{sep};
+  print $out encode('UTF-8', $txt) . $self->{sep};
   return $self;
 };
 

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9ce2c8e..ed6b488 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -45,7 +45,7 @@
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
   'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
-  'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
+  'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -70,42 +70,42 @@
 # ~~~ parameter (mandatory) ~~~
 #
 my $_TEXT_BODY        = "text";                        # tag (without attributes), which contains the primary text
- # optional
+# optional
 my $_CORP_HEADER_BEG  = "idsHeader type=\"corpus\"";   # just keep the correct order of the attributes and evtl. add an '.*' between them
- # optional
+# optional
 my $_DOC_HEADER_BEG   = "idsHeader type=\"document\""; # analog
- # mandatory
+# mandatory
 my $_TEXT_HEADER_BEG  = "idsHeader type=\"text\"";     # analog
 
 #
 # ~~~ constants ~~~
 #
 
+
 ## extern tokenization
 my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
 
-  # TODO:
-  #   Read tokenizer call from configuration file.
-  #   was 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
-  my $ext_tok;
-  if ($tokenizer_call) {
-    $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
-  }
+my $ext_tok;
+if ($tokenizer_call) {
+  $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+}
 
-  elsif ($tokenizer_korap) {
-    $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
-  };
-  my $_tok_file_ext  = "tokens.xml";
+elsif ($tokenizer_korap) {
+  $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
+};
+my $_tok_file_ext  = "tokens.xml";
 ##
 
+
 ## intern tokenization
 my $_GEN_TOK_INT = $tokenizer_intern;                  # simple tokenization (recommended for testing)
-  my $_tok_file_con  = "tokens_conservative.xml";
-  my $_tok_file_agg  = "tokens_aggressive.xml";
-  my $aggr_tok       = KorAP::XML::TEI::Tokenizer::Aggressive->new;
-  my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
+my $_tok_file_con  = "tokens_conservative.xml";
+my $_tok_file_agg  = "tokens_aggressive.xml";
+my $aggr_tok       = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##
 
+
 my $_tok_dir         = "base";                       # name of directory for storing tokenization files
 
 my $_DEBUG           = 0;                            # set to 1 for minimal more debug output (no need to be parametrized)
@@ -128,18 +128,6 @@
 # handling inline annotations (inside $_TOKENS_TAG)
 my $_INLINE_ANNOT    = $ENV{KORAPXMLTEI_INLINE}?1:0; # on/off: set to 1 if inline annotations are present and should be processed (default: 0)
 
-# TODO:
-#   These parameters are now defunct and moved to Token.pm
-my $_INLINE_LEM_RD   = "lemma";                      # from which attribute to read LEMMA information
-my $_INLINE_ATT_RD   = "ana";                        # from which attribute to read POS information (and evtl. additional MSD - Morphosyntactic Descriptions)
-                                                     # TODO: The format for the POS and MSD information has to suffice the regular expression ([^ ]+)( (.+))?
-                                                     #  - which means, that the POS information can be followed by an optional blank with additional
-                                                     #     MSD information; unlike the MSD part, the POS part may not contain any blanks.
-my $_INLINE_POS_WR   = "pos";                        # name (inside $_tokens_file) referring to POS   information
-my $_INLINE_MSD_WR   = "msd";                        # name (inside $_tokens_file) referring to MSD   information
-my $_INLINE_LEM_WR   = "lemma";                      # name (inside $_tokens_file) referring to LEMMA information
-##
-
 
 #
 # ~~~ variables ~~~
@@ -160,7 +148,8 @@
 
 my $dir;                                             # text     directory (below $_root_dir)
 
-my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id
+my ( $text_id,
+     $text_id_esc );                                 # '$text_id_esc' = escaped version of $text_id
 
 my ( $reader,                                        # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
      $tree_data );                                   # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
@@ -198,277 +187,268 @@
 
 
 # ~ read input and write output (text by text) ~
-main();
 
+my ( $pfx, $sfx );
 
-#
-# ~~~ subs ~~~
-#
+my $tl = 0; # text line (needed for whitespace handling)
 
+$input_fh = *STDIN;  # input file handle (default: stdin)
 
-sub main {
+# Maybe not necessary
+$data->reset;
 
-  my ( $pfx, $sfx );
+$dir = "";
 
-  my $tl = 0; # text line (needed for whitespace handling)
+if ( $input_fname ne '' ){
+  unless (open($input_fh, '<', $input_fname)) {
+    die $log->fatal("File '$input_fname' could not be opened.");
+  };
+}
 
-  $input_fh = *STDIN;  # input file handle (default: stdin)
+# prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in  'man XML::LibXML')
+# removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
+# see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
+# see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
+binmode $input_fh;
 
-  # Maybe not necessary
-  $data->reset;
+my $pos;
+my $l = length('</' . $_TEXT_BODY) + 1;
 
-  $dir = "";
+# ~ loop (reading input document) ~
 
-  if ( $input_fname ne '' ){
+MAIN: while ( <$input_fh> ){
 
-    open ( $input_fh, "<", "$input_fname") || die "File \'$input_fname\' could not be opened.\n";
+  $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
 
-  }
+  if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
+    # ~ start of text body ~
 
-  # prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in  'man XML::LibXML')
-  # removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
-  # see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
-  # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
-  binmode $input_fh;
+    $pfx = $1;
+    $sfx = $2;
 
-  my $pos;
-  my $l = length('</' . $_TEXT_BODY) + 1;
+    if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
+      die $log->fatal("input line number $.: " .
+                        "line with opening text-body tag '${_TEXT_BODY}' " .
+                        "contains additional information ... => Aborting (line=$_)");
+    };
 
-  # ~ loop (reading input document) ~
+    # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
+    my $buf_in = '';
 
- MAIN: while ( <$input_fh> ){
+    # Iterate over all lines in the text body
+    while (<$input_fh>) {
 
-    $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
+      $_ = remove_xml_comments( $input_fh, $_ );
 
+      # ~ end of text body ~
+      if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
 
-    if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+        # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
 
-      # ~ start of text body ~
+        if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
+          die $log->fatal("input line number $.: " .
+                            "line with closing text-body tag '${_TEXT_BODY}'".
+                            " contains additional information ... => Aborting (line=$_)");
+        };
 
-      $pfx = $1; $sfx = $2;
+        if ($dir ne "") {
 
-      die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
-        ." contains additional information ... => Aborting\n\tline=$_"
-        if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
+          $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
 
-      # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
-      my $buf_in = '';
+          #  ~ whitespace handling ~
+          #
+          #  Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
+          #   (see function 'retr_info()').
+          #
+          #  Definition of significant and insignificant whitespace
+          #   (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
+          #
+          #   Significant whitespace is part of the document content and should be preserved.
+          #   Insignificant whitespace is used when editing XML documents for readability.
+          #    These whitespaces are typically not intended for inclusion in the delivery of the document.
+          #
 
-      # Iterate over all lines in the text body
-      while (<$input_fh>) {
+          my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
 
-        $_ = remove_xml_comments( $input_fh, $_ );
+          # _XCT_LINE_NUMBERS is only for debugging
+          $param |= XCT_LINE_NUMBERS if $_XCT_LN;
+          $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
 
-        # ~ end of text body ~
-        if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
+          $structures->reset;
 
-          # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
+          $tokens->reset if $_TOKENS_PROC;
 
-          die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
-            ." contains additional information ... => Aborting\n\tline=$_"
-            if (substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/;
+          # ~ whitespace related issue ~
+          $add_one = 0;
+          %ws = ();
 
-          if ( $dir ne "" ){
+          # ~ recursion ~
+          retr_info(1, \$tree_data->[2] ); # parse input data
 
-            $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
+          if ($_DEBUG) {
+            $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+          };
 
-            #  ~ whitespace handling ~
-            #
-            #  Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
-            #   (see function 'retr_info()').
-            #
-            #  Definition of significant and insignificant whitespace
-            #   (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
-            #
-            #   Significant whitespace is part of the document content and should be preserved.
-            #   Insignificant whitespace is used when editing XML documents for readability.
-            #    These whitespaces are typically not intended for inclusion in the delivery of the document.
-            #
+          # ~ write data.xml ~
+          $data->to_zip(
+            $zipper->new_stream("$dir/${_data_file}"),
+            $text_id_esc
+          );
 
-            my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
+          # ~ tokenization ~
+          if ($_GEN_TOK_EXT) {
 
-            # _XCT_LINE_NUMBERS is only for debugging
-            $param |= XCT_LINE_NUMBERS if $_XCT_LN;
-            $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
+            # Tokenize and output
+            $ext_tok->tokenize($data->data)->to_zip(
+              $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
+              $text_id_esc
+            );
+          };
 
-            $structures->reset;
+          if ($_GEN_TOK_INT) {
 
-            if ( $_TOKENS_PROC ){
-              $tokens->reset;
-            }
-
-            # ~ whitespace related issue ~
-            $add_one = 0;
-            %ws = ();
-
-
-            # ~ recursion ~
-            retr_info(1, \$tree_data->[2] ); # parse input data
-
-            if ($_DEBUG) {
-              $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
-            };
-
-            # ~ write data.xml ~
-            $data->to_zip(
-              $zipper->new_stream("$dir/${_data_file}"),
+            # Tokenize and output
+            $cons_tok->tokenize($data->data)->to_zip(
+              $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
               $text_id_esc
             );
 
-            # ~ tokenization ~
+            $aggr_tok->tokenize($data->data)->to_zip(
+              $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
+              $text_id_esc
+            );
 
-            if ( $_GEN_TOK_EXT ){
+            $aggr_tok->reset;
+            $cons_tok->reset;
+          };
 
-              # Tokenize and output
-              $ext_tok->tokenize($data->data)->to_zip(
-                $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
-                $text_id_esc
-              );
-            };
+          # ~ write structures ~
+          if (!$structures->empty) {
+            $structures->to_zip(
+              $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
+              $text_id_esc,
+              2 # = structure serialization
+            );
+          };
 
-            if ( $_GEN_TOK_INT ){
+          # ~ write tokens ~
+          if ($_TOKENS_PROC && !$tokens->empty) {
+            $tokens->to_zip(
+              $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
+              $text_id_esc,
+              $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
+            );
+          };
 
-              # Tokenize and output
-              $cons_tok->tokenize($data->data)->to_zip(
-                $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
-                $text_id_esc
-              );
+          $dir = ""; # reinit.
 
-              $aggr_tok->tokenize($data->data)->to_zip(
-                $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
-                $text_id_esc
-              );
+          # Maybe not necessary
+          $data->reset;
 
-              $aggr_tok->reset;
-              $cons_tok->reset;
-            };
+        } else { # $dir eq ""
 
-            # ~ write structures ~
-            if (!$structures->empty) {
-              $structures->to_zip(
-                $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
-                $text_id_esc,
-                2 # = structure serialization
-              );
-            };
-
-            # ~ write tokens ~
-            if ($_TOKENS_PROC && !$tokens->empty) {
-              $tokens->to_zip(
-                $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
-                $text_id_esc,
-                $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
-              );
-            };
-
-
-            $dir = ""; # reinit.
-
-            # Maybe not necessary
-            $data->reset;
-
-          } else { # $dir eq ""
-
-            $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
-          }
-
-          next MAIN;
-        };
-
-        # ~ inside text body ~
-
-        # ~ whitespace handling ~
-
-        # The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
-        #  into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
-        #
-        # It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
-        #  example further down and notes on 'Input restrictions' in the manpage).
-        #
-        # Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
-        #
-        # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
-        #  an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
-        #
-        # Examples (how primary text with linebreaks would be converted by below code):
-        #
-        #  '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
-        #  '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
-
-        s/^\s+//; s/\s+$//; # remove consecutive whitespace at beginning and end (mostly one newline)
-
-        ### NOTE: this is only relevant, if a text consists of more than one line
-        ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
-        ###  do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
-        if ( m/<[^>]+>[^<]/ ){ # line contains at least one tag with at least one character contents
-
-          # NOTE: not stringent ('...' stands for text):
-          #
-          #   beg1............................end1  => no blank before 'beg1'
-          #   beg2....<pb/>...................end2  => no blank before 'beg2'
-          #   beg3....<info attr1="val1"/>....end3  => no blank before 'beg3'
-          #   beg4....<test>ok</test>.........end4  =>    blank before 'beg4'
-          #
-          #     =>  beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
-          #                                                                            ^
-          #                                                                            |_blank between 'end3' and 'beg4'
-
-          $tl++; # counter for text lines
-
-          s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+          $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
         }
-        ###
 
-        # add line to buffer
-        $buf_in .= $_;
+        next MAIN;
       };
 
-    } elsif ( m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$# ){
+      # ~ inside text body ~
 
-      # ~ start of header ~
-      $pfx = $1;
-      my $content = "$2\n";
+      # ~ whitespace handling ~
 
-      die "ERROR ($0): main(): input line number $.: line with opening header tag"
-        ." is not in expected format ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/;
+      # The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
+      #  into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
+      #
+      # It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
+      #  example further down and notes on 'Input restrictions' in the manpage).
+      #
+      # Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
+      #
+      # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
+      #  an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
+      #
+      # Examples (how primary text with linebreaks would be converted by below code):
+      #
+      #  '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
+      #  '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
 
-      # Parse header
-      my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+      s/^\s+//; s/\s+$//; # remove consecutive whitespace at beginning and end (mostly one newline)
 
-      # Header was parseable
-      if ($header) {
+      ### NOTE: this is only relevant, if a text consists of more than one line
+      ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
+      ###  do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
+      if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
 
-        # Write header to zip
-        my $file = $header->dir . '/' . $_header_file;
+        # NOTE: not stringent ('...' stands for text):
+        #
+        #   beg1............................end1  => no blank before 'beg1'
+        #   beg2....<pb/>...................end2  => no blank before 'beg2'
+        #   beg3....<info attr1="val1"/>....end3  => no blank before 'beg3'
+        #   beg4....<test>ok</test>.........end4  =>    blank before 'beg4'
+        #
+        #     =>  beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
+        #                                                                            ^
+        #                                                                            |_blank between 'end3' and 'beg4'
 
-        $log->debug("Writing file $file") if $_DEBUG;
+        $tl++; # counter for text lines
 
-        $header->to_zip($zipper->new_stream($file));
+        s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+      }
+      ###
 
-        # Header is for text level
-        if ($header->type eq 'text') {
+      # add line to buffer
+      $buf_in .= $_;
+    };
 
-          # Remember dir and sigles
-          $dir         = $header->dir;
-          $text_id     = $header->id;
-          $text_id_esc = $header->id_esc;
+  } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
 
-          # log output for seeing progression
-          $log->notice("$0: main(): text_id=".decode('UTF-8', $text_id ));
+    # ~ start of header ~
+    $pfx = $1;
+    my $content = "$2\n";
 
-          $tl = 0; # reset (needed for ~ whitespace handling ~)
-        };
+    if ($pfx !~ /^\s*$/) {
+      die $log->fatal("input line number $.: " .
+                        "line with opening header tag" .
+                        " is not in expected format ... => Aborting (line=$_)");
+    };
+
+    # Parse header
+    my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+
+    # Header was parseable
+    if ($header) {
+
+      # Write header to zip
+      my $file = $header->dir . '/' . $_header_file;
+
+      $log->debug("Writing file $file") if $_DEBUG;
+
+      $header->to_zip($zipper->new_stream($file));
+
+      # Header is for text level
+      if ($header->type eq 'text') {
+
+        # Remember dir and sigles
+        $dir         = $header->dir;
+        $text_id     = $header->id;
+        $text_id_esc = $header->id_esc;
+
+        # log output for seeing progression
+        $log->notice("$0: main(): text_id=".decode('UTF-8', $text_id));
+
+        $tl = 0; # reset (needed for ~ whitespace handling ~)
       }
     }
-  } #end: while
+  }
+} #end: while
 
-  $zipper->close;
+$zipper->close;
 
-  $ext_tok->close if $_GEN_TOK_EXT;
+$ext_tok->close if $_GEN_TOK_EXT;
 
-} # end: sub main
+exit(0);
 
 
 sub retr_info { # called from main()
@@ -561,23 +541,20 @@
   #  ref($data->[2]->[0]->[5]->[0]->[5]->[2])                                == ARRAY (with 2 elements)
   #  $data->[2]->[0]->[5]->[0]->[5]->[2]->[0]                                == 3 (=> type ==  XML_READER_TYPE_TEXT)
   #  $data->[2]->[0]->[5]->[0]->[5]->[2]->[1]                                == ' text'
-  # 
+  #
   #
   #  retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
   #  Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
   #  ${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
 
+  foreach $e (@{${$_[0]}}) { # iteration through all array elements ($_[0] is a reference to an array reference)
 
-  foreach $e ( @{${$_[0]}} ){ # iteration through all array elements ($_[0] is a reference to an array reference)
-
-    if ( $e->[0] == XML_READER_TYPE_ELEMENT ){ # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
-
+    if ($e->[0] == XML_READER_TYPE_ELEMENT) { # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
 
       #~~~~
       # from here: tag-node (opening)
       #~~~~
 
-
       # ~ handle structures ~
 
       # $e->[1] represents the tag name
@@ -592,9 +569,9 @@
 
       # ~ handle attributes ~
 
-      if ( defined $e->[3] ){ # only if attributes exist
+      if (defined $e->[3]) { # only if attributes exist
 
-        for ( $c = 0; $c < @{$e->[3]}; $c += 2 ){ # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
+        for ($c = 0; $c < @{$e->[3]}; $c += 2) {  # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
                                                   #  [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
                                                   # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
 
@@ -618,7 +595,7 @@
 
       # ~~ RECURSION ~~
 
-      if ( defined $e->[$_IDX] ){  # do no recursion, if $e->[$_IDX] is not defined (because we have no array of child-nodes, e.g.: <back/>)
+      if (defined $e->[$_IDX]) {  # do no recursion, if $e->[$_IDX] is not defined (because we have no array of child-nodes, e.g.: <back/>)
 
         retr_info($rl+1, \$e->[$_IDX]); # recursion with array of child-nodes
       }
@@ -635,7 +612,7 @@
       {
         $fval = $anno->from;
 
-        if ( $fval > 0 && not exists $ws{ $fval - 1 } ){ # ~ whitespace related issue ~
+        if ($fval > 0 && not exists $ws{$fval - 1}) { # ~ whitespace related issue ~
 
           # ~ previous node was a text-node ~
 
@@ -643,12 +620,16 @@
         }
 
         # in case this fails, check input
-        die "ERROR ($0, retr_info()): text_id='$text_id', processing of \@structures: from-value ($fval) is 2 or more greater"
-          ." than to-value ($pos) => please check. aborting ...\n"
-          if ( $fval - 1 ) > $pos;
+        if (($fval - 1) > $pos) {
+          die $log->fatal("text_id='$text_id', " .
+                            "processing of structures: " .
+                            "from-value ($fval) is 2 or more greater " .
+                            "than to-value ($pos) => please check. Aborting");
+        };
 
         # TODO: find example for which this case applies
         #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
+        #
         # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
         #   do testing with bigger corpus excerpt (wikipedia?)
         $anno->set_from($pos) if $fval == $pos + 1;
@@ -660,7 +641,7 @@
 
       # ~ whitespace related issue ~
       # clean up
-      delete $ws{ $fval  - 1 } if $fval > 0 && exists $ws{ $fval - 1 };
+      delete $ws{$fval  - 1} if $fval > 0 && exists $ws{$fval - 1};
 
 
       #~~~~
@@ -668,20 +649,20 @@
       #~~~~
 
 
-    #~~~~~
-    # from here: text- and whitespace-nodes
-    #~~~~~
+      #~~~~~
+      # from here: text- and whitespace-nodes
+      #~~~~~
 
-    # The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
-    #  'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
-    #
-    # When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
-    #  '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
-    #  (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
-    #
-    # echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
+      # The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
+      #  'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
+      #
+      # When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
+      #  '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
+      #  (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
+      #
+      # echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
 
-    } elsif ( $e->[0] == XML_READER_TYPE_TEXT || $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ){
+    } elsif ($e->[0] == XML_READER_TYPE_TEXT || $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE){
 
       # Notes on ~ whitespace related issue ~ (referred to the code fragment below)
       #
@@ -689,7 +670,7 @@
       #
       # Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
       #  'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
-      # 
+      #
       # The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
       #  it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
       #
@@ -719,7 +700,7 @@
       # Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
       #  and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
 
-      if( $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ){
+      if ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
 
         # ~ whitespace-node ~
 
@@ -731,12 +712,12 @@
         #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
         $ws{$data->position}++;
 
-      }else{
+      } else {
 
         # ~ text-node ~
 
         $add_one = 1;
-      }
+      };
 
 
       # ~ update $data ~
@@ -748,13 +729,13 @@
       #~~~~~
 
 
-    #elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute-node
-    #   note: attributes cannot be processed like this ( => use 'XCT_ATTRIBUTE_ARRAY' - see above )
+      # elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute-node
+      #   note: attributes cannot be processed like this ( => use 'XCT_ATTRIBUTE_ARRAY' - see above )
 
 
-    }else{ # not yet handled type
+    } else { # not yet handled type
 
-      die "ERROR ($0): Not yet handled type (\$e->[0]=".$e->[0].") ... => Aborting\n";
+      die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
     }
 
   } # end: foreach iteration
@@ -816,9 +797,9 @@
 
 =item
 
-all tokens inside the primary text (inside $data) may not be
+All tokens inside the primary text may not be
 newline seperated, because newlines are removed
-(see code section C<~ inside text body ~>) and a conversion of newlines
+(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
 into blanks between 2 tokens could lead to additional blanks,
 where there should be none (e.g.: punctuation characters like C<,> or
 C<.> should not be seperated from their predecessor token).
@@ -875,7 +856,7 @@
 
 Use the standard KorAP/DeReKo tokenizer.
 
-=item B<--use-intern-tokenization|-ti>
+=item B<--tokenizer-internal|-ti>
 
 Tokenize the data using two embedded tokenizers,
 that will take an I<Aggressive> and a I<conservative>
@@ -893,7 +874,7 @@
 
 Author: Peter Harders
 
-Contributors: Marc Kupietz, Carsten Schnober, Nils Diewald
+Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
 
 L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
commit	bda1f23d14f3c81dfe1ebb2866ceb833aae7877c	[log] [tgz]
author	Akron <diewald@ids-mannheim.de>	Tue Sep 29 13:03:52 2020 +0200
committer	Gerrit Code Review <gerrit2@korap.ids-mannheim.de>	Tue Sep 29 13:03:52 2020 +0200
tree	3f3ee61c507304987196c460a009f87fbe63667a
parent	34b924a3ef05c9839dd6c4bf2ff083b473d3a861 [diff]
parent	031a4f02c3252ba69e62e21ed4c91e8538673150 [diff]