Merge "Fix and extend documentation"

commit: 6d07f0e8b112b251127e847cd7566c51b9e4faa2 [log] [tgz]
author: Akron <diewald@ids-mannheim.de> Mon Jul 27 22:32:56 2020 +0200
committer: Gerrit Code Review <gerrit2@korap.ids-mannheim.de> Mon Jul 27 22:32:56 2020 +0200
tree: 84ebefc6514376465d25ba52f829b80a52c8ca3a
parent: edee6e5115ef54f850ad0fe7f9a9eb0bf8b3a418 [diff]
parent: 4e603a5be37a5cd4cf0da8ed3fa8ae5d76ef58d0 [diff]
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
new file mode 100644
index 0000000..3d9c06d
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Header.pm

@@ -0,0 +1,181 @@
+package KorAP::XML::TEI::Header;
+use strict;
+use warnings;
+use Encode qw(encode_utf8);
+
+# Parsing of i5 header files
+
+# Warning:
+# Opening and closing tags (without attributes) have to be in one line
+
+# TODO: IDS-specific
+my $_HEADER_TAG = 'idsHeader';
+
+use constant {
+  TEXT      => 0,
+  HEADTYPE  => 1,
+  SIGLE     => 2
+};
+
+# convert '&', '<' and '>' into their corresponding sgml-entities
+our %ent = (
+  '"' => '&quot;',
+  '&' => '&amp;',
+  '<' => '&lt;',
+  '>' => '&gt;'
+);
+
+# convert header type to sigle type
+our %sig = (
+  corpus   => 'korpusSigle',
+  document => 'dokumentSigle',
+  text     => 'textSigle'
+);
+
+
+# Create new header object
+sub new {
+  my $class = shift;
+  my $text = shift;
+
+  my $self = bless [$text, undef, ''], $class;
+
+  # Check header types to distinguish between siglen types
+  if ($text =~ m!^<${_HEADER_TAG} [^<]*type="([^"]+)"!) {
+    $self->[HEADTYPE] = $1;
+  }
+
+  # Unexpected header init
+  else {
+    die "ERROR ($0): Unable to parse header init '$text'";
+    return;
+  };
+
+  return $self;
+};
+
+
+# Parse header object from filehandle
+sub parse {
+  my ($self, $fh) = @_;
+
+  my $sig_type = $sig{$self->[HEADTYPE]} // 'textSigle';
+
+  # Iterate over file handle
+  while (<$fh>) {
+
+    # Change:
+    #   This version keeps comments in header files
+
+    # End of header found - finish parsing
+    if ( m!^(.*</${_HEADER_TAG}>)(.*)$! ){
+
+      # Add to text
+      $self->[TEXT] .= $1;
+
+      die "ERROR ($0): main(): input line number $.: line with closing header tag '${_HEADER_TAG}'"
+        ." contains additional information ... => Aborting\n\tline=$_"
+        if $2 !~ /^\s*$/;
+
+      if ($self->dir eq '') {
+
+        print STDERR "WARNING ($0): main(): input line number $.: empty " . $sig_type .
+          " in header => nothing to do ...\n header=" . $self->[TEXT] . "\n";
+        return;
+
+      };
+
+      return $self;
+    };
+
+    # Check for sigle in line
+    if ( m!^(.*)<$sig_type(?: [^>]*)?>([^<]*)(.*)$! ){
+
+      my $pfx = $1;
+      my $sig = $2;
+      my $sfx = $3;
+
+      die "ERROR ($0): main(): input line number $.: line with sigle-tag is not in expected format ... => Aborting\n\tline=$_"
+        if $pfx !~ /^\s*$/  || $sfx !~ m!^</$sig_type>\s*$! || $sig =~ /^\s*$/;
+
+      $self->[SIGLE] = encode_utf8($sig);
+
+      # Escape sig
+      my $sig_esc = $self->sigle_esc;
+
+      # replace sigle in header, if there's an escaped version that differs
+      s!(<$sig_type(?: [^>]*)?>)[^<]+</$sig_type>!$1$sig_esc</$sig_type>! if $sig_esc ne $sig;
+    };
+
+    # Add line to header text
+    $self->[TEXT] .= $_;
+  };
+};
+
+# Type of the header
+sub type {
+  $_[0]->[HEADTYPE];
+};
+
+
+# Directory (leveled) of the header file
+sub dir {
+  $_[0]->[SIGLE] =~ tr/\./\//r;
+};
+
+
+# corpus/doc/text sigle
+sub sigle {
+  $_[0]->[SIGLE];
+};
+
+
+# corpus/doc/text id
+sub id {
+  $_[0]->[SIGLE] =~ tr/\//_/r;
+};
+
+
+# corpus/doc/text sigle escaped
+sub sigle_esc {
+  $_[0]->[SIGLE] =~ s/("|&|<|>)/$ent{$1}/gr;
+};
+
+
+# corpus/doc/text id escaped
+sub id_esc {
+  $_[0]->[SIGLE] =~ tr/\//_/r =~ s/("|&|<|>)/$ent{$1}/gr;
+};
+
+
+# Return data as a string
+sub to_string {
+  my $self = shift;
+  return $self->_header . $self->[TEXT];
+};
+
+
+# Header for XML output
+sub _header {
+  my $self = shift;
+  # TODO: IDS-specific
+  return <<"HEADER";
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng"
+            type="application/xml"
+            schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN"
+          "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+HEADER
+};
+
+
+# Write data to zip stream
+sub to_zip {
+  my ($self, $zip) = @_;
+  $zip->print(encode_utf8($self->to_string));
+};
+
+
+1;
+

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 5099eeb..c36b605 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -37,7 +37,7 @@
     };
   };
 
-  return;
+  return $self;
 };
 
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 0efb648..237d87f 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -32,7 +32,7 @@
     $self->_add_surroundings($txt, $-[3], $+[3]) if $3;
   };
 
-  return
+  return $self;
 };
 
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 7b740a1..4fe3751 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -58,6 +58,7 @@
   return unless $self->{pid};
   my $out = $self->{chld_in};
   print $out $txt . $self->{sep};
+  return $self;
 };
 
 

diff --git a/script/tei2korapxml b/script/tei2korapxml
index fe815fb..f0e04c3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -23,6 +23,7 @@
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
+use KorAP::XML::TEI::Header;
 
 our $VERSION = '0.01';
 
@@ -55,15 +56,6 @@
 #
 # ~~~ parameter (mandatory) ~~~
 #
-
- # optional
-my $_CORP_SIGLE       = "korpusSigle";                 # opening and closing tags (without attributes) have to be in one line
-                                                       #  (e.g.: <korpusSigle>GOE</korpusSigle>)
- # optional
-my $_DOC_SIGLE        = "dokumentSigle";               # analog
- # mandatory
-my $_TEXT_SIGLE       = "textSigle";                   # analog
- # mandatory
 my $_TEXT_BODY        = "text";                        # tag (without attributes), which contains the primary text
  # optional
 my $_CORP_HEADER_BEG  = "idsHeader type=\"corpus\"";   # just keep the correct order of the attributes and evtl. add an '.*' between them
@@ -140,8 +132,6 @@
 my $data;                                            # contains the primary text (created by func. 'retr_info' from $buf_in), which is written to '$data_file'
 
 my $dir;                                             # text     directory (below $_root_dir)
-my $dir_crp;                                         # corpus   directory (below $_root_dir)
-my $dir_doc;                                         # document directory (below $_root_dir)
 
 my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id (see %ent)
 
@@ -150,15 +140,10 @@
                                                      # note: the index still refers to the 'single character'-versions, which are counted as 1
                                                      #  (search for '&amp;' in data.xml and see corresponding indices in $_tokens_file)
 
-my $header_txt;                                      # raw text     header (written to '$_root_dir$dir/$_header_file')
-my $header_doc;                                      # raw document header (written to '$_root_dir$dir_doc/$_header_file')
-my $header_crp;                                      # raw corpus   header (written to '$_root_dir$dir_crp/$_header_file')
+my ( $data_fl );
 
-my ( $header_fl_crp, $header_fl_doc,                 # flags for tracking where we are in the input document
-     $header_fl_txt, $data_fl );
+my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
 
-my ( $header_prfx, $data_prfx1,                      # $header_prfx is written to $_header_file, $data_* are written to $_data_file
-     $data_prfx2, $data_sfx );
 
 my @structures;                                      # list of arrays, where each array represents a TEI I5 tag (except $_TOKENS_TAG) from the input document
                                                      #  - the input of this array is written in func. 'write_structures' into the file '$_structure_file'
@@ -193,8 +178,6 @@
 
 my ( $i, $c );                                       # index variables used in loops
 
-my ( $_CORP_HEADER_END, $_DOC_HEADER_END, $_TEXT_HEADER_END );
-
 
 #
 # ~~~ main ~~~
@@ -204,9 +187,7 @@
 
 ($_XCT_LN)?($_IDX=5):($_IDX=4);
 
-$header_prfx = $data_prfx1 = $data_prfx2 = $data_sfx = "";
-
-$header_fl_txt = $header_fl_doc = $header_fl_crp = 0;
+$data_prfx1 = $data_prfx2 = $data_sfx = "";
 
 $inside_tokens_tag = -1;
 
@@ -215,15 +196,9 @@
 $_root_dir .= '/'; # base dir must always end with a slash
 $_root_dir =~ s/^\.?\///; # remove leading / (only relative paths allowed in IO::Compress::Zip) and redundant ./
 
-$_CORP_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#; $_CORP_HEADER_END  = $1;
-$_DOC_HEADER_BEG  =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#; $_DOC_HEADER_END   = $1;
-$_TEXT_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#; $_TEXT_HEADER_END  = $1;
-
-## TODO: remove this, because it's IDS-specific
-$header_prfx  = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-$header_prfx .= "<?xml-model href=\"header.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n";
-$header_prfx .= "<!DOCTYPE idsCorpus PUBLIC \"-//IDS//DTD IDS-XCES 1.0//EN\" \"http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd\">\n";
-##
+$_CORP_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#;
+$_DOC_HEADER_BEG  =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#;
+$_TEXT_HEADER_BEG =~ s#^([^\s]+)(.*)$#$1\[\^>\]*$2#;
 
 $data_prfx1   = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
 $data_prfx1  .= "<?xml-model href=\"text.rng\" type=\"application/xml\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n";
@@ -249,20 +224,13 @@
 
   my ( $pfx, $sfx );
 
-  # TODO:
-  #   Replace all calls of $lc with $. or $input_fh->input_line_number,
-  #   because otherwise remove_html_comments will
-  #   move the lines forward without incrementing.
-  my $lc = 0; # line counter (only for error handling and debugging)
-
   my $tl = 0; # text line (needed for whitespace handling)
 
   $input_fh = *STDIN;  # input file handle (default: stdin)
 
   $data_fl = 0;
 
-  $buf_in = $data = $dir = $dir_doc = $dir_crp = "";
-  $header_txt = $header_doc = $header_crp = "";
+  $buf_in = $data = $dir = "";
 
 
   if ( $input_fname ne '' ){
@@ -281,8 +249,6 @@
 
   while ( <$input_fh> ){
 
-    $lc++; # line counter
-
     # TODO: yet not tested fo big amounts of data
     # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
     remove_xml_comments( $input_fh, $_ ); # remove HTML comments (<!--...-->)
@@ -297,7 +263,7 @@
 
       $pfx = $1; $sfx = $2;
 
-      die "ERROR ($0): main(): input line number $lc: line with closing text-body tag '${_TEXT_BODY}'"
+      die "ERROR ($0): main(): input line number $.: line with closing text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
           if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
 
@@ -351,14 +317,32 @@
         #
 
 
+        # ~ tokenization ~
+
         if ( $_GEN_TOK_EXT ){
 
-          $ext_tok->tokenize($data);
+          $ext_tok->tokenize($data)->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+            $text_id_esc
+          );
 
-        } elsif ( $_GEN_TOK_INT ){
+        }
 
-          $cons_tok->tokenize($data);
-          $aggr_tok->tokenize($data);
+        if ( $_GEN_TOK_INT ){
+
+          # Tokenize and output
+          $cons_tok->tokenize($data)->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+            $text_id_esc
+          );
+
+          $aggr_tok->tokenize($data)->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+            $text_id_esc
+          );
+
+          $aggr_tok->reset;
+          $cons_tok->reset;
         }
 
         $data = encode_utf8( $data );
@@ -380,31 +364,6 @@
 
         write_tokens() if $_TOKENS_PROC && @tokens;
 
-
-        # ~ tokenization ~
-
-        if ( $_GEN_TOK_EXT ) {
-
-          $ext_tok->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
-            $text_id_esc
-          )
-
-        } elsif ( $_GEN_TOK_INT ){
-
-          # Output token streams to zip streams
-          $cons_tok->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
-            $text_id_esc
-          );
-          $aggr_tok->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
-            $text_id_esc
-          );
-          $aggr_tok->reset;
-          $cons_tok->reset;
-        }
-
         #print STDERR "$0: write_tokenization(): DONE\n";
 
         $data_fl = 0; $buf_in = $data = $dir = ""; # reinit.
@@ -412,7 +371,7 @@
       } else { # $dir eq ""
 
         print STDERR "WARNING ($0): main(): maybe empty textSigle => skipping this text ...\n";
-        print STDERR "WARNING ($0): main(): text header=$header_txt\n";
+        # print STDERR "WARNING ($0): main(): text header=$header_txt\n";
         print STDERR "WARNING ($0): main(): data=$data\n";
       }
 
@@ -469,248 +428,58 @@
       # add line to buffer
       $buf_in .= $_;
 
-    } elsif ( $header_fl_txt && m#^(.*</${_TEXT_HEADER_END}>)(.*)$# ){
-
-
-      # ~ end of text header ~
-
-
-      #print STDERR "end of text header\n";
-
-      # write it to header.xml
-
-      $sfx = $2;
-
-      $header_txt .= $1; $header_fl_txt = 0;
-
-
-      die "ERROR ($0): main(): input line number $lc: line with closing text-header tag '${_TEXT_HEADER_END}'"
-       ." contains additional information ... => Aborting\n\tline=$_"
-         if $sfx !~ /^\s*$/;
-
-      if ( $dir eq "" ){
-
-        print STDERR "WARNING ($0): main(): input line number $lc: empty textSigle in text header => nothing to do ...\ntext header=$header_txt\n";
-
-      } else {
-
-        print STDERR "DEBUG ($0): Writing file $_root_dir$dir/$_header_file\n" if $_DEBUG;
-
-        $header_txt = encode_utf8( $header_txt );
-
-        $zipper->new_stream("$_root_dir$dir/$_header_file")
-          ->print("$header_prfx$header_txt");
-
-        $header_txt = "";
-      }
-
-    } elsif ( $header_fl_txt ){
-
-      # ~ inside text header ~
-
-
-      #print STDERR "inside text header\n";
-
-      if( m#^(.*)<${_TEXT_SIGLE}(?: [^>]*)?>([^<]*)(.*)$# ){
-
-        $pfx = $1; $sfx = $3;
-
-        $dir = $2; $text_id = $dir;
-
-        $text_id =~ tr/\//_/; $dir =~ s/("|&|<|>)/$ent{$1}/g;
-
-        $text_id = encode_utf8( $text_id );
-
-        die "ERROR ($0): main(): input line number $lc: line with text-sigle tag '$_TEXT_SIGLE' is not in expected format ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/  || $sfx !~ m#^</${_TEXT_SIGLE}>\s*$# || $dir =~ /^\s*$/;
-
-        # log output for seeing progression
-        print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
-
-        $text_id_esc = $text_id;
-
-        s#(<${_TEXT_SIGLE}(?: [^>]*)?>)[^<]+(</${_TEXT_SIGLE}>)#$1$dir$2# # to be consistent with escaping, escape also textSigle in text-header
-          if $text_id_esc =~ s/("|&|<|>)/$ent{$1}/g;
-
-        $dir =~ tr/\./\//;
-      }
-
-      $header_txt .= $_;
-
-    } elsif ( $header_fl_doc && m#^(.*</${_DOC_HEADER_END}>)(.*)$# ){
-
-
-      # ~ end of document header ~
-
-      #print STDERR "end of doc header\n";
-
-      # write it to header.xml
-
-      $sfx = $2;
-
-      $header_doc .= $1; $header_fl_doc = 0;
-
-      die "ERROR ($0): main(): input line number $lc: line with closing document-header tag '${_DOC_HEADER_END}'"
-       ." contains additional information ... => Aborting\n\tline=$_"
-         if $sfx !~ /^\s*$/;
-
-      if( $dir_doc eq "" ){
-
-        print STDERR "WARNING ($0): main(): input line number $lc: empty document sigle in document header"
-          ." => nothing to do ...\ndocument header=$header_doc\n";
-
-      } else {
-
-        print STDERR "DEBUG ($0): Writing file $_root_dir$dir_doc/$_header_file\n" if $_DEBUG;
-
-        $header_doc = encode_utf8( $header_doc );
-
-        $zipper->new_stream("$_root_dir$dir_doc/$_header_file")
-          ->print("$header_prfx$header_doc");
-
-        $header_doc = $dir_doc = "";
-      }
-
-    } elsif ( $header_fl_doc ){
-
-
-      # ~ inside document header ~
-
-
-      #print STDERR "inside doc header\n";
-
-      if ( m#^(.*)<${_DOC_SIGLE}(?: [^>]*)?>([^<]*)(.*)$# ){
-
-        $pfx = $1; $sfx = $3;
-
-        $dir_doc = $2;
-
-        die "ERROR ($0): main(): input line number $lc: line with document-sigle tag '$_DOC_SIGLE' is not in expected format ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/  || $sfx !~ m#^</${_DOC_SIGLE}>\s*$# || $dir_doc =~ /^\s*$/;
-
-        s#(<${_DOC_SIGLE}(?: [^>]*)?>)[^<]+(</${_DOC_SIGLE}>)#$1$dir_doc$2# # to be consistent with escaping, escape also textSigle in Document-Header
-          if $dir_doc =~ s/("|&|<|>)/$ent{$1}/g;
-      }
-
-      $header_doc .= $_;
-
-    } elsif ( m#^(.*)(<${_TEXT_HEADER_BEG}.*)$# ){
-
-      # ~ start of text header ~
-
-
-      #print STDERR "begin of text header\n";
-
-      $header_txt = $_; $header_fl_txt = 1; $pfx = $1;
-
-      $tl = 0; # reset (needed for ~ whitespace handling ~)
-
-      die "ERROR ($0): main(): input line number $lc: line with opening text-header tag '${_TEXT_HEADER_BEG}'"
-        ." is not in expected format ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/;
-
     } elsif ( m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
-
       # ~ start of text body ~
 
-
       #print STDERR "inside text body\n";
 
       $pfx = $1; $sfx = $2;
 
       $data_fl = 1;
 
-      die "ERROR ($0): main(): input line number $lc: line with opening text-body tag '${_TEXT_BODY}'"
+      die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
         ." contains additional information ... => Aborting\n\tline=$_"
           if $pfx !~ /^\s*$/ || $sfx !~ /^\s*$/;
 
-    } elsif ( m#^(.*)(<${_DOC_HEADER_BEG}.*)$# ){
+    } elsif ( m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$# ){
 
+      # ~ start of header ~
+      $pfx = $1;
+      my $content = "$2\n";
 
-      # ~ start of document header ~
-
-
-      #print STDERR "begin of doc header\n";
-
-      $header_doc = "$2\n"; $header_fl_doc = 1; $pfx = $1;
-
-      die "ERROR ($0): main(): input line number $lc: line with opening document-header tag '${_DOC_HEADER_BEG}'"
-        ."is not in expected format ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/;
-
-    } elsif ( $header_fl_crp && m#^(.*</${_CORP_HEADER_END}>)(.*)$# ){
-
-
-      # ~ end of corpus header ~
-
-
-      #print STDERR "end of corp header\n";
-
-      $sfx = $2;
-
-      $header_crp .= $1; $header_fl_crp = 0;
-
-      die "ERROR ($0): main(): input line number $lc: line with closing corpus-header tag '${_CORP_HEADER_END}'"
-        ." contains additional information ... => Aborting\n\tline=$_"
-          if $sfx !~ /^\s*$/;
-
-      if ( $dir_crp eq "" ){
-
-        print STDERR "WARNING ($0): main(): input line number $lc: empty corpus sigle in corpus header => nothing to do ...\ncorpus header=$header_crp\n";
-
-      } else {
-
-        print STDERR "DEBUG ($0): Writing file $_root_dir$dir_crp/$_header_file\n" if $_DEBUG;
-
-        $header_crp = encode_utf8( $header_crp );
-
-        $zipper->new_stream("$_root_dir$dir_crp/$_header_file")
-          ->print("$header_prfx$header_crp");
-
-        $header_crp = $dir_crp = "";
-      }
-
-    } elsif ( $header_fl_crp ){
-
-
-      # ~ inside corpus header ~
-
-
-      #print STDERR "inside corp header\n";
-
-      if ( m#^(.*)<${_CORP_SIGLE}(?: [^>]*)?>([^<]*)(.*)$# ){
-
-        $pfx = $1; $sfx = $3;
-
-        $dir_crp = $2;
-
-        die "ERROR ($0): main(): input line number $lc: line with korpusSigle-tag is not in expected format ... => Aborting\n\tline=$_"
-          if $pfx !~ /^\s*$/  || $sfx !~ m#^</${_CORP_SIGLE}>\s*$# || $dir_crp =~ /^\s*$/;
-
-        if ( $dir_crp =~ s/("|&|<|>)/$ent{$1}/g ){
-
-          s#(<${_CORP_SIGLE}(?: [^>]*)?>)[^<]+(</${_CORP_SIGLE}>)#$1$dir_crp$2# # to be consistent with escaping, escape also textSigle in Corpus-Header
-        }
-      }
-
-      $header_crp .= $_;
-
-    } elsif ( m#^(.*)(<${_CORP_HEADER_BEG}.*)$# ){
-
-
-      # ~ start of corpus header ~
-
-
-      #print STDERR "begin of corp header\n";
-
-      $header_crp = $2; $header_fl_crp = 1; $pfx = $1;
-
-      die "ERROR ($0): main(): input line number $lc: line with opening corpus-header tag '${_CORP_HEADER_BEG}'"
+      die "ERROR ($0): main(): input line number $.: line with opening header tag"
         ." is not in expected format ... => Aborting\n\tline=$_"
           if $pfx !~ /^\s*$/;
-    }
 
+      # Parse header
+      my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+
+      # Header was parseable
+      if ($header) {
+
+        # Write header to zip
+        my $file = $_root_dir . $header->dir . '/' . $_header_file;
+
+        print STDERR "DEBUG ($0): Writing file $file\n" if $_DEBUG;
+
+        $header->to_zip($zipper->new_stream($file));
+
+        # Header is for text level
+        if ($header->type eq 'text') {
+
+          # Remember dir and sigles
+          $dir         = $header->dir;
+          $text_id     = $header->id;
+          $text_id_esc = $header->id_esc;
+
+          # log output for seeing progression
+          print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
+
+          $tl = 0; # reset (needed for ~ whitespace handling ~)
+        };
+      }
+    }
   } #end: while
 
   $zipper->close;

diff --git a/t/header.t b/t/header.t
new file mode 100644
index 0000000..0012e75
--- /dev/null
+++ b/t/header.t

@@ -0,0 +1,143 @@
+use strict;
+use warnings;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use Test::More;
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
+require_ok('KorAP::XML::TEI::Header');
+
+my $h;
+
+eval { $h = KorAP::XML::TEI::Header->new('<idsHeader>') };
+
+ok(!$h, 'Header invalid');
+
+subtest 'Corpus Header' => sub {
+  $h = KorAP::XML::TEI::Header->new('<idsHeader type="corpus">');
+  ok($h, 'Header valid');
+
+  is($h->sigle, '', 'Check sigle');
+  is($h->sigle_esc, '', 'Check sigle escaped');
+  is($h->dir, '', 'Check dir');
+  is($h->type, 'corpus', 'Check dir');
+  like($h->to_string, qr!^<\?xml version!, 'String');
+  like($h->to_string, qr!<idsHeader type=\"corpus\">$!, 'String');
+
+  my ($fh, $filename) = korap_tempfile('header_1');
+
+  print $fh <<'HTML';
+<-- mehrzeiliger
+Kommentar
+  -->  <fileDesc>
+   <titleStmt>
+    <korpusSigle>GOE</korpusSigle>
+    <c.title>Goethe-Korpus</c.title>
+   </titleStmt>
+</idsHeader>
+Test
+HTML
+
+  seek($fh, 0, 0);
+
+  ok($h->parse($fh), 'Parsing');
+
+  like($h->to_string, qr!^<\?xml version!, 'String');
+  like($h->to_string, qr!<idsHeader type=\"corpus\">!, 'String');
+  like($h->to_string, qr!<-- mehrzeiliger!, 'String');
+  like($h->to_string, qr!titleStmt!, 'String');
+  like($h->to_string, qr!</idsHeader>$!, 'String');
+
+  is($h->sigle, 'GOE', 'Check sigle');
+  is($h->sigle_esc, 'GOE', 'Check sigle escaped');
+  is($h->id, 'GOE', 'Check sigle');
+  is($h->id_esc, 'GOE', 'Check sigle escaped');
+  is($h->dir, 'GOE', 'Check dir');
+  is($h->type, 'corpus', 'Check type');
+};
+
+subtest 'Document Header' => sub {
+  $h = KorAP::XML::TEI::Header->new('<idsHeader type="document">');
+  ok($h, 'Header valid');
+
+  is($h->sigle, '', 'Check sigle');
+  is($h->sigle_esc, '', 'Check sigle escaped');
+  is($h->dir, '', 'Check dir');
+  like($h->to_string, qr!^<\?xml version!, 'String');
+  like($h->to_string, qr!<idsHeader type=\"document\">$!, 'String');
+
+  my ($fh, $filename) = korap_tempfile('header_2');
+
+  print $fh <<'HTML';
+  <fileDesc>
+   <titleStmt>
+    <dokumentSigle>GOE/"AAA"</dokumentSigle>
+   </titleStmt>
+</idsHeader>
+Test
+HTML
+
+  seek($fh, 0, 0);
+
+  ok($h->parse($fh), 'Parsing');
+
+  like($h->to_string, qr!^<\?xml version!, 'String');
+  like($h->to_string, qr!<idsHeader type=\"document\">!, 'String');
+  like($h->to_string, qr!titleStmt!, 'String');
+  like($h->to_string, qr!</idsHeader>$!, 'String');
+
+  is($h->sigle, 'GOE/"AAA"', 'Check sigle');
+  is($h->sigle_esc, 'GOE/&quot;AAA&quot;', 'Check sigle escaped');
+  is($h->id, 'GOE_"AAA"', 'Check sigle');
+  is($h->id_esc, 'GOE_&quot;AAA&quot;', 'Check sigle escaped');
+  is($h->dir, 'GOE/"AAA"', 'Check dir');
+  is($h->type, 'document', 'Check type');
+};
+
+
+subtest 'Text Header' => sub {
+  $h = KorAP::XML::TEI::Header->new('<idsHeader foo="bar" type="text">');
+  ok($h, 'Header valid');
+
+  is($h->sigle, '', 'Check sigle');
+  is($h->sigle_esc, '', 'Check sigle escaped');
+  is($h->dir, '', 'Check dir');
+  like($h->to_string, qr!^<\?xml version!, 'String');
+  like($h->to_string, qr!<idsHeader foo="bar" type=\"text\">$!, 'String');
+
+  my ($fh, $filename) = korap_tempfile('header_3');
+
+  print $fh <<'HTML';
+  <fileDesc>
+   <titleStmt>
+    <textSigle>GOE/"AAA".00003</textSigle>
+   </titleStmt>
+</idsHeader>
+Test
+HTML
+
+  seek($fh, 0, 0);
+
+  ok($h->parse($fh), 'Parsing');
+
+  like($h->to_string, qr!^<\?xml version!, 'String');
+  like($h->to_string, qr!<idsHeader foo="bar" type=\"text\">!, 'String');
+  like($h->to_string, qr!titleStmt!, 'String');
+  like($h->to_string, qr!</idsHeader>$!, 'String');
+
+  like($h->to_string, qr!GOE/&quot;AAA&quot;\.00003!, 'String');
+
+  is($h->sigle, 'GOE/"AAA".00003', 'Check sigle');
+  is($h->sigle_esc, 'GOE/&quot;AAA&quot;.00003', 'Check sigle escaped');
+  is($h->id, 'GOE_"AAA".00003', 'Check sigle');
+  is($h->id_esc, 'GOE_&quot;AAA&quot;.00003', 'Check sigle escaped');
+  is($h->dir, 'GOE/"AAA"/00003', 'Check dir');
+  is($h->type, 'text', 'Check type');
+};
+
+
+done_testing;

diff --git a/t/script.t b/t/script.t
index 8d17be6..f7f9468 100644
--- a/t/script.t
+++ b/t/script.t

@@ -7,6 +7,11 @@
 use Test::More;
 use Test::Output;
 use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
 use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
 my $f = dirname(__FILE__);
@@ -129,6 +134,8 @@
 $t = Test::XML::Loy->new($struct_xml);
 $t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
 
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+ok(!$zip, 'External not generated');
 
 # Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
@@ -174,19 +181,22 @@
 
 $t->element_count_is('spanList span', 227);
 
+
 # Tokenize with external tokenizer
 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
 
 my ($fh2, $outzip2) = korap_tempfile('script_out2');
 
 stderr_like(
-  sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
+  sub { `cat '$file' | perl '$script' -tc='perl $cmd' > '$outzip2'` },
   qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
   'Processing'
 );
 
 # Uncompress GOE/AGA/00000/base/tokens.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
+ok($zip, 'Found');
+ok(!$zip->eof, 'Readable');
 
 # Read GOE/AGA/00000/base/tokens.xml
 $tokens_xml = '';
@@ -208,10 +218,6 @@
 $t->element_count_is('spanList span', 227);
 
 
-
-# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
-
-
 my ($fh3, $outzip3) = korap_tempfile('script_out3');
 
 
@@ -306,4 +312,32 @@
 $t->element_count_is('spanList span', 22);
 
 
+subtest 'Check Tokenization Flags' => sub {
+
+  # Get external tokenizer
+  my $f = dirname(__FILE__);
+  my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+  my ($fh, $outzip) = korap_tempfile('script_tokflags');
+
+  # Generate zip file (unportable!)
+  stderr_like(
+    sub { `cat '$file' | perl '$script' -ti -tc 'perl $cmd' > '$outzip'` },
+    qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+    'Processing'
+  );
+
+  ok(-e $outzip, "File $outzip exists");
+
+  $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
+  ok($zip, 'Aggressive generated');
+  $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
+  ok($zip, 'Conservative generated');
+  $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+  ok($zip, 'External generated');
+};
+
 done_testing;

diff --git a/t/tei.t b/t/tei.t
index 53f372e..6dca05c 100644
--- a/t/tei.t
+++ b/t/tei.t

@@ -1,13 +1,14 @@
 use strict;
 use warnings;
 use Test::More;
-use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
 use FindBin;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
 use_ok('KorAP::XML::TEI', 'remove_xml_comments');
 
 my ($fh, $filename) = korap_tempfile('tei');

diff --git a/t/tokenization.t b/t/tokenization.t
index 1d75e5f..92b7cc3 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -3,6 +3,7 @@
 use Test::More;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use IO::Uncompress::Unzip;
 use open qw(:std :utf8); # assume utf-8 encoding
 
 use FindBin;
@@ -10,8 +11,10 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use_ok('Test::KorAP::XML::TEI','korap_tempfile');
 require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
 require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
+require_ok('KorAP::XML::TEI::Zipper');
 
 # Test aggressive
 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
@@ -21,6 +24,12 @@
 $aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
 is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
 
+like(
+  $aggr->reset->tokenize("Der")->to_string('a'),
+  qr!id="t_0"!,
+  'Chainable'
+);
+
 # Test conservative
 my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
 $cons->tokenize("Der alte Mann");
@@ -111,4 +120,25 @@
 is(302, scalar(@$cons));
 
 
+subtest 'Test Zipper' => sub {
+  # Test Zipper
+  my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
+  my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+  $fh->close;
+
+  my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+  $aggr->tokenize("Der alte Mann");
+  ok($aggr->to_zip(
+    $zip->new_stream('tokens.xml'),
+    'fun'
+  ), 'Written successfully');
+
+  $zip->close;
+
+  ok(-e $outzip, 'Zip exists');
+  my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
+  ok(!$unzip->eof, 'Unzip successful');
+};
+
+
 done_testing;

diff --git a/t/zipper.t b/t/zipper.t
index 2ee7fab..86aa52a 100644
--- a/t/zipper.t
+++ b/t/zipper.t

@@ -2,7 +2,6 @@
 use warnings;
 use Test::More;
 use File::Spec::Functions qw/catfile/;
-use Test::KorAP::XML::TEI qw!korap_tempfile!;
 use IO::Uncompress::Unzip;
 
 use FindBin;
@@ -10,6 +9,8 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
 require_ok('KorAP::XML::TEI::Zipper');
 
 my $data;
@@ -36,7 +37,7 @@
 is($data, 'hello', 'Data correct');
 
 
-# Uncompress GOE/header.xml from zip file
+# Uncompress data/file2.txt from zip file
 $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
 
 $data = '';

diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 3407451..163b85b 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl

@@ -13,6 +13,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
 use KorAP::XML::TEI 'remove_xml_comments';
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Tokenizer::Conservative;
@@ -46,7 +47,7 @@
 my $result;
 
 # Data for delHTMLcom-long
-my ($fh, $filename) = tempfile();
+my ($fh, $filename) = korap_tempfile('benchmark');
 
 print $fh <<'HTML';
 mehrzeiliger
commit	6d07f0e8b112b251127e847cd7566c51b9e4faa2	[log] [tgz]
author	Akron <diewald@ids-mannheim.de>	Mon Jul 27 22:32:56 2020 +0200
committer	Gerrit Code Review <gerrit2@korap.ids-mannheim.de>	Mon Jul 27 22:32:56 2020 +0200
tree	84ebefc6514376465d25ba52f829b80a52c8ca3a
parent	edee6e5115ef54f850ad0fe7f9a9eb0bf8b3a418 [diff]
parent	4e603a5be37a5cd4cf0da8ed3fa8ae5d76ef58d0 [diff]