Merge changes Id3fbb94a,Ib43733cf,I992fe374 * changes: Zip data.xml before tokens.xml Do not escape double quoutes inside raw_text elements Add -tk option to use the standard KoAP tokenizer

commit: 41021abd841594bbb18bc8094df6950620daf5df [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Sep 28 10:08:37 2020 +0200
committer: Gerrit Code Review <gerrit2@korap.ids-mannheim.de> Mon Sep 28 10:08:37 2020 +0200
tree: 9b69f7066f2987ab6dca4264d8fb96903cc588dd
parent: 7501ca0ecfa651f0add20fc8e8d959bb35c1fc54 [diff]
parent: 74ed7f349be99f68d36402fa94480c56a447467a [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer.pm b/lib/KorAP/XML/TEI/Annotations.pm
similarity index 80%
rename from lib/KorAP/XML/TEI/Tokenizer.pm
rename to lib/KorAP/XML/TEI/Annotations.pm
index 90dd075..99a8880 100644
--- a/lib/KorAP/XML/TEI/Tokenizer.pm
+++ b/lib/KorAP/XML/TEI/Annotations.pm

@@ -1,35 +1,36 @@
-package KorAP::XML::TEI::Tokenizer;
+package KorAP::XML::TEI::Annotations;
 use strict;
 use warnings;
 use Log::Any qw($log);
 
-# This is the base class for tokenizer objects.
+# This is the base class for Annotation objects.
 
-# Construct a new tokenizer
+# Construct a new annotation collector
 sub new {
   bless [], shift;
 };
 
 
-# Reset the inner state of the tokenizer
-# and return the tokenizer object.
+# Reset the inner state of the annotation collection
+# and return the annotation object.
 sub reset {
   @{$_[0]} = ();
   $_[0];
 };
 
 
-# Return boundaries
+# Return boundaries of annotations
 sub boundaries {
   @{$_[0]};
 };
 
 
-# Check if no tokens are stored
+# Check if no annotations are stored
 sub empty {
   return @{$_[0]} > 0 ? 0 : 1
 };
 
+
 # Return data as a string
 sub to_string {
   my ($self, $text_id) = @_;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Token.pm b/lib/KorAP/XML/TEI/Annotations/Annotation.pm
similarity index 67%
rename from lib/KorAP/XML/TEI/Tokenizer/Token.pm
rename to lib/KorAP/XML/TEI/Annotations/Annotation.pm
index 619eb14..03f583f 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Token.pm
+++ b/lib/KorAP/XML/TEI/Annotations/Annotation.pm

@@ -1,6 +1,7 @@
-package KorAP::XML::TEI::Tokenizer::Token;
+package KorAP::XML::TEI::Annotations::Annotation;
 use strict;
 use warnings;
+use Log::Any '$log';
 use KorAP::XML::TEI 'escape_xml';
 
 # TODO:
@@ -21,7 +22,7 @@
 my $_INLINE_MSD_WR   = "msd";
 my $_INLINE_LEM_WR   = "lemma";
 
-# A token is represented as an array reference of information
+# An annotation is represented as an array reference of information
 # with variable length.
 
 use constant {
@@ -33,7 +34,7 @@
 };
 
 
-# Create a new token object
+# Create a new annotation object
 sub new {
   my $class = shift;
   my $self = bless [@_], $class;
@@ -70,6 +71,8 @@
 
 # Set level
 sub set_level {
+  # Insert information about depth of element in XML-tree
+  # (top element = level 1)
   $_[0]->[LEVEL] = $_[1];
 };
 
@@ -86,19 +89,25 @@
 };
 
 
-# Serialize header
-sub _header {
+# Serialize span information in header
+sub _header_span {
   my ($self, $id) = @_;
 
-  # l (level): insert information about depth of element in XML-tree (top element = level 1)
-
   # Start with indentation
   return '    ' .
     '<span id="s' . $id .
     '" from="' . ($self->[FROM] // '?') .
     '" to="' . ($self->[TO] // '?') .
     '" l="' . ($self->[LEVEL] // 0) . '">' .
-    "\n" .
+    "\n";
+};
+
+
+# Serialize header for lexemes
+sub _header_lex {
+
+  # Start with indentation
+  return _header_span(@_) .
     '      ' .
     '<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">' .
     "\n" .
@@ -107,9 +116,22 @@
 };
 
 
-# Serialize footer
+# Serialize header for structures
+sub _header_struct {
+
+  # Start with indentation
+  return _header_span(@_) .
+    '      ' .
+    '<fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">' .
+    "\n" .
+    '        ' .
+    '<f name="name">' . $_[0]->[TAG] . "</f>\n";
+};
+
+
+# Serialize footer for lex and struct
 sub _footer {
-  "        </f>\n      </fs>\n    </span>\n";
+  return "      </fs>\n        </span>\n";
 };
 
 
@@ -127,7 +149,7 @@
 sub to_string {
   my ($self, $id) = @_;
 
-  my $out = $self->_header($id);
+  my $out = $self->_header_lex($id);
 
   # Check if attributes exist
   if ($self->[ATTR_OFFSET]) {
@@ -144,7 +166,7 @@
     $out .= "          </fs>\n";
   };
 
-  return $out . $self->_footer;
+  return $out . "        </f>\n" . $self->_footer;
 };
 
 
@@ -152,7 +174,7 @@
 sub to_string_with_inline_annotations {
   my ($self, $id) = @_;
 
-  my $out = $self->_header($id);
+  my $out = $self->_header_lex($id);
 
   # if ( $idx > 2 ){ # attributes
   if ($self->[ATTR_OFFSET]) {
@@ -171,8 +193,8 @@
         # The POS attribute is defined
         if ($_INLINE_POS_WR) {
           unless (defined($1)) {
-            die 'ERROR (write_tokens()): unexpected format! => Aborting ... ' .
-              '(att: ' . $self->[ $att_idx + 1 ] . ")\n";
+            die $log->fatal('Unexpected format! => Aborting ... ' .
+                              '(att: ' . $self->[ $att_idx + 1 ] . ")");
           };
           $out .= _att($_INLINE_POS_WR, $1);
         };
@@ -180,8 +202,8 @@
         # The MSD attribute is defined
         if ($_INLINE_MSD_WR) {
           unless (defined($2)) {
-            die 'ERROR (write_tokens()): unexpected format! => Aborting ... ' .
-              '(att: ' . $self->[ $att_idx + 1 ] . ")\n";
+            die $log->fatal('Unexpected format! => Aborting ... ' .
+                              '(att: ' . $self->[ $att_idx + 1 ] . ")");
           };
           $out .= _att($_INLINE_MSD_WR, $2);
         };
@@ -205,6 +227,30 @@
     $out .= "          </fs>\n";
   };
 
+  return $out . "        </f>\n" . $self->_footer;
+};
+
+
+# Stringify as a struct annotation
+sub to_string_as_struct  {
+  my ($self, $id) = @_;
+
+  my $out = $self->_header_struct($id);
+
+  # Check if attributes exist
+  if ($self->[ATTR_OFFSET]) {
+
+    $out .= '        <f name="attr">' . "\n" .
+      '          <fs type="attr">' . "\n";
+    # Iterate over all attributes
+    for (my $att_idx = ATTR_OFFSET; $att_idx < @{$self}; $att_idx += 2) {
+      # Set attribute
+      $out .= _att($self->[$att_idx], $self->[$att_idx + 1]);
+    };
+    $out .= "          </fs>\n" .
+      "        </f>\n";
+  };
+
   return $out . $self->_footer;
 };
 

diff --git a/lib/KorAP/XML/TEI/Annotations/Collector.pm b/lib/KorAP/XML/TEI/Annotations/Collector.pm
new file mode 100644
index 0000000..2b13081
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Annotations/Collector.pm

@@ -0,0 +1,107 @@
+package KorAP::XML::TEI::Annotations::Collector;
+use base 'KorAP::XML::TEI::Annotations';
+use KorAP::XML::TEI::Annotations::Annotation;
+use Encode qw(encode decode);
+use strict;
+use warnings;
+
+use constant {
+  WITH_INLINE => 1,
+  STRUCTURE   => 2
+};
+
+
+# Add new annotation to annotation list
+sub add_new_annotation {
+  my $self = shift;
+  my $token = KorAP::XML::TEI::Annotations::Annotation->new(@_);
+  push @$self, $token;
+  return $token;
+};
+
+
+# Add existing annotation to annotation list
+sub add_annotation {
+  push @{$_[0]}, $_[1];
+};
+
+
+# Get last token added to the tokens list
+sub last_token {
+  # DEPRECATED
+  $_[0]->[$#{$_[0]}];
+};
+
+
+# Stringify all tokens
+sub to_string {
+  my ($self, $text_id, $param) = @_;
+
+  unless ($text_id) {
+    warn 'Missing textID';
+    return;
+  };
+
+  my $output = $self->_header($text_id);
+
+  # Iterate
+  my $c = 0;
+
+  # Correct tokens
+  # TODO:
+  #   Check if this is also necessary for structures
+  if ($param != STRUCTURE) {
+    # correct last from-value (if the 'second to last'
+    # from-value refers to an s-tag, then the last from-value
+    # is one to big - see retr_info())
+    my $last_token = $self->last_token;
+    if ($last_token->from == $last_token->to + 1) {
+      # TODO:
+      #   check
+      $last_token->set_from($last_token->to);
+    };
+  };
+
+
+  # Serialize tokens with respect to inline annotations
+  if ($param == WITH_INLINE) {
+    # Iterate over all tokens
+    foreach (@$self) {
+      $output .= $_->to_string_with_inline_annotations($c++);
+    };
+  }
+
+  # Serialize structures
+  elsif ($param == STRUCTURE) {
+    # Iterate over all structures
+    foreach (@$self) {
+      $output .= $_->to_string_as_struct($c++);
+    };
+  }
+
+  # Serialize tokens without respect to inline annotations
+  else {
+    # Iterate over all tokens
+    foreach (@$self) {
+      $output .= $_->to_string($c++);
+    };
+  };
+
+  return $output . $self->_footer;
+};
+
+
+# Overwrite non-applicable boundary method
+sub boundaries {
+  warn 'Not supported';
+};
+
+
+# Write data to zip stream (as utf8)
+sub to_zip {
+  my ($self, $zip, $text_id, $param) = @_;
+  $zip->print(encode('UTF-8', $self->to_string($text_id, $param)));
+};
+
+
+1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index c36b605..3044045 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -1,5 +1,5 @@
 package KorAP::XML::TEI::Tokenizer::Aggressive;
-use base 'KorAP::XML::TEI::Tokenizer';
+use base 'KorAP::XML::TEI::Annotations';
 use strict;
 use warnings;
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Collector.pm b/lib/KorAP/XML/TEI/Tokenizer/Collector.pm
deleted file mode 100644
index 3c67295..0000000
--- a/lib/KorAP/XML/TEI/Tokenizer/Collector.pm
+++ /dev/null

@@ -1,83 +0,0 @@
-package KorAP::XML::TEI::Tokenizer::Collector;
-use base 'KorAP::XML::TEI::Tokenizer';
-use KorAP::XML::TEI::Tokenizer::Token;
-use Encode qw(encode decode);
-use strict;
-use warnings;
-
-
-# Add token to tokens list
-sub add_token {
-  my $self = shift;
-  my $token = KorAP::XML::TEI::Tokenizer::Token->new(@_);
-  push @$self, $token;
-  return $token;
-};
-
-
-# Get last token added to the tokens list
-sub last_token {
-  $_[0]->[$#{$_[0]}];
-};
-
-
-# Stringify all tokens
-sub to_string {
-  my ($self, $text_id, $with_inline_annotations) = @_;
-
-  unless ($text_id) {
-    warn 'Missing textID';
-    return;
-  };
-
-  my $output = $self->_header($text_id);
-
-  # Iterate
-  my $c = 0;
-
-
-  # correct last from-value (if the 'second to last'
-  # from-value refers to an s-tag, then the last from-value
-  # is one to big - see retr_info())
-  my $last_token = $self->last_token;
-  if ($last_token->from == $last_token->to + 1) {
-    # TODO:
-    #   check
-    $last_token->set_from($last_token->to);
-  };
-
-
-  # Serialize with respect to inline annotations
-  if ($with_inline_annotations) {
-    # Iterate over all tokens
-    foreach (@$self) {
-      $output .= $_->to_string_with_inline_annotations($c++);
-    };
-  }
-
-  # Serialize without respect to inline annotations
-  else {
-    # Iterate over all tokens
-    foreach (@$self) {
-      $output .= $_->to_string($c++);
-    };
-  };
-
-  return $output . $self->_footer;
-};
-
-
-# Overwrite non-applicable boundary method
-sub boundaries {
-  warn 'Not supported';
-};
-
-
-# Write data to zip stream (as utf8)
-sub to_zip {
-  my ($self, $zip, $text_id, $with_inline_annotations) = @_;
-  $zip->print(encode('UTF-8', $self->to_string($text_id, $with_inline_annotations)));
-};
-
-
-1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 237d87f..2b68786 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -1,5 +1,5 @@
 package KorAP::XML::TEI::Tokenizer::Conservative;
-use base 'KorAP::XML::TEI::Tokenizer';
+use base 'KorAP::XML::TEI::Annotations';
 use strict;
 use warnings;
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index fb9c972..9a09ec7 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -1,5 +1,5 @@
 package KorAP::XML::TEI::Tokenizer::External;
-use base 'KorAP::XML::TEI::Tokenizer';
+use base 'KorAP::XML::TEI::Annotations';
 use strict;
 use warnings;
 use Log::Any qw($log);

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 0e72931..8066b4c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -24,7 +24,7 @@
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
-use KorAP::XML::TEI::Tokenizer::Collector;
+use KorAP::XML::TEI::Annotations::Collector;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
@@ -144,8 +144,9 @@
 # ~~~ variables ~~~
 #
 
-# Initialize Token-Collector
-my $tokens = KorAP::XML::TEI::Tokenizer::Collector->new;
+# Initialize Token- and Structure-Collector
+my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
+my $structures = KorAP::XML::TEI::Annotations::Collector->new;
 
 
 # Initialize zipper
@@ -160,10 +161,6 @@
 
 my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
 
-
-my @structures;                                      # list of arrays, where each array represents a TEI I5 tag (except $_TOKENS_TAG) from the input document
-                                                     #  - the input of this array is written in func. 'write_structures' into the file '$_structure_file'
-
 my ( $ref, $idx, $att_idx );                         # needed in func. 'write_structures'
 
 my ( $reader,                                        # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
@@ -172,21 +169,17 @@
 # these are only used inside recursive function 'retr_info'
 my ( $_IDX,                                          # value is set dependent on $_XCT_LN - for extracting array of child elements from element in $tree_data
      $e,                                             # element from $tree_data
-     $n,                                             # tag name of actual processed element $e
      $dl,                                            # actual length of string $data
-     @oti,                                           # oti='open tags indizes' - a stack of indizes into @structures, where the top index in @oti
                                                      #                            represents the actual processed element from @structures
      ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
      $add_one,                                       # ...
-     $fval, $fval2,                                  # ...
+     $fval,                                          # ...
      %ws);                                           # hash for indices of whitespace-nodes (needed to recorrect from-values)
                                                      # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
                                                      #  (means: 'from-index - 1' is a key in %ws).
                                                      # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
 
-my $output;                                          # temporary variable needed in 'write_*'-functions for writing output to zip-stream $zip)
-
-my ( $i, $c );                                       # index variables used in loops
+my $c;                                               # index variables used in loops
 
 
 #
@@ -199,7 +192,7 @@
 
 $data_prfx1 = $data_prfx2 = $data_sfx = "";
 
-$fval = $fval2 = 0;
+$fval = 0;
 
 # Normalize regex for header parsing
 for ($_CORP_HEADER_BEG,
@@ -311,7 +304,7 @@
               $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY );
             }
 
-            @structures = (); @oti = ();
+            $structures->reset;
 
             if ( $_TOKENS_PROC ){
               $tokens->reset;
@@ -378,17 +371,20 @@
             };
 
             # ~ write structures ~
-
-            write_structures() if @structures;
-
+            if (!$structures->empty) {
+              $structures->to_zip(
+                $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
+                $text_id_esc,
+                2 # = structure serialization
+              );
+            };
 
             # ~ write tokens ~
-
             if ($_TOKENS_PROC && !$tokens->empty) {
               $tokens->to_zip(
                 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
                 $text_id_esc,
-                $_INLINE_ANNOT
+                $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
               );
             };
 
@@ -606,34 +602,17 @@
       #~~~~
 
 
-      # insert new array (for new tag) into @structures with tag-name and tag-attributes (if present)
-      # update @oti (open tags indizes) with @structures highest index (= $#structures); e.g.: @a=(1,2,3) => $#a = 2
-
-      # ~ tag name ~
-
-      $n = $e->[1];
-
-
       # ~ handle structures ~
 
-      my @array;
-      push @array, $n;
-      push @structures, \@array;
-      push @oti, $#structures; # add highest index of @structures to @oti
-
+      # $e->[1] represents the tag name
+      my $anno = $structures->add_new_annotation($e->[1]);
 
       # ~ handle tokens ~
 
-      # Wether to push entry also into tokens
-      my $inside_tokens_tag = 1 if $_TOKENS_PROC && $n eq $_TOKENS_TAG;
-
-      my $current_token;
-
-      # Add element to token list
-      if ($inside_tokens_tag) {
-        $current_token = $tokens->add_token($n); # TODO: adding $n is of no use (redundant)
-      }
-
+      # Add element also to token list
+      if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
+        $tokens->add_annotation($anno);
+      };
 
       # ~ handle attributes ~
 
@@ -644,15 +623,9 @@
                                                   # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
 
           # '$c' references the 'key' and '$c+1' the 'value'
-          push @{$structures[$#structures]}, ${$e->[3]}[$c], ${$e->[3]}[$c+1];
-
-          if ($inside_tokens_tag) {
-
-            # Add attributes to current token
-            $current_token->add_attribute(
-              @{$e->[3]}[$c, $c + 1]
-            );
-          }
+          $anno->add_attribute(
+            @{$e->[3]}[$c, $c + 1]
+          );
         }
       }
 
@@ -660,15 +633,7 @@
       # ~ index 'from' ~
 
       # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
-
-      push @{$structures[$#structures]}, ( $dl + $add_one ); # see below (text- and whitespace-nodes) for explanation on '$add_one'
-
-      if ($inside_tokens_tag) {
-
-        # Set from value to tokens
-        $current_token->set_from($dl + $add_one);
-      };
-
+      $anno->set_from($dl + $add_one);
 
       #~~~~
       # until here: tag-node (opening)
@@ -688,20 +653,16 @@
       #~~~~~
 
 
-      # ~ handle structures ~
+      # ~ handle structures and tokens ~
 
       {
-        my $ix  = pop @oti; # index of just closed tag
-
-        my $aix = $#{$structures[$ix]}; # determine highest index from 'array referring to last closed tag' ...
-
-        $fval = ${$structures[$ix]}[ $aix ]; # ... and get it's from-value
+        $fval = $anno->from;
 
         if ( $fval > 0 && not exists $ws{ $fval - 1 } ){ # ~ whitespace related issue ~
 
           # ~ previous node was a text-node ~
 
-          ${$structures[$ix]}[ $aix ] = $fval - 1; # recorrect from-value (see below: Notes on ~ whitespace related issue ~)
+          $anno->set_from($fval - 1);
         }
 
         # in case this fails, check input
@@ -713,57 +674,16 @@
         #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
         # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $dl;
         #   do testing with bigger corpus excerpt (wikipedia?)
-        ${$structures[$ix]}[ $aix ] = $dl if $fval == $dl + 1; # correct from-value (same as ... if $fval-1 == $dl)
-
-        push @{$structures[$ix]}, $dl, $rl; # to-value and recursion-level
+        $anno->set_from($dl) if $fval == $dl + 1;
+        $anno->set_to($dl);
+        $anno->set_level($rl);
 
         # note: use $dl, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
       }
 
-
-      # ~ handle tokens ~
-
-
-      if ($inside_tokens_tag) {
-
-        # Check last added token
-        my $last_token = $tokens->last_token;
-
-        # Get from-value from last added token
-        my $fval2 = $last_token->from;
-
-        if( $fval2 > 0 && not exists $ws{ $fval2 - 1 } ){ # ~ whitespace related issue ~
-
-          # ~ previous node was a text-node ~
-
-          # recorrect from-value
-          # (see below: Notes on ~ whitespace related issue ~)
-          $last_token->set_from($fval2 - 1);
-        }
-
-        # in case this fails, check input
-        die "ERROR ($0, retr_info()): text_id='$text_id', processing of tokens: from-value ($fval2) is 2 or more greater"
-          ." than to-value ($dl) => please check. aborting ...\n"
-            if ( $fval2 - 1 ) > $dl;
-
-        # TODO:
-        #   find example for which this case applies
-        #    maybe this is not necessary anymore, because the above recorrection of the from-value suffices
-        #
-        # TODO:
-        #   check, if it's better to remove this line and change above check to 'if ( $fval2 - 1) >= $dl;
-        #   do testing with bigger corpus excerpt (wikipedia?)
-
-        # Correct from-value (same as ... if $fval-1 == $dl)
-        $last_token->set_from($dl) if $fval2 == $dl + 1;
-        $last_token->set_to($dl); # Here from == to?
-        $last_token->set_level($rl);
-      }
-
       # ~ whitespace related issue ~
       # clean up
       delete $ws{ $fval  - 1 } if $fval > 0 && exists $ws{ $fval - 1 };
-      delete $ws{ $fval2 - 1 } if $_TOKENS_PROC && $fval2 > 0 && exists $ws{ $fval2 - 1 };
 
 
       #~~~~
@@ -866,85 +786,6 @@
 
 } # end: sub retr_info
 
-
-sub write_structures { # called from main()
-
-  # ~ write @structures ~
-
-  if ( $dir eq "" ){
-    $log->warn("write_structures(): empty textSigle => nothing to do ...");
-    return;
-  }
-
-  $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
-           ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
-           .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n"; # convert binary string to text string
-
-  $c = 0;
-
-  foreach $ref ( @structures ){
-
-    ( @{$ref} == 4 )?( $idx = 1 ):( $idx = @{$ref}-3 ); # if array '@{$ref}' doesn't contain attributes, then the number of elements in this array is 4
-                                                        #  (name, from, to, rec_level), otherwise >4
-
-    # correct last from-value ( if the 'second to last' from-value refers to an s-tag, then the last from-value is one to big - see retr_info )
-
-    if( $#structures == $c && ${$ref}[ $idx ] == ${$ref}[ $idx+1 ] + 1 ){
-
-      ${$ref}[$idx] = ${$ref}[ $idx+1 ];
-    }
-
-    # this consistency check is already done in 'retr_info()'
-    #elsif( ${$ref}[$idx] > ${$ref}[$idx+1] ){ # consistency check: abort, if this doesn't hold
-    #  die "ERROR ($0: write_structures(): \$text_id=$text_id, \$c=$c, tag-name=${$ref}[0]):"
-    #     ." 'from-index=${$ref}[$idx]' > 'to-index=${$ref}[$idx+1]' => please check! aborting ...\n";
-    #  die "ERROR ($0: write_structures(): \$text_id=$text_id, \$c=$c, tag-name=${$ref}[0]):"
-    #     ." 'from-index=${$ref}[$idx]' > 'to-index=${$ref}[$idx+1]' => please check! aborting ...\n\n$output" }
-
-    # at least 'POS' should always be there => remove constraint '$_TOKENS_PROC'
-    #if( $_TOKENS_PROC && ${$ref}[0] ne $_TOKENS_TAG )
-
-    if( ${$ref}[0] ne $_TOKENS_TAG ){ # $_TOKENS_TAG is already written in 'write_tokens'
-
-      # l (level): insert information about depth of element in XML-tree (top element = level 1)
-      $output .=  "    <span id=\"s$c\" from=\"${$ref}[ $idx ]\" to=\"${$ref}[ $idx+1 ]\" l=\"${$ref}[ $idx+2 ]\">\n"
-                 ."      <fs type=\"struct\" xmlns=\"http://www.tei-c.org/ns/1.0\">\n"
-                 ."        <f name=\"name\">${$ref}[ 0 ]</f>\n";
-
-      if ( $idx > 2 ) # attributes
-      {
-        $output .= "        <f name=\"attr\">\n          <fs type=\"attr\">\n";
-
-        for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
-
-          # see explanation in func. 'write_tokens'
-          ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]);
-
-          # attribute (at index $att_idx) with value (at index $att_idx+1)
-          $output .= "            <f name=\"${$ref}[ $att_idx ]\">${$ref}[ $att_idx+1 ]</f>\n";
-        }
-
-        $output .= "          </fs>\n        </f>\n";
-      }
-
-      $output .= "      </fs>\n    </span>\n";
-
-    } # fi: ... ne $_TOKENS_TAG
-
-    $c++;
-
-  } # end: foreach
-
-  $output .= "  </spanList>\n</layer>";
-
-  $output = encode( "UTF-8", $output ); # convert text string to binary string
-
-  $zipper->new_stream("$dir/$_structure_dir/$_structure_file")
-    ->print($output);
-
-} # end: sub write_structures
-
-
 __END__
 
 =pod

diff --git a/t/token.t b/t/annotation.t
similarity index 92%
rename from t/token.t
rename to t/annotation.t
index 85bab65..1176c98 100644
--- a/t/token.t
+++ b/t/annotation.t

@@ -8,10 +8,10 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use_ok('KorAP::XML::TEI::Tokenizer::Token');
+use_ok('KorAP::XML::TEI::Annotations::Annotation');
 
 subtest 'Initialization' => sub {
-  my $t = KorAP::XML::TEI::Tokenizer::Token->new;
+  my $t = KorAP::XML::TEI::Annotations::Annotation->new;
 
   ok(!defined($t->from), 'Undefined from');
   ok(!defined($t->to), 'Undefined to');
@@ -52,7 +52,7 @@
 
 
 subtest 'Test inline annotations' => sub {
-  my $t = KorAP::XML::TEI::Tokenizer::Token->new('x1', 0, 6);
+  my $t = KorAP::XML::TEI::Annotations::Annotation->new('x1', 0, 6);
   $t->add_attribute('ana' => 'DET @PREMOD');
   $t->add_attribute('lemma' => 'C & A');
 

diff --git a/t/annotations-collect.t b/t/annotations-collect.t
new file mode 100644
index 0000000..1a8c45f
--- /dev/null
+++ b/t/annotations-collect.t

@@ -0,0 +1,60 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use_ok('KorAP::XML::TEI::Annotations::Collector');
+use_ok('KorAP::XML::TEI::Annotations::Annotation');
+
+my $t = KorAP::XML::TEI::Annotations::Collector->new;
+
+$t->add_new_annotation('x1',0,8);
+my $token = $t->add_new_annotation('x2',9,14,2);
+$t->add_new_annotation('x3',15,20);
+
+my $loy = Test::XML::Loy->new($token->to_string(2));
+
+$loy->attr_is('span', 'id', 's2')
+  ->attr_is('span', 'from', 9)
+  ->attr_is('span', 'to', 14)
+  ->attr_is('span', 'l', 2)
+  ->attr_is('span fs f', 'name', 'lex')
+  ;
+
+$loy = Test::XML::Loy->new($t->last_token->to_string(3));
+
+$loy->attr_is('span', 'id', 's3')
+  ->attr_is('span', 'from', 15)
+  ->attr_is('span', 'to', 20)
+  ->attr_is('span fs f', 'name', 'lex')
+;
+
+$loy = Test::XML::Loy->new($t->to_string('text', 0))
+  ->attr_is('layer', 'docid', 'text')
+  ->attr_is('span#s0', 'to', '8')
+  ->attr_is('span#s1', 'to', '14')
+  ->attr_is('span#s1', 'l', '2')
+  ->attr_is('span#s2', 'to', '20')
+;
+
+my $anno = KorAP::XML::TEI::Annotations::Annotation->new('x4', 20 => 21);
+
+$t->add_annotation($anno);
+
+$loy = Test::XML::Loy->new($t->to_string('text',0))
+  ->attr_is('layer', 'docid', 'text')
+  ->attr_is('span#s0', 'to', '8')
+  ->attr_is('span#s1', 'to', '14')
+  ->attr_is('span#s1', 'l', '2')
+  ->attr_is('span#s2', 'to', '20')
+  ->attr_is('span#s3', 'from', '20')
+  ->attr_is('span#s3', 'to', '21')
+;
+
+done_testing;
+

diff --git a/t/tokenization-collect.t b/t/tokenization-collect.t
deleted file mode 100644
index 952c4b7..0000000
--- a/t/tokenization-collect.t
+++ /dev/null

@@ -1,46 +0,0 @@
-use strict;
-use warnings;
-use Test::More;
-use Test::XML::Loy;
-
-use FindBin;
-BEGIN {
-  unshift @INC, "$FindBin::Bin/../lib";
-};
-
-use_ok('KorAP::XML::TEI::Tokenizer::Collector');
-
-my $t = KorAP::XML::TEI::Tokenizer::Collector->new;
-
-$t->add_token('x1',0,8);
-my $token = $t->add_token('x2',9,14,2);
-$t->add_token('x3',15,20);
-
-my $loy = Test::XML::Loy->new($token->to_string(2));
-
-$loy->attr_is('span', 'id', 's2')
-  ->attr_is('span', 'from', 9)
-  ->attr_is('span', 'to', 14)
-  ->attr_is('span', 'l', 2)
-  ->attr_is('span fs f', 'name', 'lex')
-  ;
-
-$loy = Test::XML::Loy->new($t->last_token->to_string(3));
-
-$loy->attr_is('span', 'id', 's3')
-  ->attr_is('span', 'from', 15)
-  ->attr_is('span', 'to', 20)
-  ->attr_is('span fs f', 'name', 'lex')
-;
-
-$loy = Test::XML::Loy->new($t->to_string('text', 0))
-  ->attr_is('layer', 'docid', 'text')
-  ->attr_is('span#s0', 'to', '8')
-  ->attr_is('span#s1', 'to', '14')
-  ->attr_is('span#s1', 'l', '2')
-  ->attr_is('span#s2', 'to', '20')
-;
-
-
-done_testing;
-
commit	41021abd841594bbb18bc8094df6950620daf5df	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Sep 28 10:08:37 2020 +0200
committer	Gerrit Code Review <gerrit2@korap.ids-mannheim.de>	Mon Sep 28 10:08:37 2020 +0200
tree	9b69f7066f2987ab6dca4264d8fb96903cc588dd
parent	7501ca0ecfa651f0add20fc8e8d959bb35c1fc54 [diff]
parent	74ed7f349be99f68d36402fa94480c56a447467a [diff]