Establish collection object for token annotations Change-Id: I03f9ff1f28301135b24dc111b7ef85c3af86a8e6

commit: 09e0b2c7f4ce5f2f7e1c1b95ac12776f9ad48063 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Jul 28 15:57:01 2020 +0200
committer: Akron <nils@diewald-online.de> Thu Jul 30 13:13:08 2020 +0200
tree: 26424b77df3a906a7d6966e7a22e47db1c88b7ac
parent: e68ec0c24d75a301d9f6e41cab9e66ea6337bceb [diff]
diff --git a/lib/KorAP/XML/TEI/Tokenizer.pm b/lib/KorAP/XML/TEI/Tokenizer.pm
index c4a310a..331c8e4 100644
--- a/lib/KorAP/XML/TEI/Tokenizer.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer.pm

@@ -24,6 +24,11 @@
 };
 
 
+# Check if no tokens are stored
+sub empty {
+  return @{$_[0]} > 0 ? 0 : 1
+};
+
 # Return data as a string
 sub to_string {
   my ($self, $text_id) = @_;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Collector.pm b/lib/KorAP/XML/TEI/Tokenizer/Collector.pm
new file mode 100644
index 0000000..3c67295
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/Collector.pm

@@ -0,0 +1,83 @@
+package KorAP::XML::TEI::Tokenizer::Collector;
+use base 'KorAP::XML::TEI::Tokenizer';
+use KorAP::XML::TEI::Tokenizer::Token;
+use Encode qw(encode decode);
+use strict;
+use warnings;
+
+
+# Add token to tokens list
+sub add_token {
+  my $self = shift;
+  my $token = KorAP::XML::TEI::Tokenizer::Token->new(@_);
+  push @$self, $token;
+  return $token;
+};
+
+
+# Get last token added to the tokens list
+sub last_token {
+  $_[0]->[$#{$_[0]}];
+};
+
+
+# Stringify all tokens
+sub to_string {
+  my ($self, $text_id, $with_inline_annotations) = @_;
+
+  unless ($text_id) {
+    warn 'Missing textID';
+    return;
+  };
+
+  my $output = $self->_header($text_id);
+
+  # Iterate
+  my $c = 0;
+
+
+  # correct last from-value (if the 'second to last'
+  # from-value refers to an s-tag, then the last from-value
+  # is one to big - see retr_info())
+  my $last_token = $self->last_token;
+  if ($last_token->from == $last_token->to + 1) {
+    # TODO:
+    #   check
+    $last_token->set_from($last_token->to);
+  };
+
+
+  # Serialize with respect to inline annotations
+  if ($with_inline_annotations) {
+    # Iterate over all tokens
+    foreach (@$self) {
+      $output .= $_->to_string_with_inline_annotations($c++);
+    };
+  }
+
+  # Serialize without respect to inline annotations
+  else {
+    # Iterate over all tokens
+    foreach (@$self) {
+      $output .= $_->to_string($c++);
+    };
+  };
+
+  return $output . $self->_footer;
+};
+
+
+# Overwrite non-applicable boundary method
+sub boundaries {
+  warn 'Not supported';
+};
+
+
+# Write data to zip stream (as utf8)
+sub to_zip {
+  my ($self, $zip, $text_id, $with_inline_annotations) = @_;
+  $zip->print(encode('UTF-8', $self->to_string($text_id, $with_inline_annotations)));
+};
+
+
+1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Token.pm b/lib/KorAP/XML/TEI/Tokenizer/Token.pm
new file mode 100644
index 0000000..619eb14
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/Token.pm

@@ -0,0 +1,212 @@
+package KorAP::XML::TEI::Tokenizer::Token;
+use strict;
+use warnings;
+use KorAP::XML::TEI 'escape_xml';
+
+# TODO:
+#   Make these parameters passable from the script
+#
+# handling inline annotations (inside $_TOKENS_TAG)
+# from which attribute to read LEMMA or ANA information
+my $_INLINE_LEM_RD   = "lemma";
+my $_INLINE_ATT_RD   = "ana";
+
+# TODO:
+#   The format for the POS and MSD information has to suffice
+#   the regular expression ([^ ]+)( (.+))? - which means, that
+#   the POS information can be followed by an optional blank with
+#   additional MSD information; unlike the MSD part, the POS part
+#   may not contain any blanks.
+my $_INLINE_POS_WR   = "pos";
+my $_INLINE_MSD_WR   = "msd";
+my $_INLINE_LEM_WR   = "lemma";
+
+# A token is represented as an array reference of information
+# with variable length.
+
+use constant {
+  TAG         => 0,
+  FROM        => 1,
+  TO          => 2,
+  LEVEL       => 3,
+  ATTR_OFFSET => 4
+};
+
+
+# Create a new token object
+sub new {
+  my $class = shift;
+  my $self = bless [@_], $class;
+
+  # Ensure minimum length for pushing attributes
+  $#$self = 3;
+  return $self;
+};
+
+
+# Set 'from'
+sub set_from {
+  $_[0]->[FROM] = $_[1];
+};
+
+
+# Get 'from'
+sub from {
+  $_[0]->[FROM];
+};
+
+
+# Set 'to'
+sub set_to {
+  $_[0]->[TO] = $_[1];
+};
+
+
+# Get 'to'
+sub to {
+  $_[0]->[TO];
+};
+
+
+# Set level
+sub set_level {
+  $_[0]->[LEVEL] = $_[1];
+};
+
+
+# Get level
+sub level {
+  $_[0]->[LEVEL]
+};
+
+
+# Add attributes
+sub add_attribute {
+  push @{shift()}, @_;
+};
+
+
+# Serialize header
+sub _header {
+  my ($self, $id) = @_;
+
+  # l (level): insert information about depth of element in XML-tree (top element = level 1)
+
+  # Start with indentation
+  return '    ' .
+    '<span id="s' . $id .
+    '" from="' . ($self->[FROM] // '?') .
+    '" to="' . ($self->[TO] // '?') .
+    '" l="' . ($self->[LEVEL] // 0) . '">' .
+    "\n" .
+    '      ' .
+    '<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">' .
+    "\n" .
+    '        ' .
+    '<f name="lex">' . "\n";
+};
+
+
+# Serialize footer
+sub _footer {
+  "        </f>\n      </fs>\n    </span>\n";
+};
+
+
+# Serialize attribute
+sub _att {
+
+  # XML escape the attribute value
+  # ... <w lemma="&gt;" ana="PUNCTUATION">&gt;</w> ...
+  # the '&gt;' is translated to '>' and hence the result would be '<f name="lemma">></f>'
+  '            <f name="' . $_[0] . '">' . escape_xml($_[1] // '') . "</f>\n";
+}
+
+
+# Stringify without inline annotations
+sub to_string {
+  my ($self, $id) = @_;
+
+  my $out = $self->_header($id);
+
+  # Check if attributes exist
+  if ($self->[ATTR_OFFSET]) {
+
+    $out .= "          <fs>\n";
+
+    # Iterate over all attributes
+    for (my $att_idx = ATTR_OFFSET; $att_idx < @{$self}; $att_idx += 2) {
+
+      # Set attribute
+      $out .= _att($self->[$att_idx], $self->[$att_idx + 1]);
+    };
+
+    $out .= "          </fs>\n";
+  };
+
+  return $out . $self->_footer;
+};
+
+
+# Stringify with inline annotations
+sub to_string_with_inline_annotations {
+  my ($self, $id) = @_;
+
+  my $out = $self->_header($id);
+
+  # if ( $idx > 2 ){ # attributes
+  if ($self->[ATTR_OFFSET]) {
+
+    $out .= "          <fs>\n";
+
+    # Iterate over all attributes
+    for (my $att_idx = ATTR_OFFSET; $att_idx < @{$self}; $att_idx += 2) {
+
+      # The inline attribute is 'ana' (or something along the lines)
+      if ($self->[$att_idx] eq $_INLINE_ATT_RD){
+
+        # Take the first value
+        $self->[$att_idx + 1] =~ /^([^ ]+)(?: (.+))?$/;
+
+        # The POS attribute is defined
+        if ($_INLINE_POS_WR) {
+          unless (defined($1)) {
+            die 'ERROR (write_tokens()): unexpected format! => Aborting ... ' .
+              '(att: ' . $self->[ $att_idx + 1 ] . ")\n";
+          };
+          $out .= _att($_INLINE_POS_WR, $1);
+        };
+
+        # The MSD attribute is defined
+        if ($_INLINE_MSD_WR) {
+          unless (defined($2)) {
+            die 'ERROR (write_tokens()): unexpected format! => Aborting ... ' .
+              '(att: ' . $self->[ $att_idx + 1 ] . ")\n";
+          };
+          $out .= _att($_INLINE_MSD_WR, $2);
+        };
+
+      }
+
+      # Inline lemmata are expected
+      # TODO:
+      #   As $_INLINE_LEM_RD == $_INLINE_LEM_WR this
+      #   currently does nothing special.
+      elsif ($_INLINE_LEM_RD && $self->[$att_idx] eq $_INLINE_LEM_RD){
+        $out .= _att($_INLINE_LEM_WR, $self->[$att_idx + 1]);
+      }
+
+      # Add all other attributes
+      else {
+        $out .= _att($self->[$att_idx], $self->[$att_idx + 1]);
+      };
+    };
+
+    $out .= "          </fs>\n";
+  };
+
+  return $out . $self->_footer;
+};
+
+
+1;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index d1bb176..590daad 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -22,6 +22,7 @@
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
+use KorAP::XML::TEI::Tokenizer::Collector;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
@@ -64,7 +65,7 @@
 my $_DOC_HEADER_BEG   = "idsHeader type=\"document\""; # analog
  # mandatory
 my $_TEXT_HEADER_BEG  = "idsHeader type=\"text\"";     # analog
-  
+
 #
 # ~~~ constants ~~~
 #
@@ -110,6 +111,9 @@
 ## TODO: optional
 # handling inline annotations (inside $_TOKENS_TAG)
 my $_INLINE_ANNOT    = $ENV{KORAPXMLTEI_INLINE}?1:0; # on/off: set to 1 if inline annotations are present and should be processed (default: 0)
+
+# TODO:
+#   These parameters are now defunct and moved to Token.pm
 my $_INLINE_LEM_RD   = "lemma";                      # from which attribute to read LEMMA information
 my $_INLINE_ATT_RD   = "ana";                        # from which attribute to read POS information (and evtl. additional MSD - Morphosyntactic Descriptions)
                                                      # TODO: The format for the POS and MSD information has to suffice the regular expression ([^ ]+)( (.+))?
@@ -125,6 +129,10 @@
 # ~~~ variables ~~~
 #
 
+# Initialize Token-Collector
+my $tokens = KorAP::XML::TEI::Tokenizer::Collector->new;
+
+
 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new;
 my $input_fh;                                        # input file handle (default: stdin)
@@ -144,10 +152,7 @@
 my @structures;                                      # list of arrays, where each array represents a TEI I5 tag (except $_TOKENS_TAG) from the input document
                                                      #  - the input of this array is written in func. 'write_structures' into the file '$_structure_file'
 
-my @tokens;                                          # list of arrays, where each array represents a $_TOKENS_TAG from the input document
-                                                     #  - the input of this array is written in func. 'write_tokens' into the file '$_tokens_file'
-
-my ( $ref, $idx, $att_idx );                         # needed in func. 'write_structures' and 'write_tokens'
+my ( $ref, $idx, $att_idx );                         # needed in func. 'write_structures'
 
 my ( $reader,                                        # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
      $tree_data );                                   # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
@@ -160,7 +165,6 @@
      $dl,                                            # actual length of string $data
      @oti,                                           # oti='open tags indizes' - a stack of indizes into @structures, where the top index in @oti
                                                      #                            represents the actual processed element from @structures
-     @oti2,                                          # analogously to @oti, but with reference to array @tokens
      $inside_tokens_tag,                             # flag is set, when inside $_TOKENS_TAG
      ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
      $add_one,                                       # ...
@@ -290,7 +294,7 @@
         @structures = (); @oti = ();
 
         if ( $_TOKENS_PROC ){
-          @tokens = (); @oti2 = ()
+          $tokens->reset;
         }
 
         $dl = $rl = 0;
@@ -319,12 +323,12 @@
 
         if ( $_GEN_TOK_EXT ){
 
+          # Tokenize and output
           $ext_tok->tokenize($data)->to_zip(
             $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
             $text_id_esc
           );
-
-        }
+        };
 
         if ( $_GEN_TOK_INT ){
 
@@ -341,7 +345,7 @@
 
           $aggr_tok->reset;
           $cons_tok->reset;
-        }
+        };
 
         # Encode and escape data
         $data = escape_xml(encode( "UTF-8", $data ));
@@ -361,7 +365,13 @@
 
         # ~ write tokens ~
 
-        write_tokens() if $_TOKENS_PROC && @tokens;
+        if ($_TOKENS_PROC && !$tokens->empty) {
+          $tokens->to_zip(
+            $zipper->new_stream("$_root_dir$dir/$_tokens_dir/${_tokens_file}"),
+            $text_id_esc,
+            $_INLINE_ANNOT
+          );
+        };
 
         #print STDERR "$0: write_tokenization(): DONE\n";
 
@@ -483,9 +493,7 @@
 
   $zipper->close;
 
-  if( $_GEN_TOK_EXT ){
-    $ext_tok->close;
-  }
+  $ext_tok->close if $_GEN_TOK_EXT;
 
 } # end: sub main
 
@@ -616,14 +624,14 @@
 
       # ~ handle tokens ~
 
-      $inside_tokens_tag = $rl if $_TOKENS_PROC && $n eq $_TOKENS_TAG; # wether to push entry also into @tokens array
+      # Wether to push entry also into tokens
+      $inside_tokens_tag = $rl if $_TOKENS_PROC && $n eq $_TOKENS_TAG;
 
+      my $current_token;
+
+      # Add element to token list
       if ( $_TOKENS_PROC && $inside_tokens_tag == $rl ){
-
-        my @array2;
-        push @array2, $n;
-        push @tokens, \@array2;
-        push @oti2, $#tokens;
+        $current_token = $tokens->add_token($n); # TODO: adding $n is of no use (redundant)
       }
 
 
@@ -640,9 +648,11 @@
 
           if ( $_TOKENS_PROC && $inside_tokens_tag == $rl ){
 
-            push @{$tokens[$#tokens]}, ${$e->[3]}[$c], ${$e->[3]}[$c+1];
+            # Add attributes to current token
+            $current_token->add_attribute(
+              @{$e->[3]}[$c, $c + 1]
+            );
           }
-
         }
       }
 
@@ -655,8 +665,9 @@
 
       if ( $_TOKENS_PROC && $inside_tokens_tag == $rl ){
 
-        push @{$tokens[$#tokens]}, ( $dl + $add_one );
-      }
+        # Set from value to tokens
+        $current_token->set_from($dl + $add_one);
+      };
 
 
       #~~~~
@@ -717,31 +728,38 @@
 
       if ( $_TOKENS_PROC && $inside_tokens_tag == $rl ){
 
-        my $ix  = pop @oti2;
+        # Check last added token
+        my $last_token = $tokens->last_token;
 
-        my $aix = $#{$tokens[$ix]};
-
-        $fval2  = ${$tokens[$ix]}[ $aix ]; # from-value
+        # Get from-value from last added token
+        my $fval2 = $last_token->from;
 
         if( $fval2 > 0 && not exists $ws{ $fval2 - 1 } ){ # ~ whitespace related issue ~
 
           # ~ previous node was a text-node ~
 
-          ${$tokens[$ix]}[ $aix ] = $fval2 - 1; # recorrect from-value (see below: Notes on ~ whitespace related issue ~)
+          # recorrect from-value
+          # (see below: Notes on ~ whitespace related issue ~)
+          $last_token->set_from($fval2 - 1);
         }
 
         # in case this fails, check input
-        die "ERROR ($0, retr_info()): text_id='$text_id', processing of \@tokens: from-value ($fval2) is 2 or more greater"
+        die "ERROR ($0, retr_info()): text_id='$text_id', processing of tokens: from-value ($fval2) is 2 or more greater"
           ." than to-value ($dl) => please check. aborting ...\n"
             if ( $fval2 - 1 ) > $dl;
 
-        # TODO: find example for which this case applies
-        #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
-        # TODO: check, if it's better to remove this line and change above check to 'if ( $fval2 - 1) >= $dl;
+        # TODO:
+        #   find example for which this case applies
+        #    maybe this is not necessary anymore, because the above recorrection of the from-value suffices
+        #
+        # TODO:
+        #   check, if it's better to remove this line and change above check to 'if ( $fval2 - 1) >= $dl;
         #   do testing with bigger corpus excerpt (wikipedia?)
-        ${$tokens[$ix]}[ $aix ] = $dl if $fval2 == $dl + 1; # correct from-value (same as ... if $fval-1 == $dl)
 
-        push @{$tokens[$ix]}, $dl, $rl; # to-value and recursion-level
+        # Correct from-value (same as ... if $fval-1 == $dl)
+        $last_token->set_from($dl) if $fval2 == $dl + 1;
+        $last_token->set_to($dl); # Here from == to?
+        $last_token->set_level($rl);
 
         $inside_tokens_tag = -1; # reset
       }
@@ -936,103 +954,6 @@
 } # end: sub write_structures
 
 
-sub write_tokens { # called from main()
-
-  # ~ write @tokens ~
-
-  #print STDERR "$0: write_tokens(): ...\n";
-
-  if( $dir eq "" ){
-
-    print STDERR "WARNING ($0): write_tokens(): empty textSigle => nothing to do ...\n";
-    return;
-  }
-
-  $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
-           ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
-           .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n"; # convert binary string to text string
-
-  $c = 0;
-
-  foreach $ref ( @tokens ){
-
-    # if array '@{$ref}' doesn't contain attributes, then the number of elements in this array is 4 (name, from, to, rec_level), otherwise >4
-    ( @{$ref} == 4 )?( $idx = 1 ):( $idx = @{$ref}-3 );
-
-    # correct last from-value (if the 'second to last' from-value refers to an s-tag, then the last from-value is one to big - see retr_info())
-    if ( $#tokens == $c && ${$ref}[ $idx ] == ${$ref}[ $idx+1 ] + 1 ){
-
-      ${$ref}[ $idx ] = ${$ref}[ $idx+1 ]; # TODO: check
-    }
-
-    # l (level): insert information about depth of element in XML-tree (top element = level 1)
-    $output .= "    <span id=\"s$c\" from=\"${$ref}[ $idx ]\" to=\"${$ref}[ $idx+1 ]\" l=\"${$ref}[ $idx+2 ]\">\n"
-              ."      <fs type=\"lex\" xmlns=\"http://www.tei-c.org/ns/1.0\">\n"
-              ."        <f name=\"lex\">\n";
-
-    if ( $idx > 2 ){ # attributes
-
-      $output .= "          <fs>\n";
-
-      for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
-
-        ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]); # ... <w lemma="&gt;" ana="PUNCTUATION">&gt;</w> ...
-                                                       # the '&gt;' is translated to '>' and hence the result would be '<f name="lemma">></f>'
-
-        if ( $_INLINE_ANNOT && ${$ref}[ $att_idx ] eq "$_INLINE_ATT_RD" ){
-
-          ${$ref}[ $att_idx+1 ] =~ /^([^ ]+)(?: (.+))?$/;
-
-          die "ERROR (write_tokens()): unexpected format! => Aborting ... (att: ${$ref}[ $att_idx+1 ])\n"
-            if ( $_INLINE_POS_WR && not defined $1 ) || ( $_INLINE_MSD_WR && not defined $2 );
-
-          if ( "$_INLINE_POS_WR" ){
-
-            $output .= "            <f name=\"$_INLINE_POS_WR\">";
-            $output .= "$1" if defined $1;
-            $output .= "</f>\n";
-          }
-
-          if ( "$_INLINE_MSD_WR" ){
-
-            $output .= "            <f name=\"$_INLINE_MSD_WR\">";
-            $output .= "$2" if defined $2;
-            $output .= "</f>\n";
-          }
-
-        } elsif ( $_INLINE_ANNOT && "$_INLINE_LEM_RD" && ${$ref}[ $att_idx ] eq "$_INLINE_LEM_RD" ){
-
-          $output .= "            <f name=\"$_INLINE_LEM_WR\">${$ref}[ $att_idx+1 ]</f>\n";
-
-        } else { # all other attributes
-
-          $output .= "            <f name=\"${$ref}[$att_idx]\">${$ref}[ $att_idx+1 ]</f>\n"; # attribute (at index $att_idx) with value (at index $att_idx+1)
-        }
-
-      } # end: for
-
-      $output .= "          </fs>\n";
-
-    } # fi: attributes
-
-    $output .= "        </f>\n      </fs>\n    </span>\n";
-
-    $c++;
-
-  } # end: foreach
-
-  $output .= "  </spanList>\n</layer>";
-
-  $output = encode( "UTF-8", $output ); # convert text string to binary string
-
-  $zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
-    ->print($output);
-
-  #print STDERR "$0: write_tokens(): DONE\n";
-
-} # end: sub write_tokens
-
-
 __END__
 
 =pod

diff --git a/t/script.t b/t/script.t
index 6af3095..6ceadd9 100644
--- a/t/script.t
+++ b/t/script.t

@@ -459,5 +459,41 @@
     ;
 };
 
+subtest 'Check Inline annotations with untagged file' => sub {
+
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+  my ($fh, $outzip) = korap_tempfile('script_untagged');
+
+  # Generate zip file (unportable!)
+  stderr_like(
+    sub { `cat '$file' | KORAPXMLTEI_INLINE=1 perl '$script' > '$outzip'` },
+    qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+    'Processing 1'
+  );
+
+  # TODO: there should be a better way to test this
+  stderr_unlike(
+    sub { `cat '$file' | KORAPXMLTEI_INLINE=1 perl '$script' > '$outzip'` },
+    qr!.*undefined value.*!,
+    'Processing 2'
+  );
+  #
+
+  ok(-e $outzip, "File $outzip exists");
+
+  my $zip = IO::Uncompress::Unzip->new(
+    $outzip,
+    Name => 'GOE/AGA/00000/tokens/morpho.xml'
+  );
+  ok((not $zip), 'missing morpho.xml');
+
+  $zip = IO::Uncompress::Unzip->new(
+    $outzip,
+    Name => 'GOE/AGA/00000/struct/structure.xml'
+  );
+  ok($zip, 'found structure.xml');
+};
 
 done_testing;

diff --git a/t/token.t b/t/token.t
new file mode 100644
index 0000000..85bab65
--- /dev/null
+++ b/t/token.t

@@ -0,0 +1,83 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use_ok('KorAP::XML::TEI::Tokenizer::Token');
+
+subtest 'Initialization' => sub {
+  my $t = KorAP::XML::TEI::Tokenizer::Token->new;
+
+  ok(!defined($t->from), 'Undefined from');
+  ok(!defined($t->to), 'Undefined to');
+  ok(!defined($t->level), 'Undefined level');
+
+  $t->add_attribute('foo' => 'bar');
+  $t->add_attribute('x' => 'y');
+  $t->set_from(7);
+  $t->set_to(5);
+  $t->set_from(4);
+
+  my $loy = Test::XML::Loy->new($t->to_string(3));
+
+  $loy->attr_is('span', 'id', 's3')
+    ->attr_is('span', 'from', 4)
+    ->attr_is('span', 'to', 5)
+    ->attr_is('span fs f', 'name', 'lex')
+    ->attr_is('span fs f fs f:nth-of-type(1)', 'name', 'foo')
+    ->text_is('span fs f fs f:nth-of-type(1)', 'bar')
+    ->attr_is('span fs f fs f:nth-of-type(2)', 'name', 'x')
+    ->text_is('span fs f fs f:nth-of-type(2)', 'y')
+    ;
+
+  is($t->from,4);
+  is($t->to,5);
+  is($t->level,undef);
+  $t->set_level(19);
+  is($t->level,19);
+
+  $loy = Test::XML::Loy->new($t->to_string(3));
+
+  $loy->attr_is('span', 'id', 's3')
+    ->attr_is('span', 'from', 4)
+    ->attr_is('span', 'to', 5)
+    ->attr_is('span', 'l', 19)
+    ;
+};
+
+
+subtest 'Test inline annotations' => sub {
+  my $t = KorAP::XML::TEI::Tokenizer::Token->new('x1', 0, 6);
+  $t->add_attribute('ana' => 'DET @PREMOD');
+  $t->add_attribute('lemma' => 'C & A');
+
+  my $loy = Test::XML::Loy->new($t->to_string(1));
+
+  $loy->attr_is('span', 'id', 's1')
+    ->attr_is('span', 'to', 6)
+    ->attr_is('span > fs > f > fs f:nth-of-type(1)', 'name', 'ana')
+    ->text_is('span > fs > f > fs f:nth-of-type(1)', 'DET @PREMOD')
+    ->attr_is('span > fs > f > fs f:nth-of-type(2)', 'name', 'lemma')
+    ->text_is('span > fs > f > fs f:nth-of-type(2)', 'C & A')
+    ;
+
+  $loy = Test::XML::Loy->new($t->to_string_with_inline_annotations(1));
+
+  $loy->attr_is('span', 'id', 's1')
+    ->attr_is('span', 'to', 6)
+    ->attr_is('span > fs > f > fs f:nth-of-type(1)', 'name', 'pos')
+    ->text_is('span > fs > f > fs f:nth-of-type(1)', 'DET')
+    ->attr_is('span > fs > f > fs f:nth-of-type(2)', 'name', 'msd')
+    ->text_is('span > fs > f > fs f:nth-of-type(2)', '@PREMOD')
+    ->attr_is('span > fs > f > fs f:nth-of-type(3)', 'name', 'lemma')
+    ->text_is('span > fs > f > fs f:nth-of-type(3)', 'C & A')
+};
+
+
+done_testing;
+

diff --git a/t/tokenization-collect.t b/t/tokenization-collect.t
new file mode 100644
index 0000000..952c4b7
--- /dev/null
+++ b/t/tokenization-collect.t

@@ -0,0 +1,46 @@
+use strict;
+use warnings;
+use Test::More;
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use_ok('KorAP::XML::TEI::Tokenizer::Collector');
+
+my $t = KorAP::XML::TEI::Tokenizer::Collector->new;
+
+$t->add_token('x1',0,8);
+my $token = $t->add_token('x2',9,14,2);
+$t->add_token('x3',15,20);
+
+my $loy = Test::XML::Loy->new($token->to_string(2));
+
+$loy->attr_is('span', 'id', 's2')
+  ->attr_is('span', 'from', 9)
+  ->attr_is('span', 'to', 14)
+  ->attr_is('span', 'l', 2)
+  ->attr_is('span fs f', 'name', 'lex')
+  ;
+
+$loy = Test::XML::Loy->new($t->last_token->to_string(3));
+
+$loy->attr_is('span', 'id', 's3')
+  ->attr_is('span', 'from', 15)
+  ->attr_is('span', 'to', 20)
+  ->attr_is('span fs f', 'name', 'lex')
+;
+
+$loy = Test::XML::Loy->new($t->to_string('text', 0))
+  ->attr_is('layer', 'docid', 'text')
+  ->attr_is('span#s0', 'to', '8')
+  ->attr_is('span#s1', 'to', '14')
+  ->attr_is('span#s1', 'l', '2')
+  ->attr_is('span#s2', 'to', '20')
+;
+
+
+done_testing;
+

diff --git a/t/tokenization.t b/t/tokenization.t
index 92b7cc3..9d986a0 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -18,10 +18,15 @@
 
 # Test aggressive
 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+ok($aggr->empty, 'Empty');
 $aggr->tokenize("Der alte Mann");
+ok(!$aggr->empty, 'Not empty');
 is_deeply($aggr, [0,3,4,8,9,13]);
 
-$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
+$aggr->reset;
+ok($aggr->empty, 'Empty');
+
+$aggr->tokenize("Der alte bzw. der grau-melierte Mann");
 is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
 
 like(
commit	09e0b2c7f4ce5f2f7e1c1b95ac12776f9ad48063	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Jul 28 15:57:01 2020 +0200
committer	Akron <nils@diewald-online.de>	Thu Jul 30 13:13:08 2020 +0200
tree	26424b77df3a906a7d6966e7a22e47db1c88b7ac
parent	e68ec0c24d75a301d9f6e41cab9e66ea6337bceb [diff]