lib/KorAP/Tokenizer.pm - KorAP/KorAP-XML-Krill - Gitiles

 package KorAP::Tokenizer;
 use Mojo::Base -base;
 use Mojo::ByteStream 'b';
 use Mojo::Loader;
 use XML::Fast;
 use Try::Tiny;
 use Carp qw/croak/;
 use KorAP::Tokenizer::Range;
 use KorAP::Tokenizer::Match;
 use KorAP::Tokenizer::Spans;
 use KorAP::Tokenizer::Tokens;
 use KorAP::Field::MultiTermTokenStream;
 use List::MoreUtils 'uniq';
 use JSON::XS;
 use Log::Log4perl;

 # TODO 1:
 # Bei den Autoren im Index darauf achten, dass auch "etc." indiziert wird

 # TODO 2:
 # Add punktuations to the index
 # [Er sagte: "Hallo - na?"] becomes
 # [s:Er|tt/l:er|_1#0-2]
 # [s:sagte|tt/l:sagen|_2#3-8|.>::#8-9$1|.>tt/l:PUNCT#8-9$1|.>:"#10-11$2|.>tt/l:PUNCT#10-11$2]
 # [s:Hallo|tt/l:hallo|_3#11-16|.<::#8-9$2|.<tt/l:PUNCT#8-9$2|.<:"#10-11$1|.<tt/l:PUNCT#10-11$1|.>:-#17-18$1|.>tt/l:PUNCT#17-18$1]
 # [s:na|tt/l:na|_4#19-21|.<:-#17-18$1|.<tt/l:PUNCT#17-18$1|.>:?#21-22$1|.>tt/l:PUNCT#21-22$1|.>:"#22-23$2|.>tt/l:PUNCT#22-23$2]

 # TODO 3:
 # Meta-Data:
 # mysql> select * from textMeta limit 5;
 # +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
 # | sigle         | date       | fname      | fpos  | length | tlen | country | topic1                                    | topic2 | tpc1v | tpc2v | supertopic1           | supertopic2 | words | stopwords | numbers | s  | pars | tags | oo | no | ngc | sscr | cs   | has       | ressort | genre | type    | articletype | md5                    |
 # +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
 # | A00/JAN.00001 | 2000-01-01 | a00.i5.xml | 12667 |   3131 |  641 | CH      | Freizeit_Unterhaltung:Reisen              | NULL   |  0.99 |     0 | Freizeit_Unterhaltung | NULL        |    34 |        24 |       0 |  6 |    1 |    5 |  0 |  0 |   0 |    0 |    0 | headlines | NULL    | NULL  | Zeitung | Bericht     | dk6ORWB5uTH33eiakNcJAA |
 # | A00/JAN.00002 | 2000-01-01 | a00.i5.xml | 15798 |  11853 | 9267 | CH      | Staat_Gesellschaft:Biographien_Interviews | NULL   |     1 |     0 | Staat_Gesellschaft    | NULL        |   652 |       445 |      12 | 77 |   18 |   14 |  0 |  0 |   0 |    0 |    0 | headlines | NULL    | NULL  | Zeitung | Bericht     | m7phXF1Ds+aPq3GfLRddCw |
 # | A00/JAN.00003 | 2000-01-01 | a00.i5.xml | 27651 |   4768 | 2234 | CH      | Politik:Kommunalpolitik                   | NULL   |     1 |     0 | Politik               | NULL        |   135 |        74 |       2 | 16 |    3 |    9 |  0 |  0 |   0 |    0 |    0 | headlines | NULL    | NULL  | Zeitung | Bericht     | XwsIDMEIT7ht52DnMTkSHw |
 # | A00/JAN.00004 | 2000-01-01 | a00.i5.xml | 32419 |  11096 | 8619 | CH      | Politik:Kommunalpolitik                   | NULL   |     1 |     0 | Politik               | NULL        |   521 |       368 |       7 | 57 |    8 |   22 |  0 |  0 |   0 |    0 |    0 | headlines | NULL    | NULL  | Zeitung | Bericht     | TSx4mDVLU6XibNHUUT+ubA |
 # | A00/JAN.00005 | 2000-01-01 | a00.i5.xml | 43515 |   5421 | 2875 | CH      | Politik:Kommunalpolitik                   | NULL   |     1 |     0 | Politik               | NULL        |   177 |        94 |       7 | 19 |    3 |   11 |  0 |  0 |   0 |    0 |    0 | headlines | NULL    | NULL  | Zeitung | Bericht     | 6OGZ7MqyjSOb9AtvFLWodA |
 # +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+


 has [qw/path foundry doc stream should have name/];
 has layer => 'Tokens';

 has log => sub {
   if(Log::Log4perl->initialized()) {
     state $log = Log::Log4perl->get_logger(__PACKAGE__);
   };
   state $log = KorAP::Log->new;
   return $log;
 };

 # Parse tokens of the document
 sub parse {
   my $self = shift;

   # Create new token stream
   my $mtts = KorAP::Field::MultiTermTokenStream->new;
   my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
   my $file = b($path)->slurp;
 #  my $tokens = Mojo::DOM->new($file);
 #  $tokens->xml(1);

   my $doc = $self->doc;

   my ($should, $have) = (0, 0);

   # Create range and match objects
   my $range = KorAP::Tokenizer::Range->new;
   my $match = KorAP::Tokenizer::Match->new;

   my $old = 0;

   $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);

   # TODO: Reuse the following code from Spans.pm and tokens.pm
   my ($tokens, $error);
   try {
       local $SIG{__WARN__} = sub {
 	  $error = 1;
       };
       $tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList};
   }
   catch  {
       $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
       $error = 1;
   };

   return if $error;

   if (ref $tokens && $tokens->{span}) {
     $tokens = $tokens->{span};
   }
   else {
       return $self;
   };

   $tokens = [$tokens] if ref $tokens ne 'ARRAY';

   # Iterate over all tokens
   # $tokens->find('span')->each(
   #    sub {
   # my $span = $_;
   foreach my $span (@$tokens) {
       my $from = $span->{'-from'};
       my $to = $span->{'-to'};
       my $token = $doc->primary->data($from, $to);

       # warn 'Has ' . $from . '->' . $to . "($old)";

       unless (defined $token) {
 	  $self->log->error("Unable to find substring [$from-$to] in $path");
 	  next;
       };

       $should++;

       # Ignore non-word tokens
       next if $token !~ /[\w\d]/;

       my $mtt = $mtts->add;

       # Add gap for later finding matching positions before or after
       $range->gap($old, $from, $have) unless $old >= $from;

       # Add surface term
       # That's always the first term!
       $mtt->add('s:' . $token);

       # Add case insensitive term
       $mtt->add('i:' . lc $token);

       # Add offset information
       $mtt->o_start($from);
       $mtt->o_end($to);

       # Store offset information for position matching
       $range->set($from, $to, $have);
       $match->set($from, $to, $have);

       $old = $to + 1;

       # Add position term
       $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);

       $have++;
   };

   # Add token count
   $mtts->add_meta('tokens', '<i>' . $have);

   $range->gap($old, $doc->primary->data_length + 1, $have-1) if $doc->primary->data_length >= ($old - 1);

   # Add info
   $self->stream($mtts);
   $self->{range} = $range;
   $self->{match} = $match;
   $self->should($should);
   $self->have($have);

   $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');

   return $self;
 };

 sub add_subtokens {
   my $self = shift;
   my $mtts = $self->stream or return;

   foreach my $mtt (@{$mtts->multi_term_tokens}) {
     my $o_start = $mtt->o_start;
     my $o_end = $mtt->o_end;
     my $l = $o_end - $o_start;

     my $os = my $s = $mtt->lc_surface;

     # Algorithm based on aggressive tokenization in
     # tokenize.pl from Carsten Schnober
     $s =~ s/[[:alpha:]]/a/g;
     $s =~ s/[[:digit:]]/0/g;
     $s =~ s/\p{Punct}/#/g;
     $s =~ y/~/A/;
     $s .= 'E';

     while ($s =~ /(a+)[^a]/g) {
       my $from = $-[1];
       my $to = $+[1];
       $mtt->add(
 	term => 'i^1:' . substr($os, $from, $from + $to),
 	o_start => $from + $o_start,
 	o_end => $to + $o_start
       ) unless $to - $from == $l;
     };
     while ($s =~ /(0+)[^0]/g) {
       my $from = $-[1];
       my $to = $+[1];
       $mtt->add(
 	term => 'i^2:' . substr($os, $from, $from + $to),
 	o_start => $from + $o_start,
 	o_end => $to + $o_start
       ) unless $to - $from == $l;
     };
     while ($s =~ /(#)/g) {
       my $from = $-[1];
       my $to = $+[1];
       $mtt->add(
 	term => 'i^3:' . substr($os, $from, $from + $to),
 	o_start => $from + $o_start,
 	o_end => $to + $o_start
       ) unless $to - $from == $l;
     };
   };

   return $self;
 };


 # Get span positions through character offsets
 sub range {
   return shift->{range} // KorAP::Tokenizer::Range->new;
 };


 # Get token positions through character offsets
 sub match {
   return shift->{match} // KorAP::Tokenizer::Match->new;
 };


 # Add information of spans to the tokens
 sub add_spandata {
   my $self = shift;
   my %param = @_;

   croak 'No token data available' unless $self->stream;

   $self->log->trace(
     ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
   );

   return if $param{skip};

   my $cb = delete $param{cb};

   $param{primary} = $self->doc->primary;

   my $spans = KorAP::Tokenizer::Spans->new(
     path => $self->path,
     range => $self->range,
     match => $self->match,
     %param
   );

   my $spanarray = $spans->parse or return;

   if ($spans->should == $spans->have) {
     $self->log->trace('With perfect alignment!');
   }
   else {
     $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
   };

   if ($cb) {
     foreach (@$spanarray) {
       $cb->($self->stream, $_, $spans);
     };
     return 1;
   };
   return $spans;
 };

 # Add information to the tokens
 sub add_tokendata {
   my $self = shift;
   my %param = @_;

   croak 'No token data available' unless $self->stream;

   $self->log->trace(
     ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
   );
   return if $param{skip};

   my $cb = delete $param{cb};

   $param{primary} = $self->doc->primary;

   my $tokens = KorAP::Tokenizer::Tokens->new(
     path => $self->path,
     range => $self->range,
     match => $self->match,
     %param
   );

   my $tokenarray = $tokens->parse or return;

   if ($tokens->should == $tokens->have) {
     $self->log->trace('With perfect alignment!');
   }
   else {
     my $perc = _perc(
       $tokens->should, $tokens->have, $self->should, $self->should - $self->have
     );
     $self->log->debug('With an alignment quota of ' . $perc);
   };

   if ($cb) {
     foreach (@$tokenarray) {
       $cb->($self->stream, $_, $tokens);
     };
     return 1;
   };
   return $tokens;
 };


 sub add {
   my $self = shift;
   my $loader = Mojo::Loader->new;
   my $foundry = shift;
   my $layer = shift;

   unless ($foundry && $layer) {
     warn 'Unable to add specific module - not enough information given!';
     return;
   };

   my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;

   if ($mod->can('new') || eval("require $mod; 1;")) {
     if (my $retval = $mod->new($self)->parse(@_)) {

       # This layer is supported
       $self->support($foundry => $layer, @_);

       # Get layerinfo
       $self->layer_info($mod->layer_info);
       return $retval;
     };
   }
   else {
     $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
   };

   return;
 };


 sub _perc {
   if (@_ == 2) {
     # '[' . $_[0] . '/' . $_[1] . ']' .
     return sprintf("%.2f", ($_[1] * 100) / $_[0]);
   }

   my $a_should = shift;
   my $a_have   = shift;
   my $b_should = shift;
   my $b_have   = shift;
   my $a_quota = ($a_have * 100) / $a_should;
   my $b_quota = ($b_have * 100) / $b_should;
   return sprintf("%.2f", $a_quota) . '%' .
     ((($a_quota + $b_quota) <= 100) ?
        ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
 };


 sub support {
   my $self = shift;

   # No setting - just getting
   unless ($_[0]) {
     my @supports;

     # Get all foundries
     foreach my $foundry (keys %{$self->{support}}) {
       push(@supports, $foundry);

       # Get all layers
       foreach my $layer (@{$self->{support}->{$foundry}}) {
 	  my @layers = @$layer;
 	  push(@supports, $foundry . '/' . $layers[0]);

 	  # More information
 	  if ($layers[1]) {
 	      push(@supports, $foundry . '/' . join('/', @layers));
 	  };
       };
     };
     return lc ( join ' ', sort {$a cmp $b } @supports );
   }
   elsif (!$_[1]) {
     return $self->{support}->{$_[0]} // []
   };
   my $f = lc shift;
   my $l = lc shift;
   my @info = @_;
   $self->{support} //= {};
   $self->{support}->{$f} //= [];
   push(@{$self->{support}->{$f}}, [$l, @info]);
 };


 sub layer_info {
     my $self = shift;
     $self->{layer_info} //= [];
     if ($_[0]) {
 	push(@{$self->{layer_info}}, @{$_[0]});
     }
     else {
 	return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}};
     };
 };


 sub to_string {
   my $self = shift;
   my $primary = defined $_[0] ? $_[0] : 1;
   my $string = "<meta>\n";
   $string .= $self->doc->to_string;
   $string .= "</meta>\n";
   if ($primary) {
     $string .= "<text>\n";
     $string .= $self->doc->primary->data . "\n";
     $string .= "</text>\n";
   };
   $string .= '<field name="' . $self->name . "\">\n";
   $string .= "<info>\n";
   $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";

   # There is support info
   if (my $support = $self->support) {
     $string .= 'support = ' . $support . "\n";
   };
   if (my $layer_info = $self->layer_info) {
     $string .= 'layer_info = ' . $layer_info . "\n";
   };

   $string .= "</info>\n";
   $string .= $self->stream->to_string;
   $string .= "</field>";
   return $string;
 };

 sub to_data {
   my $self = shift;
   my $primary = defined $_[0] ? $_[0] : 1;
   my $legacy =  defined $_[1] ? $_[1] : 0;

   my %data = %{$self->doc->to_hash};
   my @fields;

   if ($legacy) {
     push(@fields, { primaryData => $self->doc->primary->data }) if $primary;

     push(@fields, {
       name => $self->name,
       data => $self->stream->to_array,
       tokenization => lc($self->foundry) . '#' . lc($self->layer),
       foundries => $self->support,
       layerInfo => $self->layer_info
     });

     $data{fields} = \@fields;
   }

   else {
     $data{primaryData} = $self->doc->primary->data if $primary;
     $data{tokenName}   = $self->name;
     $data{data}        = $self->stream->to_array;
     $data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer);
     $data{foundries}   = $self->support;
     $data{layerInfos}  = $self->layer_info;
     $data{version}     = '0.02';
   };

   \%data;
 };


 sub to_json_legacy {
   encode_json($_[0]->to_data($_[1], 1));
 };

 sub to_json {
   encode_json($_[0]->to_data($_[1], 0));
 };


 sub to_pretty_json {
   JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
 };


 1;


 __END__

 =pod

 =head1 NAME

 KorAP::Tokenizer

 =head1 SYNOPSIS

   my $tokens = KorAP::Tokenizer->new(
     path    => '../examples/00003',
     doc     => KorAP::Document->new( ... ),
     foundry => 'opennlp',
     layer   => 'tokens'
   );

   $tokens->parse;

 =head1 DESCRIPTION

 Convert token information from the KorAP XML
 format into Lucene Index compatible token streams.

 =head1 ATTRIBUTES

 =head2 path

   print $tokens->path;

 The path of the document.


 =head2 foundry

   print $tokens->foundry;

 The name of the foundry.

 =head2 should

 Number of tokens that exist at all.

 =head2 have

 Number of tokens effectively stored in the token stream (e.g., no punctuations).

 =head2 layer

   print $tokens->layer;

 The name of the tokens layer.


 =head2 doc

   print $tokens->doc->corpus_id;

 The L<KorAP::Document> object.


 =head2 stream

   $tokens->stream->add_meta('adjCount', '<i>45');

 The L<KorAP::Field::MultiTermTokenStream> object


 =head2 range

   $tokens->range->lookup(45);

 The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.

 =head2 match

   $tokens->match->lookup(45);

 The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.


 =head1 METHODS

 =head2 parse

   $tokens->parse;

 Start the tokenization process.


 =head2 to_json_legacy

   print $tokens->to_json_legacy;
   print $tokens->to_json_legacy(1);

 Return the token data in legacy JSON format.
 An optional parsed boolean parameter indicates,
 if primary data should be included.

 =head2 to_json

   print $tokens->to_json;
   print $tokens->to_json(1);

 Return the token data in JSON format
 An optional parsed boolean parameter indicates,
 if primary data should be included.


 =head2 add_subtokens

   $tokens->split_tokens;
   $tokens->split_tokens(
     sub {
        ...
     }
   );

 Add sub token information to the index.
 This is based on the C<aggressive> tokenization, written by Carsten Schnober.


 =head2 add_spandata

   $tokens->add_spandata(
     foundry => 'base',
     layer => 'sentences',
     cb => sub {
       my ($stream, $span) = @_;
       my $mtt = $stream->pos($span->p_start);
       $mtt->add(
 	term    => '<>:s',
 	o_start => $span->o_start,
 	o_end   => $span->o_end,
 	p_end   => $span->p_end
       );
     }
   );

 Add span information to the parsed token stream.
 Expects a C<foundry> name, a C<layer> name and a
 callback parameter, that will be called after each parsed
 span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
 as well as the current L<KorAP::Tokenizer::Span>.

 An optional parameter C<encoding> may indicate that the span offsets
 are either refering to C<bytes> or C<utf-8> offsets.

 An optional parameter C<skip> allows for skipping the process.


 =head2 add_tokendata

   $tokens->add_tokendata(
     foundry => 'connexor',
     layer => 'syntax',
     cb => sub {
       my ($stream, $token) = @_;
       my $mtt = $stream->pos($token->pos);
       my $content = $token->content;

       # syntax
       if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
 	$mtt->add(
 	  term => 'cnx_syn:' . $found
 	);
       };
     });

 Add token information to the parsed token stream.
 Expects a C<foundry> name, a C<layer> name and a
 callback parameter, that will be called after each parsed
 token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
 as well as the current L<KorAP::Tokenizer::Span>.

 An optional parameter C<encoding> may indicate that the token offsets
 are either refering to C<bytes> or C<utf-8> offsets.

 An optional parameter C<skip> allows for skipping the process.

 =cut
	package KorAP::Tokenizer;
	use Mojo::Base -base;
	use Mojo::ByteStream 'b';
	use Mojo::Loader;
	use XML::Fast;
	use Try::Tiny;
	use Carp qw/croak/;
	use KorAP::Tokenizer::Range;
	use KorAP::Tokenizer::Match;
	use KorAP::Tokenizer::Spans;
	use KorAP::Tokenizer::Tokens;
	use KorAP::Field::MultiTermTokenStream;
	use List::MoreUtils 'uniq';
	use JSON::XS;
	use Log::Log4perl;

	# TODO 1:
	# Bei den Autoren im Index darauf achten, dass auch "etc." indiziert wird

	# TODO 2:
	# Add punktuations to the index
	# [Er sagte: "Hallo - na?"] becomes
	# [s:Er\|tt/l:er\|_1#0-2]
	# [s:sagte\|tt/l:sagen\|_2#3-8\|.>::#8-9$1\|.>tt/l:PUNCT#8-9$1\|.>:"#10-11$2\|.>tt/l:PUNCT#10-11$2]
	# [s:Hallo\|tt/l:hallo\|_3#11-16\|.<::#8-9$2\|.<tt/l:PUNCT#8-9$2\|.<:"#10-11$1\|.<tt/l:PUNCT#10-11$1\|.>:-#17-18$1\|.>tt/l:PUNCT#17-18$1]
	# [s:na\|tt/l:na\|_4#19-21\|.<:-#17-18$1\|.<tt/l:PUNCT#17-18$1\|.>:?#21-22$1\|.>tt/l:PUNCT#21-22$1\|.>:"#22-23$2\|.>tt/l:PUNCT#22-23$2]

	# TODO 3:
	# Meta-Data:
	# mysql> select * from textMeta limit 5;
	# +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
	# \| sigle \| date \| fname \| fpos \| length \| tlen \| country \| topic1 \| topic2 \| tpc1v \| tpc2v \| supertopic1 \| supertopic2 \| words \| stopwords \| numbers \| s \| pars \| tags \| oo \| no \| ngc \| sscr \| cs \| has \| ressort \| genre \| type \| articletype \| md5 \|
	# +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
	# \| A00/JAN.00001 \| 2000-01-01 \| a00.i5.xml \| 12667 \| 3131 \| 641 \| CH \| Freizeit_Unterhaltung:Reisen \| NULL \| 0.99 \| 0 \| Freizeit_Unterhaltung \| NULL \| 34 \| 24 \| 0 \| 6 \| 1 \| 5 \| 0 \| 0 \| 0 \| 0 \| 0 \| headlines \| NULL \| NULL \| Zeitung \| Bericht \| dk6ORWB5uTH33eiakNcJAA \|
	# \| A00/JAN.00002 \| 2000-01-01 \| a00.i5.xml \| 15798 \| 11853 \| 9267 \| CH \| Staat_Gesellschaft:Biographien_Interviews \| NULL \| 1 \| 0 \| Staat_Gesellschaft \| NULL \| 652 \| 445 \| 12 \| 77 \| 18 \| 14 \| 0 \| 0 \| 0 \| 0 \| 0 \| headlines \| NULL \| NULL \| Zeitung \| Bericht \| m7phXF1Ds+aPq3GfLRddCw \|
	# \| A00/JAN.00003 \| 2000-01-01 \| a00.i5.xml \| 27651 \| 4768 \| 2234 \| CH \| Politik:Kommunalpolitik \| NULL \| 1 \| 0 \| Politik \| NULL \| 135 \| 74 \| 2 \| 16 \| 3 \| 9 \| 0 \| 0 \| 0 \| 0 \| 0 \| headlines \| NULL \| NULL \| Zeitung \| Bericht \| XwsIDMEIT7ht52DnMTkSHw \|
	# \| A00/JAN.00004 \| 2000-01-01 \| a00.i5.xml \| 32419 \| 11096 \| 8619 \| CH \| Politik:Kommunalpolitik \| NULL \| 1 \| 0 \| Politik \| NULL \| 521 \| 368 \| 7 \| 57 \| 8 \| 22 \| 0 \| 0 \| 0 \| 0 \| 0 \| headlines \| NULL \| NULL \| Zeitung \| Bericht \| TSx4mDVLU6XibNHUUT+ubA \|
	# \| A00/JAN.00005 \| 2000-01-01 \| a00.i5.xml \| 43515 \| 5421 \| 2875 \| CH \| Politik:Kommunalpolitik \| NULL \| 1 \| 0 \| Politik \| NULL \| 177 \| 94 \| 7 \| 19 \| 3 \| 11 \| 0 \| 0 \| 0 \| 0 \| 0 \| headlines \| NULL \| NULL \| Zeitung \| Bericht \| 6OGZ7MqyjSOb9AtvFLWodA \|
	# +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+



	has [qw/path foundry doc stream should have name/];
	has layer => 'Tokens';

	has log => sub {
	if(Log::Log4perl->initialized()) {
	state $log = Log::Log4perl->get_logger(__PACKAGE__);
	};
	state $log = KorAP::Log->new;
	return $log;
	};

	# Parse tokens of the document
	sub parse {
	my $self = shift;

	# Create new token stream
	my $mtts = KorAP::Field::MultiTermTokenStream->new;
	my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
	my $file = b($path)->slurp;
	# my $tokens = Mojo::DOM->new($file);
	# $tokens->xml(1);

	my $doc = $self->doc;

	my ($should, $have) = (0, 0);

	# Create range and match objects
	my $range = KorAP::Tokenizer::Range->new;
	my $match = KorAP::Tokenizer::Match->new;

	my $old = 0;

	$self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);

	# TODO: Reuse the following code from Spans.pm and tokens.pm
	my ($tokens, $error);
	try {
	local $SIG{__WARN__} = sub {
	$error = 1;
	};
	$tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList};
	}
	catch {
	$self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
	$error = 1;
	};

	return if $error;

	if (ref $tokens && $tokens->{span}) {
	$tokens = $tokens->{span};
	}
	else {
	return $self;
	};

	$tokens = [$tokens] if ref $tokens ne 'ARRAY';

	# Iterate over all tokens
	# $tokens->find('span')->each(
	# sub {
	# my $span = $_;
	foreach my $span (@$tokens) {
	my $from = $span->{'-from'};
	my $to = $span->{'-to'};
	my $token = $doc->primary->data($from, $to);

	# warn 'Has ' . $from . '->' . $to . "($old)";

	unless (defined $token) {
	$self->log->error("Unable to find substring [$from-$to] in $path");
	next;
	};

	$should++;

	# Ignore non-word tokens
	next if $token !~ /[\w\d]/;

	my $mtt = $mtts->add;

	# Add gap for later finding matching positions before or after
	$range->gap($old, $from, $have) unless $old >= $from;

	# Add surface term
	# That's always the first term!
	$mtt->add('s:' . $token);

	# Add case insensitive term
	$mtt->add('i:' . lc $token);

	# Add offset information
	$mtt->o_start($from);
	$mtt->o_end($to);

	# Store offset information for position matching
	$range->set($from, $to, $have);
	$match->set($from, $to, $have);

	$old = $to + 1;

	# Add position term
	$mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);

	$have++;
	};

	# Add token count
	$mtts->add_meta('tokens', '<i>' . $have);

	$range->gap($old, $doc->primary->data_length + 1, $have-1) if $doc->primary->data_length >= ($old - 1);

	# Add info
	$self->stream($mtts);
	$self->{range} = $range;
	$self->{match} = $match;
	$self->should($should);
	$self->have($have);

	$self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');

	return $self;
	};

	sub add_subtokens {
	my $self = shift;
	my $mtts = $self->stream or return;

	foreach my $mtt (@{$mtts->multi_term_tokens}) {
	my $o_start = $mtt->o_start;
	my $o_end = $mtt->o_end;
	my $l = $o_end - $o_start;

	my $os = my $s = $mtt->lc_surface;

	# Algorithm based on aggressive tokenization in
	# tokenize.pl from Carsten Schnober
	$s =~ s/[[:alpha:]]/a/g;
	$s =~ s/[[:digit:]]/0/g;
	$s =~ s/\p{Punct}/#/g;
	$s =~ y/~/A/;
	$s .= 'E';

	while ($s =~ /(a+)[^a]/g) {
	my $from = $-[1];
	my $to = $+[1];
	$mtt->add(
	term => 'i^1:' . substr($os, $from, $from + $to),
	o_start => $from + $o_start,
	o_end => $to + $o_start
	) unless $to - $from == $l;
	};
	while ($s =~ /(0+)[^0]/g) {
	my $from = $-[1];
	my $to = $+[1];
	$mtt->add(
	term => 'i^2:' . substr($os, $from, $from + $to),
	o_start => $from + $o_start,
	o_end => $to + $o_start
	) unless $to - $from == $l;
	};
	while ($s =~ /(#)/g) {
	my $from = $-[1];
	my $to = $+[1];
	$mtt->add(
	term => 'i^3:' . substr($os, $from, $from + $to),
	o_start => $from + $o_start,
	o_end => $to + $o_start
	) unless $to - $from == $l;
	};
	};

	return $self;
	};


	# Get span positions through character offsets
	sub range {
	return shift->{range} // KorAP::Tokenizer::Range->new;
	};


	# Get token positions through character offsets
	sub match {
	return shift->{match} // KorAP::Tokenizer::Match->new;
	};


	# Add information of spans to the tokens
	sub add_spandata {
	my $self = shift;
	my %param = @_;

	croak 'No token data available' unless $self->stream;

	$self->log->trace(
	($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
	);

	return if $param{skip};

	my $cb = delete $param{cb};

	$param{primary} = $self->doc->primary;

	my $spans = KorAP::Tokenizer::Spans->new(
	path => $self->path,
	range => $self->range,
	match => $self->match,
	%param
	);

	my $spanarray = $spans->parse or return;

	if ($spans->should == $spans->have) {
	$self->log->trace('With perfect alignment!');
	}
	else {
	$self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
	};

	if ($cb) {
	foreach (@$spanarray) {
	$cb->($self->stream, $_, $spans);
	};
	return 1;
	};
	return $spans;
	};

	# Add information to the tokens
	sub add_tokendata {
	my $self = shift;
	my %param = @_;

	croak 'No token data available' unless $self->stream;

	$self->log->trace(
	($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
	);
	return if $param{skip};

	my $cb = delete $param{cb};

	$param{primary} = $self->doc->primary;

	my $tokens = KorAP::Tokenizer::Tokens->new(
	path => $self->path,
	range => $self->range,
	match => $self->match,
	%param
	);

	my $tokenarray = $tokens->parse or return;

	if ($tokens->should == $tokens->have) {
	$self->log->trace('With perfect alignment!');
	}
	else {
	my $perc = _perc(
	$tokens->should, $tokens->have, $self->should, $self->should - $self->have
	);
	$self->log->debug('With an alignment quota of ' . $perc);
	};

	if ($cb) {
	foreach (@$tokenarray) {
	$cb->($self->stream, $_, $tokens);
	};
	return 1;
	};
	return $tokens;
	};


	sub add {
	my $self = shift;
	my $loader = Mojo::Loader->new;
	my $foundry = shift;
	my $layer = shift;

	unless ($foundry && $layer) {
	warn 'Unable to add specific module - not enough information given!';
	return;
	};

	my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;

	if ($mod->can('new') \|\| eval("require $mod; 1;")) {
	if (my $retval = $mod->new($self)->parse(@_)) {

	# This layer is supported
	$self->support($foundry => $layer, @_);

	# Get layerinfo
	$self->layer_info($mod->layer_info);
	return $retval;
	};
	}
	else {
	$self->log->error('Unable to load '.$mod . '(' . $@ . ')');
	};

	return;
	};


	sub _perc {
	if (@_ == 2) {
	# '[' . $_[0] . '/' . $_[1] . ']' .
	return sprintf("%.2f", ($_[1] * 100) / $_[0]);
	}

	my $a_should = shift;
	my $a_have = shift;
	my $b_should = shift;
	my $b_have = shift;
	my $a_quota = ($a_have * 100) / $a_should;
	my $b_quota = ($b_have * 100) / $b_should;
	return sprintf("%.2f", $a_quota) . '%' .
	((($a_quota + $b_quota) <= 100) ?
	' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
	};


	sub support {
	my $self = shift;

	# No setting - just getting
	unless ($_[0]) {
	my @supports;

	# Get all foundries
	foreach my $foundry (keys %{$self->{support}}) {
	push(@supports, $foundry);

	# Get all layers
	foreach my $layer (@{$self->{support}->{$foundry}}) {
	my @layers = @$layer;
	push(@supports, $foundry . '/' . $layers[0]);

	# More information
	if ($layers[1]) {
	push(@supports, $foundry . '/' . join('/', @layers));
	};
	};
	};
	return lc ( join ' ', sort {$a cmp $b } @supports );
	}
	elsif (!$_[1]) {
	return $self->{support}->{$_[0]} // []
	};
	my $f = lc shift;
	my $l = lc shift;
	my @info = @_;
	$self->{support} //= {};
	$self->{support}->{$f} //= [];
	push(@{$self->{support}->{$f}}, [$l, @info]);
	};


	sub layer_info {
	my $self = shift;
	$self->{layer_info} //= [];
	if ($_[0]) {
	push(@{$self->{layer_info}}, @{$_[0]});
	}
	else {
	return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}};
	};
	};


	sub to_string {
	my $self = shift;
	my $primary = defined $_[0] ? $_[0] : 1;
	my $string = "<meta>\n";
	$string .= $self->doc->to_string;
	$string .= "</meta>\n";
	if ($primary) {
	$string .= "<text>\n";
	$string .= $self->doc->primary->data . "\n";
	$string .= "</text>\n";
	};
	$string .= '<field name="' . $self->name . "\">\n";
	$string .= "<info>\n";
	$string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";

	# There is support info
	if (my $support = $self->support) {
	$string .= 'support = ' . $support . "\n";
	};
	if (my $layer_info = $self->layer_info) {
	$string .= 'layer_info = ' . $layer_info . "\n";
	};

	$string .= "</info>\n";
	$string .= $self->stream->to_string;
	$string .= "</field>";
	return $string;
	};

	sub to_data {
	my $self = shift;
	my $primary = defined $_[0] ? $_[0] : 1;
	my $legacy = defined $_[1] ? $_[1] : 0;

	my %data = %{$self->doc->to_hash};
	my @fields;

	if ($legacy) {
	push(@fields, { primaryData => $self->doc->primary->data }) if $primary;

	push(@fields, {
	name => $self->name,
	data => $self->stream->to_array,
	tokenization => lc($self->foundry) . '#' . lc($self->layer),
	foundries => $self->support,
	layerInfo => $self->layer_info
	});

	$data{fields} = \@fields;
	}

	else {
	$data{primaryData} = $self->doc->primary->data if $primary;
	$data{tokenName} = $self->name;
	$data{data} = $self->stream->to_array;
	$data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer);
	$data{foundries} = $self->support;
	$data{layerInfos} = $self->layer_info;
	$data{version} = '0.02';
	};

	\%data;
	};


	sub to_json_legacy {
	encode_json($_[0]->to_data($_[1], 1));
	};

	sub to_json {
	encode_json($_[0]->to_data($_[1], 0));
	};


	sub to_pretty_json {
	JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
	};


	1;


	__END__

	=pod

	=head1 NAME

	KorAP::Tokenizer

	=head1 SYNOPSIS

	my $tokens = KorAP::Tokenizer->new(
	path => '../examples/00003',
	doc => KorAP::Document->new( ... ),
	foundry => 'opennlp',
	layer => 'tokens'
	);

	$tokens->parse;

	=head1 DESCRIPTION

	Convert token information from the KorAP XML
	format into Lucene Index compatible token streams.

	=head1 ATTRIBUTES

	=head2 path

	print $tokens->path;

	The path of the document.


	=head2 foundry

	print $tokens->foundry;

	The name of the foundry.

	=head2 should

	Number of tokens that exist at all.

	=head2 have

	Number of tokens effectively stored in the token stream (e.g., no punctuations).

	=head2 layer

	print $tokens->layer;

	The name of the tokens layer.


	=head2 doc

	print $tokens->doc->corpus_id;

	The L<KorAP::Document> object.


	=head2 stream

	$tokens->stream->add_meta('adjCount', '<i>45');

	The L<KorAP::Field::MultiTermTokenStream> object


	=head2 range

	$tokens->range->lookup(45);

	The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.

	=head2 match

	$tokens->match->lookup(45);

	The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.


	=head1 METHODS

	=head2 parse

	$tokens->parse;

	Start the tokenization process.


	=head2 to_json_legacy

	print $tokens->to_json_legacy;
	print $tokens->to_json_legacy(1);

	Return the token data in legacy JSON format.
	An optional parsed boolean parameter indicates,
	if primary data should be included.

	=head2 to_json

	print $tokens->to_json;
	print $tokens->to_json(1);

	Return the token data in JSON format
	An optional parsed boolean parameter indicates,
	if primary data should be included.


	=head2 add_subtokens

	$tokens->split_tokens;
	$tokens->split_tokens(
	sub {
	...
	}
	);

	Add sub token information to the index.
	This is based on the C<aggressive> tokenization, written by Carsten Schnober.


	=head2 add_spandata

	$tokens->add_spandata(
	foundry => 'base',
	layer => 'sentences',
	cb => sub {
	my ($stream, $span) = @_;
	my $mtt = $stream->pos($span->p_start);
	$mtt->add(
	term => '<>:s',
	o_start => $span->o_start,
	o_end => $span->o_end,
	p_end => $span->p_end
	);
	}
	);

	Add span information to the parsed token stream.
	Expects a C<foundry> name, a C<layer> name and a
	callback parameter, that will be called after each parsed
	span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
	as well as the current L<KorAP::Tokenizer::Span>.

	An optional parameter C<encoding> may indicate that the span offsets
	are either refering to C<bytes> or C<utf-8> offsets.

	An optional parameter C<skip> allows for skipping the process.


	=head2 add_tokendata

	$tokens->add_tokendata(
	foundry => 'connexor',
	layer => 'syntax',
	cb => sub {
	my ($stream, $token) = @_;
	my $mtt = $stream->pos($token->pos);
	my $content = $token->content;

	# syntax
	if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
	$mtt->add(
	term => 'cnx_syn:' . $found
	);
	};
	});

	Add token information to the parsed token stream.
	Expects a C<foundry> name, a C<layer> name and a
	callback parameter, that will be called after each parsed
	token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
	as well as the current L<KorAP::Tokenizer::Span>.

	An optional parameter C<encoding> may indicate that the token offsets
	are either refering to C<bytes> or C<utf-8> offsets.

	An optional parameter C<skip> allows for skipping the process.

	=cut