blob: d0a8a0f80c9cf6b391a85dcfa23fa0aed81eff54 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001package KorAP::Tokenizer;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
3use Mojo::ByteStream 'b';
Nils Diewald7364d1f2013-11-05 19:26:35 +00004use Mojo::Loader;
Nils Diewald3cf08c72013-12-16 20:31:10 +00005use XML::Fast;
6use Try::Tiny;
Nils Diewald7364d1f2013-11-05 19:26:35 +00007use Carp qw/croak/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use KorAP::Tokenizer::Range;
9use KorAP::Tokenizer::Match;
10use KorAP::Tokenizer::Spans;
11use KorAP::Tokenizer::Tokens;
Nils Diewald7364d1f2013-11-05 19:26:35 +000012use KorAP::Field::MultiTermTokenStream;
Nils Diewald3cf08c72013-12-16 20:31:10 +000013use List::MoreUtils 'uniq';
Nils Diewald7364d1f2013-11-05 19:26:35 +000014use JSON::XS;
Nils Diewald2db9ad02013-10-29 19:26:43 +000015use Log::Log4perl;
16
Nils Diewald7364d1f2013-11-05 19:26:35 +000017has [qw/path foundry doc stream should have name/];
18has layer => 'Tokens';
Nils Diewald2db9ad02013-10-29 19:26:43 +000019
Nils Diewald7b847222014-04-23 11:14:00 +000020has log => sub {
21 if(Log::Log4perl->initialized()) {
22 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000023 };
24 state $log = KorAP::Log->new;
25 return $log;
Nils Diewald2db9ad02013-10-29 19:26:43 +000026};
27
28# Parse tokens of the document
29sub parse {
30 my $self = shift;
31
32 # Create new token stream
Nils Diewald7364d1f2013-11-05 19:26:35 +000033 my $mtts = KorAP::Field::MultiTermTokenStream->new;
Nils Diewald092178e2013-11-26 16:18:48 +000034 my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
35 my $file = b($path)->slurp;
Nils Diewald3cf08c72013-12-16 20:31:10 +000036# my $tokens = Mojo::DOM->new($file);
37# $tokens->xml(1);
Nils Diewald2db9ad02013-10-29 19:26:43 +000038
39 my $doc = $self->doc;
40
41 my ($should, $have) = (0, 0);
42
43 # Create range and match objects
44 my $range = KorAP::Tokenizer::Range->new;
45 my $match = KorAP::Tokenizer::Match->new;
46
47 my $old = 0;
48
49 $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
50
Nils Diewald3cf08c72013-12-16 20:31:10 +000051 # TODO: Reuse the following code from Spans.pm and tokens.pm
52 my ($tokens, $error);
53 try {
54 local $SIG{__WARN__} = sub {
55 $error = 1;
56 };
Nils Diewald7b847222014-04-23 11:14:00 +000057 $tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList};
Nils Diewald3cf08c72013-12-16 20:31:10 +000058 }
59 catch {
60 $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
61 $error = 1;
62 };
63
64 return if $error;
65
66 if (ref $tokens && $tokens->{span}) {
Nils Diewald7b847222014-04-23 11:14:00 +000067 $tokens = $tokens->{span};
Nils Diewald3cf08c72013-12-16 20:31:10 +000068 }
69 else {
Nils Diewald21a3e1a2014-04-28 18:48:16 +000070 return $self;
Nils Diewald3cf08c72013-12-16 20:31:10 +000071 };
72
73 $tokens = [$tokens] if ref $tokens ne 'ARRAY';
74
Nils Diewald2db9ad02013-10-29 19:26:43 +000075 # Iterate over all tokens
Nils Diewald3cf08c72013-12-16 20:31:10 +000076 # $tokens->find('span')->each(
77 # sub {
78 # my $span = $_;
79 foreach my $span (@$tokens) {
80 my $from = $span->{'-from'};
81 my $to = $span->{'-to'};
Nils Diewald2db9ad02013-10-29 19:26:43 +000082 my $token = $doc->primary->data($from, $to);
83
Nils Diewald3ece6302013-12-02 18:38:16 +000084 # warn 'Has ' . $from . '->' . $to . "($old)";
85
Nils Diewaldaba47102013-11-27 15:02:47 +000086 unless (defined $token) {
Nils Diewald092178e2013-11-26 16:18:48 +000087 $self->log->error("Unable to find substring [$from-$to] in $path");
Nils Diewald3cf08c72013-12-16 20:31:10 +000088 next;
Nils Diewald092178e2013-11-26 16:18:48 +000089 };
90
Nils Diewald2db9ad02013-10-29 19:26:43 +000091 $should++;
92
93 # Ignore non-word tokens
Nils Diewald3cf08c72013-12-16 20:31:10 +000094 next if $token !~ /[\w\d]/;
95
Nils Diewald2db9ad02013-10-29 19:26:43 +000096 my $mtt = $mtts->add;
97
98 # Add gap for later finding matching positions before or after
99 $range->gap($old, $from, $have) unless $old >= $from;
100
101 # Add surface term
Nils Diewaldf03c6802014-07-21 16:39:44 +0000102 # That's always the first term!
Nils Diewald2db9ad02013-10-29 19:26:43 +0000103 $mtt->add('s:' . $token);
104
105 # Add case insensitive term
106 $mtt->add('i:' . lc $token);
107
108 # Add offset information
109 $mtt->o_start($from);
110 $mtt->o_end($to);
111
112 # Store offset information for position matching
113 $range->set($from, $to, $have);
114 $match->set($from, $to, $have);
115
116 $old = $to + 1;
117
118 # Add position term
119 $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
120
121 $have++;
Nils Diewald3cf08c72013-12-16 20:31:10 +0000122 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000123
124 # Add token count
Nils Diewald092178e2013-11-26 16:18:48 +0000125 $mtts->add_meta('tokens', '<i>' . $have);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000126
Nils Diewald3ece6302013-12-02 18:38:16 +0000127 $range->gap($old, $doc->primary->data_length + 1, $have-1) if $doc->primary->data_length >= ($old - 1);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000128
129 # Add info
130 $self->stream($mtts);
131 $self->{range} = $range;
132 $self->{match} = $match;
133 $self->should($should);
134 $self->have($have);
135
Nils Diewald7b847222014-04-23 11:14:00 +0000136 $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');
Nils Diewald21a3e1a2014-04-28 18:48:16 +0000137
138 return $self;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000139};
140
Nils Diewaldf03c6802014-07-21 16:39:44 +0000141sub add_subtokens {
142 my $self = shift;
143 my $mtts = $self->stream or return;
144
145 foreach my $mtt (@{$mtts->multi_term_tokens}) {
146 my $o_start = $mtt->o_start;
147 my $o_end = $mtt->o_end;
148 my $l = $o_end - $o_start;
149
150 my $s = substr($mtt->lc_surface,2);
Nils Diewaldf03c6802014-07-21 16:39:44 +0000151 my $os = $s;
152
153 # Algorithm based on aggressive tokenization in
154 # tokenize.pl from Carsten Schnober
155 $s =~ s/[[:alpha:]]/a/g;
156 $s =~ s/[[:digit:]]/0/g;
157 $s =~ s/\p{Punct}/#/g;
158 $s =~ y/~/A/;
159 $s .= 'E';
160
161 while ($s =~ /(a+)[^a]/g) {
162 my $from = $-[1];
163 my $to = $+[1];
164 $mtt->add(
165 term => 'i^1:' . substr($os, $from, $from + $to),
166 o_start => $from + $o_start,
167 o_end => $to + $o_start
168 ) unless $to - $from == $l;
169 };
170 while ($s =~ /(0+)[^0]/g) {
171 my $from = $-[1];
172 my $to = $+[1];
173 $mtt->add(
174 term => 'i^2:' . substr($os, $from, $from + $to),
175 o_start => $from + $o_start,
176 o_end => $to + $o_start
177 ) unless $to - $from == $l;
178 };
179 while ($s =~ /(#)/g) {
180 my $from = $-[1];
181 my $to = $+[1];
182 $mtt->add(
183 term => 'i^3:' . substr($os, $from, $from + $to),
184 o_start => $from + $o_start,
185 o_end => $to + $o_start
186 ) unless $to - $from == $l;
187 };
188 };
189
Nils Diewald032e31d2014-07-21 18:39:12 +0000190 return $self;
Nils Diewaldf03c6802014-07-21 16:39:44 +0000191};
192
Nils Diewald2db9ad02013-10-29 19:26:43 +0000193
194# Get span positions through character offsets
195sub range {
196 return shift->{range} // KorAP::Tokenizer::Range->new;
197};
198
199
200# Get token positions through character offsets
201sub match {
202 return shift->{match} // KorAP::Tokenizer::Match->new;
203};
204
205
206# Add information of spans to the tokens
207sub add_spandata {
208 my $self = shift;
209 my %param = @_;
210
211 croak 'No token data available' unless $self->stream;
212
213 $self->log->trace(
214 ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
215 );
216
217 return if $param{skip};
218
219 my $cb = delete $param{cb};
220
Nils Diewald7364d1f2013-11-05 19:26:35 +0000221 $param{primary} = $self->doc->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000222
223 my $spans = KorAP::Tokenizer::Spans->new(
224 path => $self->path,
225 range => $self->range,
Nils Diewald7364d1f2013-11-05 19:26:35 +0000226 match => $self->match,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000227 %param
228 );
229
Nils Diewaldaba47102013-11-27 15:02:47 +0000230 my $spanarray = $spans->parse or return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000231
232 if ($spans->should == $spans->have) {
233 $self->log->trace('With perfect alignment!');
234 }
235 else {
236 $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
237 };
238
Nils Diewald2db9ad02013-10-29 19:26:43 +0000239 if ($cb) {
240 foreach (@$spanarray) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000241 $cb->($self->stream, $_, $spans);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000242 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000243 return 1;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000244 };
245 return $spans;
246};
247
Nils Diewald2db9ad02013-10-29 19:26:43 +0000248# Add information to the tokens
249sub add_tokendata {
250 my $self = shift;
251 my %param = @_;
252
253 croak 'No token data available' unless $self->stream;
254
255 $self->log->trace(
256 ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
257 );
258 return if $param{skip};
259
260 my $cb = delete $param{cb};
261
Nils Diewald7364d1f2013-11-05 19:26:35 +0000262 $param{primary} = $self->doc->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000263
264 my $tokens = KorAP::Tokenizer::Tokens->new(
265 path => $self->path,
Nils Diewald7364d1f2013-11-05 19:26:35 +0000266 range => $self->range,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000267 match => $self->match,
268 %param
269 );
270
Nils Diewaldaba47102013-11-27 15:02:47 +0000271 my $tokenarray = $tokens->parse or return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000272
273 if ($tokens->should == $tokens->have) {
274 $self->log->trace('With perfect alignment!');
275 }
276 else {
277 my $perc = _perc(
278 $tokens->should, $tokens->have, $self->should, $self->should - $self->have
279 );
280 $self->log->debug('With an alignment quota of ' . $perc);
281 };
282
283 if ($cb) {
284 foreach (@$tokenarray) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000285 $cb->($self->stream, $_, $tokens);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000286 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000287 return 1;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000288 };
289 return $tokens;
290};
291
292
Nils Diewald7364d1f2013-11-05 19:26:35 +0000293sub add {
294 my $self = shift;
295 my $loader = Mojo::Loader->new;
296 my $foundry = shift;
297 my $layer = shift;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000298
299 unless ($foundry && $layer) {
300 warn 'Unable to add specific module - not enough information given!';
301 return;
302 };
303
Nils Diewald7364d1f2013-11-05 19:26:35 +0000304 my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
305
306 if ($mod->can('new') || eval("require $mod; 1;")) {
307 if (my $retval = $mod->new($self)->parse(@_)) {
Nils Diewald3cf08c72013-12-16 20:31:10 +0000308
309 # This layer is supported
Nils Diewald7364d1f2013-11-05 19:26:35 +0000310 $self->support($foundry => $layer, @_);
Nils Diewald3cf08c72013-12-16 20:31:10 +0000311
312 # Get layerinfo
313 $self->layer_info($mod->layer_info);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000314 return $retval;
315 };
316 }
317 else {
318 $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
319 };
320
321 return;
322};
323
324
Nils Diewald2db9ad02013-10-29 19:26:43 +0000325sub _perc {
326 if (@_ == 2) {
327 # '[' . $_[0] . '/' . $_[1] . ']' .
328 return sprintf("%.2f", ($_[1] * 100) / $_[0]);
329 }
330
331 my $a_should = shift;
332 my $a_have = shift;
333 my $b_should = shift;
334 my $b_have = shift;
335 my $a_quota = ($a_have * 100) / $a_should;
336 my $b_quota = ($b_have * 100) / $b_should;
337 return sprintf("%.2f", $a_quota) . '%' .
338 ((($a_quota + $b_quota) <= 100) ?
339 ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
340};
341
342
Nils Diewald7364d1f2013-11-05 19:26:35 +0000343sub support {
344 my $self = shift;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000345
346 # No setting - just getting
Nils Diewald7364d1f2013-11-05 19:26:35 +0000347 unless ($_[0]) {
Nils Diewaldd9c16612013-11-18 17:55:22 +0000348 my @supports;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000349
350 # Get all foundries
Nils Diewaldd9c16612013-11-18 17:55:22 +0000351 foreach my $foundry (keys %{$self->{support}}) {
352 push(@supports, $foundry);
Nils Diewaldff6d0782014-06-10 18:26:36 +0000353
354 # Get all layers
Nils Diewaldd9c16612013-11-18 17:55:22 +0000355 foreach my $layer (@{$self->{support}->{$foundry}}) {
356 my @layers = @$layer;
Nils Diewald37e5b572013-11-20 20:26:03 +0000357 push(@supports, $foundry . '/' . $layers[0]);
Nils Diewaldff6d0782014-06-10 18:26:36 +0000358
359 # More information
Nils Diewaldd9c16612013-11-18 17:55:22 +0000360 if ($layers[1]) {
Nils Diewald37e5b572013-11-20 20:26:03 +0000361 push(@supports, $foundry . '/' . join('/', @layers));
Nils Diewaldd9c16612013-11-18 17:55:22 +0000362 };
363 };
364 };
Nils Diewald7b847222014-04-23 11:14:00 +0000365 return lc ( join ' ', sort {$a cmp $b } @supports );
Nils Diewald7364d1f2013-11-05 19:26:35 +0000366 }
367 elsif (!$_[1]) {
368 return $self->{support}->{$_[0]} // []
369 };
370 my $f = lc shift;
371 my $l = lc shift;
372 my @info = @_;
373 $self->{support} //= {};
374 $self->{support}->{$f} //= [];
375 push(@{$self->{support}->{$f}}, [$l, @info]);
376};
377
Nils Diewaldff6d0782014-06-10 18:26:36 +0000378
Nils Diewald3cf08c72013-12-16 20:31:10 +0000379sub layer_info {
380 my $self = shift;
381 $self->{layer_info} //= [];
382 if ($_[0]) {
383 push(@{$self->{layer_info}}, @{$_[0]});
384 }
385 else {
Nils Diewald7b847222014-04-23 11:14:00 +0000386 return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}};
Nils Diewald3cf08c72013-12-16 20:31:10 +0000387 };
388};
389
Nils Diewald7364d1f2013-11-05 19:26:35 +0000390
391sub to_string {
392 my $self = shift;
393 my $primary = defined $_[0] ? $_[0] : 1;
394 my $string = "<meta>\n";
395 $string .= $self->doc->to_string;
396 $string .= "</meta>\n";
397 if ($primary) {
398 $string .= "<text>\n";
399 $string .= $self->doc->primary->data . "\n";
400 $string .= "</text>\n";
401 };
402 $string .= '<field name="' . $self->name . "\">\n";
403 $string .= "<info>\n";
404 $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
Nils Diewaldff6d0782014-06-10 18:26:36 +0000405
406 # There is support info
407 if (my $support = $self->support) {
408 $string .= 'support = ' . $support . "\n";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000409 };
Nils Diewaldff6d0782014-06-10 18:26:36 +0000410 if (my $layer_info = $self->layer_info) {
411 $string .= 'layer_info = ' . $layer_info . "\n";
Nils Diewald3cf08c72013-12-16 20:31:10 +0000412 };
413
Nils Diewald7364d1f2013-11-05 19:26:35 +0000414 $string .= "</info>\n";
415 $string .= $self->stream->to_string;
416 $string .= "</field>";
417 return $string;
418};
419
420sub to_data {
421 my $self = shift;
422 my $primary = defined $_[0] ? $_[0] : 1;
Nils Diewald044c41d2013-11-11 21:45:09 +0000423 my %data = %{$self->doc->to_hash};
424
425 my @fields;
426 push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
427
428 push(@fields, {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000429 name => $self->name,
430 data => $self->stream->to_array,
Nils Diewaldd9c16612013-11-18 17:55:22 +0000431 tokenization => lc($self->foundry) . '#' . lc($self->layer),
Nils Diewald3cf08c72013-12-16 20:31:10 +0000432 foundries => $self->support,
433 layerInfo => $self->layer_info
Nils Diewald044c41d2013-11-11 21:45:09 +0000434 });
435
436 $data{fields} = \@fields;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000437 \%data;
438};
439
Nils Diewaldd9c16612013-11-18 17:55:22 +0000440
Nils Diewald7364d1f2013-11-05 19:26:35 +0000441sub to_json {
442 encode_json($_[0]->to_data($_[1]));
443};
444
445
446sub to_pretty_json {
447 JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
448};
449
450
Nils Diewald2db9ad02013-10-29 19:26:43 +00004511;
452
453
454__END__
455
456=pod
457
458=head1 NAME
459
460KorAP::Tokenizer
461
462=head1 SYNOPSIS
463
464 my $tokens = KorAP::Tokenizer->new(
465 path => '../examples/00003',
466 doc => KorAP::Document->new( ... ),
467 foundry => 'opennlp',
468 layer => 'tokens'
469 );
470
471 $tokens->parse;
472
473=head1 DESCRIPTION
474
475Convert token information from the KorAP XML
476format into Lucene Index compatible token streams.
477
478=head1 ATTRIBUTES
479
480=head2 path
481
482 print $tokens->path;
483
484The path of the document.
485
486
487=head2 foundry
488
489 print $tokens->foundry;
490
491The name of the foundry.
492
Nils Diewald7b847222014-04-23 11:14:00 +0000493=head2 should
494
495Number of tokens that exist at all.
496
497=head2 have
498
499Number of tokens effectively stored in the token stream (e.g., no punctuations).
Nils Diewald2db9ad02013-10-29 19:26:43 +0000500
501=head2 layer
502
503 print $tokens->layer;
504
505The name of the tokens layer.
506
507
508=head2 doc
509
510 print $tokens->doc->corpus_id;
511
512The L<KorAP::Document> object.
513
514
515=head2 stream
516
517 $tokens->stream->add_meta('adjCount', '<i>45');
518
Nils Diewald7364d1f2013-11-05 19:26:35 +0000519The L<KorAP::Field::MultiTermTokenStream> object
Nils Diewald2db9ad02013-10-29 19:26:43 +0000520
521
522=head2 range
523
524 $tokens->range->lookup(45);
525
526The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.
527
528=head2 match
529
530 $tokens->match->lookup(45);
531
532The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.
533
534
535=head1 METHODS
536
537=head2 parse
538
539 $tokens->parse;
540
541Start the tokenization process.
542
543
Nils Diewaldf03c6802014-07-21 16:39:44 +0000544=head2 add_subtokens
545
546 $tokens->split_tokens;
547 $tokens->split_tokens(
548 sub {
549 ...
550 }
551 );
552
553Add sub token information to the index.
554This is based on the C<aggressive> tokenization, written by Carsten Schnober.
555
556
Nils Diewald2db9ad02013-10-29 19:26:43 +0000557=head2 add_spandata
558
559 $tokens->add_spandata(
560 foundry => 'base',
561 layer => 'sentences',
562 cb => sub {
563 my ($stream, $span) = @_;
564 my $mtt = $stream->pos($span->p_start);
565 $mtt->add(
566 term => '<>:s',
567 o_start => $span->o_start,
568 o_end => $span->o_end,
569 p_end => $span->p_end
570 );
571 }
572 );
573
574Add span information to the parsed token stream.
575Expects a C<foundry> name, a C<layer> name and a
576callback parameter, that will be called after each parsed
Nils Diewald7364d1f2013-11-05 19:26:35 +0000577span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000578as well as the current L<KorAP::Tokenizer::Span>.
579
580An optional parameter C<encoding> may indicate that the span offsets
581are either refering to C<bytes> or C<utf-8> offsets.
582
583An optional parameter C<skip> allows for skipping the process.
584
585
586=head2 add_tokendata
587
588 $tokens->add_tokendata(
589 foundry => 'connexor',
590 layer => 'syntax',
591 cb => sub {
592 my ($stream, $token) = @_;
593 my $mtt = $stream->pos($token->pos);
594 my $content = $token->content;
595
596 # syntax
597 if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
598 $mtt->add(
599 term => 'cnx_syn:' . $found
600 );
601 };
602 });
603
604Add token information to the parsed token stream.
605Expects a C<foundry> name, a C<layer> name and a
606callback parameter, that will be called after each parsed
Nils Diewald7364d1f2013-11-05 19:26:35 +0000607token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000608as well as the current L<KorAP::Tokenizer::Span>.
609
610An optional parameter C<encoding> may indicate that the token offsets
611are either refering to C<bytes> or C<utf-8> offsets.
612
613An optional parameter C<skip> allows for skipping the process.
614
615=cut