blob: 65cdd4cb28281e2d7552f493758c12b200e457e1 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001package KorAP::Tokenizer;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
3use Mojo::ByteStream 'b';
Nils Diewald7364d1f2013-11-05 19:26:35 +00004use Mojo::Loader;
Nils Diewald3cf08c72013-12-16 20:31:10 +00005use XML::Fast;
6use Try::Tiny;
Nils Diewald7364d1f2013-11-05 19:26:35 +00007use Carp qw/croak/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use KorAP::Tokenizer::Range;
9use KorAP::Tokenizer::Match;
10use KorAP::Tokenizer::Spans;
11use KorAP::Tokenizer::Tokens;
Nils Diewald7364d1f2013-11-05 19:26:35 +000012use KorAP::Field::MultiTermTokenStream;
Nils Diewald3cf08c72013-12-16 20:31:10 +000013use List::MoreUtils 'uniq';
Nils Diewald7364d1f2013-11-05 19:26:35 +000014use JSON::XS;
Nils Diewald2db9ad02013-10-29 19:26:43 +000015use Log::Log4perl;
16
Nils Diewald840c9242014-10-28 19:51:26 +000017# TODO 1:
18# Bei den Autoren im Index darauf achten, dass auch "etc." indiziert wird
19
20# TODO 2:
Nils Diewalda96de622014-10-31 17:29:23 +000021# That's now implemented but deactivated
22
Nils Diewald6dab7ea2014-09-10 17:37:19 +000023# Add punktuations to the index
24# [Er sagte: "Hallo - na?"] becomes
25# [s:Er|tt/l:er|_1#0-2]
26# [s:sagte|tt/l:sagen|_2#3-8|.>::#8-9$1|.>tt/l:PUNCT#8-9$1|.>:"#10-11$2|.>tt/l:PUNCT#10-11$2]
Nils Diewalda96de622014-10-31 17:29:23 +000027# [s:Hallo|tt/l:hallo|_3#11-16|.<::#8-9$2|.<tt/l:PUNCT#8-9$2|.<:"#10-11$1|.<:tt/l:PUNCT#10-11$1|.>:-#17-18$1|.>tt/l:PUNCT#17-18$1]
Nils Diewald6dab7ea2014-09-10 17:37:19 +000028# [s:na|tt/l:na|_4#19-21|.<:-#17-18$1|.<tt/l:PUNCT#17-18$1|.>:?#21-22$1|.>tt/l:PUNCT#21-22$1|.>:"#22-23$2|.>tt/l:PUNCT#22-23$2]
29
Nils Diewald840c9242014-10-28 19:51:26 +000030
Nils Diewald7364d1f2013-11-05 19:26:35 +000031has [qw/path foundry doc stream should have name/];
32has layer => 'Tokens';
Nils Diewald2db9ad02013-10-29 19:26:43 +000033
Nils Diewald7b847222014-04-23 11:14:00 +000034has log => sub {
35 if(Log::Log4perl->initialized()) {
36 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000037 };
38 state $log = KorAP::Log->new;
39 return $log;
Nils Diewald2db9ad02013-10-29 19:26:43 +000040};
41
42# Parse tokens of the document
43sub parse {
44 my $self = shift;
45
46 # Create new token stream
Nils Diewald7364d1f2013-11-05 19:26:35 +000047 my $mtts = KorAP::Field::MultiTermTokenStream->new;
Nils Diewald092178e2013-11-26 16:18:48 +000048 my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
49 my $file = b($path)->slurp;
Nils Diewald3cf08c72013-12-16 20:31:10 +000050# my $tokens = Mojo::DOM->new($file);
51# $tokens->xml(1);
Nils Diewald2db9ad02013-10-29 19:26:43 +000052
53 my $doc = $self->doc;
54
55 my ($should, $have) = (0, 0);
56
57 # Create range and match objects
58 my $range = KorAP::Tokenizer::Range->new;
59 my $match = KorAP::Tokenizer::Match->new;
60
61 my $old = 0;
62
63 $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
64
Nils Diewald3cf08c72013-12-16 20:31:10 +000065 # TODO: Reuse the following code from Spans.pm and tokens.pm
66 my ($tokens, $error);
67 try {
68 local $SIG{__WARN__} = sub {
69 $error = 1;
70 };
Nils Diewald7b847222014-04-23 11:14:00 +000071 $tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList};
Nils Diewald3cf08c72013-12-16 20:31:10 +000072 }
73 catch {
74 $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
75 $error = 1;
76 };
77
78 return if $error;
79
80 if (ref $tokens && $tokens->{span}) {
Nils Diewald7b847222014-04-23 11:14:00 +000081 $tokens = $tokens->{span};
Nils Diewald3cf08c72013-12-16 20:31:10 +000082 }
83 else {
Nils Diewald21a3e1a2014-04-28 18:48:16 +000084 return $self;
Nils Diewald3cf08c72013-12-16 20:31:10 +000085 };
86
87 $tokens = [$tokens] if ref $tokens ne 'ARRAY';
88
Nils Diewald2db9ad02013-10-29 19:26:43 +000089 # Iterate over all tokens
Nils Diewald3cf08c72013-12-16 20:31:10 +000090 # $tokens->find('span')->each(
91 # sub {
92 # my $span = $_;
Nils Diewalda96de622014-10-31 17:29:23 +000093 my $mtt;
94 my $distance = 0;
95 my (@non_word_tokens);
Nils Diewald3cf08c72013-12-16 20:31:10 +000096 foreach my $span (@$tokens) {
97 my $from = $span->{'-from'};
98 my $to = $span->{'-to'};
Nils Diewald2db9ad02013-10-29 19:26:43 +000099 my $token = $doc->primary->data($from, $to);
100
Nils Diewald3ece6302013-12-02 18:38:16 +0000101 # warn 'Has ' . $from . '->' . $to . "($old)";
102
Nils Diewaldaba47102013-11-27 15:02:47 +0000103 unless (defined $token) {
Nils Diewald092178e2013-11-26 16:18:48 +0000104 $self->log->error("Unable to find substring [$from-$to] in $path");
Nils Diewald3cf08c72013-12-16 20:31:10 +0000105 next;
Nils Diewald092178e2013-11-26 16:18:48 +0000106 };
107
Nils Diewald2db9ad02013-10-29 19:26:43 +0000108 $should++;
109
110 # Ignore non-word tokens
Nils Diewalda96de622014-10-31 17:29:23 +0000111 if ($token !~ /[\w\d]/) {
112# if ($mtt) {
113# my $term = [$token, $from, $to];
114# $mtt->add(
115# term => '.>:'.$token,
116# payload => '<i>'.$from . '<i>' . $to . '<b>' . $distance++
117# );
118# push(@non_word_tokens, $term);
119# }
120 next;
121 };
Nils Diewald3cf08c72013-12-16 20:31:10 +0000122
Nils Diewalda96de622014-10-31 17:29:23 +0000123 $mtt = $mtts->add;
124
125# while (scalar @non_word_tokens) {
126# local $_ = shift @non_word_tokens;
127# $mtt->add(
128# term => '.<:' . $_->[0],
129# payload => '<i>' . $_->[1] . '<i>' . $_->[2] . '<b>' . --$distance
130# );
131# $distance = 0;
132# };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000133
134 # Add gap for later finding matching positions before or after
135 $range->gap($old, $from, $have) unless $old >= $from;
136
137 # Add surface term
Nils Diewaldf03c6802014-07-21 16:39:44 +0000138 # That's always the first term!
Nils Diewald2db9ad02013-10-29 19:26:43 +0000139 $mtt->add('s:' . $token);
140
141 # Add case insensitive term
142 $mtt->add('i:' . lc $token);
143
144 # Add offset information
145 $mtt->o_start($from);
146 $mtt->o_end($to);
147
148 # Store offset information for position matching
149 $range->set($from, $to, $have);
150 $match->set($from, $to, $have);
151
152 $old = $to + 1;
153
154 # Add position term
155 $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
156
157 $have++;
Nils Diewald3cf08c72013-12-16 20:31:10 +0000158 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000159
160 # Add token count
Nils Diewald092178e2013-11-26 16:18:48 +0000161 $mtts->add_meta('tokens', '<i>' . $have);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000162
Nils Diewalda96de622014-10-31 17:29:23 +0000163 if ($doc->primary->data_length >= ($old - 1)) {
164 $range->gap($old, $doc->primary->data_length + 1, $have-1)
165 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000166
167 # Add info
168 $self->stream($mtts);
169 $self->{range} = $range;
170 $self->{match} = $match;
171 $self->should($should);
172 $self->have($have);
173
Nils Diewald7b847222014-04-23 11:14:00 +0000174 $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');
Nils Diewald21a3e1a2014-04-28 18:48:16 +0000175
176 return $self;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000177};
178
Nils Diewalda96de622014-10-31 17:29:23 +0000179
180# This is now done by glemm
Nils Diewaldf03c6802014-07-21 16:39:44 +0000181sub add_subtokens {
182 my $self = shift;
183 my $mtts = $self->stream or return;
184
185 foreach my $mtt (@{$mtts->multi_term_tokens}) {
186 my $o_start = $mtt->o_start;
187 my $o_end = $mtt->o_end;
188 my $l = $o_end - $o_start;
189
Nils Diewald7658e0a2014-07-21 18:47:11 +0000190 my $os = my $s = $mtt->lc_surface;
Nils Diewaldf03c6802014-07-21 16:39:44 +0000191
192 # Algorithm based on aggressive tokenization in
193 # tokenize.pl from Carsten Schnober
194 $s =~ s/[[:alpha:]]/a/g;
195 $s =~ s/[[:digit:]]/0/g;
196 $s =~ s/\p{Punct}/#/g;
197 $s =~ y/~/A/;
198 $s .= 'E';
199
200 while ($s =~ /(a+)[^a]/g) {
201 my $from = $-[1];
202 my $to = $+[1];
203 $mtt->add(
204 term => 'i^1:' . substr($os, $from, $from + $to),
205 o_start => $from + $o_start,
206 o_end => $to + $o_start
207 ) unless $to - $from == $l;
208 };
209 while ($s =~ /(0+)[^0]/g) {
210 my $from = $-[1];
211 my $to = $+[1];
212 $mtt->add(
213 term => 'i^2:' . substr($os, $from, $from + $to),
214 o_start => $from + $o_start,
215 o_end => $to + $o_start
216 ) unless $to - $from == $l;
217 };
218 while ($s =~ /(#)/g) {
219 my $from = $-[1];
220 my $to = $+[1];
221 $mtt->add(
222 term => 'i^3:' . substr($os, $from, $from + $to),
223 o_start => $from + $o_start,
224 o_end => $to + $o_start
225 ) unless $to - $from == $l;
226 };
227 };
228
Nils Diewald032e31d2014-07-21 18:39:12 +0000229 return $self;
Nils Diewaldf03c6802014-07-21 16:39:44 +0000230};
231
Nils Diewald2db9ad02013-10-29 19:26:43 +0000232
233# Get span positions through character offsets
234sub range {
235 return shift->{range} // KorAP::Tokenizer::Range->new;
236};
237
238
239# Get token positions through character offsets
240sub match {
241 return shift->{match} // KorAP::Tokenizer::Match->new;
242};
243
244
245# Add information of spans to the tokens
246sub add_spandata {
247 my $self = shift;
248 my %param = @_;
249
250 croak 'No token data available' unless $self->stream;
251
252 $self->log->trace(
253 ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
254 );
255
256 return if $param{skip};
257
258 my $cb = delete $param{cb};
259
Nils Diewald7364d1f2013-11-05 19:26:35 +0000260 $param{primary} = $self->doc->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000261
262 my $spans = KorAP::Tokenizer::Spans->new(
263 path => $self->path,
264 range => $self->range,
Nils Diewald7364d1f2013-11-05 19:26:35 +0000265 match => $self->match,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000266 %param
267 );
268
Nils Diewaldaba47102013-11-27 15:02:47 +0000269 my $spanarray = $spans->parse or return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000270
271 if ($spans->should == $spans->have) {
272 $self->log->trace('With perfect alignment!');
273 }
274 else {
275 $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
276 };
277
Nils Diewald2db9ad02013-10-29 19:26:43 +0000278 if ($cb) {
279 foreach (@$spanarray) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280 $cb->($self->stream, $_, $spans);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000281 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000282 return 1;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000283 };
284 return $spans;
285};
286
Nils Diewald2db9ad02013-10-29 19:26:43 +0000287# Add information to the tokens
288sub add_tokendata {
289 my $self = shift;
290 my %param = @_;
291
292 croak 'No token data available' unless $self->stream;
293
294 $self->log->trace(
295 ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
296 );
297 return if $param{skip};
298
299 my $cb = delete $param{cb};
300
Nils Diewald7364d1f2013-11-05 19:26:35 +0000301 $param{primary} = $self->doc->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000302
303 my $tokens = KorAP::Tokenizer::Tokens->new(
304 path => $self->path,
Nils Diewald7364d1f2013-11-05 19:26:35 +0000305 range => $self->range,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000306 match => $self->match,
307 %param
308 );
309
Nils Diewaldaba47102013-11-27 15:02:47 +0000310 my $tokenarray = $tokens->parse or return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000311
312 if ($tokens->should == $tokens->have) {
313 $self->log->trace('With perfect alignment!');
314 }
315 else {
316 my $perc = _perc(
317 $tokens->should, $tokens->have, $self->should, $self->should - $self->have
318 );
319 $self->log->debug('With an alignment quota of ' . $perc);
320 };
321
322 if ($cb) {
323 foreach (@$tokenarray) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000324 $cb->($self->stream, $_, $tokens);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000325 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000326 return 1;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000327 };
328 return $tokens;
329};
330
331
Nils Diewald7364d1f2013-11-05 19:26:35 +0000332sub add {
333 my $self = shift;
334 my $loader = Mojo::Loader->new;
335 my $foundry = shift;
336 my $layer = shift;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000337
338 unless ($foundry && $layer) {
339 warn 'Unable to add specific module - not enough information given!';
340 return;
341 };
342
Nils Diewald7364d1f2013-11-05 19:26:35 +0000343 my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
344
345 if ($mod->can('new') || eval("require $mod; 1;")) {
346 if (my $retval = $mod->new($self)->parse(@_)) {
Nils Diewald3cf08c72013-12-16 20:31:10 +0000347
348 # This layer is supported
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349 $self->support($foundry => $layer, @_);
Nils Diewald3cf08c72013-12-16 20:31:10 +0000350
351 # Get layerinfo
352 $self->layer_info($mod->layer_info);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000353 return $retval;
354 };
355 }
356 else {
357 $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
358 };
359
360 return;
361};
362
363
Nils Diewald2db9ad02013-10-29 19:26:43 +0000364sub _perc {
365 if (@_ == 2) {
366 # '[' . $_[0] . '/' . $_[1] . ']' .
367 return sprintf("%.2f", ($_[1] * 100) / $_[0]);
368 }
369
370 my $a_should = shift;
371 my $a_have = shift;
372 my $b_should = shift;
373 my $b_have = shift;
374 my $a_quota = ($a_have * 100) / $a_should;
375 my $b_quota = ($b_have * 100) / $b_should;
376 return sprintf("%.2f", $a_quota) . '%' .
377 ((($a_quota + $b_quota) <= 100) ?
378 ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
379};
380
381
Nils Diewald7364d1f2013-11-05 19:26:35 +0000382sub support {
383 my $self = shift;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000384
385 # No setting - just getting
Nils Diewald7364d1f2013-11-05 19:26:35 +0000386 unless ($_[0]) {
Nils Diewaldd9c16612013-11-18 17:55:22 +0000387 my @supports;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000388
389 # Get all foundries
Nils Diewaldd9c16612013-11-18 17:55:22 +0000390 foreach my $foundry (keys %{$self->{support}}) {
391 push(@supports, $foundry);
Nils Diewaldff6d0782014-06-10 18:26:36 +0000392
393 # Get all layers
Nils Diewaldd9c16612013-11-18 17:55:22 +0000394 foreach my $layer (@{$self->{support}->{$foundry}}) {
395 my @layers = @$layer;
Nils Diewald37e5b572013-11-20 20:26:03 +0000396 push(@supports, $foundry . '/' . $layers[0]);
Nils Diewaldff6d0782014-06-10 18:26:36 +0000397
398 # More information
Nils Diewaldd9c16612013-11-18 17:55:22 +0000399 if ($layers[1]) {
Nils Diewald37e5b572013-11-20 20:26:03 +0000400 push(@supports, $foundry . '/' . join('/', @layers));
Nils Diewaldd9c16612013-11-18 17:55:22 +0000401 };
402 };
403 };
Nils Diewald7b847222014-04-23 11:14:00 +0000404 return lc ( join ' ', sort {$a cmp $b } @supports );
Nils Diewald7364d1f2013-11-05 19:26:35 +0000405 }
406 elsif (!$_[1]) {
407 return $self->{support}->{$_[0]} // []
408 };
409 my $f = lc shift;
410 my $l = lc shift;
411 my @info = @_;
412 $self->{support} //= {};
413 $self->{support}->{$f} //= [];
414 push(@{$self->{support}->{$f}}, [$l, @info]);
415};
416
Nils Diewaldff6d0782014-06-10 18:26:36 +0000417
Nils Diewald3cf08c72013-12-16 20:31:10 +0000418sub layer_info {
419 my $self = shift;
420 $self->{layer_info} //= [];
421 if ($_[0]) {
422 push(@{$self->{layer_info}}, @{$_[0]});
423 }
424 else {
Nils Diewald7b847222014-04-23 11:14:00 +0000425 return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}};
Nils Diewald3cf08c72013-12-16 20:31:10 +0000426 };
427};
428
Nils Diewald7364d1f2013-11-05 19:26:35 +0000429
430sub to_string {
431 my $self = shift;
432 my $primary = defined $_[0] ? $_[0] : 1;
433 my $string = "<meta>\n";
434 $string .= $self->doc->to_string;
435 $string .= "</meta>\n";
436 if ($primary) {
437 $string .= "<text>\n";
438 $string .= $self->doc->primary->data . "\n";
439 $string .= "</text>\n";
440 };
441 $string .= '<field name="' . $self->name . "\">\n";
442 $string .= "<info>\n";
443 $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
Nils Diewaldff6d0782014-06-10 18:26:36 +0000444
445 # There is support info
446 if (my $support = $self->support) {
447 $string .= 'support = ' . $support . "\n";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000448 };
Nils Diewaldff6d0782014-06-10 18:26:36 +0000449 if (my $layer_info = $self->layer_info) {
450 $string .= 'layer_info = ' . $layer_info . "\n";
Nils Diewald3cf08c72013-12-16 20:31:10 +0000451 };
452
Nils Diewald7364d1f2013-11-05 19:26:35 +0000453 $string .= "</info>\n";
454 $string .= $self->stream->to_string;
455 $string .= "</field>";
456 return $string;
457};
458
459sub to_data {
460 my $self = shift;
461 my $primary = defined $_[0] ? $_[0] : 1;
Nils Diewald32e30f02014-10-30 00:52:36 +0000462 my $legacy = defined $_[1] ? $_[1] : 0;
463
Nils Diewald044c41d2013-11-11 21:45:09 +0000464 my %data = %{$self->doc->to_hash};
Nils Diewald044c41d2013-11-11 21:45:09 +0000465 my @fields;
Nils Diewald044c41d2013-11-11 21:45:09 +0000466
Nils Diewald32e30f02014-10-30 00:52:36 +0000467 if ($legacy) {
468 push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
Nils Diewald044c41d2013-11-11 21:45:09 +0000469
Nils Diewald32e30f02014-10-30 00:52:36 +0000470 push(@fields, {
471 name => $self->name,
472 data => $self->stream->to_array,
473 tokenization => lc($self->foundry) . '#' . lc($self->layer),
474 foundries => $self->support,
475 layerInfo => $self->layer_info
476 });
477
478 $data{fields} = \@fields;
479 }
480
481 else {
482 $data{primaryData} = $self->doc->primary->data if $primary;
483 $data{tokenName} = $self->name;
484 $data{data} = $self->stream->to_array;
485 $data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer);
486 $data{foundries} = $self->support;
487 $data{layerInfos} = $self->layer_info;
488 $data{version} = '0.02';
489 };
490
Nils Diewald7364d1f2013-11-05 19:26:35 +0000491 \%data;
492};
493
Nils Diewaldd9c16612013-11-18 17:55:22 +0000494
Nils Diewald32e30f02014-10-30 00:52:36 +0000495sub to_json_legacy {
496 encode_json($_[0]->to_data($_[1], 1));
497};
498
Nils Diewald7364d1f2013-11-05 19:26:35 +0000499sub to_json {
Nils Diewald32e30f02014-10-30 00:52:36 +0000500 encode_json($_[0]->to_data($_[1], 0));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000501};
502
503
504sub to_pretty_json {
505 JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
506};
507
508
Nils Diewald2db9ad02013-10-29 19:26:43 +00005091;
510
511
512__END__
513
514=pod
515
516=head1 NAME
517
518KorAP::Tokenizer
519
520=head1 SYNOPSIS
521
522 my $tokens = KorAP::Tokenizer->new(
523 path => '../examples/00003',
524 doc => KorAP::Document->new( ... ),
525 foundry => 'opennlp',
526 layer => 'tokens'
527 );
528
529 $tokens->parse;
530
531=head1 DESCRIPTION
532
533Convert token information from the KorAP XML
534format into Lucene Index compatible token streams.
535
536=head1 ATTRIBUTES
537
538=head2 path
539
540 print $tokens->path;
541
542The path of the document.
543
544
545=head2 foundry
546
547 print $tokens->foundry;
548
549The name of the foundry.
550
Nils Diewald7b847222014-04-23 11:14:00 +0000551=head2 should
552
553Number of tokens that exist at all.
554
555=head2 have
556
557Number of tokens effectively stored in the token stream (e.g., no punctuations).
Nils Diewald2db9ad02013-10-29 19:26:43 +0000558
559=head2 layer
560
561 print $tokens->layer;
562
563The name of the tokens layer.
564
565
566=head2 doc
567
568 print $tokens->doc->corpus_id;
569
570The L<KorAP::Document> object.
571
572
573=head2 stream
574
575 $tokens->stream->add_meta('adjCount', '<i>45');
576
Nils Diewald7364d1f2013-11-05 19:26:35 +0000577The L<KorAP::Field::MultiTermTokenStream> object
Nils Diewald2db9ad02013-10-29 19:26:43 +0000578
579
580=head2 range
581
582 $tokens->range->lookup(45);
583
584The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.
585
586=head2 match
587
588 $tokens->match->lookup(45);
589
590The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.
591
592
593=head1 METHODS
594
595=head2 parse
596
597 $tokens->parse;
598
599Start the tokenization process.
600
601
Nils Diewald32e30f02014-10-30 00:52:36 +0000602=head2 to_json_legacy
603
604 print $tokens->to_json_legacy;
605 print $tokens->to_json_legacy(1);
606
607Return the token data in legacy JSON format.
608An optional parsed boolean parameter indicates,
609if primary data should be included.
610
611=head2 to_json
612
613 print $tokens->to_json;
614 print $tokens->to_json(1);
615
616Return the token data in JSON format
617An optional parsed boolean parameter indicates,
618if primary data should be included.
619
620
Nils Diewaldf03c6802014-07-21 16:39:44 +0000621=head2 add_subtokens
622
623 $tokens->split_tokens;
624 $tokens->split_tokens(
625 sub {
626 ...
627 }
628 );
629
630Add sub token information to the index.
631This is based on the C<aggressive> tokenization, written by Carsten Schnober.
632
633
Nils Diewald2db9ad02013-10-29 19:26:43 +0000634=head2 add_spandata
635
636 $tokens->add_spandata(
637 foundry => 'base',
638 layer => 'sentences',
639 cb => sub {
640 my ($stream, $span) = @_;
641 my $mtt = $stream->pos($span->p_start);
642 $mtt->add(
643 term => '<>:s',
644 o_start => $span->o_start,
645 o_end => $span->o_end,
646 p_end => $span->p_end
647 );
648 }
649 );
650
651Add span information to the parsed token stream.
652Expects a C<foundry> name, a C<layer> name and a
653callback parameter, that will be called after each parsed
Nils Diewald7364d1f2013-11-05 19:26:35 +0000654span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000655as well as the current L<KorAP::Tokenizer::Span>.
656
657An optional parameter C<encoding> may indicate that the span offsets
658are either refering to C<bytes> or C<utf-8> offsets.
659
660An optional parameter C<skip> allows for skipping the process.
661
662
663=head2 add_tokendata
664
665 $tokens->add_tokendata(
666 foundry => 'connexor',
667 layer => 'syntax',
668 cb => sub {
669 my ($stream, $token) = @_;
670 my $mtt = $stream->pos($token->pos);
671 my $content = $token->content;
672
673 # syntax
674 if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
675 $mtt->add(
676 term => 'cnx_syn:' . $found
677 );
678 };
679 });
680
681Add token information to the parsed token stream.
682Expects a C<foundry> name, a C<layer> name and a
683callback parameter, that will be called after each parsed
Nils Diewald7364d1f2013-11-05 19:26:35 +0000684token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000685as well as the current L<KorAP::Tokenizer::Span>.
686
687An optional parameter C<encoding> may indicate that the token offsets
688are either refering to C<bytes> or C<utf-8> offsets.
689
690An optional parameter C<skip> allows for skipping the process.
691
692=cut