blob: 3b83b5faee27abca3d9a7d42f5b0f8f3fc996a86 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001package KorAP::Tokenizer;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
3use Mojo::ByteStream 'b';
Nils Diewald7364d1f2013-11-05 19:26:35 +00004use Mojo::Loader;
5use Carp qw/croak/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00006use KorAP::Tokenizer::Range;
7use KorAP::Tokenizer::Match;
8use KorAP::Tokenizer::Spans;
9use KorAP::Tokenizer::Tokens;
Nils Diewald7364d1f2013-11-05 19:26:35 +000010use KorAP::Field::MultiTermTokenStream;
11use JSON::XS;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012use Log::Log4perl;
13
Nils Diewald7364d1f2013-11-05 19:26:35 +000014has [qw/path foundry doc stream should have name/];
15has layer => 'Tokens';
Nils Diewald2db9ad02013-10-29 19:26:43 +000016
17has 'log' => sub {
18 Log::Log4perl->get_logger(__PACKAGE__)
19};
20
21# Parse tokens of the document
22sub parse {
23 my $self = shift;
24
25 # Create new token stream
Nils Diewald7364d1f2013-11-05 19:26:35 +000026 my $mtts = KorAP::Field::MultiTermTokenStream->new;
Nils Diewald092178e2013-11-26 16:18:48 +000027 my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
28 my $file = b($path)->slurp;
Nils Diewald2db9ad02013-10-29 19:26:43 +000029 my $tokens = Mojo::DOM->new($file);
30 $tokens->xml(1);
31
32 my $doc = $self->doc;
33
34 my ($should, $have) = (0, 0);
35
36 # Create range and match objects
37 my $range = KorAP::Tokenizer::Range->new;
38 my $match = KorAP::Tokenizer::Match->new;
39
40 my $old = 0;
41
42 $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
43
44 # Iterate over all tokens
45 $tokens->find('span')->each(
46 sub {
47 my $span = $_;
48 my $from = $span->attr('from');
49 my $to = $span->attr('to');
50 my $token = $doc->primary->data($from, $to);
51
Nils Diewaldaba47102013-11-27 15:02:47 +000052 unless (defined $token) {
Nils Diewald092178e2013-11-26 16:18:48 +000053 $self->log->error("Unable to find substring [$from-$to] in $path");
54 return;
55 };
56
Nils Diewald2db9ad02013-10-29 19:26:43 +000057 $should++;
58
59 # Ignore non-word tokens
60 return if $token !~ /[\w\d]/;
61
62 my $mtt = $mtts->add;
63
64 # Add gap for later finding matching positions before or after
65 $range->gap($old, $from, $have) unless $old >= $from;
66
67 # Add surface term
68 $mtt->add('s:' . $token);
69
70 # Add case insensitive term
71 $mtt->add('i:' . lc $token);
72
73 # Add offset information
74 $mtt->o_start($from);
75 $mtt->o_end($to);
76
77 # Store offset information for position matching
78 $range->set($from, $to, $have);
79 $match->set($from, $to, $have);
80
81 $old = $to + 1;
82
83 # Add position term
84 $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
85
86 $have++;
87 });
88
89 # Add token count
Nils Diewald092178e2013-11-26 16:18:48 +000090 $mtts->add_meta('tokens', '<i>' . $have);
Nils Diewald2db9ad02013-10-29 19:26:43 +000091
92 $range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old;
93
94 # Add info
95 $self->stream($mtts);
96 $self->{range} = $range;
97 $self->{match} = $match;
98 $self->should($should);
99 $self->have($have);
100
101 $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');
102};
103
104
105# Get span positions through character offsets
106sub range {
107 return shift->{range} // KorAP::Tokenizer::Range->new;
108};
109
110
111# Get token positions through character offsets
112sub match {
113 return shift->{match} // KorAP::Tokenizer::Match->new;
114};
115
116
117# Add information of spans to the tokens
118sub add_spandata {
119 my $self = shift;
120 my %param = @_;
121
122 croak 'No token data available' unless $self->stream;
123
124 $self->log->trace(
125 ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
126 );
127
128 return if $param{skip};
129
130 my $cb = delete $param{cb};
131
Nils Diewald7364d1f2013-11-05 19:26:35 +0000132 $param{primary} = $self->doc->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000133
134 my $spans = KorAP::Tokenizer::Spans->new(
135 path => $self->path,
136 range => $self->range,
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137 match => $self->match,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000138 %param
139 );
140
Nils Diewaldaba47102013-11-27 15:02:47 +0000141 my $spanarray = $spans->parse or return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000142
143 if ($spans->should == $spans->have) {
144 $self->log->trace('With perfect alignment!');
145 }
146 else {
147 $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
148 };
149
Nils Diewald2db9ad02013-10-29 19:26:43 +0000150 if ($cb) {
151 foreach (@$spanarray) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000152 $cb->($self->stream, $_, $spans);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000153 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000154 return 1;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000155 };
156 return $spans;
157};
158
159
160# Add information to the tokens
161sub add_tokendata {
162 my $self = shift;
163 my %param = @_;
164
165 croak 'No token data available' unless $self->stream;
166
167 $self->log->trace(
168 ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
169 );
170 return if $param{skip};
171
172 my $cb = delete $param{cb};
173
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174 $param{primary} = $self->doc->primary;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000175
176 my $tokens = KorAP::Tokenizer::Tokens->new(
177 path => $self->path,
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178 range => $self->range,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000179 match => $self->match,
180 %param
181 );
182
Nils Diewaldaba47102013-11-27 15:02:47 +0000183 my $tokenarray = $tokens->parse or return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000184
185 if ($tokens->should == $tokens->have) {
186 $self->log->trace('With perfect alignment!');
187 }
188 else {
189 my $perc = _perc(
190 $tokens->should, $tokens->have, $self->should, $self->should - $self->have
191 );
192 $self->log->debug('With an alignment quota of ' . $perc);
193 };
194
195 if ($cb) {
196 foreach (@$tokenarray) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000197 $cb->($self->stream, $_, $tokens);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000198 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000199 return 1;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000200 };
201 return $tokens;
202};
203
204
Nils Diewald7364d1f2013-11-05 19:26:35 +0000205sub add {
206 my $self = shift;
207 my $loader = Mojo::Loader->new;
208 my $foundry = shift;
209 my $layer = shift;
210 my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
211
212 if ($mod->can('new') || eval("require $mod; 1;")) {
213 if (my $retval = $mod->new($self)->parse(@_)) {
214 $self->support($foundry => $layer, @_);
215 return $retval;
216 };
217 }
218 else {
219 $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
220 };
221
222 return;
223};
224
225
Nils Diewald2db9ad02013-10-29 19:26:43 +0000226sub _perc {
227 if (@_ == 2) {
228 # '[' . $_[0] . '/' . $_[1] . ']' .
229 return sprintf("%.2f", ($_[1] * 100) / $_[0]);
230 }
231
232 my $a_should = shift;
233 my $a_have = shift;
234 my $b_should = shift;
235 my $b_have = shift;
236 my $a_quota = ($a_have * 100) / $a_should;
237 my $b_quota = ($b_have * 100) / $b_should;
238 return sprintf("%.2f", $a_quota) . '%' .
239 ((($a_quota + $b_quota) <= 100) ?
240 ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
241};
242
243
Nils Diewald7364d1f2013-11-05 19:26:35 +0000244sub support {
245 my $self = shift;
246 unless ($_[0]) {
Nils Diewaldd9c16612013-11-18 17:55:22 +0000247 my @supports;
248 foreach my $foundry (keys %{$self->{support}}) {
249 push(@supports, $foundry);
250 foreach my $layer (@{$self->{support}->{$foundry}}) {
251 my @layers = @$layer;
Nils Diewald37e5b572013-11-20 20:26:03 +0000252 push(@supports, $foundry . '/' . $layers[0]);
Nils Diewaldd9c16612013-11-18 17:55:22 +0000253 if ($layers[1]) {
Nils Diewald37e5b572013-11-20 20:26:03 +0000254 push(@supports, $foundry . '/' . join('/', @layers));
Nils Diewaldd9c16612013-11-18 17:55:22 +0000255 };
256 };
257 };
258 return lc ( join ' ', @supports );
Nils Diewald7364d1f2013-11-05 19:26:35 +0000259 }
260 elsif (!$_[1]) {
261 return $self->{support}->{$_[0]} // []
262 };
263 my $f = lc shift;
264 my $l = lc shift;
265 my @info = @_;
266 $self->{support} //= {};
267 $self->{support}->{$f} //= [];
268 push(@{$self->{support}->{$f}}, [$l, @info]);
269};
270
271
272sub to_string {
273 my $self = shift;
274 my $primary = defined $_[0] ? $_[0] : 1;
275 my $string = "<meta>\n";
276 $string .= $self->doc->to_string;
277 $string .= "</meta>\n";
278 if ($primary) {
279 $string .= "<text>\n";
280 $string .= $self->doc->primary->data . "\n";
281 $string .= "</text>\n";
282 };
283 $string .= '<field name="' . $self->name . "\">\n";
284 $string .= "<info>\n";
285 $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
286 foreach my $foundry (keys %{$self->support}) {
287 foreach (@{$self->support($foundry)}) {
288 $string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n";
289 };
290 };
291 $string .= "</info>\n";
292 $string .= $self->stream->to_string;
293 $string .= "</field>";
294 return $string;
295};
296
297sub to_data {
298 my $self = shift;
299 my $primary = defined $_[0] ? $_[0] : 1;
Nils Diewald044c41d2013-11-11 21:45:09 +0000300 my %data = %{$self->doc->to_hash};
301
302 my @fields;
303 push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
304
305 push(@fields, {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000306 name => $self->name,
307 data => $self->stream->to_array,
Nils Diewaldd9c16612013-11-18 17:55:22 +0000308 tokenization => lc($self->foundry) . '#' . lc($self->layer),
309 foundries => $self->support
Nils Diewald044c41d2013-11-11 21:45:09 +0000310 });
311
312 $data{fields} = \@fields;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000313 \%data;
314};
315
Nils Diewaldd9c16612013-11-18 17:55:22 +0000316
Nils Diewald7364d1f2013-11-05 19:26:35 +0000317sub to_json {
318 encode_json($_[0]->to_data($_[1]));
319};
320
321
322sub to_pretty_json {
323 JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
324};
325
326
Nils Diewald2db9ad02013-10-29 19:26:43 +00003271;
328
329
330__END__
331
332=pod
333
334=head1 NAME
335
336KorAP::Tokenizer
337
338=head1 SYNOPSIS
339
340 my $tokens = KorAP::Tokenizer->new(
341 path => '../examples/00003',
342 doc => KorAP::Document->new( ... ),
343 foundry => 'opennlp',
344 layer => 'tokens'
345 );
346
347 $tokens->parse;
348
349=head1 DESCRIPTION
350
351Convert token information from the KorAP XML
352format into Lucene Index compatible token streams.
353
354=head1 ATTRIBUTES
355
356=head2 path
357
358 print $tokens->path;
359
360The path of the document.
361
362
363=head2 foundry
364
365 print $tokens->foundry;
366
367The name of the foundry.
368
369
370=head2 layer
371
372 print $tokens->layer;
373
374The name of the tokens layer.
375
376
377=head2 doc
378
379 print $tokens->doc->corpus_id;
380
381The L<KorAP::Document> object.
382
383
384=head2 stream
385
386 $tokens->stream->add_meta('adjCount', '<i>45');
387
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388The L<KorAP::Field::MultiTermTokenStream> object
Nils Diewald2db9ad02013-10-29 19:26:43 +0000389
390
391=head2 range
392
393 $tokens->range->lookup(45);
394
395The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.
396
397=head2 match
398
399 $tokens->match->lookup(45);
400
401The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.
402
403
404=head1 METHODS
405
406=head2 parse
407
408 $tokens->parse;
409
410Start the tokenization process.
411
412
413=head2 add_spandata
414
415 $tokens->add_spandata(
416 foundry => 'base',
417 layer => 'sentences',
418 cb => sub {
419 my ($stream, $span) = @_;
420 my $mtt = $stream->pos($span->p_start);
421 $mtt->add(
422 term => '<>:s',
423 o_start => $span->o_start,
424 o_end => $span->o_end,
425 p_end => $span->p_end
426 );
427 }
428 );
429
430Add span information to the parsed token stream.
431Expects a C<foundry> name, a C<layer> name and a
432callback parameter, that will be called after each parsed
Nils Diewald7364d1f2013-11-05 19:26:35 +0000433span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000434as well as the current L<KorAP::Tokenizer::Span>.
435
436An optional parameter C<encoding> may indicate that the span offsets
437are either refering to C<bytes> or C<utf-8> offsets.
438
439An optional parameter C<skip> allows for skipping the process.
440
441
442=head2 add_tokendata
443
444 $tokens->add_tokendata(
445 foundry => 'connexor',
446 layer => 'syntax',
447 cb => sub {
448 my ($stream, $token) = @_;
449 my $mtt = $stream->pos($token->pos);
450 my $content = $token->content;
451
452 # syntax
453 if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
454 $mtt->add(
455 term => 'cnx_syn:' . $found
456 );
457 };
458 });
459
460Add token information to the parsed token stream.
461Expects a C<foundry> name, a C<layer> name and a
462callback parameter, that will be called after each parsed
Nils Diewald7364d1f2013-11-05 19:26:35 +0000463token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
Nils Diewald2db9ad02013-10-29 19:26:43 +0000464as well as the current L<KorAP::Tokenizer::Span>.
465
466An optional parameter C<encoding> may indicate that the token offsets
467are either refering to C<bytes> or C<utf-8> offsets.
468
469An optional parameter C<skip> allows for skipping the process.
470
471=cut