Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | package KorAP::Tokenizer; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 2 | use Mojo::Base -base; |
| 3 | use Mojo::ByteStream 'b'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 4 | use Mojo::Loader; |
| 5 | use Carp qw/croak/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 6 | use KorAP::Tokenizer::Range; |
| 7 | use KorAP::Tokenizer::Match; |
| 8 | use KorAP::Tokenizer::Spans; |
| 9 | use KorAP::Tokenizer::Tokens; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 10 | use KorAP::Field::MultiTermTokenStream; |
| 11 | use JSON::XS; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 12 | use Log::Log4perl; |
| 13 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 | has [qw/path foundry doc stream should have name/]; |
| 15 | has layer => 'Tokens'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 16 | |
| 17 | has 'log' => sub { |
| 18 | Log::Log4perl->get_logger(__PACKAGE__) |
| 19 | }; |
| 20 | |
| 21 | # Parse tokens of the document |
| 22 | sub parse { |
| 23 | my $self = shift; |
| 24 | |
| 25 | # Create new token stream |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 26 | my $mtts = KorAP::Field::MultiTermTokenStream->new; |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 27 | my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml'; |
| 28 | my $file = b($path)->slurp; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 29 | my $tokens = Mojo::DOM->new($file); |
| 30 | $tokens->xml(1); |
| 31 | |
| 32 | my $doc = $self->doc; |
| 33 | |
| 34 | my ($should, $have) = (0, 0); |
| 35 | |
| 36 | # Create range and match objects |
| 37 | my $range = KorAP::Tokenizer::Range->new; |
| 38 | my $match = KorAP::Tokenizer::Match->new; |
| 39 | |
| 40 | my $old = 0; |
| 41 | |
| 42 | $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer); |
| 43 | |
| 44 | # Iterate over all tokens |
| 45 | $tokens->find('span')->each( |
| 46 | sub { |
| 47 | my $span = $_; |
| 48 | my $from = $span->attr('from'); |
| 49 | my $to = $span->attr('to'); |
| 50 | my $token = $doc->primary->data($from, $to); |
| 51 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame^] | 52 | unless (defined $token) { |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 53 | $self->log->error("Unable to find substring [$from-$to] in $path"); |
| 54 | return; |
| 55 | }; |
| 56 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 57 | $should++; |
| 58 | |
| 59 | # Ignore non-word tokens |
| 60 | return if $token !~ /[\w\d]/; |
| 61 | |
| 62 | my $mtt = $mtts->add; |
| 63 | |
| 64 | # Add gap for later finding matching positions before or after |
| 65 | $range->gap($old, $from, $have) unless $old >= $from; |
| 66 | |
| 67 | # Add surface term |
| 68 | $mtt->add('s:' . $token); |
| 69 | |
| 70 | # Add case insensitive term |
| 71 | $mtt->add('i:' . lc $token); |
| 72 | |
| 73 | # Add offset information |
| 74 | $mtt->o_start($from); |
| 75 | $mtt->o_end($to); |
| 76 | |
| 77 | # Store offset information for position matching |
| 78 | $range->set($from, $to, $have); |
| 79 | $match->set($from, $to, $have); |
| 80 | |
| 81 | $old = $to + 1; |
| 82 | |
| 83 | # Add position term |
| 84 | $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end); |
| 85 | |
| 86 | $have++; |
| 87 | }); |
| 88 | |
| 89 | # Add token count |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 90 | $mtts->add_meta('tokens', '<i>' . $have); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 91 | |
| 92 | $range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old; |
| 93 | |
| 94 | # Add info |
| 95 | $self->stream($mtts); |
| 96 | $self->{range} = $range; |
| 97 | $self->{match} = $match; |
| 98 | $self->should($should); |
| 99 | $self->have($have); |
| 100 | |
| 101 | $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %'); |
| 102 | }; |
| 103 | |
| 104 | |
| 105 | # Get span positions through character offsets |
| 106 | sub range { |
| 107 | return shift->{range} // KorAP::Tokenizer::Range->new; |
| 108 | }; |
| 109 | |
| 110 | |
| 111 | # Get token positions through character offsets |
| 112 | sub match { |
| 113 | return shift->{match} // KorAP::Tokenizer::Match->new; |
| 114 | }; |
| 115 | |
| 116 | |
| 117 | # Add information of spans to the tokens |
| 118 | sub add_spandata { |
| 119 | my $self = shift; |
| 120 | my %param = @_; |
| 121 | |
| 122 | croak 'No token data available' unless $self->stream; |
| 123 | |
| 124 | $self->log->trace( |
| 125 | ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer} |
| 126 | ); |
| 127 | |
| 128 | return if $param{skip}; |
| 129 | |
| 130 | my $cb = delete $param{cb}; |
| 131 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 132 | $param{primary} = $self->doc->primary; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 133 | |
| 134 | my $spans = KorAP::Tokenizer::Spans->new( |
| 135 | path => $self->path, |
| 136 | range => $self->range, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 137 | match => $self->match, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 138 | %param |
| 139 | ); |
| 140 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame^] | 141 | my $spanarray = $spans->parse or return; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 142 | |
| 143 | if ($spans->should == $spans->have) { |
| 144 | $self->log->trace('With perfect alignment!'); |
| 145 | } |
| 146 | else { |
| 147 | $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %'); |
| 148 | }; |
| 149 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 150 | if ($cb) { |
| 151 | foreach (@$spanarray) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 152 | $cb->($self->stream, $_, $spans); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 153 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 154 | return 1; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 155 | }; |
| 156 | return $spans; |
| 157 | }; |
| 158 | |
| 159 | |
| 160 | # Add information to the tokens |
| 161 | sub add_tokendata { |
| 162 | my $self = shift; |
| 163 | my %param = @_; |
| 164 | |
| 165 | croak 'No token data available' unless $self->stream; |
| 166 | |
| 167 | $self->log->trace( |
| 168 | ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer} |
| 169 | ); |
| 170 | return if $param{skip}; |
| 171 | |
| 172 | my $cb = delete $param{cb}; |
| 173 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 174 | $param{primary} = $self->doc->primary; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 175 | |
| 176 | my $tokens = KorAP::Tokenizer::Tokens->new( |
| 177 | path => $self->path, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 178 | range => $self->range, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 179 | match => $self->match, |
| 180 | %param |
| 181 | ); |
| 182 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame^] | 183 | my $tokenarray = $tokens->parse or return; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 184 | |
| 185 | if ($tokens->should == $tokens->have) { |
| 186 | $self->log->trace('With perfect alignment!'); |
| 187 | } |
| 188 | else { |
| 189 | my $perc = _perc( |
| 190 | $tokens->should, $tokens->have, $self->should, $self->should - $self->have |
| 191 | ); |
| 192 | $self->log->debug('With an alignment quota of ' . $perc); |
| 193 | }; |
| 194 | |
| 195 | if ($cb) { |
| 196 | foreach (@$tokenarray) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 197 | $cb->($self->stream, $_, $tokens); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 198 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 199 | return 1; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 200 | }; |
| 201 | return $tokens; |
| 202 | }; |
| 203 | |
| 204 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 205 | sub add { |
| 206 | my $self = shift; |
| 207 | my $loader = Mojo::Loader->new; |
| 208 | my $foundry = shift; |
| 209 | my $layer = shift; |
| 210 | my $mod = 'KorAP::Index::' . $foundry . '::' . $layer; |
| 211 | |
| 212 | if ($mod->can('new') || eval("require $mod; 1;")) { |
| 213 | if (my $retval = $mod->new($self)->parse(@_)) { |
| 214 | $self->support($foundry => $layer, @_); |
| 215 | return $retval; |
| 216 | }; |
| 217 | } |
| 218 | else { |
| 219 | $self->log->error('Unable to load '.$mod . '(' . $@ . ')'); |
| 220 | }; |
| 221 | |
| 222 | return; |
| 223 | }; |
| 224 | |
| 225 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 226 | sub _perc { |
| 227 | if (@_ == 2) { |
| 228 | # '[' . $_[0] . '/' . $_[1] . ']' . |
| 229 | return sprintf("%.2f", ($_[1] * 100) / $_[0]); |
| 230 | } |
| 231 | |
| 232 | my $a_should = shift; |
| 233 | my $a_have = shift; |
| 234 | my $b_should = shift; |
| 235 | my $b_have = shift; |
| 236 | my $a_quota = ($a_have * 100) / $a_should; |
| 237 | my $b_quota = ($b_have * 100) / $b_should; |
| 238 | return sprintf("%.2f", $a_quota) . '%' . |
| 239 | ((($a_quota + $b_quota) <= 100) ? |
| 240 | ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : ''); |
| 241 | }; |
| 242 | |
| 243 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 244 | sub support { |
| 245 | my $self = shift; |
| 246 | unless ($_[0]) { |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 247 | my @supports; |
| 248 | foreach my $foundry (keys %{$self->{support}}) { |
| 249 | push(@supports, $foundry); |
| 250 | foreach my $layer (@{$self->{support}->{$foundry}}) { |
| 251 | my @layers = @$layer; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 252 | push(@supports, $foundry . '/' . $layers[0]); |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 253 | if ($layers[1]) { |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 254 | push(@supports, $foundry . '/' . join('/', @layers)); |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 255 | }; |
| 256 | }; |
| 257 | }; |
| 258 | return lc ( join ' ', @supports ); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 259 | } |
| 260 | elsif (!$_[1]) { |
| 261 | return $self->{support}->{$_[0]} // [] |
| 262 | }; |
| 263 | my $f = lc shift; |
| 264 | my $l = lc shift; |
| 265 | my @info = @_; |
| 266 | $self->{support} //= {}; |
| 267 | $self->{support}->{$f} //= []; |
| 268 | push(@{$self->{support}->{$f}}, [$l, @info]); |
| 269 | }; |
| 270 | |
| 271 | |
| 272 | sub to_string { |
| 273 | my $self = shift; |
| 274 | my $primary = defined $_[0] ? $_[0] : 1; |
| 275 | my $string = "<meta>\n"; |
| 276 | $string .= $self->doc->to_string; |
| 277 | $string .= "</meta>\n"; |
| 278 | if ($primary) { |
| 279 | $string .= "<text>\n"; |
| 280 | $string .= $self->doc->primary->data . "\n"; |
| 281 | $string .= "</text>\n"; |
| 282 | }; |
| 283 | $string .= '<field name="' . $self->name . "\">\n"; |
| 284 | $string .= "<info>\n"; |
| 285 | $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n"; |
| 286 | foreach my $foundry (keys %{$self->support}) { |
| 287 | foreach (@{$self->support($foundry)}) { |
| 288 | $string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n"; |
| 289 | }; |
| 290 | }; |
| 291 | $string .= "</info>\n"; |
| 292 | $string .= $self->stream->to_string; |
| 293 | $string .= "</field>"; |
| 294 | return $string; |
| 295 | }; |
| 296 | |
| 297 | sub to_data { |
| 298 | my $self = shift; |
| 299 | my $primary = defined $_[0] ? $_[0] : 1; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 300 | my %data = %{$self->doc->to_hash}; |
| 301 | |
| 302 | my @fields; |
| 303 | push(@fields, { primaryData => $self->doc->primary->data }) if $primary; |
| 304 | |
| 305 | push(@fields, { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 306 | name => $self->name, |
| 307 | data => $self->stream->to_array, |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 308 | tokenization => lc($self->foundry) . '#' . lc($self->layer), |
| 309 | foundries => $self->support |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 310 | }); |
| 311 | |
| 312 | $data{fields} = \@fields; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 313 | \%data; |
| 314 | }; |
| 315 | |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 316 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 317 | sub to_json { |
| 318 | encode_json($_[0]->to_data($_[1])); |
| 319 | }; |
| 320 | |
| 321 | |
| 322 | sub to_pretty_json { |
| 323 | JSON::XS->new->pretty->encode($_[0]->to_data($_[1])); |
| 324 | }; |
| 325 | |
| 326 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 327 | 1; |
| 328 | |
| 329 | |
| 330 | __END__ |
| 331 | |
| 332 | =pod |
| 333 | |
| 334 | =head1 NAME |
| 335 | |
| 336 | KorAP::Tokenizer |
| 337 | |
| 338 | =head1 SYNOPSIS |
| 339 | |
| 340 | my $tokens = KorAP::Tokenizer->new( |
| 341 | path => '../examples/00003', |
| 342 | doc => KorAP::Document->new( ... ), |
| 343 | foundry => 'opennlp', |
| 344 | layer => 'tokens' |
| 345 | ); |
| 346 | |
| 347 | $tokens->parse; |
| 348 | |
| 349 | =head1 DESCRIPTION |
| 350 | |
| 351 | Convert token information from the KorAP XML |
| 352 | format into Lucene Index compatible token streams. |
| 353 | |
| 354 | =head1 ATTRIBUTES |
| 355 | |
| 356 | =head2 path |
| 357 | |
| 358 | print $tokens->path; |
| 359 | |
| 360 | The path of the document. |
| 361 | |
| 362 | |
| 363 | =head2 foundry |
| 364 | |
| 365 | print $tokens->foundry; |
| 366 | |
| 367 | The name of the foundry. |
| 368 | |
| 369 | |
| 370 | =head2 layer |
| 371 | |
| 372 | print $tokens->layer; |
| 373 | |
| 374 | The name of the tokens layer. |
| 375 | |
| 376 | |
| 377 | =head2 doc |
| 378 | |
| 379 | print $tokens->doc->corpus_id; |
| 380 | |
| 381 | The L<KorAP::Document> object. |
| 382 | |
| 383 | |
| 384 | =head2 stream |
| 385 | |
| 386 | $tokens->stream->add_meta('adjCount', '<i>45'); |
| 387 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 388 | The L<KorAP::Field::MultiTermTokenStream> object |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 389 | |
| 390 | |
| 391 | =head2 range |
| 392 | |
| 393 | $tokens->range->lookup(45); |
| 394 | |
| 395 | The L<KorAP::Tokenizer::Range> object for converting span offsets to positions. |
| 396 | |
| 397 | =head2 match |
| 398 | |
| 399 | $tokens->match->lookup(45); |
| 400 | |
| 401 | The L<KorAP::Tokenizer::Match> object for converting token offsets to positions. |
| 402 | |
| 403 | |
| 404 | =head1 METHODS |
| 405 | |
| 406 | =head2 parse |
| 407 | |
| 408 | $tokens->parse; |
| 409 | |
| 410 | Start the tokenization process. |
| 411 | |
| 412 | |
| 413 | =head2 add_spandata |
| 414 | |
| 415 | $tokens->add_spandata( |
| 416 | foundry => 'base', |
| 417 | layer => 'sentences', |
| 418 | cb => sub { |
| 419 | my ($stream, $span) = @_; |
| 420 | my $mtt = $stream->pos($span->p_start); |
| 421 | $mtt->add( |
| 422 | term => '<>:s', |
| 423 | o_start => $span->o_start, |
| 424 | o_end => $span->o_end, |
| 425 | p_end => $span->p_end |
| 426 | ); |
| 427 | } |
| 428 | ); |
| 429 | |
| 430 | Add span information to the parsed token stream. |
| 431 | Expects a C<foundry> name, a C<layer> name and a |
| 432 | callback parameter, that will be called after each parsed |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 433 | span. The L<KorAP::Field::MultiTermTokenStream> object will be passed, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 434 | as well as the current L<KorAP::Tokenizer::Span>. |
| 435 | |
| 436 | An optional parameter C<encoding> may indicate that the span offsets |
| 437 | are either refering to C<bytes> or C<utf-8> offsets. |
| 438 | |
| 439 | An optional parameter C<skip> allows for skipping the process. |
| 440 | |
| 441 | |
| 442 | =head2 add_tokendata |
| 443 | |
| 444 | $tokens->add_tokendata( |
| 445 | foundry => 'connexor', |
| 446 | layer => 'syntax', |
| 447 | cb => sub { |
| 448 | my ($stream, $token) = @_; |
| 449 | my $mtt = $stream->pos($token->pos); |
| 450 | my $content = $token->content; |
| 451 | |
| 452 | # syntax |
| 453 | if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) { |
| 454 | $mtt->add( |
| 455 | term => 'cnx_syn:' . $found |
| 456 | ); |
| 457 | }; |
| 458 | }); |
| 459 | |
| 460 | Add token information to the parsed token stream. |
| 461 | Expects a C<foundry> name, a C<layer> name and a |
| 462 | callback parameter, that will be called after each parsed |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 463 | token. The L<KorAP::Field::MultiTermTokenStream> object will be passed, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 464 | as well as the current L<KorAP::Tokenizer::Span>. |
| 465 | |
| 466 | An optional parameter C<encoding> may indicate that the token offsets |
| 467 | are either refering to C<bytes> or C<utf-8> offsets. |
| 468 | |
| 469 | An optional parameter C<skip> allows for skipping the process. |
| 470 | |
| 471 | =cut |