Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | package KorAP::Tokenizer; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 2 | use Mojo::Base -base; |
| 3 | use Mojo::ByteStream 'b'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 4 | use Mojo::Loader; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 5 | use XML::Fast; |
| 6 | use Try::Tiny; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 7 | use Carp qw/croak/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use KorAP::Tokenizer::Range; |
| 9 | use KorAP::Tokenizer::Match; |
| 10 | use KorAP::Tokenizer::Spans; |
| 11 | use KorAP::Tokenizer::Tokens; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 12 | use KorAP::Field::MultiTermTokenStream; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 13 | use List::MoreUtils 'uniq'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 | use JSON::XS; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 15 | use Log::Log4perl; |
| 16 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 17 | has [qw/path foundry doc stream should have name/]; |
| 18 | has layer => 'Tokens'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 19 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 20 | has log => sub { |
| 21 | if(Log::Log4perl->initialized()) { |
| 22 | state $log = Log::Log4perl->get_logger(__PACKAGE__); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 23 | }; |
| 24 | state $log = KorAP::Log->new; |
| 25 | return $log; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 26 | }; |
| 27 | |
| 28 | # Parse tokens of the document |
| 29 | sub parse { |
| 30 | my $self = shift; |
| 31 | |
| 32 | # Create new token stream |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 33 | my $mtts = KorAP::Field::MultiTermTokenStream->new; |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 34 | my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml'; |
| 35 | my $file = b($path)->slurp; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 36 | # my $tokens = Mojo::DOM->new($file); |
| 37 | # $tokens->xml(1); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 38 | |
| 39 | my $doc = $self->doc; |
| 40 | |
| 41 | my ($should, $have) = (0, 0); |
| 42 | |
| 43 | # Create range and match objects |
| 44 | my $range = KorAP::Tokenizer::Range->new; |
| 45 | my $match = KorAP::Tokenizer::Match->new; |
| 46 | |
| 47 | my $old = 0; |
| 48 | |
| 49 | $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer); |
| 50 | |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 51 | # TODO: Reuse the following code from Spans.pm and tokens.pm |
| 52 | my ($tokens, $error); |
| 53 | try { |
| 54 | local $SIG{__WARN__} = sub { |
| 55 | $error = 1; |
| 56 | }; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 57 | $tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList}; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 58 | } |
| 59 | catch { |
| 60 | $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : '')); |
| 61 | $error = 1; |
| 62 | }; |
| 63 | |
| 64 | return if $error; |
| 65 | |
| 66 | if (ref $tokens && $tokens->{span}) { |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 67 | $tokens = $tokens->{span}; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 68 | } |
| 69 | else { |
Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 70 | return $self; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 71 | }; |
| 72 | |
| 73 | $tokens = [$tokens] if ref $tokens ne 'ARRAY'; |
| 74 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 75 | # Iterate over all tokens |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 76 | # $tokens->find('span')->each( |
| 77 | # sub { |
| 78 | # my $span = $_; |
| 79 | foreach my $span (@$tokens) { |
| 80 | my $from = $span->{'-from'}; |
| 81 | my $to = $span->{'-to'}; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 82 | my $token = $doc->primary->data($from, $to); |
| 83 | |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 84 | # warn 'Has ' . $from . '->' . $to . "($old)"; |
| 85 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 86 | unless (defined $token) { |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 87 | $self->log->error("Unable to find substring [$from-$to] in $path"); |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 88 | next; |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 89 | }; |
| 90 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 91 | $should++; |
| 92 | |
| 93 | # Ignore non-word tokens |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 94 | next if $token !~ /[\w\d]/; |
| 95 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 96 | my $mtt = $mtts->add; |
| 97 | |
| 98 | # Add gap for later finding matching positions before or after |
| 99 | $range->gap($old, $from, $have) unless $old >= $from; |
| 100 | |
| 101 | # Add surface term |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 102 | # That's always the first term! |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 103 | $mtt->add('s:' . $token); |
| 104 | |
| 105 | # Add case insensitive term |
| 106 | $mtt->add('i:' . lc $token); |
| 107 | |
| 108 | # Add offset information |
| 109 | $mtt->o_start($from); |
| 110 | $mtt->o_end($to); |
| 111 | |
| 112 | # Store offset information for position matching |
| 113 | $range->set($from, $to, $have); |
| 114 | $match->set($from, $to, $have); |
| 115 | |
| 116 | $old = $to + 1; |
| 117 | |
| 118 | # Add position term |
| 119 | $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end); |
| 120 | |
| 121 | $have++; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 122 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 123 | |
| 124 | # Add token count |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 125 | $mtts->add_meta('tokens', '<i>' . $have); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 126 | |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 127 | $range->gap($old, $doc->primary->data_length + 1, $have-1) if $doc->primary->data_length >= ($old - 1); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 128 | |
| 129 | # Add info |
| 130 | $self->stream($mtts); |
| 131 | $self->{range} = $range; |
| 132 | $self->{match} = $match; |
| 133 | $self->should($should); |
| 134 | $self->have($have); |
| 135 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 136 | $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %'); |
Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 137 | |
| 138 | return $self; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 139 | }; |
| 140 | |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 141 | sub add_subtokens { |
| 142 | my $self = shift; |
| 143 | my $mtts = $self->stream or return; |
| 144 | |
| 145 | foreach my $mtt (@{$mtts->multi_term_tokens}) { |
| 146 | my $o_start = $mtt->o_start; |
| 147 | my $o_end = $mtt->o_end; |
| 148 | my $l = $o_end - $o_start; |
| 149 | |
| 150 | my $s = substr($mtt->lc_surface,2); |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 151 | my $os = $s; |
| 152 | |
| 153 | # Algorithm based on aggressive tokenization in |
| 154 | # tokenize.pl from Carsten Schnober |
| 155 | $s =~ s/[[:alpha:]]/a/g; |
| 156 | $s =~ s/[[:digit:]]/0/g; |
| 157 | $s =~ s/\p{Punct}/#/g; |
| 158 | $s =~ y/~/A/; |
| 159 | $s .= 'E'; |
| 160 | |
| 161 | while ($s =~ /(a+)[^a]/g) { |
| 162 | my $from = $-[1]; |
| 163 | my $to = $+[1]; |
| 164 | $mtt->add( |
| 165 | term => 'i^1:' . substr($os, $from, $from + $to), |
| 166 | o_start => $from + $o_start, |
| 167 | o_end => $to + $o_start |
| 168 | ) unless $to - $from == $l; |
| 169 | }; |
| 170 | while ($s =~ /(0+)[^0]/g) { |
| 171 | my $from = $-[1]; |
| 172 | my $to = $+[1]; |
| 173 | $mtt->add( |
| 174 | term => 'i^2:' . substr($os, $from, $from + $to), |
| 175 | o_start => $from + $o_start, |
| 176 | o_end => $to + $o_start |
| 177 | ) unless $to - $from == $l; |
| 178 | }; |
| 179 | while ($s =~ /(#)/g) { |
| 180 | my $from = $-[1]; |
| 181 | my $to = $+[1]; |
| 182 | $mtt->add( |
| 183 | term => 'i^3:' . substr($os, $from, $from + $to), |
| 184 | o_start => $from + $o_start, |
| 185 | o_end => $to + $o_start |
| 186 | ) unless $to - $from == $l; |
| 187 | }; |
| 188 | }; |
| 189 | |
Nils Diewald | 032e31d | 2014-07-21 18:39:12 +0000 | [diff] [blame^] | 190 | return $self; |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 191 | }; |
| 192 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 193 | |
| 194 | # Get span positions through character offsets |
| 195 | sub range { |
| 196 | return shift->{range} // KorAP::Tokenizer::Range->new; |
| 197 | }; |
| 198 | |
| 199 | |
| 200 | # Get token positions through character offsets |
| 201 | sub match { |
| 202 | return shift->{match} // KorAP::Tokenizer::Match->new; |
| 203 | }; |
| 204 | |
| 205 | |
| 206 | # Add information of spans to the tokens |
| 207 | sub add_spandata { |
| 208 | my $self = shift; |
| 209 | my %param = @_; |
| 210 | |
| 211 | croak 'No token data available' unless $self->stream; |
| 212 | |
| 213 | $self->log->trace( |
| 214 | ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer} |
| 215 | ); |
| 216 | |
| 217 | return if $param{skip}; |
| 218 | |
| 219 | my $cb = delete $param{cb}; |
| 220 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 221 | $param{primary} = $self->doc->primary; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 222 | |
| 223 | my $spans = KorAP::Tokenizer::Spans->new( |
| 224 | path => $self->path, |
| 225 | range => $self->range, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 226 | match => $self->match, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 227 | %param |
| 228 | ); |
| 229 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 230 | my $spanarray = $spans->parse or return; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 231 | |
| 232 | if ($spans->should == $spans->have) { |
| 233 | $self->log->trace('With perfect alignment!'); |
| 234 | } |
| 235 | else { |
| 236 | $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %'); |
| 237 | }; |
| 238 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 239 | if ($cb) { |
| 240 | foreach (@$spanarray) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 241 | $cb->($self->stream, $_, $spans); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 242 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 243 | return 1; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 244 | }; |
| 245 | return $spans; |
| 246 | }; |
| 247 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 248 | # Add information to the tokens |
| 249 | sub add_tokendata { |
| 250 | my $self = shift; |
| 251 | my %param = @_; |
| 252 | |
| 253 | croak 'No token data available' unless $self->stream; |
| 254 | |
| 255 | $self->log->trace( |
| 256 | ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer} |
| 257 | ); |
| 258 | return if $param{skip}; |
| 259 | |
| 260 | my $cb = delete $param{cb}; |
| 261 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 262 | $param{primary} = $self->doc->primary; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 263 | |
| 264 | my $tokens = KorAP::Tokenizer::Tokens->new( |
| 265 | path => $self->path, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 266 | range => $self->range, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 267 | match => $self->match, |
| 268 | %param |
| 269 | ); |
| 270 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 271 | my $tokenarray = $tokens->parse or return; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 272 | |
| 273 | if ($tokens->should == $tokens->have) { |
| 274 | $self->log->trace('With perfect alignment!'); |
| 275 | } |
| 276 | else { |
| 277 | my $perc = _perc( |
| 278 | $tokens->should, $tokens->have, $self->should, $self->should - $self->have |
| 279 | ); |
| 280 | $self->log->debug('With an alignment quota of ' . $perc); |
| 281 | }; |
| 282 | |
| 283 | if ($cb) { |
| 284 | foreach (@$tokenarray) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 285 | $cb->($self->stream, $_, $tokens); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 286 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 287 | return 1; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 288 | }; |
| 289 | return $tokens; |
| 290 | }; |
| 291 | |
| 292 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 293 | sub add { |
| 294 | my $self = shift; |
| 295 | my $loader = Mojo::Loader->new; |
| 296 | my $foundry = shift; |
| 297 | my $layer = shift; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 298 | |
| 299 | unless ($foundry && $layer) { |
| 300 | warn 'Unable to add specific module - not enough information given!'; |
| 301 | return; |
| 302 | }; |
| 303 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 304 | my $mod = 'KorAP::Index::' . $foundry . '::' . $layer; |
| 305 | |
| 306 | if ($mod->can('new') || eval("require $mod; 1;")) { |
| 307 | if (my $retval = $mod->new($self)->parse(@_)) { |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 308 | |
| 309 | # This layer is supported |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 310 | $self->support($foundry => $layer, @_); |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 311 | |
| 312 | # Get layerinfo |
| 313 | $self->layer_info($mod->layer_info); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 314 | return $retval; |
| 315 | }; |
| 316 | } |
| 317 | else { |
| 318 | $self->log->error('Unable to load '.$mod . '(' . $@ . ')'); |
| 319 | }; |
| 320 | |
| 321 | return; |
| 322 | }; |
| 323 | |
| 324 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 325 | sub _perc { |
| 326 | if (@_ == 2) { |
| 327 | # '[' . $_[0] . '/' . $_[1] . ']' . |
| 328 | return sprintf("%.2f", ($_[1] * 100) / $_[0]); |
| 329 | } |
| 330 | |
| 331 | my $a_should = shift; |
| 332 | my $a_have = shift; |
| 333 | my $b_should = shift; |
| 334 | my $b_have = shift; |
| 335 | my $a_quota = ($a_have * 100) / $a_should; |
| 336 | my $b_quota = ($b_have * 100) / $b_should; |
| 337 | return sprintf("%.2f", $a_quota) . '%' . |
| 338 | ((($a_quota + $b_quota) <= 100) ? |
| 339 | ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : ''); |
| 340 | }; |
| 341 | |
| 342 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 343 | sub support { |
| 344 | my $self = shift; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 345 | |
| 346 | # No setting - just getting |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 347 | unless ($_[0]) { |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 348 | my @supports; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 349 | |
| 350 | # Get all foundries |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 351 | foreach my $foundry (keys %{$self->{support}}) { |
| 352 | push(@supports, $foundry); |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 353 | |
| 354 | # Get all layers |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 355 | foreach my $layer (@{$self->{support}->{$foundry}}) { |
| 356 | my @layers = @$layer; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 357 | push(@supports, $foundry . '/' . $layers[0]); |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 358 | |
| 359 | # More information |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 360 | if ($layers[1]) { |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 361 | push(@supports, $foundry . '/' . join('/', @layers)); |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 362 | }; |
| 363 | }; |
| 364 | }; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 365 | return lc ( join ' ', sort {$a cmp $b } @supports ); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 366 | } |
| 367 | elsif (!$_[1]) { |
| 368 | return $self->{support}->{$_[0]} // [] |
| 369 | }; |
| 370 | my $f = lc shift; |
| 371 | my $l = lc shift; |
| 372 | my @info = @_; |
| 373 | $self->{support} //= {}; |
| 374 | $self->{support}->{$f} //= []; |
| 375 | push(@{$self->{support}->{$f}}, [$l, @info]); |
| 376 | }; |
| 377 | |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 378 | |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 379 | sub layer_info { |
| 380 | my $self = shift; |
| 381 | $self->{layer_info} //= []; |
| 382 | if ($_[0]) { |
| 383 | push(@{$self->{layer_info}}, @{$_[0]}); |
| 384 | } |
| 385 | else { |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 386 | return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}}; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 387 | }; |
| 388 | }; |
| 389 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 390 | |
| 391 | sub to_string { |
| 392 | my $self = shift; |
| 393 | my $primary = defined $_[0] ? $_[0] : 1; |
| 394 | my $string = "<meta>\n"; |
| 395 | $string .= $self->doc->to_string; |
| 396 | $string .= "</meta>\n"; |
| 397 | if ($primary) { |
| 398 | $string .= "<text>\n"; |
| 399 | $string .= $self->doc->primary->data . "\n"; |
| 400 | $string .= "</text>\n"; |
| 401 | }; |
| 402 | $string .= '<field name="' . $self->name . "\">\n"; |
| 403 | $string .= "<info>\n"; |
| 404 | $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n"; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 405 | |
| 406 | # There is support info |
| 407 | if (my $support = $self->support) { |
| 408 | $string .= 'support = ' . $support . "\n"; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 409 | }; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 410 | if (my $layer_info = $self->layer_info) { |
| 411 | $string .= 'layer_info = ' . $layer_info . "\n"; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 412 | }; |
| 413 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 414 | $string .= "</info>\n"; |
| 415 | $string .= $self->stream->to_string; |
| 416 | $string .= "</field>"; |
| 417 | return $string; |
| 418 | }; |
| 419 | |
| 420 | sub to_data { |
| 421 | my $self = shift; |
| 422 | my $primary = defined $_[0] ? $_[0] : 1; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 423 | my %data = %{$self->doc->to_hash}; |
| 424 | |
| 425 | my @fields; |
| 426 | push(@fields, { primaryData => $self->doc->primary->data }) if $primary; |
| 427 | |
| 428 | push(@fields, { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 429 | name => $self->name, |
| 430 | data => $self->stream->to_array, |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 431 | tokenization => lc($self->foundry) . '#' . lc($self->layer), |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 432 | foundries => $self->support, |
| 433 | layerInfo => $self->layer_info |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 434 | }); |
| 435 | |
| 436 | $data{fields} = \@fields; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 437 | \%data; |
| 438 | }; |
| 439 | |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 440 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 441 | sub to_json { |
| 442 | encode_json($_[0]->to_data($_[1])); |
| 443 | }; |
| 444 | |
| 445 | |
| 446 | sub to_pretty_json { |
| 447 | JSON::XS->new->pretty->encode($_[0]->to_data($_[1])); |
| 448 | }; |
| 449 | |
| 450 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 451 | 1; |
| 452 | |
| 453 | |
| 454 | __END__ |
| 455 | |
| 456 | =pod |
| 457 | |
| 458 | =head1 NAME |
| 459 | |
| 460 | KorAP::Tokenizer |
| 461 | |
| 462 | =head1 SYNOPSIS |
| 463 | |
| 464 | my $tokens = KorAP::Tokenizer->new( |
| 465 | path => '../examples/00003', |
| 466 | doc => KorAP::Document->new( ... ), |
| 467 | foundry => 'opennlp', |
| 468 | layer => 'tokens' |
| 469 | ); |
| 470 | |
| 471 | $tokens->parse; |
| 472 | |
| 473 | =head1 DESCRIPTION |
| 474 | |
| 475 | Convert token information from the KorAP XML |
| 476 | format into Lucene Index compatible token streams. |
| 477 | |
| 478 | =head1 ATTRIBUTES |
| 479 | |
| 480 | =head2 path |
| 481 | |
| 482 | print $tokens->path; |
| 483 | |
| 484 | The path of the document. |
| 485 | |
| 486 | |
| 487 | =head2 foundry |
| 488 | |
| 489 | print $tokens->foundry; |
| 490 | |
| 491 | The name of the foundry. |
| 492 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 493 | =head2 should |
| 494 | |
| 495 | Number of tokens that exist at all. |
| 496 | |
| 497 | =head2 have |
| 498 | |
| 499 | Number of tokens effectively stored in the token stream (e.g., no punctuations). |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 500 | |
| 501 | =head2 layer |
| 502 | |
| 503 | print $tokens->layer; |
| 504 | |
| 505 | The name of the tokens layer. |
| 506 | |
| 507 | |
| 508 | =head2 doc |
| 509 | |
| 510 | print $tokens->doc->corpus_id; |
| 511 | |
| 512 | The L<KorAP::Document> object. |
| 513 | |
| 514 | |
| 515 | =head2 stream |
| 516 | |
| 517 | $tokens->stream->add_meta('adjCount', '<i>45'); |
| 518 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 519 | The L<KorAP::Field::MultiTermTokenStream> object |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 520 | |
| 521 | |
| 522 | =head2 range |
| 523 | |
| 524 | $tokens->range->lookup(45); |
| 525 | |
| 526 | The L<KorAP::Tokenizer::Range> object for converting span offsets to positions. |
| 527 | |
| 528 | =head2 match |
| 529 | |
| 530 | $tokens->match->lookup(45); |
| 531 | |
| 532 | The L<KorAP::Tokenizer::Match> object for converting token offsets to positions. |
| 533 | |
| 534 | |
| 535 | =head1 METHODS |
| 536 | |
| 537 | =head2 parse |
| 538 | |
| 539 | $tokens->parse; |
| 540 | |
| 541 | Start the tokenization process. |
| 542 | |
| 543 | |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 544 | =head2 add_subtokens |
| 545 | |
| 546 | $tokens->split_tokens; |
| 547 | $tokens->split_tokens( |
| 548 | sub { |
| 549 | ... |
| 550 | } |
| 551 | ); |
| 552 | |
| 553 | Add sub token information to the index. |
| 554 | This is based on the C<aggressive> tokenization, written by Carsten Schnober. |
| 555 | |
| 556 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 557 | =head2 add_spandata |
| 558 | |
| 559 | $tokens->add_spandata( |
| 560 | foundry => 'base', |
| 561 | layer => 'sentences', |
| 562 | cb => sub { |
| 563 | my ($stream, $span) = @_; |
| 564 | my $mtt = $stream->pos($span->p_start); |
| 565 | $mtt->add( |
| 566 | term => '<>:s', |
| 567 | o_start => $span->o_start, |
| 568 | o_end => $span->o_end, |
| 569 | p_end => $span->p_end |
| 570 | ); |
| 571 | } |
| 572 | ); |
| 573 | |
| 574 | Add span information to the parsed token stream. |
| 575 | Expects a C<foundry> name, a C<layer> name and a |
| 576 | callback parameter, that will be called after each parsed |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 577 | span. The L<KorAP::Field::MultiTermTokenStream> object will be passed, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 578 | as well as the current L<KorAP::Tokenizer::Span>. |
| 579 | |
| 580 | An optional parameter C<encoding> may indicate that the span offsets |
| 581 | are either refering to C<bytes> or C<utf-8> offsets. |
| 582 | |
| 583 | An optional parameter C<skip> allows for skipping the process. |
| 584 | |
| 585 | |
| 586 | =head2 add_tokendata |
| 587 | |
| 588 | $tokens->add_tokendata( |
| 589 | foundry => 'connexor', |
| 590 | layer => 'syntax', |
| 591 | cb => sub { |
| 592 | my ($stream, $token) = @_; |
| 593 | my $mtt = $stream->pos($token->pos); |
| 594 | my $content = $token->content; |
| 595 | |
| 596 | # syntax |
| 597 | if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) { |
| 598 | $mtt->add( |
| 599 | term => 'cnx_syn:' . $found |
| 600 | ); |
| 601 | }; |
| 602 | }); |
| 603 | |
| 604 | Add token information to the parsed token stream. |
| 605 | Expects a C<foundry> name, a C<layer> name and a |
| 606 | callback parameter, that will be called after each parsed |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 607 | token. The L<KorAP::Field::MultiTermTokenStream> object will be passed, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 608 | as well as the current L<KorAP::Tokenizer::Span>. |
| 609 | |
| 610 | An optional parameter C<encoding> may indicate that the token offsets |
| 611 | are either refering to C<bytes> or C<utf-8> offsets. |
| 612 | |
| 613 | An optional parameter C<skip> allows for skipping the process. |
| 614 | |
| 615 | =cut |