Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | package KorAP::Tokenizer; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 2 | use Mojo::Base -base; |
| 3 | use Mojo::ByteStream 'b'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 4 | use Mojo::Loader; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 5 | use XML::Fast; |
| 6 | use Try::Tiny; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 7 | use Carp qw/croak/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use KorAP::Tokenizer::Range; |
| 9 | use KorAP::Tokenizer::Match; |
| 10 | use KorAP::Tokenizer::Spans; |
| 11 | use KorAP::Tokenizer::Tokens; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 12 | use KorAP::Field::MultiTermTokenStream; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 13 | use List::MoreUtils 'uniq'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 | use JSON::XS; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 15 | use Log::Log4perl; |
| 16 | |
Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 17 | # TODO 1: |
| 18 | # Bei den Autoren im Index darauf achten, dass auch "etc." indiziert wird |
| 19 | |
| 20 | # TODO 2: |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 21 | # That's now implemented but deactivated |
| 22 | |
Nils Diewald | 6dab7ea | 2014-09-10 17:37:19 +0000 | [diff] [blame] | 23 | # Add punktuations to the index |
| 24 | # [Er sagte: "Hallo - na?"] becomes |
| 25 | # [s:Er|tt/l:er|_1#0-2] |
| 26 | # [s:sagte|tt/l:sagen|_2#3-8|.>::#8-9$1|.>tt/l:PUNCT#8-9$1|.>:"#10-11$2|.>tt/l:PUNCT#10-11$2] |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 27 | # [s:Hallo|tt/l:hallo|_3#11-16|.<::#8-9$2|.<tt/l:PUNCT#8-9$2|.<:"#10-11$1|.<:tt/l:PUNCT#10-11$1|.>:-#17-18$1|.>tt/l:PUNCT#17-18$1] |
Nils Diewald | 6dab7ea | 2014-09-10 17:37:19 +0000 | [diff] [blame] | 28 | # [s:na|tt/l:na|_4#19-21|.<:-#17-18$1|.<tt/l:PUNCT#17-18$1|.>:?#21-22$1|.>tt/l:PUNCT#21-22$1|.>:"#22-23$2|.>tt/l:PUNCT#22-23$2] |
| 29 | |
Nils Diewald | 840c924 | 2014-10-28 19:51:26 +0000 | [diff] [blame] | 30 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 31 | has [qw/path foundry doc stream should have name/]; |
| 32 | has layer => 'Tokens'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 33 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 34 | has log => sub { |
| 35 | if(Log::Log4perl->initialized()) { |
| 36 | state $log = Log::Log4perl->get_logger(__PACKAGE__); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 37 | }; |
| 38 | state $log = KorAP::Log->new; |
| 39 | return $log; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 40 | }; |
| 41 | |
| 42 | # Parse tokens of the document |
| 43 | sub parse { |
| 44 | my $self = shift; |
| 45 | |
| 46 | # Create new token stream |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 47 | my $mtts = KorAP::Field::MultiTermTokenStream->new; |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 48 | my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml'; |
| 49 | my $file = b($path)->slurp; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 50 | # my $tokens = Mojo::DOM->new($file); |
| 51 | # $tokens->xml(1); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 52 | |
| 53 | my $doc = $self->doc; |
| 54 | |
| 55 | my ($should, $have) = (0, 0); |
| 56 | |
| 57 | # Create range and match objects |
| 58 | my $range = KorAP::Tokenizer::Range->new; |
| 59 | my $match = KorAP::Tokenizer::Match->new; |
| 60 | |
| 61 | my $old = 0; |
| 62 | |
| 63 | $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer); |
| 64 | |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 65 | # TODO: Reuse the following code from Spans.pm and tokens.pm |
| 66 | my ($tokens, $error); |
| 67 | try { |
| 68 | local $SIG{__WARN__} = sub { |
| 69 | $error = 1; |
| 70 | }; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 71 | $tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList}; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 72 | } |
| 73 | catch { |
| 74 | $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : '')); |
| 75 | $error = 1; |
| 76 | }; |
| 77 | |
| 78 | return if $error; |
| 79 | |
| 80 | if (ref $tokens && $tokens->{span}) { |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 81 | $tokens = $tokens->{span}; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 82 | } |
| 83 | else { |
Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 84 | return $self; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 85 | }; |
| 86 | |
| 87 | $tokens = [$tokens] if ref $tokens ne 'ARRAY'; |
| 88 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 89 | # Iterate over all tokens |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 90 | # $tokens->find('span')->each( |
| 91 | # sub { |
| 92 | # my $span = $_; |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 93 | my $mtt; |
| 94 | my $distance = 0; |
| 95 | my (@non_word_tokens); |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 96 | foreach my $span (@$tokens) { |
| 97 | my $from = $span->{'-from'}; |
| 98 | my $to = $span->{'-to'}; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 99 | my $token = $doc->primary->data($from, $to); |
| 100 | |
Nils Diewald | 3ece630 | 2013-12-02 18:38:16 +0000 | [diff] [blame] | 101 | # warn 'Has ' . $from . '->' . $to . "($old)"; |
| 102 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 103 | unless (defined $token) { |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 104 | $self->log->error("Unable to find substring [$from-$to] in $path"); |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 105 | next; |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 106 | }; |
| 107 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 108 | $should++; |
| 109 | |
| 110 | # Ignore non-word tokens |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 111 | if ($token !~ /[\w\d]/) { |
| 112 | # if ($mtt) { |
| 113 | # my $term = [$token, $from, $to]; |
| 114 | # $mtt->add( |
| 115 | # term => '.>:'.$token, |
| 116 | # payload => '<i>'.$from . '<i>' . $to . '<b>' . $distance++ |
| 117 | # ); |
| 118 | # push(@non_word_tokens, $term); |
| 119 | # } |
| 120 | next; |
| 121 | }; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 122 | |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 123 | $mtt = $mtts->add; |
| 124 | |
| 125 | # while (scalar @non_word_tokens) { |
| 126 | # local $_ = shift @non_word_tokens; |
| 127 | # $mtt->add( |
| 128 | # term => '.<:' . $_->[0], |
| 129 | # payload => '<i>' . $_->[1] . '<i>' . $_->[2] . '<b>' . --$distance |
| 130 | # ); |
| 131 | # $distance = 0; |
| 132 | # }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 133 | |
| 134 | # Add gap for later finding matching positions before or after |
| 135 | $range->gap($old, $from, $have) unless $old >= $from; |
| 136 | |
| 137 | # Add surface term |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 138 | # That's always the first term! |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 139 | $mtt->add('s:' . $token); |
| 140 | |
| 141 | # Add case insensitive term |
| 142 | $mtt->add('i:' . lc $token); |
| 143 | |
| 144 | # Add offset information |
| 145 | $mtt->o_start($from); |
| 146 | $mtt->o_end($to); |
| 147 | |
| 148 | # Store offset information for position matching |
| 149 | $range->set($from, $to, $have); |
| 150 | $match->set($from, $to, $have); |
| 151 | |
| 152 | $old = $to + 1; |
| 153 | |
| 154 | # Add position term |
| 155 | $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end); |
| 156 | |
| 157 | $have++; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 158 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 159 | |
| 160 | # Add token count |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 161 | $mtts->add_meta('tokens', '<i>' . $have); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 162 | |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 163 | if ($doc->primary->data_length >= ($old - 1)) { |
| 164 | $range->gap($old, $doc->primary->data_length + 1, $have-1) |
| 165 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 166 | |
| 167 | # Add info |
| 168 | $self->stream($mtts); |
| 169 | $self->{range} = $range; |
| 170 | $self->{match} = $match; |
| 171 | $self->should($should); |
| 172 | $self->have($have); |
| 173 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 174 | $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %'); |
Nils Diewald | 21a3e1a | 2014-04-28 18:48:16 +0000 | [diff] [blame] | 175 | |
| 176 | return $self; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 177 | }; |
| 178 | |
Nils Diewald | a96de62 | 2014-10-31 17:29:23 +0000 | [diff] [blame] | 179 | |
| 180 | # This is now done by glemm |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 181 | sub add_subtokens { |
| 182 | my $self = shift; |
| 183 | my $mtts = $self->stream or return; |
| 184 | |
| 185 | foreach my $mtt (@{$mtts->multi_term_tokens}) { |
| 186 | my $o_start = $mtt->o_start; |
| 187 | my $o_end = $mtt->o_end; |
| 188 | my $l = $o_end - $o_start; |
| 189 | |
Nils Diewald | 7658e0a | 2014-07-21 18:47:11 +0000 | [diff] [blame] | 190 | my $os = my $s = $mtt->lc_surface; |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 191 | |
| 192 | # Algorithm based on aggressive tokenization in |
| 193 | # tokenize.pl from Carsten Schnober |
| 194 | $s =~ s/[[:alpha:]]/a/g; |
| 195 | $s =~ s/[[:digit:]]/0/g; |
| 196 | $s =~ s/\p{Punct}/#/g; |
| 197 | $s =~ y/~/A/; |
| 198 | $s .= 'E'; |
| 199 | |
| 200 | while ($s =~ /(a+)[^a]/g) { |
| 201 | my $from = $-[1]; |
| 202 | my $to = $+[1]; |
| 203 | $mtt->add( |
| 204 | term => 'i^1:' . substr($os, $from, $from + $to), |
| 205 | o_start => $from + $o_start, |
| 206 | o_end => $to + $o_start |
| 207 | ) unless $to - $from == $l; |
| 208 | }; |
| 209 | while ($s =~ /(0+)[^0]/g) { |
| 210 | my $from = $-[1]; |
| 211 | my $to = $+[1]; |
| 212 | $mtt->add( |
| 213 | term => 'i^2:' . substr($os, $from, $from + $to), |
| 214 | o_start => $from + $o_start, |
| 215 | o_end => $to + $o_start |
| 216 | ) unless $to - $from == $l; |
| 217 | }; |
| 218 | while ($s =~ /(#)/g) { |
| 219 | my $from = $-[1]; |
| 220 | my $to = $+[1]; |
| 221 | $mtt->add( |
| 222 | term => 'i^3:' . substr($os, $from, $from + $to), |
| 223 | o_start => $from + $o_start, |
| 224 | o_end => $to + $o_start |
| 225 | ) unless $to - $from == $l; |
| 226 | }; |
| 227 | }; |
| 228 | |
Nils Diewald | 032e31d | 2014-07-21 18:39:12 +0000 | [diff] [blame] | 229 | return $self; |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 230 | }; |
| 231 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 232 | |
| 233 | # Get span positions through character offsets |
| 234 | sub range { |
| 235 | return shift->{range} // KorAP::Tokenizer::Range->new; |
| 236 | }; |
| 237 | |
| 238 | |
| 239 | # Get token positions through character offsets |
| 240 | sub match { |
| 241 | return shift->{match} // KorAP::Tokenizer::Match->new; |
| 242 | }; |
| 243 | |
| 244 | |
| 245 | # Add information of spans to the tokens |
| 246 | sub add_spandata { |
| 247 | my $self = shift; |
| 248 | my %param = @_; |
| 249 | |
| 250 | croak 'No token data available' unless $self->stream; |
| 251 | |
| 252 | $self->log->trace( |
| 253 | ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer} |
| 254 | ); |
| 255 | |
| 256 | return if $param{skip}; |
| 257 | |
| 258 | my $cb = delete $param{cb}; |
| 259 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 260 | $param{primary} = $self->doc->primary; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 261 | |
| 262 | my $spans = KorAP::Tokenizer::Spans->new( |
| 263 | path => $self->path, |
| 264 | range => $self->range, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 265 | match => $self->match, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 266 | %param |
| 267 | ); |
| 268 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 269 | my $spanarray = $spans->parse or return; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 270 | |
| 271 | if ($spans->should == $spans->have) { |
| 272 | $self->log->trace('With perfect alignment!'); |
| 273 | } |
| 274 | else { |
| 275 | $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %'); |
| 276 | }; |
| 277 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 278 | if ($cb) { |
| 279 | foreach (@$spanarray) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 280 | $cb->($self->stream, $_, $spans); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 281 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 282 | return 1; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 283 | }; |
| 284 | return $spans; |
| 285 | }; |
| 286 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 287 | # Add information to the tokens |
| 288 | sub add_tokendata { |
| 289 | my $self = shift; |
| 290 | my %param = @_; |
| 291 | |
| 292 | croak 'No token data available' unless $self->stream; |
| 293 | |
| 294 | $self->log->trace( |
| 295 | ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer} |
| 296 | ); |
| 297 | return if $param{skip}; |
| 298 | |
| 299 | my $cb = delete $param{cb}; |
| 300 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 301 | $param{primary} = $self->doc->primary; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 302 | |
| 303 | my $tokens = KorAP::Tokenizer::Tokens->new( |
| 304 | path => $self->path, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 305 | range => $self->range, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 306 | match => $self->match, |
| 307 | %param |
| 308 | ); |
| 309 | |
Nils Diewald | aba4710 | 2013-11-27 15:02:47 +0000 | [diff] [blame] | 310 | my $tokenarray = $tokens->parse or return; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 311 | |
| 312 | if ($tokens->should == $tokens->have) { |
| 313 | $self->log->trace('With perfect alignment!'); |
| 314 | } |
| 315 | else { |
| 316 | my $perc = _perc( |
| 317 | $tokens->should, $tokens->have, $self->should, $self->should - $self->have |
| 318 | ); |
| 319 | $self->log->debug('With an alignment quota of ' . $perc); |
| 320 | }; |
| 321 | |
| 322 | if ($cb) { |
| 323 | foreach (@$tokenarray) { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 324 | $cb->($self->stream, $_, $tokens); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 325 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 326 | return 1; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 327 | }; |
| 328 | return $tokens; |
| 329 | }; |
| 330 | |
| 331 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 332 | sub add { |
| 333 | my $self = shift; |
| 334 | my $loader = Mojo::Loader->new; |
| 335 | my $foundry = shift; |
| 336 | my $layer = shift; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 337 | |
| 338 | unless ($foundry && $layer) { |
| 339 | warn 'Unable to add specific module - not enough information given!'; |
| 340 | return; |
| 341 | }; |
| 342 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 343 | my $mod = 'KorAP::Index::' . $foundry . '::' . $layer; |
| 344 | |
| 345 | if ($mod->can('new') || eval("require $mod; 1;")) { |
| 346 | if (my $retval = $mod->new($self)->parse(@_)) { |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 347 | |
| 348 | # This layer is supported |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 349 | $self->support($foundry => $layer, @_); |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 350 | |
| 351 | # Get layerinfo |
| 352 | $self->layer_info($mod->layer_info); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 353 | return $retval; |
| 354 | }; |
| 355 | } |
| 356 | else { |
| 357 | $self->log->error('Unable to load '.$mod . '(' . $@ . ')'); |
| 358 | }; |
| 359 | |
| 360 | return; |
| 361 | }; |
| 362 | |
| 363 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 364 | sub _perc { |
| 365 | if (@_ == 2) { |
| 366 | # '[' . $_[0] . '/' . $_[1] . ']' . |
| 367 | return sprintf("%.2f", ($_[1] * 100) / $_[0]); |
| 368 | } |
| 369 | |
| 370 | my $a_should = shift; |
| 371 | my $a_have = shift; |
| 372 | my $b_should = shift; |
| 373 | my $b_have = shift; |
| 374 | my $a_quota = ($a_have * 100) / $a_should; |
| 375 | my $b_quota = ($b_have * 100) / $b_should; |
| 376 | return sprintf("%.2f", $a_quota) . '%' . |
| 377 | ((($a_quota + $b_quota) <= 100) ? |
| 378 | ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : ''); |
| 379 | }; |
| 380 | |
| 381 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 382 | sub support { |
| 383 | my $self = shift; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 384 | |
| 385 | # No setting - just getting |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 386 | unless ($_[0]) { |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 387 | my @supports; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 388 | |
| 389 | # Get all foundries |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 390 | foreach my $foundry (keys %{$self->{support}}) { |
| 391 | push(@supports, $foundry); |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 392 | |
| 393 | # Get all layers |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 394 | foreach my $layer (@{$self->{support}->{$foundry}}) { |
| 395 | my @layers = @$layer; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 396 | push(@supports, $foundry . '/' . $layers[0]); |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 397 | |
| 398 | # More information |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 399 | if ($layers[1]) { |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 400 | push(@supports, $foundry . '/' . join('/', @layers)); |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 401 | }; |
| 402 | }; |
| 403 | }; |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 404 | return lc ( join ' ', sort {$a cmp $b } @supports ); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 405 | } |
| 406 | elsif (!$_[1]) { |
| 407 | return $self->{support}->{$_[0]} // [] |
| 408 | }; |
| 409 | my $f = lc shift; |
| 410 | my $l = lc shift; |
| 411 | my @info = @_; |
| 412 | $self->{support} //= {}; |
| 413 | $self->{support}->{$f} //= []; |
| 414 | push(@{$self->{support}->{$f}}, [$l, @info]); |
| 415 | }; |
| 416 | |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 417 | |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 418 | sub layer_info { |
| 419 | my $self = shift; |
| 420 | $self->{layer_info} //= []; |
| 421 | if ($_[0]) { |
| 422 | push(@{$self->{layer_info}}, @{$_[0]}); |
| 423 | } |
| 424 | else { |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 425 | return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}}; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 426 | }; |
| 427 | }; |
| 428 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 429 | |
| 430 | sub to_string { |
| 431 | my $self = shift; |
| 432 | my $primary = defined $_[0] ? $_[0] : 1; |
| 433 | my $string = "<meta>\n"; |
| 434 | $string .= $self->doc->to_string; |
| 435 | $string .= "</meta>\n"; |
| 436 | if ($primary) { |
| 437 | $string .= "<text>\n"; |
| 438 | $string .= $self->doc->primary->data . "\n"; |
| 439 | $string .= "</text>\n"; |
| 440 | }; |
| 441 | $string .= '<field name="' . $self->name . "\">\n"; |
| 442 | $string .= "<info>\n"; |
| 443 | $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n"; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 444 | |
| 445 | # There is support info |
| 446 | if (my $support = $self->support) { |
| 447 | $string .= 'support = ' . $support . "\n"; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 448 | }; |
Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 449 | if (my $layer_info = $self->layer_info) { |
| 450 | $string .= 'layer_info = ' . $layer_info . "\n"; |
Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 451 | }; |
| 452 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 453 | $string .= "</info>\n"; |
| 454 | $string .= $self->stream->to_string; |
| 455 | $string .= "</field>"; |
| 456 | return $string; |
| 457 | }; |
| 458 | |
| 459 | sub to_data { |
| 460 | my $self = shift; |
| 461 | my $primary = defined $_[0] ? $_[0] : 1; |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 462 | my $legacy = defined $_[1] ? $_[1] : 0; |
| 463 | |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 464 | my %data = %{$self->doc->to_hash}; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 465 | my @fields; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 466 | |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 467 | if ($legacy) { |
| 468 | push(@fields, { primaryData => $self->doc->primary->data }) if $primary; |
Nils Diewald | 044c41d | 2013-11-11 21:45:09 +0000 | [diff] [blame] | 469 | |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 470 | push(@fields, { |
| 471 | name => $self->name, |
| 472 | data => $self->stream->to_array, |
| 473 | tokenization => lc($self->foundry) . '#' . lc($self->layer), |
| 474 | foundries => $self->support, |
| 475 | layerInfo => $self->layer_info |
| 476 | }); |
| 477 | |
| 478 | $data{fields} = \@fields; |
| 479 | } |
| 480 | |
| 481 | else { |
| 482 | $data{primaryData} = $self->doc->primary->data if $primary; |
| 483 | $data{tokenName} = $self->name; |
| 484 | $data{data} = $self->stream->to_array; |
| 485 | $data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer); |
| 486 | $data{foundries} = $self->support; |
| 487 | $data{layerInfos} = $self->layer_info; |
| 488 | $data{version} = '0.02'; |
| 489 | }; |
| 490 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 491 | \%data; |
| 492 | }; |
| 493 | |
Nils Diewald | d9c1661 | 2013-11-18 17:55:22 +0000 | [diff] [blame] | 494 | |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 495 | sub to_json_legacy { |
| 496 | encode_json($_[0]->to_data($_[1], 1)); |
| 497 | }; |
| 498 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 499 | sub to_json { |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 500 | encode_json($_[0]->to_data($_[1], 0)); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 501 | }; |
| 502 | |
| 503 | |
| 504 | sub to_pretty_json { |
| 505 | JSON::XS->new->pretty->encode($_[0]->to_data($_[1])); |
| 506 | }; |
| 507 | |
| 508 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 509 | 1; |
| 510 | |
| 511 | |
| 512 | __END__ |
| 513 | |
| 514 | =pod |
| 515 | |
| 516 | =head1 NAME |
| 517 | |
| 518 | KorAP::Tokenizer |
| 519 | |
| 520 | =head1 SYNOPSIS |
| 521 | |
| 522 | my $tokens = KorAP::Tokenizer->new( |
| 523 | path => '../examples/00003', |
| 524 | doc => KorAP::Document->new( ... ), |
| 525 | foundry => 'opennlp', |
| 526 | layer => 'tokens' |
| 527 | ); |
| 528 | |
| 529 | $tokens->parse; |
| 530 | |
| 531 | =head1 DESCRIPTION |
| 532 | |
| 533 | Convert token information from the KorAP XML |
| 534 | format into Lucene Index compatible token streams. |
| 535 | |
| 536 | =head1 ATTRIBUTES |
| 537 | |
| 538 | =head2 path |
| 539 | |
| 540 | print $tokens->path; |
| 541 | |
| 542 | The path of the document. |
| 543 | |
| 544 | |
| 545 | =head2 foundry |
| 546 | |
| 547 | print $tokens->foundry; |
| 548 | |
| 549 | The name of the foundry. |
| 550 | |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 551 | =head2 should |
| 552 | |
| 553 | Number of tokens that exist at all. |
| 554 | |
| 555 | =head2 have |
| 556 | |
| 557 | Number of tokens effectively stored in the token stream (e.g., no punctuations). |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 558 | |
| 559 | =head2 layer |
| 560 | |
| 561 | print $tokens->layer; |
| 562 | |
| 563 | The name of the tokens layer. |
| 564 | |
| 565 | |
| 566 | =head2 doc |
| 567 | |
| 568 | print $tokens->doc->corpus_id; |
| 569 | |
| 570 | The L<KorAP::Document> object. |
| 571 | |
| 572 | |
| 573 | =head2 stream |
| 574 | |
| 575 | $tokens->stream->add_meta('adjCount', '<i>45'); |
| 576 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 577 | The L<KorAP::Field::MultiTermTokenStream> object |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 578 | |
| 579 | |
| 580 | =head2 range |
| 581 | |
| 582 | $tokens->range->lookup(45); |
| 583 | |
| 584 | The L<KorAP::Tokenizer::Range> object for converting span offsets to positions. |
| 585 | |
| 586 | =head2 match |
| 587 | |
| 588 | $tokens->match->lookup(45); |
| 589 | |
| 590 | The L<KorAP::Tokenizer::Match> object for converting token offsets to positions. |
| 591 | |
| 592 | |
| 593 | =head1 METHODS |
| 594 | |
| 595 | =head2 parse |
| 596 | |
| 597 | $tokens->parse; |
| 598 | |
| 599 | Start the tokenization process. |
| 600 | |
| 601 | |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 602 | =head2 to_json_legacy |
| 603 | |
| 604 | print $tokens->to_json_legacy; |
| 605 | print $tokens->to_json_legacy(1); |
| 606 | |
| 607 | Return the token data in legacy JSON format. |
| 608 | An optional parsed boolean parameter indicates, |
| 609 | if primary data should be included. |
| 610 | |
| 611 | =head2 to_json |
| 612 | |
| 613 | print $tokens->to_json; |
| 614 | print $tokens->to_json(1); |
| 615 | |
| 616 | Return the token data in JSON format |
| 617 | An optional parsed boolean parameter indicates, |
| 618 | if primary data should be included. |
| 619 | |
| 620 | |
Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 621 | =head2 add_subtokens |
| 622 | |
| 623 | $tokens->split_tokens; |
| 624 | $tokens->split_tokens( |
| 625 | sub { |
| 626 | ... |
| 627 | } |
| 628 | ); |
| 629 | |
| 630 | Add sub token information to the index. |
| 631 | This is based on the C<aggressive> tokenization, written by Carsten Schnober. |
| 632 | |
| 633 | |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 634 | =head2 add_spandata |
| 635 | |
| 636 | $tokens->add_spandata( |
| 637 | foundry => 'base', |
| 638 | layer => 'sentences', |
| 639 | cb => sub { |
| 640 | my ($stream, $span) = @_; |
| 641 | my $mtt = $stream->pos($span->p_start); |
| 642 | $mtt->add( |
| 643 | term => '<>:s', |
| 644 | o_start => $span->o_start, |
| 645 | o_end => $span->o_end, |
| 646 | p_end => $span->p_end |
| 647 | ); |
| 648 | } |
| 649 | ); |
| 650 | |
| 651 | Add span information to the parsed token stream. |
| 652 | Expects a C<foundry> name, a C<layer> name and a |
| 653 | callback parameter, that will be called after each parsed |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 654 | span. The L<KorAP::Field::MultiTermTokenStream> object will be passed, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 655 | as well as the current L<KorAP::Tokenizer::Span>. |
| 656 | |
| 657 | An optional parameter C<encoding> may indicate that the span offsets |
| 658 | are either refering to C<bytes> or C<utf-8> offsets. |
| 659 | |
| 660 | An optional parameter C<skip> allows for skipping the process. |
| 661 | |
| 662 | |
| 663 | =head2 add_tokendata |
| 664 | |
| 665 | $tokens->add_tokendata( |
| 666 | foundry => 'connexor', |
| 667 | layer => 'syntax', |
| 668 | cb => sub { |
| 669 | my ($stream, $token) = @_; |
| 670 | my $mtt = $stream->pos($token->pos); |
| 671 | my $content = $token->content; |
| 672 | |
| 673 | # syntax |
| 674 | if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) { |
| 675 | $mtt->add( |
| 676 | term => 'cnx_syn:' . $found |
| 677 | ); |
| 678 | }; |
| 679 | }); |
| 680 | |
| 681 | Add token information to the parsed token stream. |
| 682 | Expects a C<foundry> name, a C<layer> name and a |
| 683 | callback parameter, that will be called after each parsed |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 684 | token. The L<KorAP::Field::MultiTermTokenStream> object will be passed, |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 685 | as well as the current L<KorAP::Tokenizer::Span>. |
| 686 | |
| 687 | An optional parameter C<encoding> may indicate that the token offsets |
| 688 | are either refering to C<bytes> or C<utf-8> offsets. |
| 689 | |
| 690 | An optional parameter C<skip> allows for skipping the process. |
| 691 | |
| 692 | =cut |