blob: f0dc39891db767fad8438fe6b7d711a890697a26 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
4use v5.16;
5use lib 'lib', '../lib';
6use Set::Scalar;
7use Mojo::DOM;
8use Mojo::Util qw/encode decode/;
9use Mojo::ByteStream 'b';
10
11use Log::Log4perl;
12Log::Log4perl->init("script/log4perl.conf");
13
14use KorAP::Document;
15use KorAP::Tokenizer;
16
17
18# Call perl script/prepare_index.pl WPD/AAA/00001
19
20sub parse_doc {
21 my $doc = KorAP::Document->new(
22 path => shift . '/'
23 );
24
25 $doc->parse;
26
27 my $tokens = KorAP::Tokenizer->new(
28 path => $doc->path,
29 doc => $doc,
30 foundry => 'connexor',
31 layer => 'tokens'
32 );
33
34 $tokens->parse;
35
36 my $i = 0;
37 $tokens->add_spandata(
38 foundry => 'connexor',
39 layer => 'sentences',
40 #skip => 1,
41 cb => sub {
42 my ($stream, $span) = @_;
43 my $mtt = $stream->pos($span->p_start);
44 $mtt->add(
45 term => '<>:s',
46 o_start => $span->o_start,
47 o_end => $span->o_end,
48 p_end => $span->p_end
49 );
50 $i++;
51 }
52 );
53
54 $tokens->stream->add_meta('s', '<i>' . $i);
55
56 $i = 0;
57 $tokens->add_spandata(
58 foundry => 'base',
59 layer => 'paragraph',
60 #skip => 1,
61 cb => sub {
62 my ($stream, $span) = @_;
63 my $mtt = $stream->pos($span->p_start);
64 $mtt->add(
65 term => '<>:p',
66 o_start => $span->o_start,
67 o_end => $span->o_end,
68 p_end => $span->p_end
69 );
70 $i++;
71 }
72 );
73 $tokens->stream->add_meta('p', '<i>' . $i);
74
75 $tokens->add_tokendata(
76 foundry => 'opennlp',
77 layer => 'morpho',
78 #skip => 1,
79 cb => sub {
80 my ($stream, $token) = @_;
81 my $mtt = $stream->pos($token->pos);
82 my $content = $token->content;
83
84 my $found;
85
86 # syntax
87 if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
88 $mtt->add(
89 term => 'opennlp_p:' . $found
90 );
91 };
92 });
93
94
95 my $model = 'ne_dewac_175m_600';
96 $tokens->add_tokendata(
97 foundry => 'corenlp',
98 #skip => 1,
99 layer => $model,
100 cb => sub {
101 my ($stream, $token) = @_;
102 my $mtt = $stream->pos($token->pos);
103 my $content = $token->content;
104
105 my $found;
106
107 if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
108 $mtt->add(
109 term => 'corenlp_' . $model . ':' . $found
110 );
111 };
112 });
113
114 $model = 'ne_hgc_175m_600';
115 $tokens->add_tokendata(
116 foundry => 'corenlp',
117 #skip => 1,
118 layer => $model,
119 cb => sub {
120 my ($stream, $token) = @_;
121 my $mtt = $stream->pos($token->pos);
122 my $content = $token->content;
123
124 my $found;
125
126 if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
127 $mtt->add(
128 term => 'corenlp_' . $model . ':' . $found
129 );
130 };
131 });
132
133 $tokens->add_tokendata(
134 foundry => 'connexor',
135 layer => 'morpho',
136 #skip => 1,
137 cb => sub {
138 my ($stream, $token) = @_;
139 my $mtt = $stream->pos($token->pos);
140 my $content = $token->content;
141
142 my $found;
143
144 # Lemma
145 if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
146 if (index($found, "\N{U+00a0}") >= 0) {
147 $found = b($found)->decode;
148 foreach (split(/\x{00A0}/, $found)) {
149 $mtt->add(
150 term => 'cnx_l:' . b($_)->encode
151 );
152 }
153 }
154 else {
155 $mtt->add(
156 term => 'cnx_l:' . $found # b($found)->encode
157 );
158 };
159 };
160
161 # POS
162 if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
163 $mtt->add(
164 term => 'cnx_p:' . $found
165 );
166 };
167
168 # MSD
169 # Todo: Look in the description!
170 if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
171 foreach (split(':', $found)) {
172 $mtt->add(
173 term => 'cnx_m:' . $_
174 );
175 };
176 };
177 }
178 );
179
180 $tokens->add_tokendata(
181 foundry => 'connexor',
182 layer => 'syntax',
183 #skip => 1,
184 cb => sub {
185 my ($stream, $token) = @_;
186 my $mtt = $stream->pos($token->pos);
187 my $content = $token->content;
188
189 my $found;
190
191 # syntax
192 if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
193 $mtt->add(
194 term => 'cnx_syn:' . $found
195 );
196 };
197 });
198
199 $tokens->add_spandata(
200 foundry => 'connexor',
201 layer => 'phrase',
202 #skip => 1,
203 cb => sub {
204 my ($stream, $span) = @_;
205
206 my $type = $span->content->at('f[name=pos]');
207 if ($type && ($type = $type->text)) {
208 my $mtt = $stream->pos($span->p_start);
209 $mtt->add(
210 term => '<>:cnx_const:' . $type,
211 o_start => $span->o_start,
212 o_end => $span->o_end,
213 p_end => $span->p_end
214 );
215 };
216 }
217 );
218
219 $tokens->add_tokendata(
220 foundry => 'tree_tagger',
221 #skip => 1,
222 layer => 'morpho',
223 cb => sub {
224 my ($stream, $token) = @_;
225 my $mtt = $stream->pos($token->pos);
226 my $content = $token->content;
227
228 my $found;
229
230 # lemma
231 if (($found = $content->at('f[name="lemma"]')) &&
232 ($found = $found->text) && $found ne 'UNKNOWN') {
233 $mtt->add(
234 term => 'tt_l:' . $found
235 );
236 };
237
238 # pos
239 if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
240 $mtt->add(
241 term => 'tt_p:' . $found
242 );
243 };
244 });
245
246 $tokens->add_tokendata(
247 foundry => 'mate',
248 layer => 'morpho',
249 cb => sub {
250 my ($stream, $token) = @_;
251 my $mtt = $stream->pos($token->pos);
252 my $content = $token->content;
253
254 my $found;
255
256 my $capital = 0;
257
258 # pos
259 if (($found = $content->at('f[name="pos"]')) &&
260 ($found = $found->text)) {
261 $mtt->add(term => 'mate_p:' . $found
262 );
263 };
264
265 # lemma
266 if (($found = $content->at('f[name="lemma"]'))
267 && ($found = $found->text)
268 && $found ne '--') {
269 $mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
270 };
271
272 # MSD
273 if (($found = $content->at('f[name="msd"]')) &&
274 ($found = $found->text) &&
275 ($found ne '_')) {
276 foreach (split '\|', $found) {
277 my ($x, $y) = split "=", $_;
278 # case, tense, number, mood, person, degree, gender
279 $mtt->add(term => 'mate_m:' . $x . ':' . $y);
280 };
281 };
282 });
283
284
285 $tokens->add_tokendata(
286 foundry => 'xip',
287 #skip => 1,
288 layer => 'morpho',
289 encoding => 'bytes',
290 cb => sub {
291 my ($stream, $token) = @_;
292 my $mtt = $stream->pos($token->pos);
293 my $content = $token->content;
294
295 my $found;
296
297 my $capital = 0;
298 # pos
299 if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
300 $mtt->add(
301 term => 'xip_p:' . $found
302 );
303
304 $capital = 1 if $found eq 'NOUN';
305 };
306
307 # lemma
308 if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
309 my (@token) = split('#', $found);
310
311 my $full = '';
312 foreach (@token) {
313 $full .= $_;
314 $_ =~ s{/\w+$}{};
315 $mtt->add(term => 'xip_l:' . $_);
316 };
317 if (@token > 1) {
318 $full =~ s{/}{}g;
319 $full = lc $full;
320 $full = $capital ? ucfirst($full) : $full;
321 $mtt->add(term => 'xip_l:' . $full);
322 };
323 };
324 });
325
326
327 # Collect all spans and check for roots
328 my %xip_const;
329 my $xip_const_root = Set::Scalar->new;
330 my $xip_const_noroot = Set::Scalar->new;
331
332 # First run:
333 $tokens->add_spandata(
334 foundry => 'xip',
335 layer => 'constituency',
336 encoding => 'bytes',
337 #skip => 1,
338 cb => sub {
339 my ($stream, $span) = @_;
340
341 $xip_const{$span->id} = $span;
342 $xip_const_root->insert($span->id);
343
344 $span->content->find('rel[label=dominates][target]')->each(
345 sub {
346 my $rel = shift;
347 $xip_const_noroot->insert($rel->attr('target'));
348 }
349 );
350 }
351 );
352
353 my $stream = $tokens->stream;
354
355 my $add_const = sub {
356 my $span = shift;
357 my $level = shift;
358 my $mtt = $stream->pos($span->p_start);
359
360 my $content = $span->content;
361 my $type = $content->at('f[name=const]');
362 if ($type && ($type = $type->text)) {
363 # $type is now NPA, NP, NUM
364 my %term = (
365 term => '<>:xip_const:' . $type,
366 o_start => $span->o_start,
367 o_end => $span->o_end,
368 p_end => $span->p_end
369 );
370
371 $term{payload} = '<s>' . $level if $level;
372
373 $mtt->add(%term);
374
375 my $this = __SUB__;
376
377 $content->find('rel[label=dominates][target]')->each(
378 sub {
379 my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
380 $this->($subspan, $level + 1);
381 }
382 );
383 };
384 };
385
386 my $diff = $xip_const_root->difference($xip_const_noroot);
387 foreach ($diff->members) {
388 my $obj = delete $xip_const{$_} or next;
389 $add_const->($obj, 0);
390 };
391
392 # Todo: Add mate-morpho
393 # Todo: Add mate-dependency
394 # Todo: Add xip-dependency
395
396 print $tokens->stream->to_string;
397};
398
399if ($ARGV[0]) {
400 parse_doc($ARGV[0]);
401};
402
403
404
405__END__