blob: 08bd56c9edde64090084b8256797e7f8b19122ed [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Krill;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
Nils Diewald2db9ad02013-10-29 19:26:43 +00003use Mojo::ByteStream 'b';
Nils Diewald7b847222014-04-23 11:14:00 +00004use Mojo::Util qw/encode/;
Akron3ec0a1c2017-01-18 14:41:55 +01005use Mojo::File;
Akron14ca9f02016-01-29 19:38:18 +01006use Scalar::Util qw/weaken/;
Nils Diewald3cf08c72013-12-16 20:31:10 +00007use XML::Fast;
8use Try::Tiny;
Akron7d4cdd82016-08-17 21:39:45 +02009use Carp qw/croak carp/;
Akrone4c2e412016-01-28 15:10:50 +010010use KorAP::XML::Document::Primary;
Akron941c1a62016-02-23 17:41:41 +010011use KorAP::XML::Tokenizer;
Nils Diewald7b847222014-04-23 11:14:00 +000012use Log::Log4perl;
Akrone4c2e412016-01-28 15:10:50 +010013use KorAP::XML::Log;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Nils Diewald7b847222014-04-23 11:14:00 +000015use Mojo::DOM;
16use Data::Dumper;
Akronaf670ae2016-10-24 20:14:32 +020017use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000018
Akron38873012017-02-06 20:27:37 +010019our $VERSION = '0.25';
Nils Diewald90410c22014-11-03 21:04:05 +000020
Nils Diewald7364d1f2013-11-05 19:26:35 +000021has 'path';
Akron35db6e32016-03-17 22:42:22 +010022has [qw/text_sigle doc_sigle corpus_sigle/];
23has 'meta_type' => 'I5';
Akron11c80302016-03-18 19:44:43 +010024has 'cache';
Nils Diewald7364d1f2013-11-05 19:26:35 +000025
Nils Diewald7b847222014-04-23 11:14:00 +000026has log => sub {
27 if(Log::Log4perl->initialized()) {
28 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000029 };
Akrone4c2e412016-01-28 15:10:50 +010030 state $log = KorAP::XML::Log->new;
Nils Diewald7b847222014-04-23 11:14:00 +000031 return $log;
32};
33
Akron6396c302016-03-18 16:05:39 +010034# Constructor
Nils Diewald7b847222014-04-23 11:14:00 +000035sub new {
36 my $class = shift;
37 my $self = bless { @_ }, $class;
Akron6396c302016-03-18 16:05:39 +010038
39 # Path is defined
Nils Diewaldd681eab2014-11-01 01:18:25 +000040 if (exists $self->{path}) {
41 $self->{path} = rel2abs($self->{path});
42 if ($self->{path} !~ m!\/$!) {
43 $self->{path} .= '/';
44 };
Nils Diewald7b847222014-04-23 11:14:00 +000045 };
46 return $self;
47};
Nils Diewald2db9ad02013-10-29 19:26:43 +000048
Akron35db6e32016-03-17 22:42:22 +010049
50# Parse document (primary data and metadata)
Nils Diewald2db9ad02013-10-29 19:26:43 +000051sub parse {
52 my $self = shift;
Akron35db6e32016-03-17 22:42:22 +010053 my $meta_data_type = $self->meta_type;
Nils Diewald7b847222014-04-23 11:14:00 +000054
Akron6396c302016-03-18 16:05:39 +010055 state $ENC_RE = qr/^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
Nils Diewald2db9ad02013-10-29 19:26:43 +000056
Akron6396c302016-03-18 16:05:39 +010057 # Path to primary
58 my $data_xml = $self->path . 'data.xml';
Nils Diewald98767bb2014-04-25 20:31:19 +000059 my ($rt, $error, $file);
60
61 my $unable = 'Unable to parse document ' . $self->path;
62
Akron35db6e32016-03-17 22:42:22 +010063 # No primary data found
Nils Diewald98767bb2014-04-25 20:31:19 +000064 unless (-e $data_xml) {
65 $self->log->warn($unable . ' - no data.xml found');
66 $error = 1;
67 }
68
69 else {
Akron35db6e32016-03-17 22:42:22 +010070 # Load file
Akron3ec0a1c2017-01-18 14:41:55 +010071 $file = b(Mojo::File->new($data_xml)->slurp);
Akrona7d0e9f2017-02-03 14:36:21 +010072
Nils Diewald98767bb2014-04-25 20:31:19 +000073 try {
Nils Diewald3cf08c72013-12-16 20:31:10 +000074 local $SIG{__WARN__} = sub {
Akron7d4cdd82016-08-17 21:39:45 +020075 $error = 1;
Nils Diewald3cf08c72013-12-16 20:31:10 +000076 };
Akron7d4cdd82016-08-17 21:39:45 +020077
Nils Diewald3cf08c72013-12-16 20:31:10 +000078 $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
Akrona7d0e9f2017-02-03 14:36:21 +010079
Akrona8665782016-01-27 21:47:57 +010080 } catch {
81 $self->log->warn($unable);
82 $error = 1;
83 };
Nils Diewald3cf08c72013-12-16 20:31:10 +000084 };
85
86 return if $error;
Nils Diewald2db9ad02013-10-29 19:26:43 +000087
Nils Diewald3ece6302013-12-02 18:38:16 +000088 $self->log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +000089
Nils Diewald2db9ad02013-10-29 19:26:43 +000090 # Get document id and corpus id
Nils Diewald3cf08c72013-12-16 20:31:10 +000091 if ($rt && $rt->{'-docid'}) {
Akron1cd5b872016-03-22 00:23:46 +010092 if ($rt->{'-docid'} =~ /^([^_]+)_([^\._]+?)\.(.+?)$/) {
93 $self->text_sigle(join('/', $1, $2, $3));
94 $self->doc_sigle(join('/', $1, $2));
95 $self->corpus_sigle($1);
Nils Diewald2db9ad02013-10-29 19:26:43 +000096 }
97 else {
Akron7d4cdd82016-08-17 21:39:45 +020098 $self->log->warn($unable . ': ID not parseable');
99 return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000100 };
101 }
102 else {
Akron7d4cdd82016-08-17 21:39:45 +0200103 $self->log->warn($unable . ': No raw_text found or no ID');
104 return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000105 };
106
107 # Get primary data
Nils Diewald3cf08c72013-12-16 20:31:10 +0000108 my $pd = $rt->{text};
Akron6396c302016-03-18 16:05:39 +0100109
Akron7d4cdd82016-08-17 21:39:45 +0200110 unless ($pd) {
111 $self->log->warn($unable . ': No primary data found');
112 return;
113 };
Akron087d5db2016-10-24 18:14:22 +0200114
Akron6396c302016-03-18 16:05:39 +0100115 # Associate primary data
116 $self->{pd} = KorAP::XML::Document::Primary->new($pd);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000117
Nils Diewaldd681eab2014-11-01 01:18:25 +0000118 my @path = grep { $_ } splitdir($self->path);
Nils Diewald840c9242014-10-28 19:51:26 +0000119 my @header;
120
Akron35db6e32016-03-17 22:42:22 +0100121 # Parse the corpus file, the doc file,
122 # and the text file for meta information
Nils Diewald840c9242014-10-28 19:51:26 +0000123 foreach (0..2) {
Nils Diewald0e489772016-10-24 15:16:52 +0200124 # Removed starting '/'
Akronaf670ae2016-10-24 20:14:32 +0200125 my $header = ($^O =~ /^mswin/i ? '' : '/');
126 $header .= catfile(@path, 'header.xml');
127 unshift @header, $header;
Nils Diewald840c9242014-10-28 19:51:26 +0000128 pop @path;
129 };
Akronb2636cf2016-01-26 18:42:44 +0100130
Akron6396c302016-03-18 16:05:39 +0100131 # Get metadata class and create an object
Akron35db6e32016-03-17 22:42:22 +0100132 my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type;
133 my $meta;
134
135 if ($meta_class->can('new') || eval("require $meta_class; 1;")) {
136 $meta = $meta_class->new(
Akron6396c302016-03-18 16:05:39 +0100137 log => $self->log,
Akron35db6e32016-03-17 22:42:22 +0100138 corpus_sigle => $self->corpus_sigle,
139 doc_sigle => $self->doc_sigle,
Akron11c80302016-03-18 19:44:43 +0100140 text_sigle => $self->text_sigle,
141 cache => $self->cache
Akron35db6e32016-03-17 22:42:22 +0100142 );
143
Akron6396c302016-03-18 16:05:39 +0100144 # Associate meta object
Akron35db6e32016-03-17 22:42:22 +0100145 $self->{meta} = $meta;
146 };
147
Akron6396c302016-03-18 16:05:39 +0100148 unless ($meta) {
149 $self->log->warn(
150 "Metadata object for $meta_data_type not initializable"
151 );
152 };
Akron35db6e32016-03-17 22:42:22 +0100153
Nils Diewald840c9242014-10-28 19:51:26 +0000154 my @type = qw/corpus doc text/;
155 foreach (@header) {
156 # Get corpus, doc and text meta data
157 my $type = shift(@type);
Akrona8665782016-01-27 21:47:57 +0100158
Akron11c80302016-03-18 19:44:43 +0100159 # Check for cache
160 next if $meta->is_cached($type);
161
Akrona8665782016-01-27 21:47:57 +0100162 next unless -e $_;
163
Akron35db6e32016-03-17 22:42:22 +0100164 # Slurp data and probably decode
Akron3ec0a1c2017-01-18 14:41:55 +0100165 my $slurp = b(Mojo::File->new($_)->slurp);
Akron6396c302016-03-18 16:05:39 +0100166 $slurp =~ $ENC_RE;
Akrona8665782016-01-27 21:47:57 +0100167 my $file = $slurp->decode($2 // 'UTF-8');
168
169 # Get DOM
170 my $dom = Mojo::DOM->new($file);
171
Akron6396c302016-03-18 16:05:39 +0100172 # Parse object based on DOM
Akron35db6e32016-03-17 22:42:22 +0100173 $meta->parse($dom, $type);
Akron11c80302016-03-18 19:44:43 +0100174 $meta->do_cache($type);
Nils Diewald840c9242014-10-28 19:51:26 +0000175 };
176
Akron14ca9f02016-01-29 19:38:18 +0100177 return $self;
178};
179
180
181sub tokenize {
182 my $self = shift;
183 my ($token_foundry, $token_layer) = @_;
184
185 $token_foundry //= 'OpenNLP';
186 $token_layer //= 'Tokens';
187
Akron941c1a62016-02-23 17:41:41 +0100188 # Create tokenizer
189 my $tokens = KorAP::XML::Tokenizer->new(
Akron14ca9f02016-01-29 19:38:18 +0100190 path => $self->path,
191 doc => $self,
192 foundry => $token_foundry,
193 layer => $token_layer,
194 name => 'tokens'
195 );
196
Akron941c1a62016-02-23 17:41:41 +0100197 # Parse tokens
Akron14ca9f02016-01-29 19:38:18 +0100198 unless ($tokens->parse) {
199 $self->log->warn(
200 'Unable to tokenize ' . $self->path .
201 ' with ' . $token_foundry . '#'
202 . $token_layer
203 );
204 }
205 else {
206 weaken $self;
207 $self->{tokenizer} = $tokens;
208 };
209
210 return $self;
211};
212
213
214# Add annotation
215sub annotate {
216 my $self = shift;
217 unless ($self->{tokenizer}) {
218 $self->log->warn('No tokenizer defined')
219 }
220 else {
221 $self->{tokenizer}->add(@_);
222 };
223
224 $self;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000225};
226
227
Akrona8665782016-01-27 21:47:57 +0100228# Store arbitrary data
229sub store {
230 my $self = shift;
231 return $self->{store} unless @_;
232 return $self->{store}->{$_[0]} if @_ == 1;
233 $self->{store}->{$_[0]} = $_[1];
234};
235
236
Nils Diewald2db9ad02013-10-29 19:26:43 +0000237# Primary data
238sub primary {
239 $_[0]->{pd};
240};
241
Akron35db6e32016-03-17 22:42:22 +0100242sub meta {
243 return $_[0]->{meta};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000244};
245
Akron35db6e32016-03-17 22:42:22 +0100246sub to_hash {
Nils Diewald840c9242014-10-28 19:51:26 +0000247 my $self = shift;
Nils Diewald840c9242014-10-28 19:51:26 +0000248
Akron35db6e32016-03-17 22:42:22 +0100249 $self->parse unless $self->text_sigle;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000250
Akron35db6e32016-03-17 22:42:22 +0100251 my %hash;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000252
Akron35db6e32016-03-17 22:42:22 +0100253 # Get meta object
254 my $meta = $self->meta;
Akron11c80302016-03-18 19:44:43 +0100255 foreach ($meta->keys) {
Nils Diewald840c9242014-10-28 19:51:26 +0000256
Akron35db6e32016-03-17 22:42:22 +0100257 my $v = $meta->{$_};
258 if (ref $v) {
259 $hash{_k($_)} = $meta->keywords($_);
Nils Diewald90410c22014-11-03 21:04:05 +0000260 }
Akron35db6e32016-03-17 22:42:22 +0100261 else {
262 $v =~ s/\n/ /g;
263 $v =~ s/\s\s+/ /g;
264 $hash{_k($_)} = $v;
Nils Diewald840c9242014-10-28 19:51:26 +0000265 };
Nils Diewald90410c22014-11-03 21:04:05 +0000266 };
Nils Diewald840c9242014-10-28 19:51:26 +0000267
Akron35db6e32016-03-17 22:42:22 +0100268 foreach (qw/corpus doc text/) {
269 $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'};
Nils Diewald8e323ee2014-04-23 17:28:14 +0000270 };
271
Akron35db6e32016-03-17 22:42:22 +0100272 return \%hash;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000273};
274
Nils Diewald840c9242014-10-28 19:51:26 +0000275
Akron35db6e32016-03-17 22:42:22 +0100276sub _k {
277 my $x = $_[0];
278 $x =~ s/_(\w)/\U$1\E/g;
279 $x =~ s/id$/ID/gi;
280 return $x;
281};
282
283
284sub to_json {
285 my $self = shift;
286 unless ($self->{tokenizer}) {
287 $self->log->warn('No tokenizer defined');
288 return;
289 };
290
291 return $self->{tokenizer}->to_json;
292};
293
294
2951;
296
297
298__END__
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000299
Nils Diewald7364d1f2013-11-05 19:26:35 +0000300sub to_string {
301 my $self = shift;
302
303 my $string;
304
305 foreach (@ATTR) {
306 if (my $att = $self->$_) {
307 $att =~ s/\n/ /g;
308 $att =~ s/\s\s+/ /g;
309 $string .= $_ . ' = ' . $att . "\n";
310 };
311 };
312
Akron31d788e2016-02-05 20:49:03 +0100313 $string .= 'text_class = ' . $self->text_class_string . "\n";
314 $string .= 'keywords = ' . $self->keywords_string . "\n";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000315
316 return $string;
317};
318
Akron14ca9f02016-01-29 19:38:18 +0100319# Todo: Make this a KoralQuery serializer
320sub to_koral_query {
321 my $self = shift;
Akron941c1a62016-02-23 17:41:41 +0100322 my $hash = {};
323 $hash->{'@context'} = 'http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld';
324 $hash->{'@type'} = 'koral:corpus';
325# $hash->{'text'} = $self->primary->data;
326# my $hash = $self->to_hash;
Akron14ca9f02016-01-29 19:38:18 +0100327};
Nils Diewald7b847222014-04-23 11:14:00 +0000328
Akron941c1a62016-02-23 17:41:41 +0100329
Nils Diewald2db9ad02013-10-29 19:26:43 +00003301;
331
332
333__END__
334
335=pod
336
Akron31d788e2016-02-05 20:49:03 +0100337=encoding utf8
338
Nils Diewald2db9ad02013-10-29 19:26:43 +0000339=head1 NAME
340
Akron31d788e2016-02-05 20:49:03 +0100341KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
Nils Diewald2db9ad02013-10-29 19:26:43 +0000342
343
344=head1 SYNOPSIS
345
Akron31d788e2016-02-05 20:49:03 +0100346 # Create Converter Object
Akrone4c2e412016-01-28 15:10:50 +0100347 my $doc = KorAP::XML::Krill->new(
Nils Diewald2db9ad02013-10-29 19:26:43 +0000348 path => 'mydoc-1/'
349 );
350
Akron31d788e2016-02-05 20:49:03 +0100351 # Convert to krill json
352 print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000353
354
355=head1 DESCRIPTION
356
Akron31d788e2016-02-05 20:49:03 +0100357Parse the primary and meta data of a KorAP-XML document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000358
359
Akron31d788e2016-02-05 20:49:03 +0100360=head1 ATTRIBUTES
Nils Diewald2db9ad02013-10-29 19:26:43 +0000361
Akron31d788e2016-02-05 20:49:03 +0100362=head2 log
Nils Diewald2db9ad02013-10-29 19:26:43 +0000363
Akron31d788e2016-02-05 20:49:03 +0100364L<Log::Log4perl> object for logging.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000365
366=head2 path
367
368 $doc->path("example-004/");
369 print $doc->path;
370
371The path of the document.
372
373
Nils Diewald2db9ad02013-10-29 19:26:43 +0000374=head2 primary
375
376 print $doc->primary->data(0,20);
377
Akrone4c2e412016-01-28 15:10:50 +0100378The L<KorAP::XML::Document::Primary> object containing the primary data.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000379
380
Nils Diewald2db9ad02013-10-29 19:26:43 +0000381=head1 METHODS
382
Akron31d788e2016-02-05 20:49:03 +0100383=head2 annotate
384
Akrona5920b12016-06-29 18:51:21 +0200385 $doc->annotate('Mate', 'Morpho');
Akron31d788e2016-02-05 20:49:03 +0100386
387Add annotation layer to conversion process.
388
389
Nils Diewald2db9ad02013-10-29 19:26:43 +0000390=head2 parse
391
Akron31d788e2016-02-05 20:49:03 +0100392 $doc = $doc->parse;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000393
Akron31d788e2016-02-05 20:49:03 +0100394Run the meta parsing process of the document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000395
396
Akron31d788e2016-02-05 20:49:03 +0100397=head2 tokenize
398
399 $doc = $doc->tokenize('OpenNLP', 'Tokens');
400
401Accept the tokenization based on a given foundry and a given layer.
402
403
404=head1 AVAILABILITY
405
406 https://github.com/KorAP/KorAP-XML-Krill
407
408
409=head1 COPYRIGHT AND LICENSE
410
Akron3ec0a1c2017-01-18 14:41:55 +0100411Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akron31d788e2016-02-05 20:49:03 +0100412Author: L<Nils Diewald|http://nils-diewald.de/>
413
414KorAP::XML::Krill is developed as part of the
415L<KorAP|http://korap.ids-mannheim.de/>
416Corpus Analysis Platform at the
417L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
418member of the
419L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
420and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
421funded by the
422L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
423
424KorAP::XML::Krill is free software published under the
425L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
426
Nils Diewald2db9ad02013-10-29 19:26:43 +0000427=cut