blob: a8aa5d46d16cca1a3d0a5c4415e4eb1d19922c57 [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Krill;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
Nils Diewald2db9ad02013-10-29 19:26:43 +00003use Mojo::ByteStream 'b';
Akron918ce422017-06-16 20:28:43 +02004use Mojo::Util qw/encode html_unescape/;
Akron3ec0a1c2017-01-18 14:41:55 +01005use Mojo::File;
Akron14ca9f02016-01-29 19:38:18 +01006use Scalar::Util qw/weaken/;
Nils Diewald3cf08c72013-12-16 20:31:10 +00007use XML::Fast;
8use Try::Tiny;
Akron7d4cdd82016-08-17 21:39:45 +02009use Carp qw/croak carp/;
Akrone4c2e412016-01-28 15:10:50 +010010use KorAP::XML::Document::Primary;
Akron941c1a62016-02-23 17:41:41 +010011use KorAP::XML::Tokenizer;
Nils Diewald7b847222014-04-23 11:14:00 +000012use Log::Log4perl;
Akrone4c2e412016-01-28 15:10:50 +010013use KorAP::XML::Log;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Nils Diewald7b847222014-04-23 11:14:00 +000015use Mojo::DOM;
16use Data::Dumper;
Akronaf670ae2016-10-24 20:14:32 +020017use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000018
Akron9a062ce2017-07-04 19:12:05 +020019our $VERSION = '0.32';
Nils Diewald90410c22014-11-03 21:04:05 +000020
Nils Diewald7364d1f2013-11-05 19:26:35 +000021has 'path';
Akron35db6e32016-03-17 22:42:22 +010022has [qw/text_sigle doc_sigle corpus_sigle/];
23has 'meta_type' => 'I5';
Akron11c80302016-03-18 19:44:43 +010024has 'cache';
Nils Diewald7364d1f2013-11-05 19:26:35 +000025
Nils Diewald7b847222014-04-23 11:14:00 +000026has log => sub {
27 if(Log::Log4perl->initialized()) {
28 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000029 };
Akrone4c2e412016-01-28 15:10:50 +010030 state $log = KorAP::XML::Log->new;
Nils Diewald7b847222014-04-23 11:14:00 +000031 return $log;
32};
33
Akron6396c302016-03-18 16:05:39 +010034# Constructor
Nils Diewald7b847222014-04-23 11:14:00 +000035sub new {
36 my $class = shift;
37 my $self = bless { @_ }, $class;
Akron6396c302016-03-18 16:05:39 +010038
39 # Path is defined
Nils Diewaldd681eab2014-11-01 01:18:25 +000040 if (exists $self->{path}) {
41 $self->{path} = rel2abs($self->{path});
42 if ($self->{path} !~ m!\/$!) {
43 $self->{path} .= '/';
44 };
Nils Diewald7b847222014-04-23 11:14:00 +000045 };
46 return $self;
47};
Nils Diewald2db9ad02013-10-29 19:26:43 +000048
Akron35db6e32016-03-17 22:42:22 +010049
50# Parse document (primary data and metadata)
Nils Diewald2db9ad02013-10-29 19:26:43 +000051sub parse {
52 my $self = shift;
Akron35db6e32016-03-17 22:42:22 +010053 my $meta_data_type = $self->meta_type;
Nils Diewald7b847222014-04-23 11:14:00 +000054
Akron6396c302016-03-18 16:05:39 +010055 state $ENC_RE = qr/^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
Nils Diewald2db9ad02013-10-29 19:26:43 +000056
Akron6396c302016-03-18 16:05:39 +010057 # Path to primary
58 my $data_xml = $self->path . 'data.xml';
Nils Diewald98767bb2014-04-25 20:31:19 +000059 my ($rt, $error, $file);
60
61 my $unable = 'Unable to parse document ' . $self->path;
62
Akron35db6e32016-03-17 22:42:22 +010063 # No primary data found
Nils Diewald98767bb2014-04-25 20:31:19 +000064 unless (-e $data_xml) {
65 $self->log->warn($unable . ' - no data.xml found');
66 $error = 1;
67 }
68
69 else {
Akron35db6e32016-03-17 22:42:22 +010070 # Load file
Akron3ec0a1c2017-01-18 14:41:55 +010071 $file = b(Mojo::File->new($data_xml)->slurp);
Akrona7d0e9f2017-02-03 14:36:21 +010072
Nils Diewald98767bb2014-04-25 20:31:19 +000073 try {
Nils Diewald3cf08c72013-12-16 20:31:10 +000074 local $SIG{__WARN__} = sub {
Akron7d4cdd82016-08-17 21:39:45 +020075 $error = 1;
Nils Diewald3cf08c72013-12-16 20:31:10 +000076 };
Akron7d4cdd82016-08-17 21:39:45 +020077
Nils Diewald3cf08c72013-12-16 20:31:10 +000078 $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
Akrona7d0e9f2017-02-03 14:36:21 +010079
Akrona8665782016-01-27 21:47:57 +010080 } catch {
81 $self->log->warn($unable);
82 $error = 1;
83 };
Nils Diewald3cf08c72013-12-16 20:31:10 +000084 };
85
86 return if $error;
Nils Diewald2db9ad02013-10-29 19:26:43 +000087
Nils Diewald3ece6302013-12-02 18:38:16 +000088 $self->log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +000089
Nils Diewald2db9ad02013-10-29 19:26:43 +000090 # Get document id and corpus id
Nils Diewald3cf08c72013-12-16 20:31:10 +000091 if ($rt && $rt->{'-docid'}) {
Akron1cd5b872016-03-22 00:23:46 +010092 if ($rt->{'-docid'} =~ /^([^_]+)_([^\._]+?)\.(.+?)$/) {
93 $self->text_sigle(join('/', $1, $2, $3));
94 $self->doc_sigle(join('/', $1, $2));
95 $self->corpus_sigle($1);
Nils Diewald2db9ad02013-10-29 19:26:43 +000096 }
97 else {
Akron7d4cdd82016-08-17 21:39:45 +020098 $self->log->warn($unable . ': ID not parseable');
99 return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000100 };
101 }
102 else {
Akron7d4cdd82016-08-17 21:39:45 +0200103 $self->log->warn($unable . ': No raw_text found or no ID');
104 return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000105 };
106
Akron918ce422017-06-16 20:28:43 +0200107 # Get primary data (was my "$pd = $rt->{text};" before)
108 # Unfortunately xml2hash removes spaces at the start and at
109 # the end of a text node, making it impossible to deal with cmc data.
110 $file =~ $ENC_RE;
111 $file = $file->decode($2 // 'UTF-8');
112 my $start = index($file, '<text>') + 6;
113 my $end = index($file, '</text>');
114 my $pd = html_unescape substr($file, $start, $end - $start);
Akron6396c302016-03-18 16:05:39 +0100115
Akron7d4cdd82016-08-17 21:39:45 +0200116 unless ($pd) {
117 $self->log->warn($unable . ': No primary data found');
118 return;
119 };
Akron087d5db2016-10-24 18:14:22 +0200120
Akron6396c302016-03-18 16:05:39 +0100121 # Associate primary data
122 $self->{pd} = KorAP::XML::Document::Primary->new($pd);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000123
Nils Diewaldd681eab2014-11-01 01:18:25 +0000124 my @path = grep { $_ } splitdir($self->path);
Nils Diewald840c9242014-10-28 19:51:26 +0000125 my @header;
126
Akron35db6e32016-03-17 22:42:22 +0100127 # Parse the corpus file, the doc file,
128 # and the text file for meta information
Nils Diewald840c9242014-10-28 19:51:26 +0000129 foreach (0..2) {
Nils Diewald0e489772016-10-24 15:16:52 +0200130 # Removed starting '/'
Akronaf670ae2016-10-24 20:14:32 +0200131 my $header = ($^O =~ /^mswin/i ? '' : '/');
132 $header .= catfile(@path, 'header.xml');
133 unshift @header, $header;
Nils Diewald840c9242014-10-28 19:51:26 +0000134 pop @path;
135 };
Akronb2636cf2016-01-26 18:42:44 +0100136
Akron6396c302016-03-18 16:05:39 +0100137 # Get metadata class and create an object
Akron35db6e32016-03-17 22:42:22 +0100138 my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type;
139 my $meta;
140
141 if ($meta_class->can('new') || eval("require $meta_class; 1;")) {
142 $meta = $meta_class->new(
Akron6396c302016-03-18 16:05:39 +0100143 log => $self->log,
Akron35db6e32016-03-17 22:42:22 +0100144 corpus_sigle => $self->corpus_sigle,
145 doc_sigle => $self->doc_sigle,
Akron11c80302016-03-18 19:44:43 +0100146 text_sigle => $self->text_sigle,
147 cache => $self->cache
Akron35db6e32016-03-17 22:42:22 +0100148 );
149
Akron6396c302016-03-18 16:05:39 +0100150 # Associate meta object
Akron35db6e32016-03-17 22:42:22 +0100151 $self->{meta} = $meta;
152 };
153
Akron6396c302016-03-18 16:05:39 +0100154 unless ($meta) {
155 $self->log->warn(
156 "Metadata object for $meta_data_type not initializable"
157 );
158 };
Akron35db6e32016-03-17 22:42:22 +0100159
Nils Diewald840c9242014-10-28 19:51:26 +0000160 my @type = qw/corpus doc text/;
161 foreach (@header) {
162 # Get corpus, doc and text meta data
163 my $type = shift(@type);
Akrona8665782016-01-27 21:47:57 +0100164
Akron11c80302016-03-18 19:44:43 +0100165 # Check for cache
166 next if $meta->is_cached($type);
167
Akrona8665782016-01-27 21:47:57 +0100168 next unless -e $_;
169
Akron35db6e32016-03-17 22:42:22 +0100170 # Slurp data and probably decode
Akron3ec0a1c2017-01-18 14:41:55 +0100171 my $slurp = b(Mojo::File->new($_)->slurp);
Akron6396c302016-03-18 16:05:39 +0100172 $slurp =~ $ENC_RE;
Akrona8665782016-01-27 21:47:57 +0100173 my $file = $slurp->decode($2 // 'UTF-8');
174
175 # Get DOM
176 my $dom = Mojo::DOM->new($file);
177
Akron6396c302016-03-18 16:05:39 +0100178 # Parse object based on DOM
Akron35db6e32016-03-17 22:42:22 +0100179 $meta->parse($dom, $type);
Akron11c80302016-03-18 19:44:43 +0100180 $meta->do_cache($type);
Nils Diewald840c9242014-10-28 19:51:26 +0000181 };
182
Akron14ca9f02016-01-29 19:38:18 +0100183 return $self;
184};
185
186
187sub tokenize {
188 my $self = shift;
189 my ($token_foundry, $token_layer) = @_;
190
191 $token_foundry //= 'OpenNLP';
192 $token_layer //= 'Tokens';
193
Akron941c1a62016-02-23 17:41:41 +0100194 # Create tokenizer
195 my $tokens = KorAP::XML::Tokenizer->new(
Akron14ca9f02016-01-29 19:38:18 +0100196 path => $self->path,
197 doc => $self,
198 foundry => $token_foundry,
199 layer => $token_layer,
200 name => 'tokens'
201 );
202
Akron941c1a62016-02-23 17:41:41 +0100203 # Parse tokens
Akron14ca9f02016-01-29 19:38:18 +0100204 unless ($tokens->parse) {
205 $self->log->warn(
206 'Unable to tokenize ' . $self->path .
207 ' with ' . $token_foundry . '#'
208 . $token_layer
209 );
210 }
211 else {
212 weaken $self;
213 $self->{tokenizer} = $tokens;
214 };
215
216 return $self;
217};
218
219
220# Add annotation
221sub annotate {
222 my $self = shift;
223 unless ($self->{tokenizer}) {
224 $self->log->warn('No tokenizer defined')
225 }
226 else {
227 $self->{tokenizer}->add(@_);
228 };
229
230 $self;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000231};
232
233
Akrona8665782016-01-27 21:47:57 +0100234# Store arbitrary data
235sub store {
236 my $self = shift;
237 return $self->{store} unless @_;
238 return $self->{store}->{$_[0]} if @_ == 1;
239 $self->{store}->{$_[0]} = $_[1];
240};
241
242
Nils Diewald2db9ad02013-10-29 19:26:43 +0000243# Primary data
244sub primary {
245 $_[0]->{pd};
246};
247
Akron35db6e32016-03-17 22:42:22 +0100248sub meta {
249 return $_[0]->{meta};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000250};
251
Akron35db6e32016-03-17 22:42:22 +0100252sub to_hash {
Nils Diewald840c9242014-10-28 19:51:26 +0000253 my $self = shift;
Nils Diewald840c9242014-10-28 19:51:26 +0000254
Akron35db6e32016-03-17 22:42:22 +0100255 $self->parse unless $self->text_sigle;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000256
Akron35db6e32016-03-17 22:42:22 +0100257 my %hash;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000258
Akron35db6e32016-03-17 22:42:22 +0100259 # Get meta object
260 my $meta = $self->meta;
Akron11c80302016-03-18 19:44:43 +0100261 foreach ($meta->keys) {
Nils Diewald840c9242014-10-28 19:51:26 +0000262
Akron35db6e32016-03-17 22:42:22 +0100263 my $v = $meta->{$_};
264 if (ref $v) {
265 $hash{_k($_)} = $meta->keywords($_);
Nils Diewald90410c22014-11-03 21:04:05 +0000266 }
Akron35db6e32016-03-17 22:42:22 +0100267 else {
268 $v =~ s/\n/ /g;
269 $v =~ s/\s\s+/ /g;
270 $hash{_k($_)} = $v;
Nils Diewald840c9242014-10-28 19:51:26 +0000271 };
Nils Diewald90410c22014-11-03 21:04:05 +0000272 };
Nils Diewald840c9242014-10-28 19:51:26 +0000273
Akron35db6e32016-03-17 22:42:22 +0100274 foreach (qw/corpus doc text/) {
275 $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'};
Nils Diewald8e323ee2014-04-23 17:28:14 +0000276 };
277
Akron35db6e32016-03-17 22:42:22 +0100278 return \%hash;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000279};
280
Nils Diewald840c9242014-10-28 19:51:26 +0000281
Akron35db6e32016-03-17 22:42:22 +0100282sub _k {
283 my $x = $_[0];
284 $x =~ s/_(\w)/\U$1\E/g;
285 $x =~ s/id$/ID/gi;
286 return $x;
287};
288
289
290sub to_json {
291 my $self = shift;
292 unless ($self->{tokenizer}) {
293 $self->log->warn('No tokenizer defined');
294 return;
295 };
296
297 return $self->{tokenizer}->to_json;
298};
299
300
3011;
302
303
304__END__
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000305
Nils Diewald7364d1f2013-11-05 19:26:35 +0000306sub to_string {
307 my $self = shift;
308
309 my $string;
310
311 foreach (@ATTR) {
312 if (my $att = $self->$_) {
313 $att =~ s/\n/ /g;
314 $att =~ s/\s\s+/ /g;
315 $string .= $_ . ' = ' . $att . "\n";
316 };
317 };
318
Akron31d788e2016-02-05 20:49:03 +0100319 $string .= 'text_class = ' . $self->text_class_string . "\n";
320 $string .= 'keywords = ' . $self->keywords_string . "\n";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000321
322 return $string;
323};
324
Akron14ca9f02016-01-29 19:38:18 +0100325# Todo: Make this a KoralQuery serializer
326sub to_koral_query {
327 my $self = shift;
Akron941c1a62016-02-23 17:41:41 +0100328 my $hash = {};
329 $hash->{'@context'} = 'http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld';
330 $hash->{'@type'} = 'koral:corpus';
331# $hash->{'text'} = $self->primary->data;
332# my $hash = $self->to_hash;
Akron14ca9f02016-01-29 19:38:18 +0100333};
Nils Diewald7b847222014-04-23 11:14:00 +0000334
Akron941c1a62016-02-23 17:41:41 +0100335
Nils Diewald2db9ad02013-10-29 19:26:43 +00003361;
337
338
339__END__
340
341=pod
342
Akron31d788e2016-02-05 20:49:03 +0100343=encoding utf8
344
Nils Diewald2db9ad02013-10-29 19:26:43 +0000345=head1 NAME
346
Akron31d788e2016-02-05 20:49:03 +0100347KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
Nils Diewald2db9ad02013-10-29 19:26:43 +0000348
349
350=head1 SYNOPSIS
351
Akron31d788e2016-02-05 20:49:03 +0100352 # Create Converter Object
Akrone4c2e412016-01-28 15:10:50 +0100353 my $doc = KorAP::XML::Krill->new(
Nils Diewald2db9ad02013-10-29 19:26:43 +0000354 path => 'mydoc-1/'
355 );
356
Akron31d788e2016-02-05 20:49:03 +0100357 # Convert to krill json
358 print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000359
360
361=head1 DESCRIPTION
362
Akron31d788e2016-02-05 20:49:03 +0100363Parse the primary and meta data of a KorAP-XML document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000364
365
Akron31d788e2016-02-05 20:49:03 +0100366=head1 ATTRIBUTES
Nils Diewald2db9ad02013-10-29 19:26:43 +0000367
Akron31d788e2016-02-05 20:49:03 +0100368=head2 log
Nils Diewald2db9ad02013-10-29 19:26:43 +0000369
Akron31d788e2016-02-05 20:49:03 +0100370L<Log::Log4perl> object for logging.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000371
372=head2 path
373
374 $doc->path("example-004/");
375 print $doc->path;
376
377The path of the document.
378
379
Nils Diewald2db9ad02013-10-29 19:26:43 +0000380=head2 primary
381
382 print $doc->primary->data(0,20);
383
Akrone4c2e412016-01-28 15:10:50 +0100384The L<KorAP::XML::Document::Primary> object containing the primary data.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000385
386
Nils Diewald2db9ad02013-10-29 19:26:43 +0000387=head1 METHODS
388
Akron31d788e2016-02-05 20:49:03 +0100389=head2 annotate
390
Akrona5920b12016-06-29 18:51:21 +0200391 $doc->annotate('Mate', 'Morpho');
Akron31d788e2016-02-05 20:49:03 +0100392
393Add annotation layer to conversion process.
394
395
Nils Diewald2db9ad02013-10-29 19:26:43 +0000396=head2 parse
397
Akron31d788e2016-02-05 20:49:03 +0100398 $doc = $doc->parse;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000399
Akron31d788e2016-02-05 20:49:03 +0100400Run the meta parsing process of the document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000401
402
Akron31d788e2016-02-05 20:49:03 +0100403=head2 tokenize
404
405 $doc = $doc->tokenize('OpenNLP', 'Tokens');
406
407Accept the tokenization based on a given foundry and a given layer.
408
409
410=head1 AVAILABILITY
411
412 https://github.com/KorAP/KorAP-XML-Krill
413
414
415=head1 COPYRIGHT AND LICENSE
416
Akron3ec0a1c2017-01-18 14:41:55 +0100417Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akron31d788e2016-02-05 20:49:03 +0100418Author: L<Nils Diewald|http://nils-diewald.de/>
419
420KorAP::XML::Krill is developed as part of the
421L<KorAP|http://korap.ids-mannheim.de/>
422Corpus Analysis Platform at the
423L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
424member of the
425L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
426and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
427funded by the
428L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
429
430KorAP::XML::Krill is free software published under the
431L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
432
Nils Diewald2db9ad02013-10-29 19:26:43 +0000433=cut