blob: 564cc1443c3339cb0a46f64121f2556e1c30be91 [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Krill;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
Nils Diewald2db9ad02013-10-29 19:26:43 +00003use Mojo::ByteStream 'b';
Akron918ce422017-06-16 20:28:43 +02004use Mojo::Util qw/encode html_unescape/;
Akron3ec0a1c2017-01-18 14:41:55 +01005use Mojo::File;
Akron14ca9f02016-01-29 19:38:18 +01006use Scalar::Util qw/weaken/;
Nils Diewald3cf08c72013-12-16 20:31:10 +00007use XML::Fast;
8use Try::Tiny;
Akrone4c2e412016-01-28 15:10:50 +01009use KorAP::XML::Document::Primary;
Akron941c1a62016-02-23 17:41:41 +010010use KorAP::XML::Tokenizer;
Akronb9c33812020-10-21 16:19:35 +020011use Log::Any qw($log);
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Nils Diewald7b847222014-04-23 11:14:00 +000013use Mojo::DOM;
Akronaf670ae2016-10-24 20:14:32 +020014use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
Akronc4ec0932020-08-06 09:19:22 +020015use Exporter 'import';
16
Akron41127e32020-08-07 12:46:19 +020017our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
Akronc4ec0932020-08-06 09:19:22 +020018
Akron64f7fae2022-07-27 12:45:33 +020019our $VERSION = '0.47';
Nils Diewald90410c22014-11-03 21:04:05 +000020
Nils Diewald7364d1f2013-11-05 19:26:35 +000021has 'path';
Akron35db6e32016-03-17 22:42:22 +010022has [qw/text_sigle doc_sigle corpus_sigle/];
23has 'meta_type' => 'I5';
Akron11c80302016-03-18 19:44:43 +010024has 'cache';
Akron64f7fae2022-07-27 12:45:33 +020025has 'lang';
Nils Diewald7364d1f2013-11-05 19:26:35 +000026
Nils Diewald7b847222014-04-23 11:14:00 +000027has log => sub {
Nils Diewald7b847222014-04-23 11:14:00 +000028 return $log;
29};
30
Akron6396c302016-03-18 16:05:39 +010031# Constructor
Nils Diewald7b847222014-04-23 11:14:00 +000032sub new {
33 my $class = shift;
34 my $self = bless { @_ }, $class;
Akron6396c302016-03-18 16:05:39 +010035
36 # Path is defined
Nils Diewaldd681eab2014-11-01 01:18:25 +000037 if (exists $self->{path}) {
38 $self->{path} = rel2abs($self->{path});
39 if ($self->{path} !~ m!\/$!) {
40 $self->{path} .= '/';
41 };
Nils Diewald7b847222014-04-23 11:14:00 +000042 };
43 return $self;
44};
Nils Diewald2db9ad02013-10-29 19:26:43 +000045
Akron35db6e32016-03-17 22:42:22 +010046
47# Parse document (primary data and metadata)
Nils Diewald2db9ad02013-10-29 19:26:43 +000048sub parse {
49 my $self = shift;
Akron35db6e32016-03-17 22:42:22 +010050 my $meta_data_type = $self->meta_type;
Nils Diewald7b847222014-04-23 11:14:00 +000051
Akron6396c302016-03-18 16:05:39 +010052 state $ENC_RE = qr/^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
Nils Diewald2db9ad02013-10-29 19:26:43 +000053
Akron6396c302016-03-18 16:05:39 +010054 # Path to primary
55 my $data_xml = $self->path . 'data.xml';
Nils Diewald98767bb2014-04-25 20:31:19 +000056 my ($rt, $error, $file);
57
58 my $unable = 'Unable to parse document ' . $self->path;
59
Akron35db6e32016-03-17 22:42:22 +010060 # No primary data found
Nils Diewald98767bb2014-04-25 20:31:19 +000061 unless (-e $data_xml) {
Akronb9c33812020-10-21 16:19:35 +020062 $log->warn($unable . ' - no data.xml found');
Nils Diewald98767bb2014-04-25 20:31:19 +000063 $error = 1;
64 }
65
66 else {
Akron35db6e32016-03-17 22:42:22 +010067 # Load file
Akron3ec0a1c2017-01-18 14:41:55 +010068 $file = b(Mojo::File->new($data_xml)->slurp);
Akrona7d0e9f2017-02-03 14:36:21 +010069
Nils Diewald98767bb2014-04-25 20:31:19 +000070 try {
Nils Diewald3cf08c72013-12-16 20:31:10 +000071 local $SIG{__WARN__} = sub {
Akron7d4cdd82016-08-17 21:39:45 +020072 $error = 1;
Nils Diewald3cf08c72013-12-16 20:31:10 +000073 };
Akron7d4cdd82016-08-17 21:39:45 +020074
Nils Diewald3cf08c72013-12-16 20:31:10 +000075 $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
Akrona7d0e9f2017-02-03 14:36:21 +010076
Akrona8665782016-01-27 21:47:57 +010077 } catch {
Akronb9c33812020-10-21 16:19:35 +020078 $log->warn($unable);
Akrona8665782016-01-27 21:47:57 +010079 $error = 1;
80 };
Nils Diewald3cf08c72013-12-16 20:31:10 +000081 };
82
83 return if $error;
Nils Diewald2db9ad02013-10-29 19:26:43 +000084
Akronb9c33812020-10-21 16:19:35 +020085 $log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +000086
Nils Diewald2db9ad02013-10-29 19:26:43 +000087 # Get document id and corpus id
Nils Diewald3cf08c72013-12-16 20:31:10 +000088 if ($rt && $rt->{'-docid'}) {
Akron1cd5b872016-03-22 00:23:46 +010089 if ($rt->{'-docid'} =~ /^([^_]+)_([^\._]+?)\.(.+?)$/) {
90 $self->text_sigle(join('/', $1, $2, $3));
91 $self->doc_sigle(join('/', $1, $2));
92 $self->corpus_sigle($1);
Nils Diewald2db9ad02013-10-29 19:26:43 +000093 }
94 else {
Akronb9c33812020-10-21 16:19:35 +020095 $log->warn($unable . ': ID not parseable: ' . $rt->{'-docid'});
Akron7d4cdd82016-08-17 21:39:45 +020096 return;
Nils Diewald2db9ad02013-10-29 19:26:43 +000097 };
98 }
99 else {
Akronb9c33812020-10-21 16:19:35 +0200100 $log->warn($unable . ': No raw_text found or no ID');
Akron7d4cdd82016-08-17 21:39:45 +0200101 return;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000102 };
103
Akron918ce422017-06-16 20:28:43 +0200104 # Get primary data (was my "$pd = $rt->{text};" before)
105 # Unfortunately xml2hash removes spaces at the start and at
106 # the end of a text node, making it impossible to deal with cmc data.
107 $file =~ $ENC_RE;
108 $file = $file->decode($2 // 'UTF-8');
109 my $start = index($file, '<text>') + 6;
110 my $end = index($file, '</text>');
111 my $pd = html_unescape substr($file, $start, $end - $start);
Akron6396c302016-03-18 16:05:39 +0100112
Akron7d4cdd82016-08-17 21:39:45 +0200113 unless ($pd) {
Akronb9c33812020-10-21 16:19:35 +0200114 $log->warn($unable . ': No primary data found');
Akron7d4cdd82016-08-17 21:39:45 +0200115 return;
116 };
Akron087d5db2016-10-24 18:14:22 +0200117
Akron6396c302016-03-18 16:05:39 +0100118 # Associate primary data
119 $self->{pd} = KorAP::XML::Document::Primary->new($pd);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000120
Nils Diewaldd681eab2014-11-01 01:18:25 +0000121 my @path = grep { $_ } splitdir($self->path);
Nils Diewald840c9242014-10-28 19:51:26 +0000122 my @header;
123
Akron35db6e32016-03-17 22:42:22 +0100124 # Parse the corpus file, the doc file,
125 # and the text file for meta information
Nils Diewald840c9242014-10-28 19:51:26 +0000126 foreach (0..2) {
Nils Diewald0e489772016-10-24 15:16:52 +0200127 # Removed starting '/'
Akronaf670ae2016-10-24 20:14:32 +0200128 my $header = ($^O =~ /^mswin/i ? '' : '/');
129 $header .= catfile(@path, 'header.xml');
130 unshift @header, $header;
Nils Diewald840c9242014-10-28 19:51:26 +0000131 pop @path;
132 };
Akronb2636cf2016-01-26 18:42:44 +0100133
Akron6396c302016-03-18 16:05:39 +0100134 # Get metadata class and create an object
Akron35db6e32016-03-17 22:42:22 +0100135 my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type;
136 my $meta;
137
138 if ($meta_class->can('new') || eval("require $meta_class; 1;")) {
139 $meta = $meta_class->new(
Akronb9c33812020-10-21 16:19:35 +0200140 log => $log,
Akron35db6e32016-03-17 22:42:22 +0100141 corpus_sigle => $self->corpus_sigle,
142 doc_sigle => $self->doc_sigle,
Akron11c80302016-03-18 19:44:43 +0100143 text_sigle => $self->text_sigle,
Akron64f7fae2022-07-27 12:45:33 +0200144 cache => $self->cache,
145 lang => $self->lang
Akron35db6e32016-03-17 22:42:22 +0100146 );
147
Akron6396c302016-03-18 16:05:39 +0100148 # Associate meta object
Akron35db6e32016-03-17 22:42:22 +0100149 $self->{meta} = $meta;
150 };
151
Akron6396c302016-03-18 16:05:39 +0100152 unless ($meta) {
Akronb9c33812020-10-21 16:19:35 +0200153 $log->warn(
Akron6396c302016-03-18 16:05:39 +0100154 "Metadata object for $meta_data_type not initializable"
155 );
156 };
Akron35db6e32016-03-17 22:42:22 +0100157
Nils Diewald840c9242014-10-28 19:51:26 +0000158 my @type = qw/corpus doc text/;
159 foreach (@header) {
160 # Get corpus, doc and text meta data
161 my $type = shift(@type);
Akrona8665782016-01-27 21:47:57 +0100162
Akron11c80302016-03-18 19:44:43 +0100163 # Check for cache
164 next if $meta->is_cached($type);
165
Akrona8665782016-01-27 21:47:57 +0100166 next unless -e $_;
167
Akron35db6e32016-03-17 22:42:22 +0100168 # Slurp data and probably decode
Akron3ec0a1c2017-01-18 14:41:55 +0100169 my $slurp = b(Mojo::File->new($_)->slurp);
Akron6396c302016-03-18 16:05:39 +0100170 $slurp =~ $ENC_RE;
Akrona8665782016-01-27 21:47:57 +0100171 my $file = $slurp->decode($2 // 'UTF-8');
172
173 # Get DOM
174 my $dom = Mojo::DOM->new($file);
175
Akron6396c302016-03-18 16:05:39 +0100176 # Parse object based on DOM
Akron35db6e32016-03-17 22:42:22 +0100177 $meta->parse($dom, $type);
Akron11c80302016-03-18 19:44:43 +0100178 $meta->do_cache($type);
Nils Diewald840c9242014-10-28 19:51:26 +0000179 };
180
Akron14ca9f02016-01-29 19:38:18 +0100181 return $self;
182};
183
184
Akron47426f02020-08-06 13:28:53 +0200185# Start token parsing
Akron14ca9f02016-01-29 19:38:18 +0100186sub tokenize {
187 my $self = shift;
188 my ($token_foundry, $token_layer) = @_;
189
190 $token_foundry //= 'OpenNLP';
191 $token_layer //= 'Tokens';
192
Akron941c1a62016-02-23 17:41:41 +0100193 # Create tokenizer
194 my $tokens = KorAP::XML::Tokenizer->new(
Akron14ca9f02016-01-29 19:38:18 +0100195 path => $self->path,
196 doc => $self,
197 foundry => $token_foundry,
198 layer => $token_layer,
199 name => 'tokens'
200 );
201
Akron941c1a62016-02-23 17:41:41 +0100202 # Parse tokens
Akron14ca9f02016-01-29 19:38:18 +0100203 unless ($tokens->parse) {
Akronb9c33812020-10-21 16:19:35 +0200204 $log->warn(
Akron14ca9f02016-01-29 19:38:18 +0100205 'Unable to tokenize ' . $self->path .
Akron4701d092020-08-04 15:20:19 +0200206 ' with ' . $token_foundry . '#'
207 . $token_layer
208 );
Akron14ca9f02016-01-29 19:38:18 +0100209 }
210 else {
211 weaken $self;
212 $self->{tokenizer} = $tokens;
213 };
214
215 return $self;
216};
217
218
219# Add annotation
220sub annotate {
221 my $self = shift;
222 unless ($self->{tokenizer}) {
Akronb9c33812020-10-21 16:19:35 +0200223 $log->warn('No tokenizer defined')
Akron14ca9f02016-01-29 19:38:18 +0100224 }
225 else {
226 $self->{tokenizer}->add(@_);
227 };
228
229 $self;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000230};
231
232
Akrona8665782016-01-27 21:47:57 +0100233# Store arbitrary data
234sub store {
235 my $self = shift;
236 return $self->{store} unless @_;
237 return $self->{store}->{$_[0]} if @_ == 1;
238 $self->{store}->{$_[0]} = $_[1];
239};
240
241
Nils Diewald2db9ad02013-10-29 19:26:43 +0000242# Primary data
243sub primary {
244 $_[0]->{pd};
245};
246
Akron47426f02020-08-06 13:28:53 +0200247
248# Get meta object
Akron35db6e32016-03-17 22:42:22 +0100249sub meta {
250 return $_[0]->{meta};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000251};
252
Akron47426f02020-08-06 13:28:53 +0200253
254# Serialize to hash
Akron35db6e32016-03-17 22:42:22 +0100255sub to_hash {
Nils Diewald840c9242014-10-28 19:51:26 +0000256 my $self = shift;
Nils Diewald840c9242014-10-28 19:51:26 +0000257
Akron35db6e32016-03-17 22:42:22 +0100258 $self->parse unless $self->text_sigle;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000259
Akron35db6e32016-03-17 22:42:22 +0100260 my %hash;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000261
Akron35db6e32016-03-17 22:42:22 +0100262 # Get meta object
263 my $meta = $self->meta;
Akron11c80302016-03-18 19:44:43 +0100264 foreach ($meta->keys) {
Nils Diewald840c9242014-10-28 19:51:26 +0000265
Akron35db6e32016-03-17 22:42:22 +0100266 my $v = $meta->{$_};
267 if (ref $v) {
268 $hash{_k($_)} = $meta->keywords($_);
Nils Diewald90410c22014-11-03 21:04:05 +0000269 }
Akron35db6e32016-03-17 22:42:22 +0100270 else {
Akron4701d092020-08-04 15:20:19 +0200271 $v =~ tr/\n/ /;
Akron35db6e32016-03-17 22:42:22 +0100272 $v =~ s/\s\s+/ /g;
273 $hash{_k($_)} = $v;
Nils Diewald840c9242014-10-28 19:51:26 +0000274 };
Nils Diewald90410c22014-11-03 21:04:05 +0000275 };
Nils Diewald840c9242014-10-28 19:51:26 +0000276
Akron35db6e32016-03-17 22:42:22 +0100277 foreach (qw/corpus doc text/) {
278 $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'};
Nils Diewald8e323ee2014-04-23 17:28:14 +0000279 };
280
Akron35db6e32016-03-17 22:42:22 +0100281 return \%hash;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000282};
283
Nils Diewald840c9242014-10-28 19:51:26 +0000284
Akron35db6e32016-03-17 22:42:22 +0100285sub _k {
Akron4701d092020-08-04 15:20:19 +0200286 substr($_[0], 2) =~ s/_(\w)/\U$1\E/gr =~ s/id$/ID/gir;
Akron35db6e32016-03-17 22:42:22 +0100287};
288
289
290sub to_json {
291 my $self = shift;
292 unless ($self->{tokenizer}) {
Akronb9c33812020-10-21 16:19:35 +0200293 $log->warn('No tokenizer defined');
Akron35db6e32016-03-17 22:42:22 +0100294 return;
295 };
296
297 return $self->{tokenizer}->to_json;
298};
299
Akronc4ec0932020-08-06 09:19:22 +0200300# Functions
301
302sub get_file_name_from_glob ($) {
303 my $glob = shift;
304 $glob =~ s![\\\/},]!-!g; # Transform paths
305 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
306 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
307 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
308 $glob =~ s/^-//; # Clean beginning
309 $glob =~ s/\.zip$//; # Remove file extension
310 $glob =~ s/-$//; # Clean end
311 return $glob;
312};
313
Akron35db6e32016-03-17 22:42:22 +0100314
Akron41127e32020-08-07 12:46:19 +0200315# Get file name based on path information
316sub get_file_name ($$) {
317 my $i = shift;
318
319 # Check if the base dir is a directory
320 if (-d $i) {
321
322 # Remove following slashes
323 $i =~ s![^\/]+$!!;
324 };
325 my $file = shift;
326
327 # Remove temp dir fragments
328 $file =~ s!^/?tmp/[^/]+!!;
329 $file =~ s/^?\/?$i//;
330 $file =~ tr/\//-/;
331 $file =~ s{^-+}{};
332 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/; # shorten
333 return $file;
334};
335
336
Akron35db6e32016-03-17 22:42:22 +01003371;
338
339
340__END__
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000341
Nils Diewald2db9ad02013-10-29 19:26:43 +0000342=pod
343
Akron31d788e2016-02-05 20:49:03 +0100344=encoding utf8
345
Nils Diewald2db9ad02013-10-29 19:26:43 +0000346=head1 NAME
347
Akron31d788e2016-02-05 20:49:03 +0100348KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
Nils Diewald2db9ad02013-10-29 19:26:43 +0000349
350
351=head1 SYNOPSIS
352
Akron31d788e2016-02-05 20:49:03 +0100353 # Create Converter Object
Akrone4c2e412016-01-28 15:10:50 +0100354 my $doc = KorAP::XML::Krill->new(
Nils Diewald2db9ad02013-10-29 19:26:43 +0000355 path => 'mydoc-1/'
356 );
357
Akron31d788e2016-02-05 20:49:03 +0100358 # Convert to krill json
359 print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000360
361
362=head1 DESCRIPTION
363
Akron31d788e2016-02-05 20:49:03 +0100364Parse the primary and meta data of a KorAP-XML document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000365
366
Akron31d788e2016-02-05 20:49:03 +0100367=head1 ATTRIBUTES
Nils Diewald2db9ad02013-10-29 19:26:43 +0000368
Akron31d788e2016-02-05 20:49:03 +0100369=head2 log
Nils Diewald2db9ad02013-10-29 19:26:43 +0000370
Akronb9c33812020-10-21 16:19:35 +0200371L<Log::Any> object for logging.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000372
373=head2 path
374
375 $doc->path("example-004/");
376 print $doc->path;
377
378The path of the document.
379
380
Nils Diewald2db9ad02013-10-29 19:26:43 +0000381=head2 primary
382
383 print $doc->primary->data(0,20);
384
Akrone4c2e412016-01-28 15:10:50 +0100385The L<KorAP::XML::Document::Primary> object containing the primary data.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000386
387
Nils Diewald2db9ad02013-10-29 19:26:43 +0000388=head1 METHODS
389
Akron31d788e2016-02-05 20:49:03 +0100390=head2 annotate
391
Akrona5920b12016-06-29 18:51:21 +0200392 $doc->annotate('Mate', 'Morpho');
Akron31d788e2016-02-05 20:49:03 +0100393
394Add annotation layer to conversion process.
395
396
Nils Diewald2db9ad02013-10-29 19:26:43 +0000397=head2 parse
398
Akron31d788e2016-02-05 20:49:03 +0100399 $doc = $doc->parse;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000400
Akron31d788e2016-02-05 20:49:03 +0100401Run the meta parsing process of the document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000402
403
Akron31d788e2016-02-05 20:49:03 +0100404=head2 tokenize
405
406 $doc = $doc->tokenize('OpenNLP', 'Tokens');
407
408Accept the tokenization based on a given foundry and a given layer.
409
410
411=head1 AVAILABILITY
412
413 https://github.com/KorAP/KorAP-XML-Krill
414
415
416=head1 COPYRIGHT AND LICENSE
417
Akron6882d7d2021-02-08 09:43:57 +0100418Copyright (C) 2015-2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
419Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron31d788e2016-02-05 20:49:03 +0100420
421KorAP::XML::Krill is developed as part of the
422L<KorAP|http://korap.ids-mannheim.de/>
423Corpus Analysis Platform at the
Akrond4c5c102020-02-11 11:47:59 +0100424L<Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron31d788e2016-02-05 20:49:03 +0100425member of the
Akrond4c5c102020-02-11 11:47:59 +0100426L<Leibniz-Gemeinschaft|https://www.leibniz-gemeinschaft.de/en/>
Akron31d788e2016-02-05 20:49:03 +0100427and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
428funded by the
429L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
430
431KorAP::XML::Krill is free software published under the
Akron6882d7d2021-02-08 09:43:57 +0100432L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron31d788e2016-02-05 20:49:03 +0100433
Nils Diewald2db9ad02013-10-29 19:26:43 +0000434=cut