blob: 18a12b1207162eda0ca0cb840c415d0424422926 [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Krill;
Nils Diewald2db9ad02013-10-29 19:26:43 +00002use Mojo::Base -base;
Nils Diewald2db9ad02013-10-29 19:26:43 +00003use Mojo::ByteStream 'b';
Nils Diewald7b847222014-04-23 11:14:00 +00004use Mojo::Util qw/encode/;
Akron14ca9f02016-01-29 19:38:18 +01005use Scalar::Util qw/weaken/;
Nils Diewald3cf08c72013-12-16 20:31:10 +00006use XML::Fast;
7use Try::Tiny;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Carp qw/croak/;
Akrone4c2e412016-01-28 15:10:50 +01009use KorAP::XML::Document::Primary;
Akron941c1a62016-02-23 17:41:41 +010010use KorAP::XML::Tokenizer;
Nils Diewald7b847222014-04-23 11:14:00 +000011use Log::Log4perl;
Akrone4c2e412016-01-28 15:10:50 +010012use KorAP::XML::Log;
Nils Diewald7b847222014-04-23 11:14:00 +000013use Mojo::DOM;
14use Data::Dumper;
Nils Diewaldd681eab2014-11-01 01:18:25 +000015use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000016
Nils Diewald1448c262015-10-01 17:25:33 +000017# TODO: Currently metadata is processed multiple times - that's horrible!
18# Due to the kind of processing, processed metadata may be stored in
19# a multiprocess cache instead.
20
Akrone10ad322016-02-27 10:54:26 +010021our $VERSION = '0.12';
Akrone4c2e412016-01-28 15:10:50 +010022
Nils Diewald840c9242014-10-28 19:51:26 +000023our @ATTR = qw/text_sigle
24 doc_sigle
25 corpus_sigle
Nils Diewald8e323ee2014-04-23 17:28:14 +000026 title
Nils Diewaldfeccbb12015-06-18 20:06:45 +000027 pub_date
Nils Diewald8e323ee2014-04-23 17:28:14 +000028 sub_title
Nils Diewalda96de622014-10-31 17:29:23 +000029 pub_place
30 author/;
Nils Diewald8e323ee2014-04-23 17:28:14 +000031
32our @ADVANCED_ATTR = qw/publisher
33 editor
34 text_type
35 text_type_art
Nils Diewald840c9242014-10-28 19:51:26 +000036 text_type_ref
37 text_column
38 text_domain
Nils Diewald8e323ee2014-04-23 17:28:14 +000039 creation_date
Nils Diewald840c9242014-10-28 19:51:26 +000040 license
Nils Diewald840c9242014-10-28 19:51:26 +000041 pages
Nils Diewald840c9242014-10-28 19:51:26 +000042 file_edition_statement
43 bibl_edition_statement
Nils Diewald840c9242014-10-28 19:51:26 +000044 reference
Nils Diewald840c9242014-10-28 19:51:26 +000045 language
Nils Diewald90410c22014-11-03 21:04:05 +000046
47 doc_title
48 doc_sub_title
49 doc_editor
50 doc_author
51
52 corpus_author
Nils Diewald840c9242014-10-28 19:51:26 +000053 corpus_title
54 corpus_sub_title
Nils Diewald90410c22014-11-03 21:04:05 +000055 corpus_editor
Akrona8665782016-01-27 21:47:57 +010056
57 availability
58 pub_place_key
Nils Diewald8e323ee2014-04-23 17:28:14 +000059 /;
Nils Diewaldfeccbb12015-06-18 20:06:45 +000060# Separate: text_class, keywords
Nils Diewald8e323ee2014-04-23 17:28:14 +000061
Nils Diewald90410c22014-11-03 21:04:05 +000062# Removed: coll_title, coll_sub_title, coll_author, coll_editor
63# Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
64
65
Nils Diewald7364d1f2013-11-05 19:26:35 +000066has 'path';
Nils Diewald8e323ee2014-04-23 17:28:14 +000067has [@ATTR, @ADVANCED_ATTR];
Nils Diewald7364d1f2013-11-05 19:26:35 +000068
Nils Diewald7b847222014-04-23 11:14:00 +000069has log => sub {
70 if(Log::Log4perl->initialized()) {
71 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000072 };
Akrone4c2e412016-01-28 15:10:50 +010073 state $log = KorAP::XML::Log->new;
Nils Diewald7b847222014-04-23 11:14:00 +000074 return $log;
75};
76
Akrona8665782016-01-27 21:47:57 +010077
Nils Diewald7b847222014-04-23 11:14:00 +000078sub new {
79 my $class = shift;
80 my $self = bless { @_ }, $class;
Nils Diewaldd681eab2014-11-01 01:18:25 +000081 if (exists $self->{path}) {
82 $self->{path} = rel2abs($self->{path});
83 if ($self->{path} !~ m!\/$!) {
84 $self->{path} .= '/';
85 };
Nils Diewald7b847222014-04-23 11:14:00 +000086 };
87 return $self;
88};
Nils Diewald2db9ad02013-10-29 19:26:43 +000089
90# parse document
91sub parse {
92 my $self = shift;
Nils Diewald7b847222014-04-23 11:14:00 +000093
Nils Diewald98767bb2014-04-25 20:31:19 +000094 my $data_xml = $self->path . 'data.xml';
Nils Diewald2db9ad02013-10-29 19:26:43 +000095
Nils Diewald98767bb2014-04-25 20:31:19 +000096 my ($rt, $error, $file);
97
98 my $unable = 'Unable to parse document ' . $self->path;
99
100 unless (-e $data_xml) {
101 $self->log->warn($unable . ' - no data.xml found');
102 $error = 1;
103 }
104
105 else {
Akrona8665782016-01-27 21:47:57 +0100106
Nils Diewald98767bb2014-04-25 20:31:19 +0000107 $file = b($data_xml)->slurp;
108
109 try {
Nils Diewald3cf08c72013-12-16 20:31:10 +0000110 local $SIG{__WARN__} = sub {
Nils Diewald98767bb2014-04-25 20:31:19 +0000111 $error = 1;
Nils Diewald3cf08c72013-12-16 20:31:10 +0000112 };
113 $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
Akrona8665782016-01-27 21:47:57 +0100114 } catch {
115 $self->log->warn($unable);
116 $error = 1;
117 };
Nils Diewald3cf08c72013-12-16 20:31:10 +0000118 };
119
120 return if $error;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000121
Nils Diewald3ece6302013-12-02 18:38:16 +0000122 $self->log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000123
Nils Diewald2db9ad02013-10-29 19:26:43 +0000124 # Get document id and corpus id
Nils Diewald3cf08c72013-12-16 20:31:10 +0000125 if ($rt && $rt->{'-docid'}) {
Nils Diewald840c9242014-10-28 19:51:26 +0000126 $self->text_sigle($rt->{'-docid'});
Akron194be542016-01-21 12:52:43 +0100127 if ($self->text_sigle =~ /^(([^_]+)_[^\._]+?)\..+?$/) {
Nils Diewald840c9242014-10-28 19:51:26 +0000128 $self->corpus_sigle($2);
129 $self->doc_sigle($1);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000130 }
131 else {
Nils Diewald3ece6302013-12-02 18:38:16 +0000132 croak $unable . ': ID not parseable';
Nils Diewald2db9ad02013-10-29 19:26:43 +0000133 };
134 }
135 else {
Nils Diewald3ece6302013-12-02 18:38:16 +0000136 croak $unable . ': No raw_text found or no ID';
Nils Diewald2db9ad02013-10-29 19:26:43 +0000137 };
138
139 # Get primary data
Nils Diewald3cf08c72013-12-16 20:31:10 +0000140 my $pd = $rt->{text};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000141 if ($pd) {
Akrone4c2e412016-01-28 15:10:50 +0100142 $self->{pd} = KorAP::XML::Document::Primary->new($pd);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000143 }
144 else {
145 croak $unable;
146 };
147
Nils Diewaldd681eab2014-11-01 01:18:25 +0000148 my @path = grep { $_ } splitdir($self->path);
Nils Diewald840c9242014-10-28 19:51:26 +0000149 my @header;
150
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000151 # Parse the corpus file, the doc file, and the text file for meta information
Nils Diewald840c9242014-10-28 19:51:26 +0000152 foreach (0..2) {
Nils Diewaldd681eab2014-11-01 01:18:25 +0000153 unshift @header, '/' . catfile(@path, 'header.xml');
Nils Diewald840c9242014-10-28 19:51:26 +0000154 pop @path;
155 };
Akronb2636cf2016-01-26 18:42:44 +0100156
Nils Diewald840c9242014-10-28 19:51:26 +0000157 my @type = qw/corpus doc text/;
158 foreach (@header) {
159 # Get corpus, doc and text meta data
160 my $type = shift(@type);
Akrona8665782016-01-27 21:47:57 +0100161
162 next unless -e $_;
163
164 my $slurp = b($_)->slurp;
165 $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/;
166 my $file = $slurp->decode($2 // 'UTF-8');
167
168 # Get DOM
169 my $dom = Mojo::DOM->new($file);
170
171 if ($dom->at('idsHeader') || $dom->at('idsheader')) {
172 $self->_parse_meta_i5($dom, $type);
173 }
174 else {
175 $self->_parse_meta_tei($dom, $type);
176 };
Nils Diewald840c9242014-10-28 19:51:26 +0000177 };
178
Akron14ca9f02016-01-29 19:38:18 +0100179 return $self;
180};
181
182
183sub tokenize {
184 my $self = shift;
185 my ($token_foundry, $token_layer) = @_;
186
187 $token_foundry //= 'OpenNLP';
188 $token_layer //= 'Tokens';
189
Akron941c1a62016-02-23 17:41:41 +0100190 # Create tokenizer
191 my $tokens = KorAP::XML::Tokenizer->new(
Akron14ca9f02016-01-29 19:38:18 +0100192 path => $self->path,
193 doc => $self,
194 foundry => $token_foundry,
195 layer => $token_layer,
196 name => 'tokens'
197 );
198
Akron941c1a62016-02-23 17:41:41 +0100199 # Parse tokens
Akron14ca9f02016-01-29 19:38:18 +0100200 unless ($tokens->parse) {
201 $self->log->warn(
202 'Unable to tokenize ' . $self->path .
203 ' with ' . $token_foundry . '#'
204 . $token_layer
205 );
206 }
207 else {
208 weaken $self;
209 $self->{tokenizer} = $tokens;
210 };
211
212 return $self;
213};
214
215
216# Add annotation
217sub annotate {
218 my $self = shift;
219 unless ($self->{tokenizer}) {
220 $self->log->warn('No tokenizer defined')
221 }
222 else {
223 $self->{tokenizer}->add(@_);
224 };
225
226 $self;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000227};
228
229
Akrona8665782016-01-27 21:47:57 +0100230# Store arbitrary data
231sub store {
232 my $self = shift;
233 return $self->{store} unless @_;
234 return $self->{store}->{$_[0]} if @_ == 1;
235 $self->{store}->{$_[0]} = $_[1];
236};
237
238
Nils Diewald2db9ad02013-10-29 19:26:43 +0000239# Primary data
240sub primary {
241 $_[0]->{pd};
242};
243
Nils Diewalda96de622014-10-31 17:29:23 +0000244#sub author {
245# my $self = shift;
246#
247# # Set authors
248# if ($_[0]) {
249# return $self->{authors} = [
250# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
251# ];
252# }
253# return ($self->{authors} // []);
254#};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000255
256sub text_class {
257 my $self = shift;
258 if ($_[0]) {
259 return $self->{topics} = [ @_ ];
260 };
Akron15e6fa72016-02-04 15:02:20 +0100261 return ($self->{topics} //= []);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000262};
263
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000264sub text_class_string {
265 return join ' ', @{shift->text_class};
266}
267
Nils Diewald840c9242014-10-28 19:51:26 +0000268sub keywords {
269 my $self = shift;
270 if ($_[0]) {
271 return $self->{keywords} = [ @_ ];
272 };
Akron15e6fa72016-02-04 15:02:20 +0100273 return ($self->{keywords} //= []);
Nils Diewald840c9242014-10-28 19:51:26 +0000274};
275
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000276sub keywords_string {
277 return join ' ', @{shift->keywords};
278}
279
280sub _remove_prefix {
Akrona8665782016-01-27 21:47:57 +0100281# return $_[0];
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000282
283 # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
284 my $title = shift;
285 my $prefix = shift;
286 $prefix =~ tr!_!/!;
287 if (index($title, $prefix) == 0) {
288 $title = substr($title, length($prefix));
289 $title =~ s/^\s+//;
290 $title =~ s/\s+$//;
291 };
292 return $title;
293};
294
Nils Diewald840c9242014-10-28 19:51:26 +0000295
Akrona8665782016-01-27 21:47:57 +0100296sub _parse_meta_tei {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000297 my $self = shift;
Akrona8665782016-01-27 21:47:57 +0100298 my $dom = shift;
Nils Diewald840c9242014-10-28 19:51:26 +0000299 my $type = shift;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000300
Akrona8665782016-01-27 21:47:57 +0100301 my $stmt;
Akronfd0707e2016-02-11 22:13:36 +0100302 if ($type eq 'text') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000303
Akronfd0707e2016-02-11 22:13:36 +0100304 # Publisher
Akrona8665782016-01-27 21:47:57 +0100305 try {
Akronfd0707e2016-02-11 22:13:36 +0100306 $self->publisher($dom->at('publisher')->all_text);
Akrona8665782016-01-27 21:47:57 +0100307 };
308
Akronfd0707e2016-02-11 22:13:36 +0100309 # Date of publication
Akrona8665782016-01-27 21:47:57 +0100310 try {
Akronfd0707e2016-02-11 22:13:36 +0100311 my $date = $dom->at('date')->all_text;
Akron8c84aa52016-02-13 21:26:54 +0100312 $self->store(sgbrDate => $date);
313 if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2}).*$!$1$2$3!) {
Akronfd0707e2016-02-11 22:13:36 +0100314 $self->pub_date($date);
315 }
316 else {
317 $self->log->warn('"' . $date . '" is not a compatible pubDate');
Akron8c84aa52016-02-13 21:26:54 +0100318 };
Akronfd0707e2016-02-11 22:13:36 +0100319 };
Akron15e6fa72016-02-04 15:02:20 +0100320
Akronfd0707e2016-02-11 22:13:36 +0100321 # Publication place
322 try {
Akron8c84aa52016-02-13 21:26:54 +0100323 my $pp = $dom->at('pubPlace');
324 if ($pp) {
325 $self->pub_place($pp->all_text) if $pp->all_text;
326 };
327 if ($pp->attr('ref')) {
328 $self->reference($pp->attr('ref'));
329 };
Akronfd0707e2016-02-11 22:13:36 +0100330 };
Akron15e6fa72016-02-04 15:02:20 +0100331
Akronfd0707e2016-02-11 22:13:36 +0100332 if ($stmt = $dom->at('titleStmt')) {
333 # Title
334 try {
335 $stmt->find('title')->each(
336 sub {
337 my $type = $_->attr('type') || 'main';
338 $self->title($_->all_text) if $type eq 'main';
Akron15e6fa72016-02-04 15:02:20 +0100339
Akronfd0707e2016-02-11 22:13:36 +0100340 # Only support the first subtitle
341 $self->sub_title($_->all_text) if $type eq 'sub' && !$self->sub_title;
342 }
343 );
344 };
Akron15e6fa72016-02-04 15:02:20 +0100345
Akronfd0707e2016-02-11 22:13:36 +0100346 # Author
347 try {
348 my $author = $stmt->at('author')->attr('ref');
349
350 $author = $self->{ref_author}->{$author};
351
352 if ($author) {
353 my $array = $self->keywords;
354 $self->author($author->{name} // $author->{id});
355
356 if ($author->{age}) {
357 $self->store('sgbrAuthorAgeClass' => $author->{age});
358 push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
359 };
360 if ($author->{sex}) {
361 $self->store('sgbrAuthorSex' => $author->{sex});
362 push @$array, 'sgbrAuthorSex:' . $author->{sex};
363 };
Akron15e6fa72016-02-04 15:02:20 +0100364 };
Akrona8665782016-01-27 21:47:57 +0100365 };
366 };
367
368 try {
369 my $kodex = $dom->at('item[rend]')->attr('rend');
Akron15e6fa72016-02-04 15:02:20 +0100370 if ($kodex) {
371 my $array = $self->keywords;
372 $self->store('sgbrKodex' => $kodex);
373 push @$array, 'sgbrKodex:' . $kodex;
374 };
Akrona8665782016-01-27 21:47:57 +0100375 };
376 }
377
378 elsif ($type eq 'doc') {
379 try {
380 $dom->find('particDesc person')->each(
381 sub {
Akron15e6fa72016-02-04 15:02:20 +0100382
Akronfd0707e2016-02-11 22:13:36 +0100383 my $hash = $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
Akrona8665782016-01-27 21:47:57 +0100384 age => $_->attr('age'),
385 sex => $_->attr('sex'),
386 id => $_->attr('xml:id')
Akronfd0707e2016-02-11 22:13:36 +0100387 };
388
389 # Get name
390 if ($_->at('persName')) {
391 $hash->{name} = $_->at('persName')->all_text;
392 };
Akrona8665782016-01-27 21:47:57 +0100393 });
394 };
395
396 try {
397 my $lang = $dom->at('language[ident]')->attr('ident');
398 $self->language($lang);
399 };
400
401 try {
Akronfd0707e2016-02-11 22:13:36 +0100402 $self->store('funder', $dom->at('funder > orgName')->all_text);
403 };
404
405 try {
406 $stmt = $dom->find('fileDesc > titleStmt > title')->each(
Akrona8665782016-01-27 21:47:57 +0100407 sub {
408 my $type = $_->attr('type') || 'main';
409 $self->doc_title($_->all_text) if $type eq 'main';
Akronfd0707e2016-02-11 22:13:36 +0100410 if ($type eq 'sub') {
411 my $sub_title = $self->doc_sub_title;
412 $self->doc_sub_title(
413 ($sub_title ? $sub_title . ', ' : '') . $_->all_text
414 );
415 };
Akrona8665782016-01-27 21:47:57 +0100416 }
417 );
418 };
419 };
420 return;
421};
422
423
424
425sub _parse_meta_i5 {
426 my $self = shift;
427 my $dom = shift;
428 my $type = shift;
Nils Diewald840c9242014-10-28 19:51:26 +0000429
Akron9c0488f2016-01-28 14:17:15 +0100430 my $analytic = $dom->at('analytic') || $dom->at('monogr');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000431
Nils Diewald840c9242014-10-28 19:51:26 +0000432 # There is an analytic element
Nils Diewald8e323ee2014-04-23 17:28:14 +0000433 if ($analytic) {
Nils Diewald840c9242014-10-28 19:51:26 +0000434
Nils Diewald90410c22014-11-03 21:04:05 +0000435 # Get title, subtitle, author, editor
436 my $title = $analytic->at('h\.title[type=main]');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000437 my $sub_title = $analytic->at('h\.title[type=sub]');
Nils Diewald90410c22014-11-03 21:04:05 +0000438 my $author = $analytic->at('h\.author');
439 my $editor = $analytic->at('editor');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000440
Nils Diewald90410c22014-11-03 21:04:05 +0000441 $title = $title ? $title->all_text : undef;
442 $sub_title = $sub_title ? $sub_title->all_text : undef;
443 $author = $author ? $author->all_text : undef;
444 $editor = $editor ? $editor->all_text : undef;
Nils Diewald840c9242014-10-28 19:51:26 +0000445
446 if ($type eq 'text') {
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000447 $self->title(_remove_prefix($title, $self->text_sigle)) if $title;
Nils Diewald90410c22014-11-03 21:04:05 +0000448 $self->sub_title($sub_title) if $sub_title;
449 $self->editor($editor) if $editor;
450 $self->author($author) if $author;
451 }
452 elsif ($type eq 'doc') {
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000453 $self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title;
Nils Diewald90410c22014-11-03 21:04:05 +0000454 $self->doc_sub_title($sub_title) if $sub_title;
455 $self->doc_author($author) if $author;
456 $self->doc_editor($editor) if $editor;
Nils Diewald840c9242014-10-28 19:51:26 +0000457 }
458 elsif ($type eq 'corpus') {
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000459 $self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title;
Nils Diewald90410c22014-11-03 21:04:05 +0000460 $self->corpus_sub_title($sub_title) if $sub_title;
461 $self->corpus_author($author) if $author;
462 $self->corpus_editor($editor) if $editor;
Nils Diewald840c9242014-10-28 19:51:26 +0000463 };
Nils Diewald90410c22014-11-03 21:04:05 +0000464 };
Nils Diewald840c9242014-10-28 19:51:26 +0000465
Nils Diewald90410c22014-11-03 21:04:05 +0000466 # Not in analytic
467 if ($type eq 'corpus') {
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000468 unless ($self->corpus_title) {
469 if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
Akrona8665782016-01-27 21:47:57 +0100470 $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle))
471 if $title->all_text;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000472 };
Nils Diewald90410c22014-11-03 21:04:05 +0000473 };
474 }
475
476 # doc title
477 elsif ($type eq 'doc') {
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000478 unless ($self->doc_title) {
479 if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
Akrona8665782016-01-27 21:47:57 +0100480 $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle))
481 if $title->all_text;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000482 };
Nils Diewald90410c22014-11-03 21:04:05 +0000483 };
484 }
485
486 # text title
487 elsif ($type eq 'text') {
488 unless ($self->title) {
489 if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
Akrona8665782016-01-27 21:47:57 +0100490 $self->title(_remove_prefix($title->all_text, $self->text_sigle))
491 if $title->all_text;
492 }
Nils Diewald90410c22014-11-03 21:04:05 +0000493 };
Nils Diewald8e323ee2014-04-23 17:28:14 +0000494 };
495
496 # Get PubPlace
Nils Diewald90410c22014-11-03 21:04:05 +0000497 if (my $place = $dom->at('pubPlace')) {
498 $self->pub_place($place->all_text) if $place->all_text;
Akrona8665782016-01-27 21:47:57 +0100499 $self->pub_place_key($place->attr('key')) if $place->attr('key');
Nils Diewald90410c22014-11-03 21:04:05 +0000500 };
Nils Diewald8e323ee2014-04-23 17:28:14 +0000501
502 # Get Publisher
Nils Diewald90410c22014-11-03 21:04:05 +0000503 if (my $publisher = $dom->at('imprint publisher')) {
504 $self->publisher($publisher->all_text) if $publisher->all_text;
505 };
Nils Diewald8e323ee2014-04-23 17:28:14 +0000506
Nils Diewald840c9242014-10-28 19:51:26 +0000507 # Get text type
508 my $text_desc = $dom->at('textDesc');
509
510 if ($text_desc) {
Nils Diewald90410c22014-11-03 21:04:05 +0000511 if (my $text_type = $text_desc->at('textType')) {
512 $self->text_type($text_type->all_text) if $text_type->all_text;
513 };
Nils Diewald840c9242014-10-28 19:51:26 +0000514
515 # Get text domain
Nils Diewald90410c22014-11-03 21:04:05 +0000516 if (my $text_domain = $text_desc->at('textDomain')) {
517 $self->text_domain($text_domain->all_text) if $text_domain->all_text;
518 };
Nils Diewald840c9242014-10-28 19:51:26 +0000519
520 # Get text type art
Nils Diewald90410c22014-11-03 21:04:05 +0000521 if (my $text_type_art = $text_desc->at('textTypeArt')) {
522 $self->text_type_art($text_type_art->all_text) if $text_type_art->all_text;
523 };
Nils Diewald840c9242014-10-28 19:51:26 +0000524
525 # Get text type art
Nils Diewald90410c22014-11-03 21:04:05 +0000526 if (my $text_type_ref = $text_desc->at('textTypeRef')) {
527 $self->text_type_ref($text_type_ref->all_text) if $text_type_ref->all_text;
528 };
Nils Diewald840c9242014-10-28 19:51:26 +0000529 };
530
Akrona8665782016-01-27 21:47:57 +0100531 # Availability
532 try {
533 $self->availability(
534 $dom->at('availability')->all_text
535 );
536 };
537
Nils Diewald840c9242014-10-28 19:51:26 +0000538 # Get pubDate
539 my $pub_date = $dom->find('pubDate[type=year]');
540 $pub_date->each(
Nils Diewald2db9ad02013-10-29 19:26:43 +0000541 sub {
Nils Diewald840c9242014-10-28 19:51:26 +0000542 my $x = shift->parent;
543 my $year = $x->at("pubDate[type=year]");
544 return unless $year;
545
546 $year = $year ? $year->text : 0;
547 my $month = $x->at("pubDate[type=month]");
548 $month = $month ? $month->text : 0;
549 my $day = $x->at("pubDate[type=day]");
550 $day = $day ? $day->text : 0;
551
Nils Diewald90410c22014-11-03 21:04:05 +0000552 $year = 0 if $year !~ /^\d+$/;
Nils Diewald840c9242014-10-28 19:51:26 +0000553 $month = 0 if $month !~ /^\d+$/;
Nils Diewald90410c22014-11-03 21:04:05 +0000554 $day = 0 if $day !~ /^\d+$/;
Nils Diewald840c9242014-10-28 19:51:26 +0000555
556 my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
557 $date .= length($month) == 1 ? '0' . $month : $month;
558 $date .= length($day) == 1 ? '0' . $day : $day;
559 $self->pub_date($date);
560 });
561
562 # creatDate
563 my $create_date = $dom->at('creatDate');
Nils Diewald90410c22014-11-03 21:04:05 +0000564 if ($create_date && $create_date->text) {
Nils Diewald840c9242014-10-28 19:51:26 +0000565 $create_date = $create_date->all_text;
566 if (index($create_date, '-') > -1) {
567 $self->log->warn("Creation date ranges are not supported");
568 ($create_date) = split /\s*-\s*/, $create_date;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000569 }
Nils Diewald840c9242014-10-28 19:51:26 +0000570
571 $create_date =~ s{^(\d{4})$}{$1\.00};
572 $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
573 if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
574 $create_date =~ tr/\.//d;
575 $self->creation_date($create_date);
576 };
577 };
578
579 my $text_class = $dom->at('textClass');
580 if ($text_class) {
581 # Get textClasses
582 my @topic;
583
584 $text_class->find("catRef")->each(
585 sub {
586 my ($ign, @ttopic) = split('\.', $_->attr('target'));
587 push(@topic, @ttopic);
588 }
589 );
590 $self->text_class(@topic) if @topic > 0;
591
Akron15e6fa72016-02-04 15:02:20 +0100592 my $kws = $self->keywords;
Nils Diewald840c9242014-10-28 19:51:26 +0000593 my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
Akron15e6fa72016-02-04 15:02:20 +0100594 push(@$kws, @keywords) if @keywords > 0;
Nils Diewald840c9242014-10-28 19:51:26 +0000595 };
596
597 if (my $edition_statement = $dom->at('biblFull editionStmt')) {
Nils Diewald90410c22014-11-03 21:04:05 +0000598 $self->bibl_edition_statement($edition_statement->all_text)
599 if $edition_statement->text;
Nils Diewald840c9242014-10-28 19:51:26 +0000600 };
601
602 if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
Nils Diewald90410c22014-11-03 21:04:05 +0000603 $self->file_edition_statement($edition_statement->all_text)
604 if $edition_statement->text;
Nils Diewald840c9242014-10-28 19:51:26 +0000605 };
606
607 if (my $file_desc = $dom->at('fileDesc')) {
608 if (my $availability = $file_desc->at('publicationStmt > availability')) {
609 $self->license($availability->all_text);
610 };
611 };
612
Nils Diewald90410c22014-11-03 21:04:05 +0000613 # Some meta data only available in the corpus
Nils Diewald840c9242014-10-28 19:51:26 +0000614 if ($type eq 'corpus') {
615 if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
616 $self->language($language->attr('id'));
617 };
618 }
619
Nils Diewald90410c22014-11-03 21:04:05 +0000620 # Some meta data only reevant from the text
Nils Diewald840c9242014-10-28 19:51:26 +0000621 elsif ($type eq 'text') {
622
623 if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
624 if (my $ref_text = $reference->all_text) {
Nils Diewald90410c22014-11-03 21:04:05 +0000625 $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
626 $self->reference($ref_text);
627 };
Nils Diewald840c9242014-10-28 19:51:26 +0000628 };
629
630 my $column = $dom->at('textDesc > column');
631 $self->text_column($column->all_text) if $column;
632
633 if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
634 $pages = $pages->all_text;
635 if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
636 $self->pages($1 . '-' . $2);
637 };
638 };
Nils Diewald90410c22014-11-03 21:04:05 +0000639 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000640};
641
Nils Diewald840c9242014-10-28 19:51:26 +0000642
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000643
Nils Diewald7364d1f2013-11-05 19:26:35 +0000644sub to_string {
645 my $self = shift;
646
647 my $string;
648
649 foreach (@ATTR) {
650 if (my $att = $self->$_) {
651 $att =~ s/\n/ /g;
652 $att =~ s/\s\s+/ /g;
653 $string .= $_ . ' = ' . $att . "\n";
654 };
655 };
656
Akron31d788e2016-02-05 20:49:03 +0100657 $string .= 'text_class = ' . $self->text_class_string . "\n";
658 $string .= 'keywords = ' . $self->keywords_string . "\n";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000659
660 return $string;
661};
662
Nils Diewald044c41d2013-11-11 21:45:09 +0000663sub _k {
664 my $x = $_[0];
665 $x =~ s/_(\w)/\U$1\E/g;
666 $x =~ s/id$/ID/gi;
667 return $x;
668};
669
Nils Diewald7364d1f2013-11-05 19:26:35 +0000670
671sub to_hash {
672 my $self = shift;
673
Nils Diewald840c9242014-10-28 19:51:26 +0000674 $self->parse unless $self->text_sigle;
Nils Diewald7b847222014-04-23 11:14:00 +0000675
Nils Diewald7364d1f2013-11-05 19:26:35 +0000676 my %hash;
677
Akron12ef05b2016-01-31 23:58:53 +0100678 foreach (@ATTR, @ADVANCED_ATTR, 'store') {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000679 if (my $att = $self->$_) {
680 $att =~ s/\n/ /g;
681 $att =~ s/\s\s+/ /g;
Nils Diewald044c41d2013-11-11 21:45:09 +0000682 $hash{_k($_)} = $att;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000683 };
684 };
685
Nils Diewald840c9242014-10-28 19:51:26 +0000686 for (qw/text_class keywords/) {
687 my @array = @{ $self->$_ };
688 next unless @array;
689 $hash{_k($_)} = join(' ', @array);
Nils Diewald37e5b572013-11-20 20:26:03 +0000690 };
691
Nils Diewald7364d1f2013-11-05 19:26:35 +0000692 return \%hash;
693};
694
Akron14ca9f02016-01-29 19:38:18 +0100695# Todo: Make this a KoralQuery serializer
696sub to_koral_query {
697 my $self = shift;
Akron941c1a62016-02-23 17:41:41 +0100698 my $hash = {};
699 $hash->{'@context'} = 'http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld';
700 $hash->{'@type'} = 'koral:corpus';
701# $hash->{'text'} = $self->primary->data;
702# my $hash = $self->to_hash;
Akron14ca9f02016-01-29 19:38:18 +0100703};
Nils Diewald7b847222014-04-23 11:14:00 +0000704
Akron941c1a62016-02-23 17:41:41 +0100705
Akron31d788e2016-02-05 20:49:03 +0100706sub to_json {
707 my $self = shift;
708 unless ($self->{tokenizer}) {
709 $self->log->warn('No tokenizer defined');
710 return;
711 };
712
713 return $self->{tokenizer}->to_json;
714};
715
716
Nils Diewald2db9ad02013-10-29 19:26:43 +00007171;
718
719
720__END__
721
722=pod
723
Akron31d788e2016-02-05 20:49:03 +0100724=encoding utf8
725
Nils Diewald2db9ad02013-10-29 19:26:43 +0000726=head1 NAME
727
Akron31d788e2016-02-05 20:49:03 +0100728KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
Nils Diewald2db9ad02013-10-29 19:26:43 +0000729
730
731=head1 SYNOPSIS
732
Akron31d788e2016-02-05 20:49:03 +0100733 # Create Converter Object
Akrone4c2e412016-01-28 15:10:50 +0100734 my $doc = KorAP::XML::Krill->new(
Nils Diewald2db9ad02013-10-29 19:26:43 +0000735 path => 'mydoc-1/'
736 );
737
Akron31d788e2016-02-05 20:49:03 +0100738 # Convert to krill json
739 print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000740
741
742=head1 DESCRIPTION
743
Akron31d788e2016-02-05 20:49:03 +0100744Parse the primary and meta data of a KorAP-XML document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000745
746
Akron31d788e2016-02-05 20:49:03 +0100747=head1 ATTRIBUTES
Nils Diewald2db9ad02013-10-29 19:26:43 +0000748
Akron31d788e2016-02-05 20:49:03 +0100749=head2 log
Nils Diewald2db9ad02013-10-29 19:26:43 +0000750
Akron31d788e2016-02-05 20:49:03 +0100751L<Log::Log4perl> object for logging.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000752
753=head2 path
754
755 $doc->path("example-004/");
756 print $doc->path;
757
758The path of the document.
759
760
Nils Diewald2db9ad02013-10-29 19:26:43 +0000761=head2 primary
762
763 print $doc->primary->data(0,20);
764
Akrone4c2e412016-01-28 15:10:50 +0100765The L<KorAP::XML::Document::Primary> object containing the primary data.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000766
767
Nils Diewald2db9ad02013-10-29 19:26:43 +0000768=head1 METHODS
769
Akron31d788e2016-02-05 20:49:03 +0100770=head2 annotate
771
772 $doc->add('Mate', 'Morpho');
773
774Add annotation layer to conversion process.
775
776
Nils Diewald2db9ad02013-10-29 19:26:43 +0000777=head2 parse
778
Akron31d788e2016-02-05 20:49:03 +0100779 $doc = $doc->parse;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000780
Akron31d788e2016-02-05 20:49:03 +0100781Run the meta parsing process of the document.
Nils Diewald2db9ad02013-10-29 19:26:43 +0000782
783
Akron31d788e2016-02-05 20:49:03 +0100784=head2 tokenize
785
786 $doc = $doc->tokenize('OpenNLP', 'Tokens');
787
788Accept the tokenization based on a given foundry and a given layer.
789
790
791=head1 AVAILABILITY
792
793 https://github.com/KorAP/KorAP-XML-Krill
794
795
796=head1 COPYRIGHT AND LICENSE
797
798Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
799Author: L<Nils Diewald|http://nils-diewald.de/>
800
801KorAP::XML::Krill is developed as part of the
802L<KorAP|http://korap.ids-mannheim.de/>
803Corpus Analysis Platform at the
804L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
805member of the
806L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
807and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
808funded by the
809L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
810
811KorAP::XML::Krill is free software published under the
812L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
813
Nils Diewald2db9ad02013-10-29 19:26:43 +0000814=cut