blob: b4ab7def989d4c2d650c1955f266b558aefa1935 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001package KorAP::Document;
2use Mojo::Base -base;
3use v5.16;
Nils Diewald2db9ad02013-10-29 19:26:43 +00004use Mojo::ByteStream 'b';
Nils Diewald7b847222014-04-23 11:14:00 +00005use Mojo::Util qw/encode/;
Nils Diewald3cf08c72013-12-16 20:31:10 +00006use XML::Fast;
7use Try::Tiny;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Carp qw/croak/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009use KorAP::Document::Primary;
Nils Diewald7b847222014-04-23 11:14:00 +000010use Log::Log4perl;
11use KorAP::Log;
12use Mojo::DOM;
13use Data::Dumper;
Nils Diewald2db9ad02013-10-29 19:26:43 +000014
Nils Diewald8e323ee2014-04-23 17:28:14 +000015our @ATTR = qw/id
16 corpus_id
17 pub_date
18 title
19 sub_title
20 pub_place/;
21
22our @ADVANCED_ATTR = qw/publisher
23 editor
24 text_type
25 text_type_art
26 creation_date
27 coll_title
28 coll_sub_title
29 coll_author
30 coll_editor
31 /;
32
Nils Diewald7364d1f2013-11-05 19:26:35 +000033has 'path';
Nils Diewald8e323ee2014-04-23 17:28:14 +000034has [@ATTR, @ADVANCED_ATTR];
Nils Diewald7364d1f2013-11-05 19:26:35 +000035
Nils Diewald7b847222014-04-23 11:14:00 +000036has log => sub {
37 if(Log::Log4perl->initialized()) {
38 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000039 };
40 state $log = KorAP::Log->new;
41 return $log;
42};
43
44sub new {
45 my $class = shift;
46 my $self = bless { @_ }, $class;
47 if (exists $self->{path} && $self->{path} !~ m!\/$!) {
48 $self->{path} .= '/';
49 };
50 return $self;
51};
Nils Diewald2db9ad02013-10-29 19:26:43 +000052
53# parse document
54sub parse {
55 my $self = shift;
Nils Diewald7b847222014-04-23 11:14:00 +000056
Nils Diewald98767bb2014-04-25 20:31:19 +000057 my $data_xml = $self->path . 'data.xml';
Nils Diewald2db9ad02013-10-29 19:26:43 +000058
Nils Diewald98767bb2014-04-25 20:31:19 +000059 my ($rt, $error, $file);
60
61 my $unable = 'Unable to parse document ' . $self->path;
62
63 unless (-e $data_xml) {
64 $self->log->warn($unable . ' - no data.xml found');
65 $error = 1;
66 }
67
68 else {
69 $file = b($data_xml)->slurp;
70
71 try {
Nils Diewald3cf08c72013-12-16 20:31:10 +000072 local $SIG{__WARN__} = sub {
Nils Diewald98767bb2014-04-25 20:31:19 +000073 $error = 1;
Nils Diewald3cf08c72013-12-16 20:31:10 +000074 };
75 $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
Nils Diewald98767bb2014-04-25 20:31:19 +000076 }
77 catch {
78 $self->log->warn($unable);
79 $error = 1;
80 };
Nils Diewald3cf08c72013-12-16 20:31:10 +000081 };
82
83 return if $error;
Nils Diewald2db9ad02013-10-29 19:26:43 +000084
Nils Diewald3ece6302013-12-02 18:38:16 +000085 $self->log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +000086
Nils Diewald2db9ad02013-10-29 19:26:43 +000087 # Get document id and corpus id
Nils Diewald3cf08c72013-12-16 20:31:10 +000088 if ($rt && $rt->{'-docid'}) {
89 $self->id($rt->{'-docid'});
Nils Diewald2db9ad02013-10-29 19:26:43 +000090 if ($self->id =~ /^([^_]+)_/) {
91 $self->corpus_id($1);
92 }
93 else {
Nils Diewald3ece6302013-12-02 18:38:16 +000094 croak $unable . ': ID not parseable';
Nils Diewald2db9ad02013-10-29 19:26:43 +000095 };
96 }
97 else {
Nils Diewald3ece6302013-12-02 18:38:16 +000098 croak $unable . ': No raw_text found or no ID';
Nils Diewald2db9ad02013-10-29 19:26:43 +000099 };
100
101 # Get primary data
Nils Diewald3cf08c72013-12-16 20:31:10 +0000102 my $pd = $rt->{text};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000103 if ($pd) {
Nils Diewald3cf08c72013-12-16 20:31:10 +0000104 $self->{pd} = KorAP::Document::Primary->new($pd);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000105 }
106 else {
107 croak $unable;
108 };
109
110 # Get meta data
111 $self->_parse_meta;
112 return 1;
113};
114
115
116# Primary data
117sub primary {
118 $_[0]->{pd};
119};
120
121sub author {
122 my $self = shift;
123
124 # Set authors
125 if ($_[0]) {
126 return $self->{authors} = [
127 grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
128 ];
129 }
130 return ($self->{authors} // []);
131};
132
133sub text_class {
134 my $self = shift;
135 if ($_[0]) {
136 return $self->{topics} = [ @_ ];
137 };
138 return ($self->{topics} // []);
139};
140
Nils Diewald2db9ad02013-10-29 19:26:43 +0000141sub _parse_meta {
142 my $self = shift;
143
Nils Diewald7364d1f2013-11-05 19:26:35 +0000144 my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000145
146 my $dom = Mojo::DOM->new($file);
Nils Diewald682feb02013-11-29 22:48:40 +0000147 my $analytic = $dom->at('analytic');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000148
Nils Diewald8e323ee2014-04-23 17:28:14 +0000149 if ($analytic) {
150 # Get title
151 my $title = $analytic->at('h\.title[type=main]');
152 $self->title($title->text) if $title;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000153
Nils Diewald8e323ee2014-04-23 17:28:14 +0000154 # Get Subtitle
155 my $sub_title = $analytic->at('h\.title[type=sub]');
156 $self->sub_title($sub_title->text) if $sub_title;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000157
Nils Diewald8e323ee2014-04-23 17:28:14 +0000158 # Get Author
159 my $author = $analytic->at('h\.author');
160 $self->author($author->all_text) if $author;
161
162 # Get Editor
163 my $editor = $analytic->at('editor');
164 $self->editor($editor->all_text) if $editor;
165 };
166
167 # Get PubPlace
168 my $place = $dom->at('pubPlace');
169 $self->pub_place($place->all_text) if $place;
170
171 # Get Publisher
172 my $publisher = $dom->at('publisher');
173 $self->publisher($publisher->all_text) if $publisher;
174
175 my $mono = $dom->at('monogr');
176 if ($mono) {
177 # Get title
178 my $title = $mono->at('h\.title[type=main]');
179
180 # It's a monograph
181 if (!$self->title) {
182 $self->title($title->text) if $title;
183
184 # Get Subtitle
185 my $sub_title = $mono->at('h\.title[type=sub]');
186 $self->sub_title($sub_title->text) if $sub_title;
187
188 }
189 else {
190 $self->coll_title($title->text) if $title;
191
192 # Get Subtitle
193 my $sub_title = $mono->at('h\.title[type=sub]');
194 $self->coll_sub_title($sub_title->text) if $sub_title;
195 };
196
197 # Get Author
198 my $author = $mono->at('h\.author');
199 $self->coll_author($author->all_text) if $author;
200
201 # Get editor
202 my $editor = $mono->at('editor');
203 $self->coll_editor($editor->all_text) if $editor;
204 };
205
206 # Get text type
207 my $text_type = $dom->at('textDesc textType');
208 $self->text_type($text_type->all_text) if $text_type;
209
210 # Get text type
211 my $text_type_art = $dom->at('textDesc textTypeArt');
212 $self->text_type_art($text_type_art->all_text) if $text_type_art;
213
Nils Diewald2db9ad02013-10-29 19:26:43 +0000214
215 # Get pubDate
216 my $year = $dom->at("pubDate[type=year]");
217 $year = $year ? $year->text : 0;
218 my $month = $dom->at("pubDate[type=month]");
219 $month = $month ? $month->text : 0;
220 my $day = $dom->at("pubDate[type=day]");
221 $day = $day ? $day->text : 0;
222
Nils Diewald092178e2013-11-26 16:18:48 +0000223 $year = 0 if $year !~ /^\d+$/;
224 $month = 0 if $month !~ /^\d+$/;
225 $day = 0 if $day !~ /^\d+$/;
226
Nils Diewald2db9ad02013-10-29 19:26:43 +0000227 my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
228 $date .= length($month) == 1 ? '0' . $month : $month;
229 $date .= length($day) == 1 ? '0' . $day : $day;
230
231 $self->pub_date($date);
232
Nils Diewald8e323ee2014-04-23 17:28:14 +0000233 # creatDate
234 my $createdate = $dom->at('creatDate');
235 if ($createdate) {
236 $createdate = $createdate->all_text;
237 if (index($createdate, '-') > -1) {
238 $self->log->warn("Creation date ranges are not supported yet");
239 }
240 else {
241 $createdate =~ s{^(\d{4})$}{$1\.00};
242 $createdate =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
243 if ($createdate =~ /^\d{4}\.\d{2}\.\d{2}$/) {
244 $createdate =~ tr/\.//d;
245 $self->creation_date($createdate);
246 };
247 };
248 };
249
Nils Diewald2db9ad02013-10-29 19:26:43 +0000250 # Get textClasses
251 my @topic;
252 $dom->find("textClass catRef")->each(
253 sub {
254 my ($ign, @ttopic) = split('\.', $_->attr('target'));
255 push(@topic, @ttopic);
256 }
257 );
258 $self->text_class(@topic);
259};
260
Nils Diewald7364d1f2013-11-05 19:26:35 +0000261sub to_string {
262 my $self = shift;
263
264 my $string;
265
266 foreach (@ATTR) {
267 if (my $att = $self->$_) {
268 $att =~ s/\n/ /g;
269 $att =~ s/\s\s+/ /g;
270 $string .= $_ . ' = ' . $att . "\n";
271 };
272 };
273
274 if ($self->author) {
275 foreach (@{$self->author}) {
276 $_ =~ s/\n/ /g;
277 $_ =~ s/\s\s+/ /g;
278 $string .= 'author = ' . $_ . "\n";
279 };
280 };
281
282 if ($self->text_class) {
283 foreach (@{$self->text_class}) {
284 $string .= 'text_class = ' . $_ . "\n";
285 };
286 };
287
288 return $string;
289};
290
Nils Diewald044c41d2013-11-11 21:45:09 +0000291sub _k {
292 my $x = $_[0];
293 $x =~ s/_(\w)/\U$1\E/g;
294 $x =~ s/id$/ID/gi;
295 return $x;
296};
297
Nils Diewald7364d1f2013-11-05 19:26:35 +0000298
299sub to_hash {
300 my $self = shift;
301
Nils Diewald7b847222014-04-23 11:14:00 +0000302 $self->parse unless $self->id;
303
Nils Diewald7364d1f2013-11-05 19:26:35 +0000304 my %hash;
305
Nils Diewald044c41d2013-11-11 21:45:09 +0000306 foreach (@ATTR) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000307 if (my $att = $self->$_) {
308 $att =~ s/\n/ /g;
309 $att =~ s/\s\s+/ /g;
Nils Diewald044c41d2013-11-11 21:45:09 +0000310 $hash{_k($_)} = $att;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311 };
312 };
313
Nils Diewald37e5b572013-11-20 20:26:03 +0000314 for ('author') {
Nils Diewald044c41d2013-11-11 21:45:09 +0000315 $hash{_k($_)} = join(',', @{ $self->$_ });
316 };
317
Nils Diewald37e5b572013-11-20 20:26:03 +0000318 for ('text_class') {
319 $hash{_k($_)} = join(' ', @{ $self->$_ });
320 };
321
Nils Diewald7364d1f2013-11-05 19:26:35 +0000322 return \%hash;
323};
324
325
Nils Diewald7b847222014-04-23 11:14:00 +0000326
327sub _parse_meta_fast {
328 my $self = shift;
329
330 # my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
331 my $file = b($self->path . 'header.xml')->slurp;
332
333 my ($meta, $error);
Nils Diewald98767bb2014-04-25 20:31:19 +0000334 my $unable = 'Unable to parse document ' . $self->path;
Nils Diewald7b847222014-04-23 11:14:00 +0000335
336 try {
337 local $SIG{__WARN__} = sub {
338 $error = 1;
339 };
340 $meta = xml2hash($file, text => '#text', attr => '-', array => ['h.title', 'imprint', 'catRef', 'h.author'])->{idsHeader};
341 }
342 catch {
343 $self->log->warn($unable);
344 $error = 1;
345 };
346
347 return if $error;
348
349 my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct};
350 my $analytic = $bibl_struct->{analytic};
351
352 my $titles = $analytic->{'h.title'};
353 foreach (@$titles) {
354 if ($_->{'-type'} eq 'main') {
355 $self->title($_->{'#text'});
356 }
357 elsif ($_->{'-type'} eq 'sub') {
358 $self->sub_title($_->{'#text'});
359 };
360 };
361
362 # Get Author
363 if (my $author = $analytic->{'h.author'}) {
364 $self->author($author->[0]);
365 };
366
367 # Get pubDate
368 my $date = $bibl_struct->{monogr}->{imprint};
369 my ($year, $month, $day) = (0,0,0);
370 foreach (@$date) {
371 warn $date;
372 if ($date->{-type} eq 'year') {
373 $year = $date->{'#text'};
374 }
375 elsif ($date->{-type} eq 'month') {
376 $month = $date->{'#text'};
377 }
378 elsif ($date->{-type} eq 'day') {
379 $day = $date->{'#text'};
380 };
381 };
382
383 $year = 0 if $year !~ /^\d+$/;
384 $month = 0 if $month !~ /^\d+$/;
385 $day = 0 if $day !~ /^\d+$/;
386
387 $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
388 $date .= length($month) == 1 ? '0' . $month : $month;
389 $date .= length($day) == 1 ? '0' . $day : $day;
390
391 $self->pub_date($date);
392
393 # Get textClasses
394 my @topic;
395 my $textClass = $meta->{profileDesc}->{textClass}->{catRef};
396 foreach (@$textClass) {
397 my ($ign, @ttopic) = split('\.', $_->{'-target'});
398 push(@topic, @ttopic);
399 };
400 $self->text_class(@topic);
401};
402
403
404
Nils Diewald2db9ad02013-10-29 19:26:43 +00004051;
406
407
408__END__
409
410=pod
411
412=head1 NAME
413
414KorAP::Document
415
416
417=head1 SYNOPSIS
418
419 my $doc = KorAP::Document->new(
420 path => 'mydoc-1/'
421 );
422
423 $doc->parse;
424
425 print $doc->title;
426
427
428=head1 DESCRIPTION
429
430Parse the primary and meta data of a document.
431
432
433=head2 ATTRIBUTES
434
435=head2 id
436
437 $doc->id(75476);
438 print $doc->id;
439
440The unique identifier of the document.
441
442
443=head2 corpus_id
444
445 $doc->corpus_id(4);
446 print $doc->corpus_id;
447
448The unique identifier of the corpus.
449
450
451=head2 path
452
453 $doc->path("example-004/");
454 print $doc->path;
455
456The path of the document.
457
458
459=head2 title
460
461 $doc->title("Der Name der Rose");
462 print $doc->title;
463
464The title of the document.
465
466
467=head2 sub_title
468
469 $doc->sub_title("Natürlich eine Handschrift");
470 print $doc->sub_title;
471
472The title of the document.
473
474
475=head2 pub_place
476
477 $doc->pub_place("Rom");
478 print $doc->pub_place;
479
480The publication place of the document.
481
482
483=head2 pub_date
484
485 $doc->pub_place("19800404");
486 print $doc->pub_place;
487
488The publication date of the document,
489in the format "YYYYMMDD".
490
491
492=head2 primary
493
494 print $doc->primary->data(0,20);
495
496The L<KorAP::Document::Primary> object containing the primary data.
497
498
499=head2 author
500
501 $doc->author('Binks, Jar Jar; Luke Skywalker');
502 print $doc->author->[0];
503
504Set the author value as semikolon separated list of names or
505get an array reference of author names.
506
507=head2 text_class
508
509 $doc->text_class(qw/news sports/);
510 print $doc->text_class->[0];
511
512Set the text class as an array or get an array
513reference of text classes.
514
515
516=head1 METHODS
517
518=head2 parse
519
520 $doc->parse;
521
522Run the parsing process of the document
523
524
525=cut