blob: b9bdfe307a1bed3229b081b00c2fa08cb4ea8dcf [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001package KorAP::Document;
2use Mojo::Base -base;
Nils Diewald2db9ad02013-10-29 19:26:43 +00003use Mojo::ByteStream 'b';
Nils Diewald7b847222014-04-23 11:14:00 +00004use Mojo::Util qw/encode/;
Nils Diewald3cf08c72013-12-16 20:31:10 +00005use XML::Fast;
6use Try::Tiny;
Nils Diewald7364d1f2013-11-05 19:26:35 +00007use Carp qw/croak/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use KorAP::Document::Primary;
Nils Diewald7b847222014-04-23 11:14:00 +00009use Log::Log4perl;
10use KorAP::Log;
11use Mojo::DOM;
12use Data::Dumper;
Nils Diewaldd681eab2014-11-01 01:18:25 +000013use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000014
Nils Diewald840c9242014-10-28 19:51:26 +000015our @ATTR = qw/text_sigle
16 doc_sigle
17 corpus_sigle
Nils Diewald90410c22014-11-03 21:04:05 +000018
Nils Diewald8e323ee2014-04-23 17:28:14 +000019 pub_date
20 title
21 sub_title
Nils Diewalda96de622014-10-31 17:29:23 +000022 pub_place
23 author/;
Nils Diewald8e323ee2014-04-23 17:28:14 +000024
25our @ADVANCED_ATTR = qw/publisher
26 editor
27 text_type
28 text_type_art
Nils Diewald840c9242014-10-28 19:51:26 +000029 text_type_ref
30 text_column
31 text_domain
Nils Diewald8e323ee2014-04-23 17:28:14 +000032 creation_date
Nils Diewald840c9242014-10-28 19:51:26 +000033 license
Nils Diewald840c9242014-10-28 19:51:26 +000034 pages
Nils Diewald840c9242014-10-28 19:51:26 +000035 file_edition_statement
36 bibl_edition_statement
Nils Diewald840c9242014-10-28 19:51:26 +000037 reference
Nils Diewald840c9242014-10-28 19:51:26 +000038 language
Nils Diewald90410c22014-11-03 21:04:05 +000039
40 doc_title
41 doc_sub_title
42 doc_editor
43 doc_author
44
45 corpus_author
Nils Diewald840c9242014-10-28 19:51:26 +000046 corpus_title
47 corpus_sub_title
Nils Diewald90410c22014-11-03 21:04:05 +000048 corpus_editor
Nils Diewald8e323ee2014-04-23 17:28:14 +000049 /;
50
Nils Diewald90410c22014-11-03 21:04:05 +000051# Removed: coll_title, coll_sub_title, coll_author, coll_editor
52# Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
53
54
Nils Diewald7364d1f2013-11-05 19:26:35 +000055has 'path';
Nils Diewald8e323ee2014-04-23 17:28:14 +000056has [@ATTR, @ADVANCED_ATTR];
Nils Diewald7364d1f2013-11-05 19:26:35 +000057
Nils Diewald7b847222014-04-23 11:14:00 +000058has log => sub {
59 if(Log::Log4perl->initialized()) {
60 state $log = Log::Log4perl->get_logger(__PACKAGE__);
Nils Diewald7b847222014-04-23 11:14:00 +000061 };
62 state $log = KorAP::Log->new;
63 return $log;
64};
65
66sub new {
67 my $class = shift;
68 my $self = bless { @_ }, $class;
Nils Diewaldd681eab2014-11-01 01:18:25 +000069 if (exists $self->{path}) {
70 $self->{path} = rel2abs($self->{path});
71 if ($self->{path} !~ m!\/$!) {
72 $self->{path} .= '/';
73 };
Nils Diewald7b847222014-04-23 11:14:00 +000074 };
75 return $self;
76};
Nils Diewald2db9ad02013-10-29 19:26:43 +000077
78# parse document
79sub parse {
80 my $self = shift;
Nils Diewald7b847222014-04-23 11:14:00 +000081
Nils Diewald98767bb2014-04-25 20:31:19 +000082 my $data_xml = $self->path . 'data.xml';
Nils Diewald2db9ad02013-10-29 19:26:43 +000083
Nils Diewald98767bb2014-04-25 20:31:19 +000084 my ($rt, $error, $file);
85
86 my $unable = 'Unable to parse document ' . $self->path;
87
88 unless (-e $data_xml) {
89 $self->log->warn($unable . ' - no data.xml found');
90 $error = 1;
91 }
92
93 else {
94 $file = b($data_xml)->slurp;
95
96 try {
Nils Diewald3cf08c72013-12-16 20:31:10 +000097 local $SIG{__WARN__} = sub {
Nils Diewald98767bb2014-04-25 20:31:19 +000098 $error = 1;
Nils Diewald3cf08c72013-12-16 20:31:10 +000099 };
100 $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
Nils Diewald98767bb2014-04-25 20:31:19 +0000101 }
102 catch {
103 $self->log->warn($unable);
104 $error = 1;
105 };
Nils Diewald3cf08c72013-12-16 20:31:10 +0000106 };
107
108 return if $error;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000109
Nils Diewald3ece6302013-12-02 18:38:16 +0000110 $self->log->debug('Parse document ' . $self->path);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000111
Nils Diewald2db9ad02013-10-29 19:26:43 +0000112 # Get document id and corpus id
Nils Diewald3cf08c72013-12-16 20:31:10 +0000113 if ($rt && $rt->{'-docid'}) {
Nils Diewald840c9242014-10-28 19:51:26 +0000114 $self->text_sigle($rt->{'-docid'});
115 if ($self->text_sigle =~ /^(([^_]+)_[^\._]+?)\.\d+$/) {
116 $self->corpus_sigle($2);
117 $self->doc_sigle($1);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000118 }
119 else {
Nils Diewald3ece6302013-12-02 18:38:16 +0000120 croak $unable . ': ID not parseable';
Nils Diewald2db9ad02013-10-29 19:26:43 +0000121 };
122 }
123 else {
Nils Diewald3ece6302013-12-02 18:38:16 +0000124 croak $unable . ': No raw_text found or no ID';
Nils Diewald2db9ad02013-10-29 19:26:43 +0000125 };
126
127 # Get primary data
Nils Diewald3cf08c72013-12-16 20:31:10 +0000128 my $pd = $rt->{text};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000129 if ($pd) {
Nils Diewald3cf08c72013-12-16 20:31:10 +0000130 $self->{pd} = KorAP::Document::Primary->new($pd);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000131 }
132 else {
133 croak $unable;
134 };
135
Nils Diewaldd681eab2014-11-01 01:18:25 +0000136 my @path = grep { $_ } splitdir($self->path);
Nils Diewald840c9242014-10-28 19:51:26 +0000137 my @header;
138
139 foreach (0..2) {
Nils Diewaldd681eab2014-11-01 01:18:25 +0000140 unshift @header, '/' . catfile(@path, 'header.xml');
Nils Diewald840c9242014-10-28 19:51:26 +0000141 pop @path;
142 };
143 my @type = qw/corpus doc text/;
144 foreach (@header) {
145 # Get corpus, doc and text meta data
146 my $type = shift(@type);
147 $self->_parse_meta($_, $type) if -e $_;
148 };
149
Nils Diewald2db9ad02013-10-29 19:26:43 +0000150 return 1;
151};
152
153
154# Primary data
155sub primary {
156 $_[0]->{pd};
157};
158
Nils Diewalda96de622014-10-31 17:29:23 +0000159#sub author {
160# my $self = shift;
161#
162# # Set authors
163# if ($_[0]) {
164# return $self->{authors} = [
165# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
166# ];
167# }
168# return ($self->{authors} // []);
169#};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000170
171sub text_class {
172 my $self = shift;
173 if ($_[0]) {
174 return $self->{topics} = [ @_ ];
175 };
176 return ($self->{topics} // []);
177};
178
Nils Diewald840c9242014-10-28 19:51:26 +0000179sub keywords {
180 my $self = shift;
181 if ($_[0]) {
182 return $self->{keywords} = [ @_ ];
183 };
184 return ($self->{keywords} // []);
185};
186
187
Nils Diewald2db9ad02013-10-29 19:26:43 +0000188sub _parse_meta {
189 my $self = shift;
Nils Diewald840c9242014-10-28 19:51:26 +0000190 my $header_xml = shift;
191 my $type = shift;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000192
Nils Diewald840c9242014-10-28 19:51:26 +0000193 my $file = b($header_xml)->slurp->decode('iso-8859-1');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000194
195 my $dom = Mojo::DOM->new($file);
Nils Diewald840c9242014-10-28 19:51:26 +0000196
Nils Diewald682feb02013-11-29 22:48:40 +0000197 my $analytic = $dom->at('analytic');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000198
Nils Diewald840c9242014-10-28 19:51:26 +0000199 # There is an analytic element
Nils Diewald8e323ee2014-04-23 17:28:14 +0000200 if ($analytic) {
Nils Diewald840c9242014-10-28 19:51:26 +0000201
Nils Diewald90410c22014-11-03 21:04:05 +0000202 # Get title, subtitle, author, editor
203 my $title = $analytic->at('h\.title[type=main]');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000204 my $sub_title = $analytic->at('h\.title[type=sub]');
Nils Diewald90410c22014-11-03 21:04:05 +0000205 my $author = $analytic->at('h\.author');
206 my $editor = $analytic->at('editor');
Nils Diewald2db9ad02013-10-29 19:26:43 +0000207
Nils Diewald90410c22014-11-03 21:04:05 +0000208 $title = $title ? $title->all_text : undef;
209 $sub_title = $sub_title ? $sub_title->all_text : undef;
210 $author = $author ? $author->all_text : undef;
211 $editor = $editor ? $editor->all_text : undef;
Nils Diewald840c9242014-10-28 19:51:26 +0000212
213 if ($type eq 'text') {
Nils Diewald90410c22014-11-03 21:04:05 +0000214 $self->title($title) if $title;
215 $self->sub_title($sub_title) if $sub_title;
216 $self->editor($editor) if $editor;
217 $self->author($author) if $author;
218 }
219 elsif ($type eq 'doc') {
220 $self->doc_title($title) if $title;
221 $self->doc_sub_title($sub_title) if $sub_title;
222 $self->doc_author($author) if $author;
223 $self->doc_editor($editor) if $editor;
Nils Diewald840c9242014-10-28 19:51:26 +0000224 }
225 elsif ($type eq 'corpus') {
Nils Diewald90410c22014-11-03 21:04:05 +0000226 $self->corpus_title($title) if $title;
227 $self->corpus_sub_title($sub_title) if $sub_title;
228 $self->corpus_author($author) if $author;
229 $self->corpus_editor($editor) if $editor;
Nils Diewald840c9242014-10-28 19:51:26 +0000230 };
Nils Diewald90410c22014-11-03 21:04:05 +0000231 };
Nils Diewald840c9242014-10-28 19:51:26 +0000232
Nils Diewald90410c22014-11-03 21:04:05 +0000233 # Not in analytic
234 if ($type eq 'corpus') {
235 if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
236 $self->corpus_title($title->all_text) if $title->all_text;
237 };
238 }
239
240 # doc title
241 elsif ($type eq 'doc') {
242 if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
243 $self->doc_title($title->all_text) if $title->all_text;
244 };
245 }
246
247 # text title
248 elsif ($type eq 'text') {
249 unless ($self->title) {
250 if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
251 $self->title($title->all_text) if $title->all_text;
252 };
253 };
Nils Diewald8e323ee2014-04-23 17:28:14 +0000254 };
255
256 # Get PubPlace
Nils Diewald90410c22014-11-03 21:04:05 +0000257 if (my $place = $dom->at('pubPlace')) {
258 $self->pub_place($place->all_text) if $place->all_text;
259 };
Nils Diewald8e323ee2014-04-23 17:28:14 +0000260
261 # Get Publisher
Nils Diewald90410c22014-11-03 21:04:05 +0000262 if (my $publisher = $dom->at('imprint publisher')) {
263 $self->publisher($publisher->all_text) if $publisher->all_text;
264 };
Nils Diewald8e323ee2014-04-23 17:28:14 +0000265
266 my $mono = $dom->at('monogr');
267 if ($mono) {
Nils Diewald8e323ee2014-04-23 17:28:14 +0000268
Nils Diewald90410c22014-11-03 21:04:05 +0000269 # Get title, subtitle, author, editor
270 my $title = $mono->at('h\.title[type=main]');
Nils Diewald840c9242014-10-28 19:51:26 +0000271 my $sub_title = $mono->at('h\.title[type=sub]');
Nils Diewald90410c22014-11-03 21:04:05 +0000272 my $author = $mono->at('h\.author');
273 my $editor = $mono->at('editor');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000274
Nils Diewald90410c22014-11-03 21:04:05 +0000275 $title = $title ? $title->all_text : undef;
276 $sub_title = $sub_title ? $sub_title->all_text : undef;
277 $author = $author ? $author->all_text : undef;
278 $editor = $editor ? $editor->all_text : undef;
279
280 if ($type eq 'text') {
281 $self->title($title) if $title;
282 $self->sub_title($sub_title) if $sub_title;
283 $self->editor($editor) if $editor;
284 $self->author($author) if $author;
Nils Diewald8e323ee2014-04-23 17:28:14 +0000285 }
Nils Diewald90410c22014-11-03 21:04:05 +0000286 elsif ($type eq 'doc') {
287 $self->doc_title($title) if $title;
288 $self->doc_sub_title($sub_title) if $sub_title;
289 $self->doc_author($author) if $author;
290 $self->doc_editor($editor) if $editor;
291 }
292 elsif ($type eq 'corpus') {
293 $self->corpus_title($title) if $title;
294 $self->corpus_sub_title($sub_title) if $sub_title;
295 $self->corpus_author($author) if $author;
296 $self->corpus_editor($editor) if $editor;
Nils Diewald8e323ee2014-04-23 17:28:14 +0000297 };
298 };
299
Nils Diewald840c9242014-10-28 19:51:26 +0000300 # Get text type
301 my $text_desc = $dom->at('textDesc');
302
303 if ($text_desc) {
Nils Diewald90410c22014-11-03 21:04:05 +0000304 if (my $text_type = $text_desc->at('textType')) {
305 $self->text_type($text_type->all_text) if $text_type->all_text;
306 };
Nils Diewald840c9242014-10-28 19:51:26 +0000307
308 # Get text domain
Nils Diewald90410c22014-11-03 21:04:05 +0000309 if (my $text_domain = $text_desc->at('textDomain')) {
310 $self->text_domain($text_domain->all_text) if $text_domain->all_text;
311 };
Nils Diewald840c9242014-10-28 19:51:26 +0000312
313 # Get text type art
Nils Diewald90410c22014-11-03 21:04:05 +0000314 if (my $text_type_art = $text_desc->at('textTypeArt')) {
315 $self->text_type_art($text_type_art->all_text) if $text_type_art->all_text;
316 };
Nils Diewald840c9242014-10-28 19:51:26 +0000317
318 # Get text type art
Nils Diewald90410c22014-11-03 21:04:05 +0000319 if (my $text_type_ref = $text_desc->at('textTypeRef')) {
320 $self->text_type_ref($text_type_ref->all_text) if $text_type_ref->all_text;
321 };
Nils Diewald840c9242014-10-28 19:51:26 +0000322 };
323
324 # Get pubDate
325 my $pub_date = $dom->find('pubDate[type=year]');
326 $pub_date->each(
Nils Diewald2db9ad02013-10-29 19:26:43 +0000327 sub {
Nils Diewald840c9242014-10-28 19:51:26 +0000328 my $x = shift->parent;
329 my $year = $x->at("pubDate[type=year]");
330 return unless $year;
331
332 $year = $year ? $year->text : 0;
333 my $month = $x->at("pubDate[type=month]");
334 $month = $month ? $month->text : 0;
335 my $day = $x->at("pubDate[type=day]");
336 $day = $day ? $day->text : 0;
337
Nils Diewald90410c22014-11-03 21:04:05 +0000338 $year = 0 if $year !~ /^\d+$/;
Nils Diewald840c9242014-10-28 19:51:26 +0000339 $month = 0 if $month !~ /^\d+$/;
Nils Diewald90410c22014-11-03 21:04:05 +0000340 $day = 0 if $day !~ /^\d+$/;
Nils Diewald840c9242014-10-28 19:51:26 +0000341
342 my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
343 $date .= length($month) == 1 ? '0' . $month : $month;
344 $date .= length($day) == 1 ? '0' . $day : $day;
345 $self->pub_date($date);
346 });
347
348 # creatDate
349 my $create_date = $dom->at('creatDate');
Nils Diewald90410c22014-11-03 21:04:05 +0000350 if ($create_date && $create_date->text) {
Nils Diewald840c9242014-10-28 19:51:26 +0000351 $create_date = $create_date->all_text;
352 if (index($create_date, '-') > -1) {
353 $self->log->warn("Creation date ranges are not supported");
354 ($create_date) = split /\s*-\s*/, $create_date;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000355 }
Nils Diewald840c9242014-10-28 19:51:26 +0000356
357 $create_date =~ s{^(\d{4})$}{$1\.00};
358 $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
359 if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
360 $create_date =~ tr/\.//d;
361 $self->creation_date($create_date);
362 };
363 };
364
365 my $text_class = $dom->at('textClass');
366 if ($text_class) {
367 # Get textClasses
368 my @topic;
369
370 $text_class->find("catRef")->each(
371 sub {
372 my ($ign, @ttopic) = split('\.', $_->attr('target'));
373 push(@topic, @ttopic);
374 }
375 );
376 $self->text_class(@topic) if @topic > 0;
377
378 my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
379 $self->keywords(@keywords) if @keywords > 0;
380 };
381
382 if (my $edition_statement = $dom->at('biblFull editionStmt')) {
Nils Diewald90410c22014-11-03 21:04:05 +0000383 $self->bibl_edition_statement($edition_statement->all_text)
384 if $edition_statement->text;
Nils Diewald840c9242014-10-28 19:51:26 +0000385 };
386
387 if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
Nils Diewald90410c22014-11-03 21:04:05 +0000388 $self->file_edition_statement($edition_statement->all_text)
389 if $edition_statement->text;
Nils Diewald840c9242014-10-28 19:51:26 +0000390 };
391
392 if (my $file_desc = $dom->at('fileDesc')) {
393 if (my $availability = $file_desc->at('publicationStmt > availability')) {
394 $self->license($availability->all_text);
395 };
396 };
397
Nils Diewald90410c22014-11-03 21:04:05 +0000398 # Some meta data only available in the corpus
Nils Diewald840c9242014-10-28 19:51:26 +0000399 if ($type eq 'corpus') {
400 if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
401 $self->language($language->attr('id'));
402 };
403 }
404
Nils Diewald90410c22014-11-03 21:04:05 +0000405 # Some meta data only reevant from the text
Nils Diewald840c9242014-10-28 19:51:26 +0000406 elsif ($type eq 'text') {
407
408 if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
409 if (my $ref_text = $reference->all_text) {
Nils Diewald90410c22014-11-03 21:04:05 +0000410 $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
411 $self->reference($ref_text);
412 };
Nils Diewald840c9242014-10-28 19:51:26 +0000413 };
414
415 my $column = $dom->at('textDesc > column');
416 $self->text_column($column->all_text) if $column;
417
418 if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
419 $pages = $pages->all_text;
420 if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
421 $self->pages($1 . '-' . $2);
422 };
423 };
Nils Diewald90410c22014-11-03 21:04:05 +0000424 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000425};
426
Nils Diewald840c9242014-10-28 19:51:26 +0000427
Nils Diewald7364d1f2013-11-05 19:26:35 +0000428sub to_string {
429 my $self = shift;
430
431 my $string;
432
433 foreach (@ATTR) {
434 if (my $att = $self->$_) {
435 $att =~ s/\n/ /g;
436 $att =~ s/\s\s+/ /g;
437 $string .= $_ . ' = ' . $att . "\n";
438 };
439 };
440
Nils Diewalda96de622014-10-31 17:29:23 +0000441# if ($self->author) {
442# foreach (@{$self->author}) {
443# $_ =~ s/\n/ /g;
444# $_ =~ s/\s\s+/ /g;
445# $string .= 'author = ' . $_ . "\n";
446# };
447# };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000448
449 if ($self->text_class) {
450 foreach (@{$self->text_class}) {
451 $string .= 'text_class = ' . $_ . "\n";
452 };
453 };
454
455 return $string;
456};
457
Nils Diewald044c41d2013-11-11 21:45:09 +0000458sub _k {
459 my $x = $_[0];
460 $x =~ s/_(\w)/\U$1\E/g;
461 $x =~ s/id$/ID/gi;
462 return $x;
463};
464
Nils Diewald7364d1f2013-11-05 19:26:35 +0000465
466sub to_hash {
467 my $self = shift;
468
Nils Diewald840c9242014-10-28 19:51:26 +0000469 $self->parse unless $self->text_sigle;
Nils Diewald7b847222014-04-23 11:14:00 +0000470
Nils Diewald7364d1f2013-11-05 19:26:35 +0000471 my %hash;
472
Nils Diewald840c9242014-10-28 19:51:26 +0000473 foreach (@ATTR, @ADVANCED_ATTR) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000474 if (my $att = $self->$_) {
475 $att =~ s/\n/ /g;
476 $att =~ s/\s\s+/ /g;
Nils Diewald044c41d2013-11-11 21:45:09 +0000477 $hash{_k($_)} = $att;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000478 };
479 };
480
Nils Diewald840c9242014-10-28 19:51:26 +0000481 for (qw/text_class keywords/) {
482 my @array = @{ $self->$_ };
483 next unless @array;
484 $hash{_k($_)} = join(' ', @array);
Nils Diewald37e5b572013-11-20 20:26:03 +0000485 };
486
Nils Diewald7364d1f2013-11-05 19:26:35 +0000487 return \%hash;
488};
489
490
Nils Diewald840c9242014-10-28 19:51:26 +0000491# Don't work that well
Nils Diewald7b847222014-04-23 11:14:00 +0000492sub _parse_meta_fast {
493 my $self = shift;
494
495 # my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
496 my $file = b($self->path . 'header.xml')->slurp;
497
498 my ($meta, $error);
Nils Diewald98767bb2014-04-25 20:31:19 +0000499 my $unable = 'Unable to parse document ' . $self->path;
Nils Diewald7b847222014-04-23 11:14:00 +0000500
501 try {
502 local $SIG{__WARN__} = sub {
503 $error = 1;
504 };
Nils Diewald6a2a14b2015-06-17 20:34:24 +0000505 $meta = xml2hash(
506 $file,
507 text => '#text',
508 attr => '-',
509 array => ['h.title', 'imprint', 'catRef', 'h.author']
510 )->{idsHeader};
Nils Diewald7b847222014-04-23 11:14:00 +0000511 }
512 catch {
513 $self->log->warn($unable);
514 $error = 1;
515 };
516
517 return if $error;
518
519 my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct};
520 my $analytic = $bibl_struct->{analytic};
521
522 my $titles = $analytic->{'h.title'};
523 foreach (@$titles) {
524 if ($_->{'-type'} eq 'main') {
525 $self->title($_->{'#text'});
526 }
527 elsif ($_->{'-type'} eq 'sub') {
528 $self->sub_title($_->{'#text'});
529 };
530 };
531
532 # Get Author
533 if (my $author = $analytic->{'h.author'}) {
534 $self->author($author->[0]);
535 };
536
537 # Get pubDate
538 my $date = $bibl_struct->{monogr}->{imprint};
539 my ($year, $month, $day) = (0,0,0);
540 foreach (@$date) {
Nils Diewald7b847222014-04-23 11:14:00 +0000541 if ($date->{-type} eq 'year') {
542 $year = $date->{'#text'};
543 }
544 elsif ($date->{-type} eq 'month') {
545 $month = $date->{'#text'};
546 }
547 elsif ($date->{-type} eq 'day') {
548 $day = $date->{'#text'};
549 };
550 };
551
552 $year = 0 if $year !~ /^\d+$/;
553 $month = 0 if $month !~ /^\d+$/;
554 $day = 0 if $day !~ /^\d+$/;
555
556 $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
557 $date .= length($month) == 1 ? '0' . $month : $month;
558 $date .= length($day) == 1 ? '0' . $day : $day;
559
560 $self->pub_date($date);
561
562 # Get textClasses
563 my @topic;
564 my $textClass = $meta->{profileDesc}->{textClass}->{catRef};
565 foreach (@$textClass) {
566 my ($ign, @ttopic) = split('\.', $_->{'-target'});
567 push(@topic, @ttopic);
568 };
569 $self->text_class(@topic);
570};
571
572
573
Nils Diewald2db9ad02013-10-29 19:26:43 +00005741;
575
576
577__END__
578
579=pod
580
581=head1 NAME
582
583KorAP::Document
584
585
586=head1 SYNOPSIS
587
588 my $doc = KorAP::Document->new(
589 path => 'mydoc-1/'
590 );
591
592 $doc->parse;
593
594 print $doc->title;
595
596
597=head1 DESCRIPTION
598
599Parse the primary and meta data of a document.
600
601
602=head2 ATTRIBUTES
603
Nils Diewald840c9242014-10-28 19:51:26 +0000604=head2 text_sigle
Nils Diewald2db9ad02013-10-29 19:26:43 +0000605
Nils Diewald840c9242014-10-28 19:51:26 +0000606 $doc->text_sigle(75476);
607 print $doc->text_sigle;
608
609The unique identifier of the text.
610
611
612=head2 doc_sigle
613
614 $doc->doc_sigle(75476);
615 print $doc->doc_sigle;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000616
617The unique identifier of the document.
618
619
Nils Diewald840c9242014-10-28 19:51:26 +0000620=head2 corpus_sigle
Nils Diewald2db9ad02013-10-29 19:26:43 +0000621
Nils Diewald840c9242014-10-28 19:51:26 +0000622 $doc->corpus_sigle(4);
623 print $doc->corpus_sigle;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000624
625The unique identifier of the corpus.
626
627
628=head2 path
629
630 $doc->path("example-004/");
631 print $doc->path;
632
633The path of the document.
634
635
636=head2 title
637
638 $doc->title("Der Name der Rose");
639 print $doc->title;
640
641The title of the document.
642
643
644=head2 sub_title
645
646 $doc->sub_title("Natürlich eine Handschrift");
647 print $doc->sub_title;
648
649The title of the document.
650
651
652=head2 pub_place
653
654 $doc->pub_place("Rom");
655 print $doc->pub_place;
656
657The publication place of the document.
658
659
660=head2 pub_date
661
662 $doc->pub_place("19800404");
663 print $doc->pub_place;
664
665The publication date of the document,
666in the format "YYYYMMDD".
667
668
669=head2 primary
670
671 print $doc->primary->data(0,20);
672
673The L<KorAP::Document::Primary> object containing the primary data.
674
675
676=head2 author
677
678 $doc->author('Binks, Jar Jar; Luke Skywalker');
679 print $doc->author->[0];
680
681Set the author value as semikolon separated list of names or
682get an array reference of author names.
683
684=head2 text_class
685
686 $doc->text_class(qw/news sports/);
687 print $doc->text_class->[0];
688
689Set the text class as an array or get an array
690reference of text classes.
691
692
693=head1 METHODS
694
695=head2 parse
696
697 $doc->parse;
698
699Run the parsing process of the document
700
701
702=cut
Nils Diewald62d1a972014-07-31 21:49:11 +0000703
704
705Deal with:
706 <attribute name="info">
707 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
708 information expressed by the given layer of annotation (there may, and often will, be
709 more than one)</documentation>
710 <list>
711 <oneOrMore>
712 <choice>
713 <value type="NCName">pos</value>
714 <value type="NCName">lemma</value>
715 <value type="NCName">msd</value>
716 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
717 the traditional abbreviation for "morphosyntactic description", listing info on
718 e.g. tense, person, case, etc.</documentation>
719 <value type="NCName">dep</value>
720 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
721 information about types of relations, used in dependency-style annotations; it is
722 an indication for the visualiser that word-to-word relationships should be
723 displayed</documentation>
724 <value type="NCName">lbl</value>
725 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
726 indicates the presence of labels over dependency relations</documentation>
727 <value type="NCName">const</value>
728 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
729 stands for 'constituency' or hierarchical, tree-based annotations; it is an
730 indication for the visualiser that it should display syntactic
731 trees</documentation>
732 <value type="NCName">cat</value>
733 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
734 used for syntactic categories, as separate from pos; note that these sets need not
735 be disjoint (at the lexical level, they usually overlap), but the frontend prefers
736 to keep them separate. 'cat' will be found in the context of chunking or
737 hierarchical parsing and will characterise nodes; it may also be found in
738 dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
739 (the latter are signalled by 'lbl')</documentation>
740 <value type="NCName">struct</value>
741 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
742 non-linguistic information (headers, highlights, etc.)</documentation>
743 <value type="NCName">frag</value>
744 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
745 >non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
746 <value type="NCName">ne</value>
747 <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
748 entities</documentation>
749 </choice>
750 </oneOrMore>
751 </list>
752 </attribute>