blob: 81f8e5f95af19b4cda9743b1a25f9e183fb90967 [file] [log] [blame]
Akron35db6e32016-03-17 22:42:22 +01001package KorAP::XML::Meta::I5;
2use KorAP::XML::Meta::Base;
Akronfbf66382016-07-12 19:44:01 +02003
4our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/;
Akron35db6e32016-03-17 22:42:22 +01005
Akronafb81ad2016-08-01 20:28:31 +02006sub _squish ($) {
7 for ($_[0]) {
8 s!\s\s+! !g;
9 s!^\s*!!;
10 s!\s*$!!;
Akronc3881502017-01-20 14:36:37 +010011 s!^\-+$!!g;
Akronafb81ad2016-08-01 20:28:31 +020012 };
13 $_[0];
14};
15
Akron35db6e32016-03-17 22:42:22 +010016# Parse meta data
17sub parse {
Akron6396c302016-03-18 16:05:39 +010018 my ($self, $dom, $type) = @_;
Akron35db6e32016-03-17 22:42:22 +010019
Akronfbf66382016-07-12 19:44:01 +020020 # Parse text sigle
21 if ($type eq 'text' && !$self->text_sigle) {
22 my $v = $dom->at('textSigle');
23 if ($v) {
Akronafb81ad2016-08-01 20:28:31 +020024 $self->{_text_sigle} = _squish $v->text;
Akronfbf66382016-07-12 19:44:01 +020025 if ($self->{_text_sigle} =~ $SIGLE_RE) {
Akron3ec0a1c2017-01-18 14:41:55 +010026 $self->{_text_sigle} = join('/', $1, $2, $3);
27 $self->{_doc_sigle} = join('/', $1, $2);
28 $self->{_corpus_sigle} = $1;
Akronfbf66382016-07-12 19:44:01 +020029 };
30 }
31 }
32
33 # Parse document sigle
34 elsif ($type eq 'doc' && !$self->doc_sigle) {
35 my $v = $dom->at('dokumentSigle');
36 if ($v) {
37 $self->{_doc_sigle} = $v->text;
38 if ($self->{_doc_sigle} =~ $SIGLE_RE) {
Akron3ec0a1c2017-01-18 14:41:55 +010039 $self->{_doc_sigle} = join('/', $1, $2);
40 $self->{_corpus_sigle} = $1;
Akronfbf66382016-07-12 19:44:01 +020041 };
42 }
43 }
44
45 # Parse corpus sigle
46 elsif ($type eq 'corpus' && !$self->corpus_sigle) {
47 my $v = $dom->at('korpusSigle');
48 $self->{_corpus_sigle} = $v->text if $v;
49 };
50
Akron7e2eb882017-01-18 17:28:07 +010051 # TODO: May have analytic AND monogr
52 foreach my $analytic ($dom->at('analytic'), $dom->at('monogr')) {
53 next unless $analytic;
54 # There is an analytic element
Akron35db6e32016-03-17 22:42:22 +010055
56 # Get title, subtitle, author, editor
57 my $title = $analytic->at('h\.title[type=main]');
58 my $sub_title = $analytic->at('h\.title[type=sub]');
59 my $author = $analytic->at('h\.author');
60 my $editor = $analytic->at('editor');
61
Akron578af4b2017-01-20 16:28:50 +010062 # Editor contains translator
63 my $translator;
64 if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') {
65 # Translator is only supported on the text level currently
Akrona7d0e9f2017-02-03 14:36:21 +010066 $translator = _squish $editor->all_text;
67 $self->{translator} = $translator if $translator;
Akron578af4b2017-01-20 16:28:50 +010068 $editor = undef;
69 }
70 else {
71 $editor = $editor ? _squish $editor->all_text : undef;
72 };
73
Akronafb81ad2016-08-01 20:28:31 +020074 $title = $title ? _squish $title->all_text : undef;
75 $sub_title = $sub_title ? _squish $sub_title->all_text : undef;
76 $author = $author ? _squish $author->all_text : undef;
Akron35db6e32016-03-17 22:42:22 +010077
Akron6396c302016-03-18 16:05:39 +010078 # Text meta data
Akron35db6e32016-03-17 22:42:22 +010079 if ($type eq 'text') {
Akron7e2eb882017-01-18 17:28:07 +010080 unless ($self->{title} || $self->{sub_title}) {
81 $self->{title} = _remove_prefix($title, $self->text_sigle) if $title;
82 $self->{sub_title} = $sub_title if $sub_title;
83 };
84 $self->{editor} //= $editor if $editor;
85 $self->{author} //= $author if $author;
Akron35db6e32016-03-17 22:42:22 +010086 }
Akron6396c302016-03-18 16:05:39 +010087
88 # Doc meta data
Akron35db6e32016-03-17 22:42:22 +010089 elsif ($type eq 'doc') {
Akron7e2eb882017-01-18 17:28:07 +010090 unless ($self->{doc_title} || $self->{doc_sub_title}) {
91 $self->{doc_title} //= _remove_prefix($title, $self->doc_sigle) if $title;
92 $self->{doc_sub_title} //= $sub_title if $sub_title;
93 };
94 $self->{doc_author} //= $author if $author;
95 $self->{doc_editor} //= $editor if $editor;
Akron35db6e32016-03-17 22:42:22 +010096 }
Akron6396c302016-03-18 16:05:39 +010097
98 # Corpus meta data
Akron35db6e32016-03-17 22:42:22 +010099 elsif ($type eq 'corpus') {
Akron7e2eb882017-01-18 17:28:07 +0100100 unless ($self->{corpus_title} || $self->{corpus_sub_title}) {
101 $self->{corpus_title} //= _remove_prefix($title, $self->corpus_sigle) if $title;
102 $self->{corpus_sub_title} //= $sub_title if $sub_title;
103 };
104 $self->{corpus_author} //= $author if $author;
105 $self->{corpus_editor} //= $editor if $editor;
Akron35db6e32016-03-17 22:42:22 +0100106 };
107 };
108
109 # Not in analytic
Akron6396c302016-03-18 16:05:39 +0100110 my $title;
Akron35db6e32016-03-17 22:42:22 +0100111 if ($type eq 'corpus') {
Akron6396c302016-03-18 16:05:39 +0100112
113 # Corpus title not yet given
Akron35db6e32016-03-17 22:42:22 +0100114 unless ($self->{corpus_title}) {
Akron6396c302016-03-18 16:05:39 +0100115 if ($title = $dom->at('fileDesc > titleStmt > c\.title')) {
Akrona7d0e9f2017-02-03 14:36:21 +0100116 $title = _squish($title->all_text);
Akron6396c302016-03-18 16:05:39 +0100117
Akron3ec0a1c2017-01-18 14:41:55 +0100118 if ($title) {
119 $self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle);
120 };
Akron35db6e32016-03-17 22:42:22 +0100121 };
122 };
123 }
124
125 # doc title
126 elsif ($type eq 'doc') {
127 unless ($self->{doc_title}) {
Akron6396c302016-03-18 16:05:39 +0100128 if ($title = $dom->at('fileDesc > titleStmt > d\.title')) {
Akrona7d0e9f2017-02-03 14:36:21 +0100129 $title = _squish($title->all_text);
Akron6396c302016-03-18 16:05:39 +0100130
Akron3ec0a1c2017-01-18 14:41:55 +0100131 if ($title) {
132 $self->{doc_title} = _remove_prefix($title, $self->doc_sigle);
133 };
Akron35db6e32016-03-17 22:42:22 +0100134 };
135 };
136 }
137
138 # text title
139 elsif ($type eq 'text') {
140 unless ($self->{title}) {
Akron6396c302016-03-18 16:05:39 +0100141 if ($title = $dom->at('fileDesc > titleStmt > t\.title')) {
Akrona7d0e9f2017-02-03 14:36:21 +0100142 $title = _squish($title->all_text);
Akron3ec0a1c2017-01-18 14:41:55 +0100143 if ($title) {
144 $self->{title} = _remove_prefix($title, $self->text_sigle);
145 };
Akron35db6e32016-03-17 22:42:22 +0100146 }
147 };
148 };
149
Akron6396c302016-03-18 16:05:39 +0100150 my $temp;
151
Akron35db6e32016-03-17 22:42:22 +0100152 # Get PubPlace
Akron6396c302016-03-18 16:05:39 +0100153 if ($temp = $dom->at('pubPlace')) {
154 my $place_attr = $temp->attr('key');
155 $self->{pub_place_key} = $place_attr if $place_attr;
Akronafb81ad2016-08-01 20:28:31 +0200156 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100157 $self->{pub_place} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100158 };
159
160 # Get Publisher
Akron6396c302016-03-18 16:05:39 +0100161 if ($temp = $dom->at('imprint publisher')) {
Akronafb81ad2016-08-01 20:28:31 +0200162 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100163 $self->{publisher} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100164 };
165
166 # Get text type
Akron6396c302016-03-18 16:05:39 +0100167 $temp = $dom->at('textDesc');
168 my $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100169
Akron6396c302016-03-18 16:05:39 +0100170 if ($temp) {
171 if ($temp_2 = $temp->at('textType')) {
Akronafb81ad2016-08-01 20:28:31 +0200172 $temp_2 = _squish $temp_2->all_text;
Akron6396c302016-03-18 16:05:39 +0100173 $self->{text_type} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100174 };
175
176 # Get text domain
Akron6396c302016-03-18 16:05:39 +0100177 if ($temp_2 = $temp->at('textDomain')) {
Akronafb81ad2016-08-01 20:28:31 +0200178 $temp_2 = _squish $temp_2->all_text;
Akron6396c302016-03-18 16:05:39 +0100179 $self->{text_domain} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100180 };
181
182 # Get text type art
Akron6396c302016-03-18 16:05:39 +0100183 if ($temp_2 = $temp->at('textTypeArt')) {
Akronafb81ad2016-08-01 20:28:31 +0200184 $temp_2 = _squish $temp_2->all_text;
Akron6396c302016-03-18 16:05:39 +0100185 $self->{text_type_art} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100186 };
187
Akron6396c302016-03-18 16:05:39 +0100188 # Get text type ref
189 if ($temp_2 = $temp->at('textTypeRef')) {
Akronafb81ad2016-08-01 20:28:31 +0200190 $temp_2 = _squish $temp_2->all_text;
Akron6396c302016-03-18 16:05:39 +0100191 $self->{text_type_ref} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100192 };
193 };
194
Akron6396c302016-03-18 16:05:39 +0100195 state $NR_RE = qr/^\d+$/;
196 state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!;
Akron35db6e32016-03-17 22:42:22 +0100197
198 # Get pubDate
199 my $pub_date = $dom->find('pubDate[type=year]');
200 $pub_date->each(
201 sub {
202 my $x = shift->parent;
Akron6396c302016-03-18 16:05:39 +0100203 my $year = $x->at('pubDate[type=year]') or return;
Akron35db6e32016-03-17 22:42:22 +0100204 $year = $year ? $year->text : 0;
Akron6396c302016-03-18 16:05:39 +0100205 my $month = $x->at('pubDate[type=month]');
Akron35db6e32016-03-17 22:42:22 +0100206 $month = $month ? $month->text : 0;
Akron6396c302016-03-18 16:05:39 +0100207 my $day = $x->at('pubDate[type=day]');
Akron35db6e32016-03-17 22:42:22 +0100208 $day = $day ? $day->text : 0;
209
Akron6396c302016-03-18 16:05:39 +0100210 $year = 0 if $year !~ $NR_RE;
211 $month = 0 if $month !~ $NR_RE;
212 $day = 0 if $day !~ $NR_RE;
Akron35db6e32016-03-17 22:42:22 +0100213
214 my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
215 $date .= length($month) == 1 ? '0' . $month : $month;
216 $date .= length($day) == 1 ? '0' . $day : $day;
217 $self->{pub_date} = $date;
218 });
219
220 # creatDate
221 my $create_date = $dom->at('creatDate');
222 if ($create_date && $create_date->text) {
Akronafb81ad2016-08-01 20:28:31 +0200223 $create_date = _squish $create_date->all_text;
Akron35db6e32016-03-17 22:42:22 +0100224 if (index($create_date, '-') > -1) {
225 $self->log->warn("Creation date ranges are not supported");
226 ($create_date) = split /\s*-\s*/, $create_date;
Akron6396c302016-03-18 16:05:39 +0100227 };
228 unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) {
229 unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) {
Akron31399172017-01-20 14:13:34 +0100230 $create_date =~ /^\d{4}\.\d{2}\.\d{2}$/;
Akron6396c302016-03-18 16:05:39 +0100231 };
232 };
233 if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) {
Akron35db6e32016-03-17 22:42:22 +0100234 $create_date =~ tr/\.//d;
235 $self->{creation_date} = $create_date;
236 };
237 };
238
Akron6396c302016-03-18 16:05:39 +0100239 $temp = $dom->at('textClass');
240 if ($temp) {
Akron35db6e32016-03-17 22:42:22 +0100241 # Get textClasses
242 my @topic;
243
Akron6396c302016-03-18 16:05:39 +0100244 $temp->find("catRef")->each(
Akron35db6e32016-03-17 22:42:22 +0100245 sub {
Akrona7d0e9f2017-02-03 14:36:21 +0100246 my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target'));
Akron31399172017-01-20 14:13:34 +0100247 push(@topic, @ttopic);
Akron35db6e32016-03-17 22:42:22 +0100248 }
249 );
250 $self->{text_class} = [@topic] if @topic > 0;
251
252 my $kws = $self->{keywords};
Akrona7d0e9f2017-02-03 14:36:21 +0100253 my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each;
Akron35db6e32016-03-17 22:42:22 +0100254 push(@$kws, @keywords) if @keywords > 0;
255 };
256
Akron6396c302016-03-18 16:05:39 +0100257 if ($temp = $dom->at('biblFull editionStmt')) {
Akronafb81ad2016-08-01 20:28:31 +0200258 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100259 $self->{bibl_edition_statement} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100260 };
261
Akron6396c302016-03-18 16:05:39 +0100262 if ($temp = $dom->at('fileDescl editionStmt')) {
Akronafb81ad2016-08-01 20:28:31 +0200263 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100264 $self->{file_edition_statement} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100265 };
266
Akron6396c302016-03-18 16:05:39 +0100267 if ($temp = $dom->at('fileDesc')) {
268 if (my $availability = $temp->at('publicationStmt > availability')) {
Akronafb81ad2016-08-01 20:28:31 +0200269 $temp = _squish $availability->all_text;
Akron6396c302016-03-18 16:05:39 +0100270 $self->{availability} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100271 };
272 };
273
274 # Some meta data only available in the corpus
275 if ($type eq 'corpus') {
Akron6396c302016-03-18 16:05:39 +0100276 if ($temp = $dom->at('profileDesc > langUsage > language[id]')) {
277 $self->{language} = $temp->attr('id') if $temp->attr('id');
Akron35db6e32016-03-17 22:42:22 +0100278 };
279 }
280
281 # Some meta data only reevant from the text
282 elsif ($type eq 'text') {
283
Akron6396c302016-03-18 16:05:39 +0100284 if ($temp = $dom->at('sourceDesc reference[type=complete]')) {
Akronafb81ad2016-08-01 20:28:31 +0200285 if (my $ref_text = _squish $temp->all_text) {
Akron31399172017-01-20 14:13:34 +0100286 $ref_text =~ s!$REF_RE!!;
287 $self->{reference} = $ref_text;
Akron35db6e32016-03-17 22:42:22 +0100288 };
289 };
290
Akron6396c302016-03-18 16:05:39 +0100291 $temp = $dom->at('textDesc > column');
Akronafb81ad2016-08-01 20:28:31 +0200292 if ($temp && ($temp = _squish $temp->all_text)) {
Akron6396c302016-03-18 16:05:39 +0100293 $self->{text_column} = $temp;
294 };
Akron35db6e32016-03-17 22:42:22 +0100295
Akron6396c302016-03-18 16:05:39 +0100296 if ($temp = $dom->at('biblStruct biblScope[type=pp]')) {
Akronafb81ad2016-08-01 20:28:31 +0200297 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100298 if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) {
Akron08d54452017-02-16 23:19:49 +0100299 $self->{src_pages} = $1 . '-' . $2;
Akron35db6e32016-03-17 22:42:22 +0100300 };
301 };
302 };
Akronfbf66382016-07-12 19:44:01 +0200303
304 return 1;
Akron35db6e32016-03-17 22:42:22 +0100305};
306
307
308sub _remove_prefix {
Akron35db6e32016-03-17 22:42:22 +0100309 # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
Akron6396c302016-03-18 16:05:39 +0100310 return $_[0] unless $_[1];
311
312 my ($title, $prefix) = @_;
Akron1cd5b872016-03-22 00:23:46 +0100313 # $prefix =~ tr!_!/!;
314 $prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!;
Akron35db6e32016-03-17 22:42:22 +0100315 if (index($title, $prefix) == 0) {
316 $title = substr($title, length($prefix));
Akron0465de52017-02-07 22:30:08 +0100317 $title =~ s!^\s*[-;:,]\s*!!;
Akron35db6e32016-03-17 22:42:22 +0100318 };
Akron6396c302016-03-18 16:05:39 +0100319
Akronafb81ad2016-08-01 20:28:31 +0200320 return _squish $title;
Akron35db6e32016-03-17 22:42:22 +0100321};
322
323
Akron35db6e32016-03-17 22:42:22 +01003241;