blob: 7cfcffb70a85449478f6fce46a9e6441b1fa7ee1 [file] [log] [blame]
Akron35db6e32016-03-17 22:42:22 +01001package KorAP::XML::Meta::I5;
2use KorAP::XML::Meta::Base;
Akron20294552019-11-29 16:15:35 +01003use Mojo::Util qw/url_escape/;
Akronfbf66382016-07-12 19:44:01 +02004
5our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/;
Akron35db6e32016-03-17 22:42:22 +01006
Akron5eb3aa02019-01-25 18:30:47 +01007# STRING:
8# "pubPlace",
9# "textSigle",
10# "docSigle",
11# "corpusSigle",
12# "textType",
13# "textTypeArt",
14# "textTypeRef",
15# "textColumn",
16# "textDomain",
17# "availability",
18# "language",
19# "corpusID", // Deprecated!
20# "ID" // Deprecated!
21#
22# TEXT:
23# "author",
24# "title",
25# "subTitle",
26# "corpusTitle",
27# "corpusSubTitle",
28# "corpusAuthor",
29# "docTitle",
30# "docSubTitle",
31# "docAuthor"
32#
33# KEYWORDS:
34# "textClass",
35# "foundries",
36# "keywords"
37#
38# STORE:
39# "docEditor",
40# "tokenSource",
41# "layerInfos",
42# "publisher",
43# "editor",
44# "fileEditionStatement",
45# "biblEditionStatement",
46# "reference",
47# "corpusEditor"
Akron0d68a4b2019-11-13 15:42:11 +010048# "distributor"
Akron26e77892022-05-16 17:59:29 +020049# "internalLink"
50# "externalLink"
Akron5eb3aa02019-01-25 18:30:47 +010051#
52# DATE:
53# "pubDate",
54# "creationDate"
55
56
Akronafb81ad2016-08-01 20:28:31 +020057sub _squish ($) {
58 for ($_[0]) {
59 s!\s\s+! !g;
60 s!^\s*!!;
61 s!\s*$!!;
Akronc3881502017-01-20 14:36:37 +010062 s!^\-+$!!g;
Akronafb81ad2016-08-01 20:28:31 +020063 };
64 $_[0];
65};
66
Akron35db6e32016-03-17 22:42:22 +010067# Parse meta data
Akronc893ac32018-07-18 17:59:02 +020068# This will normally be parsed in the order corpus, doc, text
Akron35db6e32016-03-17 22:42:22 +010069sub parse {
Akron6396c302016-03-18 16:05:39 +010070 my ($self, $dom, $type) = @_;
Akron35db6e32016-03-17 22:42:22 +010071
Akron64f7fae2022-07-27 12:45:33 +020072 my $lang = $self->lang;
73
Akronfbf66382016-07-12 19:44:01 +020074 # Parse text sigle
75 if ($type eq 'text' && !$self->text_sigle) {
76 my $v = $dom->at('textSigle');
77 if ($v) {
Akronafb81ad2016-08-01 20:28:31 +020078 $self->{_text_sigle} = _squish $v->text;
Akronfbf66382016-07-12 19:44:01 +020079 if ($self->{_text_sigle} =~ $SIGLE_RE) {
Akron3ec0a1c2017-01-18 14:41:55 +010080 $self->{_text_sigle} = join('/', $1, $2, $3);
81 $self->{_doc_sigle} = join('/', $1, $2);
82 $self->{_corpus_sigle} = $1;
Akronfbf66382016-07-12 19:44:01 +020083 };
84 }
85 }
86
87 # Parse document sigle
88 elsif ($type eq 'doc' && !$self->doc_sigle) {
89 my $v = $dom->at('dokumentSigle');
90 if ($v) {
91 $self->{_doc_sigle} = $v->text;
92 if ($self->{_doc_sigle} =~ $SIGLE_RE) {
Akron3ec0a1c2017-01-18 14:41:55 +010093 $self->{_doc_sigle} = join('/', $1, $2);
94 $self->{_corpus_sigle} = $1;
Akronfbf66382016-07-12 19:44:01 +020095 };
96 }
97 }
98
99 # Parse corpus sigle
100 elsif ($type eq 'corpus' && !$self->corpus_sigle) {
101 my $v = $dom->at('korpusSigle');
102 $self->{_corpus_sigle} = $v->text if $v;
103 };
104
Akron7e2eb882017-01-18 17:28:07 +0100105 # TODO: May have analytic AND monogr
106 foreach my $analytic ($dom->at('analytic'), $dom->at('monogr')) {
107 next unless $analytic;
108 # There is an analytic element
Akron35db6e32016-03-17 22:42:22 +0100109
110 # Get title, subtitle, author, editor
Akron64f7fae2022-07-27 12:45:33 +0200111 my $titles = $analytic->find('h\.title[type=main]');
112 my $title;
113 if ($lang) {
114 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
115 };
116 $title = $titles->first unless $title;
117
118 my $sub_title;
119 $titles = $analytic->find('h\.title[type=sub]');
120 if ($lang) {
121 $sub_title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
122 };
123 $sub_title = $titles->first unless $sub_title;
124
Akron35db6e32016-03-17 22:42:22 +0100125 my $author = $analytic->at('h\.author');
126 my $editor = $analytic->at('editor');
127
Akrona308c712017-06-16 16:37:01 +0200128 #if ($analytic->find('editor')->size > 1) {
129 # warn 'Mehr als ein Editor!';
130 # warn $analytic->find('editor')->join("\n");
131 #};
132
133 #if ($analytic->find('author')->size > 1) {
134 # warn 'Mehr als ein Autor!';
135 # warn $analytic->find('author')->join("\n");
136 #};
137
Akron578af4b2017-01-20 16:28:50 +0100138 # Editor contains translator
139 my $translator;
140 if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') {
141 # Translator is only supported on the text level currently
Akrona7d0e9f2017-02-03 14:36:21 +0100142 $translator = _squish $editor->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100143 $self->{A_translator} = $translator if $translator;
Akron578af4b2017-01-20 16:28:50 +0100144 $editor = undef;
145 }
146 else {
147 $editor = $editor ? _squish $editor->all_text : undef;
148 };
149
Akronafb81ad2016-08-01 20:28:31 +0200150 $title = $title ? _squish $title->all_text : undef;
151 $sub_title = $sub_title ? _squish $sub_title->all_text : undef;
152 $author = $author ? _squish $author->all_text : undef;
Akron35db6e32016-03-17 22:42:22 +0100153
Akron3c9b27c2022-03-04 13:08:13 +0100154 if (my $temp = $analytic->at('biblNote[n="url"]')) {
155 my $url = _squish $temp->all_text;
156 my $title = $temp->attr('rend') || $url;
157 $self->{"A_${type}_external_link"} = $self->korap_data_uri($url, title => $title);
158 };
159
160 if (my $temp = $analytic->at('biblNote[n="url.ids"]')) {
161 my $url = _squish $temp->all_text;
162 my $title = $temp->attr('rend') || $url;
163 $self->{"A_${type}_internal_link"} = $self->korap_data_uri($url, title => $title);
164 };
165
Akron6396c302016-03-18 16:05:39 +0100166 # Text meta data
Akron35db6e32016-03-17 22:42:22 +0100167 if ($type eq 'text') {
Akron5eb3aa02019-01-25 18:30:47 +0100168 unless ($self->{T_title} || $self->{T_sub_title}) {
169 $self->{T_title} = _remove_prefix($title, $self->text_sigle) if $title;
170 $self->{T_sub_title} = $sub_title if $sub_title;
Akron7e2eb882017-01-18 17:28:07 +0100171 };
Akron5eb3aa02019-01-25 18:30:47 +0100172 $self->{A_editor} //= $editor if $editor;
173 $self->{T_author} //= $author if $author;
Akron35db6e32016-03-17 22:42:22 +0100174 }
Akron6396c302016-03-18 16:05:39 +0100175
176 # Doc meta data
Akron35db6e32016-03-17 22:42:22 +0100177 elsif ($type eq 'doc') {
Akron5eb3aa02019-01-25 18:30:47 +0100178 unless ($self->{T_doc_title} || $self->{T_doc_sub_title}) {
179 $self->{T_doc_title} //= _remove_prefix($title, $self->doc_sigle) if $title;
180 $self->{T_doc_sub_title} //= $sub_title if $sub_title;
Akron7e2eb882017-01-18 17:28:07 +0100181 };
Akron5eb3aa02019-01-25 18:30:47 +0100182 $self->{T_doc_author} //= $author if $author;
183 $self->{A_doc_editor} //= $editor if $editor;
Akron35db6e32016-03-17 22:42:22 +0100184 }
Akron6396c302016-03-18 16:05:39 +0100185
186 # Corpus meta data
Akron35db6e32016-03-17 22:42:22 +0100187 elsif ($type eq 'corpus') {
Akron5eb3aa02019-01-25 18:30:47 +0100188 unless ($self->{T_corpus_title} || $self->{T_corpus_sub_title}) {
189 $self->{T_corpus_title} //= _remove_prefix($title, $self->corpus_sigle) if $title;
190 $self->{T_corpus_sub_title} //= $sub_title if $sub_title;
Akron7e2eb882017-01-18 17:28:07 +0100191 };
Akron5eb3aa02019-01-25 18:30:47 +0100192 $self->{T_corpus_author} //= $author if $author;
193 $self->{A_corpus_editor} //= $editor if $editor;
Akron35db6e32016-03-17 22:42:22 +0100194 };
195 };
196
197 # Not in analytic
Akron64f7fae2022-07-27 12:45:33 +0200198 my ($titles, $title);
Akron35db6e32016-03-17 22:42:22 +0100199 if ($type eq 'corpus') {
Akron6396c302016-03-18 16:05:39 +0100200
201 # Corpus title not yet given
Akron5eb3aa02019-01-25 18:30:47 +0100202 unless ($self->{T_corpus_title}) {
Akron64f7fae2022-07-27 12:45:33 +0200203 if ($titles = $dom->find('fileDesc > titleStmt > c\.title')) {
204 if ($lang) {
205 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
206 };
207
208 $title = $titles->first unless $title;
Akron6396c302016-03-18 16:05:39 +0100209
Akron3ec0a1c2017-01-18 14:41:55 +0100210 if ($title) {
Akron64f7fae2022-07-27 12:45:33 +0200211 $title = _squish($title->all_text);
212
213 if ($title) {
214 $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
215 };
Akron3ec0a1c2017-01-18 14:41:55 +0100216 };
Akron35db6e32016-03-17 22:42:22 +0100217 };
218 };
219 }
220
221 # doc title
222 elsif ($type eq 'doc') {
Akron5eb3aa02019-01-25 18:30:47 +0100223 unless ($self->{T_doc_title}) {
Akron64f7fae2022-07-27 12:45:33 +0200224 if ($titles = $dom->find('fileDesc > titleStmt > d\.title')) {
225 if ($lang) {
226 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
227 };
228
229 $title = $titles->first unless $title;
Akron6396c302016-03-18 16:05:39 +0100230
Akron3ec0a1c2017-01-18 14:41:55 +0100231 if ($title) {
Akron64f7fae2022-07-27 12:45:33 +0200232 $title = _squish($title->all_text);
233
234 if ($title) {
235 $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
236 };
Akron3ec0a1c2017-01-18 14:41:55 +0100237 };
Akron35db6e32016-03-17 22:42:22 +0100238 };
239 };
240 }
241
242 # text title
243 elsif ($type eq 'text') {
Akron5eb3aa02019-01-25 18:30:47 +0100244 unless ($self->{T_title}) {
Marc Kupietz400590b2022-12-23 16:02:36 +0100245 if ($titles = $dom->find('fileDesc > titleStmt > t\.title, fileDesc > titleStmt > title')) {
Akron64f7fae2022-07-27 12:45:33 +0200246 if ($lang) {
247 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
Akron3ec0a1c2017-01-18 14:41:55 +0100248 };
Akron64f7fae2022-07-27 12:45:33 +0200249
250 $title = $titles->first unless $title;
251
252 if ($title) {
253 $title = _squish($title->all_text);
254
255 if ($title) {
256 $self->{T_title} = _remove_prefix($title, $self->text_sigle);
257 };
258 };
259 };
Akron35db6e32016-03-17 22:42:22 +0100260 };
261 };
262
Akron6396c302016-03-18 16:05:39 +0100263 my $temp;
264
Akron35db6e32016-03-17 22:42:22 +0100265 # Get PubPlace
Akron6396c302016-03-18 16:05:39 +0100266 if ($temp = $dom->at('pubPlace')) {
267 my $place_attr = $temp->attr('key');
Akron5eb3aa02019-01-25 18:30:47 +0100268 $self->{S_pub_place_key} = $place_attr if $place_attr;
Akronafb81ad2016-08-01 20:28:31 +0200269 $temp = _squish $temp->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100270 $self->{S_pub_place} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100271 };
272
273 # Get Publisher
Akron6396c302016-03-18 16:05:39 +0100274 if ($temp = $dom->at('imprint publisher')) {
Akronafb81ad2016-08-01 20:28:31 +0200275 $temp = _squish $temp->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100276 $self->{A_publisher} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100277 };
278
279 # Get text type
Akron6396c302016-03-18 16:05:39 +0100280 $temp = $dom->at('textDesc');
281 my $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100282
Akron6396c302016-03-18 16:05:39 +0100283 if ($temp) {
284 if ($temp_2 = $temp->at('textType')) {
Akronafb81ad2016-08-01 20:28:31 +0200285 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100286 $self->{S_text_type} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100287 };
288
289 # Get text domain
Akron6396c302016-03-18 16:05:39 +0100290 if ($temp_2 = $temp->at('textDomain')) {
Akronafb81ad2016-08-01 20:28:31 +0200291 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100292 $self->{S_text_domain} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100293 };
294
295 # Get text type art
Akron6396c302016-03-18 16:05:39 +0100296 if ($temp_2 = $temp->at('textTypeArt')) {
Akronafb81ad2016-08-01 20:28:31 +0200297 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100298 $self->{S_text_type_art} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100299 };
300
Akron6396c302016-03-18 16:05:39 +0100301 # Get text type ref
302 if ($temp_2 = $temp->at('textTypeRef')) {
Akronafb81ad2016-08-01 20:28:31 +0200303 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100304 $self->{S_text_type_ref} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100305 };
306 };
307
Akron6396c302016-03-18 16:05:39 +0100308 state $NR_RE = qr/^\d+$/;
309 state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!;
Akron35db6e32016-03-17 22:42:22 +0100310
311 # Get pubDate
312 my $pub_date = $dom->find('pubDate[type=year]');
313 $pub_date->each(
314 sub {
315 my $x = shift->parent;
Akron6396c302016-03-18 16:05:39 +0100316 my $year = $x->at('pubDate[type=year]') or return;
Akron35db6e32016-03-17 22:42:22 +0100317 $year = $year ? $year->text : 0;
Akron6396c302016-03-18 16:05:39 +0100318 my $month = $x->at('pubDate[type=month]');
Akron35db6e32016-03-17 22:42:22 +0100319 $month = $month ? $month->text : 0;
Akron6396c302016-03-18 16:05:39 +0100320 my $day = $x->at('pubDate[type=day]');
Akron35db6e32016-03-17 22:42:22 +0100321 $day = $day ? $day->text : 0;
322
Akron6396c302016-03-18 16:05:39 +0100323 $year = 0 if $year !~ $NR_RE;
324 $month = 0 if $month !~ $NR_RE;
325 $day = 0 if $day !~ $NR_RE;
Akron35db6e32016-03-17 22:42:22 +0100326
Akron01c6fb52023-08-25 12:22:33 +0200327 my $date = '0000';
328 if ($year) {
329 if (length($year) <= 2) {
330 # TEMP: This may change in the future!
331 if ($year < 50) {
332 $date = '20' . (length($year) == 1 ? '0' : '') . $year;
333 } else {
334 $date = '19' . $year;
335 };
336 }
337 else {
338 $date = $year;
339 };
340 };
341
Akron35db6e32016-03-17 22:42:22 +0100342 $date .= length($month) == 1 ? '0' . $month : $month;
343 $date .= length($day) == 1 ? '0' . $day : $day;
Akron5eb3aa02019-01-25 18:30:47 +0100344 $self->{D_pub_date} = $date;
Akron35db6e32016-03-17 22:42:22 +0100345 });
346
347 # creatDate
348 my $create_date = $dom->at('creatDate');
349 if ($create_date && $create_date->text) {
Akronafb81ad2016-08-01 20:28:31 +0200350 $create_date = _squish $create_date->all_text;
Akron35db6e32016-03-17 22:42:22 +0100351 if (index($create_date, '-') > -1) {
352 $self->log->warn("Creation date ranges are not supported");
353 ($create_date) = split /\s*-\s*/, $create_date;
Akron6396c302016-03-18 16:05:39 +0100354 };
355 unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) {
356 unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) {
Akron31399172017-01-20 14:13:34 +0100357 $create_date =~ /^\d{4}\.\d{2}\.\d{2}$/;
Akron6396c302016-03-18 16:05:39 +0100358 };
359 };
360 if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) {
Akron35db6e32016-03-17 22:42:22 +0100361 $create_date =~ tr/\.//d;
Akron5eb3aa02019-01-25 18:30:47 +0100362 $self->{D_creation_date} = $create_date;
Akron35db6e32016-03-17 22:42:22 +0100363 };
364 };
365
Akron0f9b93a2017-06-29 16:21:52 +0200366
Akron6396c302016-03-18 16:05:39 +0100367 $temp = $dom->at('textClass');
368 if ($temp) {
Akron35db6e32016-03-17 22:42:22 +0100369 # Get textClasses
370 my @topic;
371
Akron6396c302016-03-18 16:05:39 +0100372 $temp->find("catRef")->each(
Akron35db6e32016-03-17 22:42:22 +0100373 sub {
Akron2e840a72022-02-03 09:49:26 +0100374 return unless $_->attr('target');
Akrona7d0e9f2017-02-03 14:36:21 +0100375 my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target'));
Akron31399172017-01-20 14:13:34 +0100376 push(@topic, @ttopic);
Akron35db6e32016-03-17 22:42:22 +0100377 }
378 );
Akron5eb3aa02019-01-25 18:30:47 +0100379 $self->{K_text_class} = [@topic] if @topic > 0;
Akron35db6e32016-03-17 22:42:22 +0100380
Akron5eb3aa02019-01-25 18:30:47 +0100381 my $kws = $self->{K_keywords};
Akrona7d0e9f2017-02-03 14:36:21 +0100382 my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each;
Akron35db6e32016-03-17 22:42:22 +0100383 push(@$kws, @keywords) if @keywords > 0;
384 };
385
Akron6396c302016-03-18 16:05:39 +0100386 if ($temp = $dom->at('biblFull editionStmt')) {
Akronafb81ad2016-08-01 20:28:31 +0200387 $temp = _squish $temp->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100388 $self->{A_bibl_edition_statement} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100389 };
390
Akron6396c302016-03-18 16:05:39 +0100391 if ($temp = $dom->at('fileDesc')) {
Akron0d68a4b2019-11-13 15:42:11 +0100392 my $temp2;
393
Akron57799fc2020-02-11 11:42:33 +0100394 if (my $editionStmt = $temp->at('editionStmt')) {
395 $temp2 = _squish $editionStmt->all_text;
396 $self->{A_file_edition_statement} = $temp2 if $temp2;
397 };
398
Akron6396c302016-03-18 16:05:39 +0100399 if (my $availability = $temp->at('publicationStmt > availability')) {
Akron0d68a4b2019-11-13 15:42:11 +0100400 $temp2 = _squish $availability->all_text;
401 $self->{S_availability} = $temp2 if $temp2;
Akron35db6e32016-03-17 22:42:22 +0100402 };
Akron0d68a4b2019-11-13 15:42:11 +0100403
404 if (my $distributor = $temp->at('publicationStmt > distributor')) {
405 $temp2 = _squish $distributor->all_text;
406 $self->{A_distributor} = $temp2 if $temp2;
407 }
Akron35db6e32016-03-17 22:42:22 +0100408 };
409
Akronc893ac32018-07-18 17:59:02 +0200410 if ($temp = $dom->at('profileDesc > langUsage > language[id]')) {
Akron5eb3aa02019-01-25 18:30:47 +0100411 $self->{S_language} = $temp->attr('id') if $temp->attr('id');
Akronc893ac32018-07-18 17:59:02 +0200412 };
413
414
Akron35db6e32016-03-17 22:42:22 +0100415 # Some meta data only available in the corpus
Akronc893ac32018-07-18 17:59:02 +0200416 #if ($type eq 'corpus') {
417 #}
Akron35db6e32016-03-17 22:42:22 +0100418
419 # Some meta data only reevant from the text
Akronc893ac32018-07-18 17:59:02 +0200420 if ($type eq 'text') {
Akron35db6e32016-03-17 22:42:22 +0100421
Akron6396c302016-03-18 16:05:39 +0100422 if ($temp = $dom->at('sourceDesc reference[type=complete]')) {
Akronafb81ad2016-08-01 20:28:31 +0200423 if (my $ref_text = _squish $temp->all_text) {
Akron31399172017-01-20 14:13:34 +0100424 $ref_text =~ s!$REF_RE!!;
Akron5eb3aa02019-01-25 18:30:47 +0100425 $self->{A_reference} = $ref_text;
Akron6bf3cc92019-02-07 12:11:20 +0100426
Akron20294552019-11-29 16:15:35 +0100427 # In case of Wikipedia texts, take the URL
Akron6bf3cc92019-02-07 12:11:20 +0100428 if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) {
Akron8ad06c42022-01-11 17:07:49 +0100429 $self->{A_externalLink} = $self->korap_data_uri($1, title => 'Wikipedia');
Akron6bf3cc92019-02-07 12:11:20 +0100430 };
Akron35db6e32016-03-17 22:42:22 +0100431 };
432 };
433
Akron6396c302016-03-18 16:05:39 +0100434 $temp = $dom->at('textDesc > column');
Akronafb81ad2016-08-01 20:28:31 +0200435 if ($temp && ($temp = _squish $temp->all_text)) {
Akron5eb3aa02019-01-25 18:30:47 +0100436 $self->{S_text_column} = $temp;
Akron6396c302016-03-18 16:05:39 +0100437 };
Akron35db6e32016-03-17 22:42:22 +0100438
Akron6396c302016-03-18 16:05:39 +0100439 if ($temp = $dom->at('biblStruct biblScope[type=pp]')) {
Akronafb81ad2016-08-01 20:28:31 +0200440 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100441 if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) {
Akron5eb3aa02019-01-25 18:30:47 +0100442 $self->{A_src_pages} = $1 . '-' . $2;
Akron35db6e32016-03-17 22:42:22 +0100443 };
444 };
Akron20294552019-11-29 16:15:35 +0100445
446 # DGD treatment
Akron67b6eda2019-12-13 15:50:23 +0100447 if ($self->{T_title} && !$self->{A_externalLink} && $self->{_corpus_sigle} =~ /^(?:[AD]GD|FOLK)$/) {
Akron20294552019-11-29 16:15:35 +0100448 my $transcript = $self->{T_title};
449 $transcript =~ s/_DF_\d+$//i;
Akron8ad06c42022-01-11 17:07:49 +0100450 $self->{A_externalLink} = $self->korap_data_uri(
Akron20294552019-11-29 16:15:35 +0100451 'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=' .
Akron8ad06c42022-01-11 17:07:49 +0100452 url_escape($transcript), title => 'DGD');
Akron20294552019-11-29 16:15:35 +0100453 }
Akron35db6e32016-03-17 22:42:22 +0100454 };
Akronfbf66382016-07-12 19:44:01 +0200455
456 return 1;
Akron35db6e32016-03-17 22:42:22 +0100457};
458
459
460sub _remove_prefix {
Akron35db6e32016-03-17 22:42:22 +0100461 # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
Akron6396c302016-03-18 16:05:39 +0100462 return $_[0] unless $_[1];
463
464 my ($title, $prefix) = @_;
Akron1cd5b872016-03-22 00:23:46 +0100465 # $prefix =~ tr!_!/!;
466 $prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!;
Akron35db6e32016-03-17 22:42:22 +0100467 if (index($title, $prefix) == 0) {
468 $title = substr($title, length($prefix));
Akron0465de52017-02-07 22:30:08 +0100469 $title =~ s!^\s*[-;:,]\s*!!;
Akron35db6e32016-03-17 22:42:22 +0100470 };
Akron6396c302016-03-18 16:05:39 +0100471
Akronafb81ad2016-08-01 20:28:31 +0200472 return _squish $title;
Akron35db6e32016-03-17 22:42:22 +0100473};
474
475
Akron35db6e32016-03-17 22:42:22 +01004761;
Akron57799fc2020-02-11 11:42:33 +0100477
Akrond4c5c102020-02-11 11:47:59 +0100478
479__END__
480
481=pod
482
483=encoding utf8
484
485=head1 NAME
486
487KorAP::XML::Meta::I5 - Parses I5 meta data of a KorAP-XML document
488
489=head1 DESCRIPTION
490
491Parses I5 meta data of a KorAP-XML document.
492
493Following the data model, all 3 levels of metadata are parsed, while not all
494metadata levels contain the same information. The precedence is that metadata
495defined on the text level will override metadata on the document level. And
496metadata on the document level will override metadata on the corpus level.
497
498=head2 Metadata categories
499
500Krill currently supports the following types of metadata to be indexed.
501They differ especially in the way they can be used to construct a virtual corpus.
502
503=over 2
504
505=item B<String>
506
507A simple string representation of a meta data field. Useful for fixed values,
508such as I<corpusSigle> or I<language>.
509
510=item B<Text>
511
512A string representation that will be indexed as a text, so fulltext search
513(like phrase search) is supported. Useful for values where partial matches are
514useful, like I<title> or I<author>.
515
516=item B<Keywords>
517
518Multiple string representations. Identical to string, but supports multiple
519values in the same field. Useful for multiple given values such as I<textClass>.
520
521=item B<Attachement>
522
523Values that can't be used for the construction of virtual corpora, but are stored
524per document and can be retrieved. Useful for static data to be retrieved such as
525I<reference> or I<externalLink>.
526
527=item B<Date>
528
529A representation of a date, that can later be used for date range queries to construct
530virtual corpora. Useful for all date related information, such as I<pubDate> or I<createDate>.
531
532=back
533
534=head2 Metadata fields
535
536Currently L<KorAP::XML::Meta::I5> recognizes and transfers the following fields, given as
537a SCSS selector rule (plus C<@> for attribute values) followed by the field name and
538the metadata category.
539The order may indicate a field to be overwritten.
540
541=over 2
542
543=item B<On all levels>
544
545 (analytic, monogr) editor[role=translator] translator ATTACHEMENT
546 pubPlace@key pubPlaceKey STRING
547 pubPlace pubPlace STRING
548 imprint publisher publisher ATTACHEMENT
549 textDesc textType textType STRING
550 textDesc textDomain textDomain STRING
551 textDesc textTypeArt textTypeArt STRING
552 textDesc textTypeRef textTypeRef STRING
553 pubDate[type=year]
554 & pubDate[type=month]
555 & pubDate[type=day] pubDate DATE
556 creatDate creationDate DATE
557 textClass catRef@target textClass KEYWORDS
Akron0a187b92020-03-16 12:49:58 +0100558 textClass h\.keywords > keyTerm keywords KEYWORDS
Akrond4c5c102020-02-11 11:47:59 +0100559 biblFull editionStmt biblEditionStatement ATTACHEMENT
560 fileDesc editionStmt fileEditionStatement ATTACHEMENT
561 fileDesc publicationStmt > availability availability STRING
562 fileDesc publicationStmt > distributor distributor ATTACHEMENT
563 profileDesc > langUsage > language[id]@id language STRING
564
565=item B<On text level>
566
567 textSigle textSigle STRING
Akron0a187b92020-03-16 12:49:58 +0100568 fileDesc > titleStmt > t\.title title TEXT
569 (analytic, monogr) h\.title[type=main] title TEXT
570 (analytic, monogr) h\.title[type=sub] subTitle TEXT
571 (analytic, monogr) h\.author author TEXT
Akrond4c5c102020-02-11 11:47:59 +0100572 (analytic, monogr) editor[role!=translator] editor ATTACHEMENT
573 sourceDesc reference[type=complete] reference ATTACHEMENT
574 textDesc > column textColumn STRING
575 biblStruct biblScope[type=pp] srcPages ATTACHEMENT
Akron26e77892022-05-16 17:59:29 +0200576 biblNote[n=url] textExternalLink
577 & @rend ATTACHEMENT
578 biblNote[n="url.ids"] textInternalLink
579 & @rend ATTACHEMENT
Akrond4c5c102020-02-11 11:47:59 +0100580
581=item B<On document level>
582
583 dokumentSigle docSigle STRING
Akron0a187b92020-03-16 12:49:58 +0100584 fileDesc > titleStmt > d\.title docTitle TEXT
585 (analytic, monogr) h\.title[type=main] docTitle TEXT
586 (analytic, monogr) h\.title[type=sub] docSubTitle TEXT
587 (analytic, monogr) h\.author docAuthor TEXT
Akrond4c5c102020-02-11 11:47:59 +0100588 (analytic, monogr) editor[role!=translator] docEditor ATTACHEMENT
Akron26e77892022-05-16 17:59:29 +0200589 biblNote[n=url] docExternalLink
590 & @rend ATTACHEMENT
591 biblNote[n="url.ids"] docInternalLink
592 & @rend ATTACHEMENT
Akrond4c5c102020-02-11 11:47:59 +0100593
594=item B<On corpus level>
595
596 korpusSigle corpusSigle STRING
Akron0a187b92020-03-16 12:49:58 +0100597 fileDesc > titleStmt > c\.title corpusTitle TEXT
598 (analytic, monogr) h\.title[type=main] corpusTitle TEXT
599 (analytic, monogr) h\.title[type=sub] corpusSubTitle TEXT
600 (analytic, monogr) h\.author corpusAuthor TEXT
Akrond4c5c102020-02-11 11:47:59 +0100601 (analytic, monogr) editor[role!=translator] corpusEditor ATTACHEMENT
Akron26e77892022-05-16 17:59:29 +0200602 biblNote[n=url] corpusExternalLink
603 & @rend ATTACHEMENT
604 biblNote[n="url.ids"] corpudInternalLink
605 & @rend ATTACHEMENT
Akrond4c5c102020-02-11 11:47:59 +0100606
607=back
608
609Some fields are specially formated, like C<srcPages> or dates.
610In case of Wikipedia texts, C<sourceDesc reference[type=complete]> will be
611turned into an C<externalLink>. In case of DGD/AGD documents, an external link
612to the DGD will be created as C<externalLink>.
613
614
615=head1 AVAILABILITY
616
617 https://github.com/KorAP/KorAP-XML-Krill
618
619
620=head1 COPYRIGHT AND LICENSE
621
622Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
623Author: L<Nils Diewald|https://nils-diewald.de/>
624
625KorAP::XML::Krill is developed as part of the
626L<KorAP|https://korap.ids-mannheim.de/>
627Corpus Analysis Platform at the
628L<Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
629member of the
630L<Leibniz-Gemeinschaft|https://www.leibniz-gemeinschaft.de/en/>
631and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
632funded by the
633L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
634
635KorAP::XML::Krill is free software published under the
636L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
637
638=cut