blob: a675e4f71f01fbcd0000bac1331f96d6424b2444 [file] [log] [blame]
Akron35db6e32016-03-17 22:42:22 +01001package KorAP::XML::Meta::I5;
2use KorAP::XML::Meta::Base;
Akron20294552019-11-29 16:15:35 +01003use Mojo::Util qw/url_escape/;
Akronfbf66382016-07-12 19:44:01 +02004
5our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/;
Akron35db6e32016-03-17 22:42:22 +01006
Akron5eb3aa02019-01-25 18:30:47 +01007# STRING:
8# "pubPlace",
9# "textSigle",
10# "docSigle",
11# "corpusSigle",
12# "textType",
13# "textTypeArt",
14# "textTypeRef",
15# "textColumn",
16# "textDomain",
17# "availability",
18# "language",
19# "corpusID", // Deprecated!
20# "ID" // Deprecated!
21#
22# TEXT:
23# "author",
24# "title",
25# "subTitle",
26# "corpusTitle",
27# "corpusSubTitle",
28# "corpusAuthor",
29# "docTitle",
30# "docSubTitle",
31# "docAuthor"
32#
33# KEYWORDS:
34# "textClass",
35# "foundries",
36# "keywords"
37#
38# STORE:
39# "docEditor",
40# "tokenSource",
41# "layerInfos",
42# "publisher",
43# "editor",
44# "fileEditionStatement",
45# "biblEditionStatement",
46# "reference",
47# "corpusEditor"
Akron0d68a4b2019-11-13 15:42:11 +010048# "distributor"
Akron26e77892022-05-16 17:59:29 +020049# "internalLink"
50# "externalLink"
Akron5eb3aa02019-01-25 18:30:47 +010051#
52# DATE:
53# "pubDate",
54# "creationDate"
55
56
Akronafb81ad2016-08-01 20:28:31 +020057sub _squish ($) {
58 for ($_[0]) {
59 s!\s\s+! !g;
60 s!^\s*!!;
61 s!\s*$!!;
Akronc3881502017-01-20 14:36:37 +010062 s!^\-+$!!g;
Akronafb81ad2016-08-01 20:28:31 +020063 };
64 $_[0];
65};
66
Akron35db6e32016-03-17 22:42:22 +010067# Parse meta data
Akronc893ac32018-07-18 17:59:02 +020068# This will normally be parsed in the order corpus, doc, text
Akron35db6e32016-03-17 22:42:22 +010069sub parse {
Akron6396c302016-03-18 16:05:39 +010070 my ($self, $dom, $type) = @_;
Akron35db6e32016-03-17 22:42:22 +010071
Akron64f7fae2022-07-27 12:45:33 +020072 my $lang = $self->lang;
73
Akronfbf66382016-07-12 19:44:01 +020074 # Parse text sigle
75 if ($type eq 'text' && !$self->text_sigle) {
76 my $v = $dom->at('textSigle');
77 if ($v) {
Akronafb81ad2016-08-01 20:28:31 +020078 $self->{_text_sigle} = _squish $v->text;
Akronfbf66382016-07-12 19:44:01 +020079 if ($self->{_text_sigle} =~ $SIGLE_RE) {
Akron3ec0a1c2017-01-18 14:41:55 +010080 $self->{_text_sigle} = join('/', $1, $2, $3);
81 $self->{_doc_sigle} = join('/', $1, $2);
82 $self->{_corpus_sigle} = $1;
Akronfbf66382016-07-12 19:44:01 +020083 };
84 }
85 }
86
87 # Parse document sigle
88 elsif ($type eq 'doc' && !$self->doc_sigle) {
89 my $v = $dom->at('dokumentSigle');
90 if ($v) {
91 $self->{_doc_sigle} = $v->text;
92 if ($self->{_doc_sigle} =~ $SIGLE_RE) {
Akron3ec0a1c2017-01-18 14:41:55 +010093 $self->{_doc_sigle} = join('/', $1, $2);
94 $self->{_corpus_sigle} = $1;
Akronfbf66382016-07-12 19:44:01 +020095 };
96 }
97 }
98
99 # Parse corpus sigle
100 elsif ($type eq 'corpus' && !$self->corpus_sigle) {
101 my $v = $dom->at('korpusSigle');
102 $self->{_corpus_sigle} = $v->text if $v;
103 };
104
Akron7e2eb882017-01-18 17:28:07 +0100105 # TODO: May have analytic AND monogr
106 foreach my $analytic ($dom->at('analytic'), $dom->at('monogr')) {
107 next unless $analytic;
108 # There is an analytic element
Akron35db6e32016-03-17 22:42:22 +0100109
110 # Get title, subtitle, author, editor
Akron64f7fae2022-07-27 12:45:33 +0200111 my $titles = $analytic->find('h\.title[type=main]');
112 my $title;
113 if ($lang) {
114 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
115 };
116 $title = $titles->first unless $title;
117
118 my $sub_title;
119 $titles = $analytic->find('h\.title[type=sub]');
120 if ($lang) {
121 $sub_title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
122 };
123 $sub_title = $titles->first unless $sub_title;
124
Akron35db6e32016-03-17 22:42:22 +0100125 my $author = $analytic->at('h\.author');
126 my $editor = $analytic->at('editor');
127
Akrona308c712017-06-16 16:37:01 +0200128 #if ($analytic->find('editor')->size > 1) {
129 # warn 'Mehr als ein Editor!';
130 # warn $analytic->find('editor')->join("\n");
131 #};
132
133 #if ($analytic->find('author')->size > 1) {
134 # warn 'Mehr als ein Autor!';
135 # warn $analytic->find('author')->join("\n");
136 #};
137
Akron578af4b2017-01-20 16:28:50 +0100138 # Editor contains translator
139 my $translator;
140 if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') {
141 # Translator is only supported on the text level currently
Akrona7d0e9f2017-02-03 14:36:21 +0100142 $translator = _squish $editor->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100143 $self->{A_translator} = $translator if $translator;
Akron578af4b2017-01-20 16:28:50 +0100144 $editor = undef;
145 }
146 else {
147 $editor = $editor ? _squish $editor->all_text : undef;
148 };
149
Akronafb81ad2016-08-01 20:28:31 +0200150 $title = $title ? _squish $title->all_text : undef;
151 $sub_title = $sub_title ? _squish $sub_title->all_text : undef;
152 $author = $author ? _squish $author->all_text : undef;
Akron35db6e32016-03-17 22:42:22 +0100153
Akron3c9b27c2022-03-04 13:08:13 +0100154 if (my $temp = $analytic->at('biblNote[n="url"]')) {
155 my $url = _squish $temp->all_text;
156 my $title = $temp->attr('rend') || $url;
157 $self->{"A_${type}_external_link"} = $self->korap_data_uri($url, title => $title);
158 };
159
160 if (my $temp = $analytic->at('biblNote[n="url.ids"]')) {
161 my $url = _squish $temp->all_text;
162 my $title = $temp->attr('rend') || $url;
163 $self->{"A_${type}_internal_link"} = $self->korap_data_uri($url, title => $title);
164 };
165
Akron6396c302016-03-18 16:05:39 +0100166 # Text meta data
Akron35db6e32016-03-17 22:42:22 +0100167 if ($type eq 'text') {
Akron5eb3aa02019-01-25 18:30:47 +0100168 unless ($self->{T_title} || $self->{T_sub_title}) {
169 $self->{T_title} = _remove_prefix($title, $self->text_sigle) if $title;
170 $self->{T_sub_title} = $sub_title if $sub_title;
Akron7e2eb882017-01-18 17:28:07 +0100171 };
Akron5eb3aa02019-01-25 18:30:47 +0100172 $self->{A_editor} //= $editor if $editor;
173 $self->{T_author} //= $author if $author;
Akron35db6e32016-03-17 22:42:22 +0100174 }
Akron6396c302016-03-18 16:05:39 +0100175
176 # Doc meta data
Akron35db6e32016-03-17 22:42:22 +0100177 elsif ($type eq 'doc') {
Akron5eb3aa02019-01-25 18:30:47 +0100178 unless ($self->{T_doc_title} || $self->{T_doc_sub_title}) {
179 $self->{T_doc_title} //= _remove_prefix($title, $self->doc_sigle) if $title;
180 $self->{T_doc_sub_title} //= $sub_title if $sub_title;
Akron7e2eb882017-01-18 17:28:07 +0100181 };
Akron5eb3aa02019-01-25 18:30:47 +0100182 $self->{T_doc_author} //= $author if $author;
183 $self->{A_doc_editor} //= $editor if $editor;
Akron35db6e32016-03-17 22:42:22 +0100184 }
Akron6396c302016-03-18 16:05:39 +0100185
186 # Corpus meta data
Akron35db6e32016-03-17 22:42:22 +0100187 elsif ($type eq 'corpus') {
Akron5eb3aa02019-01-25 18:30:47 +0100188 unless ($self->{T_corpus_title} || $self->{T_corpus_sub_title}) {
189 $self->{T_corpus_title} //= _remove_prefix($title, $self->corpus_sigle) if $title;
190 $self->{T_corpus_sub_title} //= $sub_title if $sub_title;
Akron7e2eb882017-01-18 17:28:07 +0100191 };
Akron5eb3aa02019-01-25 18:30:47 +0100192 $self->{T_corpus_author} //= $author if $author;
193 $self->{A_corpus_editor} //= $editor if $editor;
Akron35db6e32016-03-17 22:42:22 +0100194 };
195 };
196
197 # Not in analytic
Akron64f7fae2022-07-27 12:45:33 +0200198 my ($titles, $title);
Akron35db6e32016-03-17 22:42:22 +0100199 if ($type eq 'corpus') {
Akron6396c302016-03-18 16:05:39 +0100200
201 # Corpus title not yet given
Akron5eb3aa02019-01-25 18:30:47 +0100202 unless ($self->{T_corpus_title}) {
Akron64f7fae2022-07-27 12:45:33 +0200203 if ($titles = $dom->find('fileDesc > titleStmt > c\.title')) {
204 if ($lang) {
205 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
206 };
207
208 $title = $titles->first unless $title;
Akron6396c302016-03-18 16:05:39 +0100209
Akron3ec0a1c2017-01-18 14:41:55 +0100210 if ($title) {
Akron64f7fae2022-07-27 12:45:33 +0200211 $title = _squish($title->all_text);
212
213 if ($title) {
214 $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
215 };
Akron3ec0a1c2017-01-18 14:41:55 +0100216 };
Akron35db6e32016-03-17 22:42:22 +0100217 };
218 };
219 }
220
221 # doc title
222 elsif ($type eq 'doc') {
Akron5eb3aa02019-01-25 18:30:47 +0100223 unless ($self->{T_doc_title}) {
Akron64f7fae2022-07-27 12:45:33 +0200224 if ($titles = $dom->find('fileDesc > titleStmt > d\.title')) {
225 if ($lang) {
226 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
227 };
228
229 $title = $titles->first unless $title;
Akron6396c302016-03-18 16:05:39 +0100230
Akron3ec0a1c2017-01-18 14:41:55 +0100231 if ($title) {
Akron64f7fae2022-07-27 12:45:33 +0200232 $title = _squish($title->all_text);
233
234 if ($title) {
235 $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
236 };
Akron3ec0a1c2017-01-18 14:41:55 +0100237 };
Akron35db6e32016-03-17 22:42:22 +0100238 };
239 };
240 }
241
242 # text title
243 elsif ($type eq 'text') {
Akron5eb3aa02019-01-25 18:30:47 +0100244 unless ($self->{T_title}) {
Akron64f7fae2022-07-27 12:45:33 +0200245 if ($titles = $dom->find('fileDesc > titleStmt > t\.title')) {
246 if ($lang) {
247 $title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
Akron3ec0a1c2017-01-18 14:41:55 +0100248 };
Akron64f7fae2022-07-27 12:45:33 +0200249
250 $title = $titles->first unless $title;
251
252 if ($title) {
253 $title = _squish($title->all_text);
254
255 if ($title) {
256 $self->{T_title} = _remove_prefix($title, $self->text_sigle);
257 };
258 };
259 };
Akron35db6e32016-03-17 22:42:22 +0100260 };
261 };
262
Akron6396c302016-03-18 16:05:39 +0100263 my $temp;
264
Akron35db6e32016-03-17 22:42:22 +0100265 # Get PubPlace
Akron6396c302016-03-18 16:05:39 +0100266 if ($temp = $dom->at('pubPlace')) {
267 my $place_attr = $temp->attr('key');
Akron5eb3aa02019-01-25 18:30:47 +0100268 $self->{S_pub_place_key} = $place_attr if $place_attr;
Akronafb81ad2016-08-01 20:28:31 +0200269 $temp = _squish $temp->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100270 $self->{S_pub_place} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100271 };
272
273 # Get Publisher
Akron6396c302016-03-18 16:05:39 +0100274 if ($temp = $dom->at('imprint publisher')) {
Akronafb81ad2016-08-01 20:28:31 +0200275 $temp = _squish $temp->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100276 $self->{A_publisher} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100277 };
278
279 # Get text type
Akron6396c302016-03-18 16:05:39 +0100280 $temp = $dom->at('textDesc');
281 my $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100282
Akron6396c302016-03-18 16:05:39 +0100283 if ($temp) {
284 if ($temp_2 = $temp->at('textType')) {
Akronafb81ad2016-08-01 20:28:31 +0200285 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100286 $self->{S_text_type} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100287 };
288
289 # Get text domain
Akron6396c302016-03-18 16:05:39 +0100290 if ($temp_2 = $temp->at('textDomain')) {
Akronafb81ad2016-08-01 20:28:31 +0200291 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100292 $self->{S_text_domain} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100293 };
294
295 # Get text type art
Akron6396c302016-03-18 16:05:39 +0100296 if ($temp_2 = $temp->at('textTypeArt')) {
Akronafb81ad2016-08-01 20:28:31 +0200297 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100298 $self->{S_text_type_art} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100299 };
300
Akron6396c302016-03-18 16:05:39 +0100301 # Get text type ref
302 if ($temp_2 = $temp->at('textTypeRef')) {
Akronafb81ad2016-08-01 20:28:31 +0200303 $temp_2 = _squish $temp_2->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100304 $self->{S_text_type_ref} = $temp_2 if $temp_2;
Akron35db6e32016-03-17 22:42:22 +0100305 };
306 };
307
Akron6396c302016-03-18 16:05:39 +0100308 state $NR_RE = qr/^\d+$/;
309 state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!;
Akron35db6e32016-03-17 22:42:22 +0100310
311 # Get pubDate
312 my $pub_date = $dom->find('pubDate[type=year]');
313 $pub_date->each(
314 sub {
315 my $x = shift->parent;
Akron6396c302016-03-18 16:05:39 +0100316 my $year = $x->at('pubDate[type=year]') or return;
Akron35db6e32016-03-17 22:42:22 +0100317 $year = $year ? $year->text : 0;
Akron6396c302016-03-18 16:05:39 +0100318 my $month = $x->at('pubDate[type=month]');
Akron35db6e32016-03-17 22:42:22 +0100319 $month = $month ? $month->text : 0;
Akron6396c302016-03-18 16:05:39 +0100320 my $day = $x->at('pubDate[type=day]');
Akron35db6e32016-03-17 22:42:22 +0100321 $day = $day ? $day->text : 0;
322
Akron6396c302016-03-18 16:05:39 +0100323 $year = 0 if $year !~ $NR_RE;
324 $month = 0 if $month !~ $NR_RE;
325 $day = 0 if $day !~ $NR_RE;
Akron35db6e32016-03-17 22:42:22 +0100326
327 my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
328 $date .= length($month) == 1 ? '0' . $month : $month;
329 $date .= length($day) == 1 ? '0' . $day : $day;
Akron5eb3aa02019-01-25 18:30:47 +0100330 $self->{D_pub_date} = $date;
Akron35db6e32016-03-17 22:42:22 +0100331 });
332
333 # creatDate
334 my $create_date = $dom->at('creatDate');
335 if ($create_date && $create_date->text) {
Akronafb81ad2016-08-01 20:28:31 +0200336 $create_date = _squish $create_date->all_text;
Akron35db6e32016-03-17 22:42:22 +0100337 if (index($create_date, '-') > -1) {
338 $self->log->warn("Creation date ranges are not supported");
339 ($create_date) = split /\s*-\s*/, $create_date;
Akron6396c302016-03-18 16:05:39 +0100340 };
341 unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) {
342 unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) {
Akron31399172017-01-20 14:13:34 +0100343 $create_date =~ /^\d{4}\.\d{2}\.\d{2}$/;
Akron6396c302016-03-18 16:05:39 +0100344 };
345 };
346 if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) {
Akron35db6e32016-03-17 22:42:22 +0100347 $create_date =~ tr/\.//d;
Akron5eb3aa02019-01-25 18:30:47 +0100348 $self->{D_creation_date} = $create_date;
Akron35db6e32016-03-17 22:42:22 +0100349 };
350 };
351
Akron0f9b93a2017-06-29 16:21:52 +0200352
Akron6396c302016-03-18 16:05:39 +0100353 $temp = $dom->at('textClass');
354 if ($temp) {
Akron35db6e32016-03-17 22:42:22 +0100355 # Get textClasses
356 my @topic;
357
Akron6396c302016-03-18 16:05:39 +0100358 $temp->find("catRef")->each(
Akron35db6e32016-03-17 22:42:22 +0100359 sub {
Akron2e840a72022-02-03 09:49:26 +0100360 return unless $_->attr('target');
Akrona7d0e9f2017-02-03 14:36:21 +0100361 my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target'));
Akron31399172017-01-20 14:13:34 +0100362 push(@topic, @ttopic);
Akron35db6e32016-03-17 22:42:22 +0100363 }
364 );
Akron5eb3aa02019-01-25 18:30:47 +0100365 $self->{K_text_class} = [@topic] if @topic > 0;
Akron35db6e32016-03-17 22:42:22 +0100366
Akron5eb3aa02019-01-25 18:30:47 +0100367 my $kws = $self->{K_keywords};
Akrona7d0e9f2017-02-03 14:36:21 +0100368 my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each;
Akron35db6e32016-03-17 22:42:22 +0100369 push(@$kws, @keywords) if @keywords > 0;
370 };
371
Akron6396c302016-03-18 16:05:39 +0100372 if ($temp = $dom->at('biblFull editionStmt')) {
Akronafb81ad2016-08-01 20:28:31 +0200373 $temp = _squish $temp->all_text;
Akron5eb3aa02019-01-25 18:30:47 +0100374 $self->{A_bibl_edition_statement} = $temp if $temp;
Akron35db6e32016-03-17 22:42:22 +0100375 };
376
Akron6396c302016-03-18 16:05:39 +0100377 if ($temp = $dom->at('fileDesc')) {
Akron0d68a4b2019-11-13 15:42:11 +0100378 my $temp2;
379
Akron57799fc2020-02-11 11:42:33 +0100380 if (my $editionStmt = $temp->at('editionStmt')) {
381 $temp2 = _squish $editionStmt->all_text;
382 $self->{A_file_edition_statement} = $temp2 if $temp2;
383 };
384
Akron6396c302016-03-18 16:05:39 +0100385 if (my $availability = $temp->at('publicationStmt > availability')) {
Akron0d68a4b2019-11-13 15:42:11 +0100386 $temp2 = _squish $availability->all_text;
387 $self->{S_availability} = $temp2 if $temp2;
Akron35db6e32016-03-17 22:42:22 +0100388 };
Akron0d68a4b2019-11-13 15:42:11 +0100389
390 if (my $distributor = $temp->at('publicationStmt > distributor')) {
391 $temp2 = _squish $distributor->all_text;
392 $self->{A_distributor} = $temp2 if $temp2;
393 }
Akron35db6e32016-03-17 22:42:22 +0100394 };
395
Akronc893ac32018-07-18 17:59:02 +0200396 if ($temp = $dom->at('profileDesc > langUsage > language[id]')) {
Akron5eb3aa02019-01-25 18:30:47 +0100397 $self->{S_language} = $temp->attr('id') if $temp->attr('id');
Akronc893ac32018-07-18 17:59:02 +0200398 };
399
400
Akron35db6e32016-03-17 22:42:22 +0100401 # Some meta data only available in the corpus
Akronc893ac32018-07-18 17:59:02 +0200402 #if ($type eq 'corpus') {
403 #}
Akron35db6e32016-03-17 22:42:22 +0100404
405 # Some meta data only reevant from the text
Akronc893ac32018-07-18 17:59:02 +0200406 if ($type eq 'text') {
Akron35db6e32016-03-17 22:42:22 +0100407
Akron6396c302016-03-18 16:05:39 +0100408 if ($temp = $dom->at('sourceDesc reference[type=complete]')) {
Akronafb81ad2016-08-01 20:28:31 +0200409 if (my $ref_text = _squish $temp->all_text) {
Akron31399172017-01-20 14:13:34 +0100410 $ref_text =~ s!$REF_RE!!;
Akron5eb3aa02019-01-25 18:30:47 +0100411 $self->{A_reference} = $ref_text;
Akron6bf3cc92019-02-07 12:11:20 +0100412
Akron20294552019-11-29 16:15:35 +0100413 # In case of Wikipedia texts, take the URL
Akron6bf3cc92019-02-07 12:11:20 +0100414 if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) {
Akron8ad06c42022-01-11 17:07:49 +0100415 $self->{A_externalLink} = $self->korap_data_uri($1, title => 'Wikipedia');
Akron6bf3cc92019-02-07 12:11:20 +0100416 };
Akron35db6e32016-03-17 22:42:22 +0100417 };
418 };
419
Akron6396c302016-03-18 16:05:39 +0100420 $temp = $dom->at('textDesc > column');
Akronafb81ad2016-08-01 20:28:31 +0200421 if ($temp && ($temp = _squish $temp->all_text)) {
Akron5eb3aa02019-01-25 18:30:47 +0100422 $self->{S_text_column} = $temp;
Akron6396c302016-03-18 16:05:39 +0100423 };
Akron35db6e32016-03-17 22:42:22 +0100424
Akron6396c302016-03-18 16:05:39 +0100425 if ($temp = $dom->at('biblStruct biblScope[type=pp]')) {
Akronafb81ad2016-08-01 20:28:31 +0200426 $temp = _squish $temp->all_text;
Akron6396c302016-03-18 16:05:39 +0100427 if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) {
Akron5eb3aa02019-01-25 18:30:47 +0100428 $self->{A_src_pages} = $1 . '-' . $2;
Akron35db6e32016-03-17 22:42:22 +0100429 };
430 };
Akron20294552019-11-29 16:15:35 +0100431
432 # DGD treatment
Akron67b6eda2019-12-13 15:50:23 +0100433 if ($self->{T_title} && !$self->{A_externalLink} && $self->{_corpus_sigle} =~ /^(?:[AD]GD|FOLK)$/) {
Akron20294552019-11-29 16:15:35 +0100434 my $transcript = $self->{T_title};
435 $transcript =~ s/_DF_\d+$//i;
Akron8ad06c42022-01-11 17:07:49 +0100436 $self->{A_externalLink} = $self->korap_data_uri(
Akron20294552019-11-29 16:15:35 +0100437 'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=' .
Akron8ad06c42022-01-11 17:07:49 +0100438 url_escape($transcript), title => 'DGD');
Akron20294552019-11-29 16:15:35 +0100439 }
Akron35db6e32016-03-17 22:42:22 +0100440 };
Akronfbf66382016-07-12 19:44:01 +0200441
442 return 1;
Akron35db6e32016-03-17 22:42:22 +0100443};
444
445
446sub _remove_prefix {
Akron35db6e32016-03-17 22:42:22 +0100447 # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
Akron6396c302016-03-18 16:05:39 +0100448 return $_[0] unless $_[1];
449
450 my ($title, $prefix) = @_;
Akron1cd5b872016-03-22 00:23:46 +0100451 # $prefix =~ tr!_!/!;
452 $prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!;
Akron35db6e32016-03-17 22:42:22 +0100453 if (index($title, $prefix) == 0) {
454 $title = substr($title, length($prefix));
Akron0465de52017-02-07 22:30:08 +0100455 $title =~ s!^\s*[-;:,]\s*!!;
Akron35db6e32016-03-17 22:42:22 +0100456 };
Akron6396c302016-03-18 16:05:39 +0100457
Akronafb81ad2016-08-01 20:28:31 +0200458 return _squish $title;
Akron35db6e32016-03-17 22:42:22 +0100459};
460
461
Akron35db6e32016-03-17 22:42:22 +01004621;
Akron57799fc2020-02-11 11:42:33 +0100463
Akrond4c5c102020-02-11 11:47:59 +0100464
465__END__
466
467=pod
468
469=encoding utf8
470
471=head1 NAME
472
473KorAP::XML::Meta::I5 - Parses I5 meta data of a KorAP-XML document
474
475=head1 DESCRIPTION
476
477Parses I5 meta data of a KorAP-XML document.
478
479Following the data model, all 3 levels of metadata are parsed, while not all
480metadata levels contain the same information. The precedence is that metadata
481defined on the text level will override metadata on the document level. And
482metadata on the document level will override metadata on the corpus level.
483
484=head2 Metadata categories
485
486Krill currently supports the following types of metadata to be indexed.
487They differ especially in the way they can be used to construct a virtual corpus.
488
489=over 2
490
491=item B<String>
492
493A simple string representation of a meta data field. Useful for fixed values,
494such as I<corpusSigle> or I<language>.
495
496=item B<Text>
497
498A string representation that will be indexed as a text, so fulltext search
499(like phrase search) is supported. Useful for values where partial matches are
500useful, like I<title> or I<author>.
501
502=item B<Keywords>
503
504Multiple string representations. Identical to string, but supports multiple
505values in the same field. Useful for multiple given values such as I<textClass>.
506
507=item B<Attachement>
508
509Values that can't be used for the construction of virtual corpora, but are stored
510per document and can be retrieved. Useful for static data to be retrieved such as
511I<reference> or I<externalLink>.
512
513=item B<Date>
514
515A representation of a date, that can later be used for date range queries to construct
516virtual corpora. Useful for all date related information, such as I<pubDate> or I<createDate>.
517
518=back
519
520=head2 Metadata fields
521
522Currently L<KorAP::XML::Meta::I5> recognizes and transfers the following fields, given as
523a SCSS selector rule (plus C<@> for attribute values) followed by the field name and
524the metadata category.
525The order may indicate a field to be overwritten.
526
527=over 2
528
529=item B<On all levels>
530
531 (analytic, monogr) editor[role=translator] translator ATTACHEMENT
532 pubPlace@key pubPlaceKey STRING
533 pubPlace pubPlace STRING
534 imprint publisher publisher ATTACHEMENT
535 textDesc textType textType STRING
536 textDesc textDomain textDomain STRING
537 textDesc textTypeArt textTypeArt STRING
538 textDesc textTypeRef textTypeRef STRING
539 pubDate[type=year]
540 & pubDate[type=month]
541 & pubDate[type=day] pubDate DATE
542 creatDate creationDate DATE
543 textClass catRef@target textClass KEYWORDS
Akron0a187b92020-03-16 12:49:58 +0100544 textClass h\.keywords > keyTerm keywords KEYWORDS
Akrond4c5c102020-02-11 11:47:59 +0100545 biblFull editionStmt biblEditionStatement ATTACHEMENT
546 fileDesc editionStmt fileEditionStatement ATTACHEMENT
547 fileDesc publicationStmt > availability availability STRING
548 fileDesc publicationStmt > distributor distributor ATTACHEMENT
549 profileDesc > langUsage > language[id]@id language STRING
550
551=item B<On text level>
552
553 textSigle textSigle STRING
Akron0a187b92020-03-16 12:49:58 +0100554 fileDesc > titleStmt > t\.title title TEXT
555 (analytic, monogr) h\.title[type=main] title TEXT
556 (analytic, monogr) h\.title[type=sub] subTitle TEXT
557 (analytic, monogr) h\.author author TEXT
Akrond4c5c102020-02-11 11:47:59 +0100558 (analytic, monogr) editor[role!=translator] editor ATTACHEMENT
559 sourceDesc reference[type=complete] reference ATTACHEMENT
560 textDesc > column textColumn STRING
561 biblStruct biblScope[type=pp] srcPages ATTACHEMENT
Akron26e77892022-05-16 17:59:29 +0200562 biblNote[n=url] textExternalLink
563 & @rend ATTACHEMENT
564 biblNote[n="url.ids"] textInternalLink
565 & @rend ATTACHEMENT
Akrond4c5c102020-02-11 11:47:59 +0100566
567=item B<On document level>
568
569 dokumentSigle docSigle STRING
Akron0a187b92020-03-16 12:49:58 +0100570 fileDesc > titleStmt > d\.title docTitle TEXT
571 (analytic, monogr) h\.title[type=main] docTitle TEXT
572 (analytic, monogr) h\.title[type=sub] docSubTitle TEXT
573 (analytic, monogr) h\.author docAuthor TEXT
Akrond4c5c102020-02-11 11:47:59 +0100574 (analytic, monogr) editor[role!=translator] docEditor ATTACHEMENT
Akron26e77892022-05-16 17:59:29 +0200575 biblNote[n=url] docExternalLink
576 & @rend ATTACHEMENT
577 biblNote[n="url.ids"] docInternalLink
578 & @rend ATTACHEMENT
Akrond4c5c102020-02-11 11:47:59 +0100579
580=item B<On corpus level>
581
582 korpusSigle corpusSigle STRING
Akron0a187b92020-03-16 12:49:58 +0100583 fileDesc > titleStmt > c\.title corpusTitle TEXT
584 (analytic, monogr) h\.title[type=main] corpusTitle TEXT
585 (analytic, monogr) h\.title[type=sub] corpusSubTitle TEXT
586 (analytic, monogr) h\.author corpusAuthor TEXT
Akrond4c5c102020-02-11 11:47:59 +0100587 (analytic, monogr) editor[role!=translator] corpusEditor ATTACHEMENT
Akron26e77892022-05-16 17:59:29 +0200588 biblNote[n=url] corpusExternalLink
589 & @rend ATTACHEMENT
590 biblNote[n="url.ids"] corpudInternalLink
591 & @rend ATTACHEMENT
Akrond4c5c102020-02-11 11:47:59 +0100592
593=back
594
595Some fields are specially formated, like C<srcPages> or dates.
596In case of Wikipedia texts, C<sourceDesc reference[type=complete]> will be
597turned into an C<externalLink>. In case of DGD/AGD documents, an external link
598to the DGD will be created as C<externalLink>.
599
600
601=head1 AVAILABILITY
602
603 https://github.com/KorAP/KorAP-XML-Krill
604
605
606=head1 COPYRIGHT AND LICENSE
607
608Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
609Author: L<Nils Diewald|https://nils-diewald.de/>
610
611KorAP::XML::Krill is developed as part of the
612L<KorAP|https://korap.ids-mannheim.de/>
613Corpus Analysis Platform at the
614L<Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
615member of the
616L<Leibniz-Gemeinschaft|https://www.leibniz-gemeinschaft.de/en/>
617and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
618funded by the
619L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
620
621KorAP::XML::Krill is free software published under the
622L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
623
624=cut