| package KorAP::XML::Meta::I5; |
| use KorAP::XML::Meta::Base; |
| use Mojo::Util qw/url_escape/; |
| |
| our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/; |
| our $KORAP_LINK_PREF = 'data:application/x.korap-link;'; |
| |
| # STRING: |
| # "pubPlace", |
| # "textSigle", |
| # "docSigle", |
| # "corpusSigle", |
| # "textType", |
| # "textTypeArt", |
| # "textTypeRef", |
| # "textColumn", |
| # "textDomain", |
| # "availability", |
| # "language", |
| # "corpusID", // Deprecated! |
| # "ID" // Deprecated! |
| # |
| # TEXT: |
| # "author", |
| # "title", |
| # "subTitle", |
| # "corpusTitle", |
| # "corpusSubTitle", |
| # "corpusAuthor", |
| # "docTitle", |
| # "docSubTitle", |
| # "docAuthor" |
| # |
| # KEYWORDS: |
| # "textClass", |
| # "foundries", |
| # "keywords" |
| # |
| # STORE: |
| # "docEditor", |
| # "tokenSource", |
| # "layerInfos", |
| # "publisher", |
| # "editor", |
| # "fileEditionStatement", |
| # "biblEditionStatement", |
| # "reference", |
| # "corpusEditor" |
| # "distributor" |
| # |
| # DATE: |
| # "pubDate", |
| # "creationDate" |
| |
| |
| sub _squish ($) { |
| for ($_[0]) { |
| s!\s\s+! !g; |
| s!^\s*!!; |
| s!\s*$!!; |
| s!^\-+$!!g; |
| }; |
| $_[0]; |
| }; |
| |
| # Parse meta data |
| # This will normally be parsed in the order corpus, doc, text |
| sub parse { |
| my ($self, $dom, $type) = @_; |
| |
| # Parse text sigle |
| if ($type eq 'text' && !$self->text_sigle) { |
| my $v = $dom->at('textSigle'); |
| if ($v) { |
| $self->{_text_sigle} = _squish $v->text; |
| if ($self->{_text_sigle} =~ $SIGLE_RE) { |
| $self->{_text_sigle} = join('/', $1, $2, $3); |
| $self->{_doc_sigle} = join('/', $1, $2); |
| $self->{_corpus_sigle} = $1; |
| }; |
| } |
| } |
| |
| # Parse document sigle |
| elsif ($type eq 'doc' && !$self->doc_sigle) { |
| my $v = $dom->at('dokumentSigle'); |
| if ($v) { |
| $self->{_doc_sigle} = $v->text; |
| if ($self->{_doc_sigle} =~ $SIGLE_RE) { |
| $self->{_doc_sigle} = join('/', $1, $2); |
| $self->{_corpus_sigle} = $1; |
| }; |
| } |
| } |
| |
| # Parse corpus sigle |
| elsif ($type eq 'corpus' && !$self->corpus_sigle) { |
| my $v = $dom->at('korpusSigle'); |
| $self->{_corpus_sigle} = $v->text if $v; |
| }; |
| |
| # TODO: May have analytic AND monogr |
| foreach my $analytic ($dom->at('analytic'), $dom->at('monogr')) { |
| next unless $analytic; |
| # There is an analytic element |
| |
| # Get title, subtitle, author, editor |
| my $title = $analytic->at('h\.title[type=main]'); |
| my $sub_title = $analytic->at('h\.title[type=sub]'); |
| my $author = $analytic->at('h\.author'); |
| my $editor = $analytic->at('editor'); |
| |
| #if ($analytic->find('editor')->size > 1) { |
| # warn 'Mehr als ein Editor!'; |
| # warn $analytic->find('editor')->join("\n"); |
| #}; |
| |
| #if ($analytic->find('author')->size > 1) { |
| # warn 'Mehr als ein Autor!'; |
| # warn $analytic->find('author')->join("\n"); |
| #}; |
| |
| # Editor contains translator |
| my $translator; |
| if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') { |
| # Translator is only supported on the text level currently |
| $translator = _squish $editor->all_text; |
| $self->{A_translator} = $translator if $translator; |
| $editor = undef; |
| } |
| else { |
| $editor = $editor ? _squish $editor->all_text : undef; |
| }; |
| |
| $title = $title ? _squish $title->all_text : undef; |
| $sub_title = $sub_title ? _squish $sub_title->all_text : undef; |
| $author = $author ? _squish $author->all_text : undef; |
| |
| # Text meta data |
| if ($type eq 'text') { |
| unless ($self->{T_title} || $self->{T_sub_title}) { |
| $self->{T_title} = _remove_prefix($title, $self->text_sigle) if $title; |
| $self->{T_sub_title} = $sub_title if $sub_title; |
| }; |
| $self->{A_editor} //= $editor if $editor; |
| $self->{T_author} //= $author if $author; |
| } |
| |
| # Doc meta data |
| elsif ($type eq 'doc') { |
| unless ($self->{T_doc_title} || $self->{T_doc_sub_title}) { |
| $self->{T_doc_title} //= _remove_prefix($title, $self->doc_sigle) if $title; |
| $self->{T_doc_sub_title} //= $sub_title if $sub_title; |
| }; |
| $self->{T_doc_author} //= $author if $author; |
| $self->{A_doc_editor} //= $editor if $editor; |
| } |
| |
| # Corpus meta data |
| elsif ($type eq 'corpus') { |
| unless ($self->{T_corpus_title} || $self->{T_corpus_sub_title}) { |
| $self->{T_corpus_title} //= _remove_prefix($title, $self->corpus_sigle) if $title; |
| $self->{T_corpus_sub_title} //= $sub_title if $sub_title; |
| }; |
| $self->{T_corpus_author} //= $author if $author; |
| $self->{A_corpus_editor} //= $editor if $editor; |
| }; |
| }; |
| |
| # Not in analytic |
| my $title; |
| if ($type eq 'corpus') { |
| |
| # Corpus title not yet given |
| unless ($self->{T_corpus_title}) { |
| if ($title = $dom->at('fileDesc > titleStmt > c\.title')) { |
| $title = _squish($title->all_text); |
| |
| if ($title) { |
| $self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle); |
| }; |
| }; |
| }; |
| } |
| |
| # doc title |
| elsif ($type eq 'doc') { |
| unless ($self->{T_doc_title}) { |
| if ($title = $dom->at('fileDesc > titleStmt > d\.title')) { |
| $title = _squish($title->all_text); |
| |
| if ($title) { |
| $self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle); |
| }; |
| }; |
| }; |
| } |
| |
| # text title |
| elsif ($type eq 'text') { |
| unless ($self->{T_title}) { |
| if ($title = $dom->at('fileDesc > titleStmt > t\.title')) { |
| $title = _squish($title->all_text); |
| if ($title) { |
| $self->{T_title} = _remove_prefix($title, $self->text_sigle); |
| }; |
| } |
| }; |
| }; |
| |
| my $temp; |
| |
| # Get PubPlace |
| if ($temp = $dom->at('pubPlace')) { |
| my $place_attr = $temp->attr('key'); |
| $self->{S_pub_place_key} = $place_attr if $place_attr; |
| $temp = _squish $temp->all_text; |
| $self->{S_pub_place} = $temp if $temp; |
| }; |
| |
| # Get Publisher |
| if ($temp = $dom->at('imprint publisher')) { |
| $temp = _squish $temp->all_text; |
| $self->{A_publisher} = $temp if $temp; |
| }; |
| |
| # Get text type |
| $temp = $dom->at('textDesc'); |
| my $temp_2; |
| |
| if ($temp) { |
| if ($temp_2 = $temp->at('textType')) { |
| $temp_2 = _squish $temp_2->all_text; |
| $self->{S_text_type} = $temp_2 if $temp_2; |
| }; |
| |
| # Get text domain |
| if ($temp_2 = $temp->at('textDomain')) { |
| $temp_2 = _squish $temp_2->all_text; |
| $self->{S_text_domain} = $temp_2 if $temp_2; |
| }; |
| |
| # Get text type art |
| if ($temp_2 = $temp->at('textTypeArt')) { |
| $temp_2 = _squish $temp_2->all_text; |
| $self->{S_text_type_art} = $temp_2 if $temp_2; |
| }; |
| |
| # Get text type ref |
| if ($temp_2 = $temp->at('textTypeRef')) { |
| $temp_2 = _squish $temp_2->all_text; |
| $self->{S_text_type_ref} = $temp_2 if $temp_2; |
| }; |
| }; |
| |
| state $NR_RE = qr/^\d+$/; |
| state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!; |
| |
| # Get pubDate |
| my $pub_date = $dom->find('pubDate[type=year]'); |
| $pub_date->each( |
| sub { |
| my $x = shift->parent; |
| my $year = $x->at('pubDate[type=year]') or return; |
| $year = $year ? $year->text : 0; |
| my $month = $x->at('pubDate[type=month]'); |
| $month = $month ? $month->text : 0; |
| my $day = $x->at('pubDate[type=day]'); |
| $day = $day ? $day->text : 0; |
| |
| $year = 0 if $year !~ $NR_RE; |
| $month = 0 if $month !~ $NR_RE; |
| $day = 0 if $day !~ $NR_RE; |
| |
| my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| $date .= length($month) == 1 ? '0' . $month : $month; |
| $date .= length($day) == 1 ? '0' . $day : $day; |
| $self->{D_pub_date} = $date; |
| }); |
| |
| # creatDate |
| my $create_date = $dom->at('creatDate'); |
| if ($create_date && $create_date->text) { |
| $create_date = _squish $create_date->all_text; |
| if (index($create_date, '-') > -1) { |
| $self->log->warn("Creation date ranges are not supported"); |
| ($create_date) = split /\s*-\s*/, $create_date; |
| }; |
| unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) { |
| unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) { |
| $create_date =~ /^\d{4}\.\d{2}\.\d{2}$/; |
| }; |
| }; |
| if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) { |
| $create_date =~ tr/\.//d; |
| $self->{D_creation_date} = $create_date; |
| }; |
| }; |
| |
| |
| $temp = $dom->at('textClass'); |
| if ($temp) { |
| # Get textClasses |
| my @topic; |
| |
| $temp->find("catRef")->each( |
| sub { |
| my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target')); |
| push(@topic, @ttopic); |
| } |
| ); |
| $self->{K_text_class} = [@topic] if @topic > 0; |
| |
| my $kws = $self->{K_keywords}; |
| my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each; |
| push(@$kws, @keywords) if @keywords > 0; |
| }; |
| |
| if ($temp = $dom->at('biblFull editionStmt')) { |
| $temp = _squish $temp->all_text; |
| $self->{A_bibl_edition_statement} = $temp if $temp; |
| }; |
| |
| if ($temp = $dom->at('fileDesc')) { |
| my $temp2; |
| |
| if (my $editionStmt = $temp->at('editionStmt')) { |
| $temp2 = _squish $editionStmt->all_text; |
| $self->{A_file_edition_statement} = $temp2 if $temp2; |
| }; |
| |
| if (my $availability = $temp->at('publicationStmt > availability')) { |
| $temp2 = _squish $availability->all_text; |
| $self->{S_availability} = $temp2 if $temp2; |
| }; |
| |
| if (my $distributor = $temp->at('publicationStmt > distributor')) { |
| $temp2 = _squish $distributor->all_text; |
| $self->{A_distributor} = $temp2 if $temp2; |
| } |
| }; |
| |
| if ($temp = $dom->at('profileDesc > langUsage > language[id]')) { |
| $self->{S_language} = $temp->attr('id') if $temp->attr('id'); |
| }; |
| |
| |
| # Some meta data only available in the corpus |
| #if ($type eq 'corpus') { |
| #} |
| |
| # Some meta data only reevant from the text |
| if ($type eq 'text') { |
| |
| if ($temp = $dom->at('sourceDesc reference[type=complete]')) { |
| if (my $ref_text = _squish $temp->all_text) { |
| $ref_text =~ s!$REF_RE!!; |
| $self->{A_reference} = $ref_text; |
| |
| # In case of Wikipedia texts, take the URL |
| if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) { |
| $self->{A_externalLink} = $KORAP_LINK_PREF . 'title=Wikipedia,' . $1; |
| }; |
| }; |
| }; |
| |
| $temp = $dom->at('textDesc > column'); |
| if ($temp && ($temp = _squish $temp->all_text)) { |
| $self->{S_text_column} = $temp; |
| }; |
| |
| if ($temp = $dom->at('biblStruct biblScope[type=pp]')) { |
| $temp = _squish $temp->all_text; |
| if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) { |
| $self->{A_src_pages} = $1 . '-' . $2; |
| }; |
| }; |
| |
| # DGD treatment |
| if ($self->{T_title} && !$self->{A_externalLink} && $self->{_corpus_sigle} =~ /^(?:[AD]GD|FOLK)$/) { |
| my $transcript = $self->{T_title}; |
| $transcript =~ s/_DF_\d+$//i; |
| $self->{A_externalLink} = $KORAP_LINK_PREF . 'title=DGD,' . |
| 'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=' . |
| url_escape($transcript); |
| } |
| }; |
| |
| return 1; |
| }; |
| |
| |
| sub _remove_prefix { |
| # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ... |
| return $_[0] unless $_[1]; |
| |
| my ($title, $prefix) = @_; |
| # $prefix =~ tr!_!/!; |
| $prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!; |
| if (index($title, $prefix) == 0) { |
| $title = substr($title, length($prefix)); |
| $title =~ s!^\s*[-;:,]\s*!!; |
| }; |
| |
| return _squish $title; |
| }; |
| |
| |
| 1; |
| |
| |
| __END__ |
| |
| =pod |
| |
| =encoding utf8 |
| |
| =head1 NAME |
| |
| KorAP::XML::Meta::I5 - Parses I5 meta data of a KorAP-XML document |
| |
| =head1 DESCRIPTION |
| |
| Parses I5 meta data of a KorAP-XML document. |
| |
| Following the data model, all 3 levels of metadata are parsed, while not all |
| metadata levels contain the same information. The precedence is that metadata |
| defined on the text level will override metadata on the document level. And |
| metadata on the document level will override metadata on the corpus level. |
| |
| =head2 Metadata categories |
| |
| Krill currently supports the following types of metadata to be indexed. |
| They differ especially in the way they can be used to construct a virtual corpus. |
| |
| =over 2 |
| |
| =item B<String> |
| |
| A simple string representation of a meta data field. Useful for fixed values, |
| such as I<corpusSigle> or I<language>. |
| |
| =item B<Text> |
| |
| A string representation that will be indexed as a text, so fulltext search |
| (like phrase search) is supported. Useful for values where partial matches are |
| useful, like I<title> or I<author>. |
| |
| =item B<Keywords> |
| |
| Multiple string representations. Identical to string, but supports multiple |
| values in the same field. Useful for multiple given values such as I<textClass>. |
| |
| =item B<Attachement> |
| |
| Values that can't be used for the construction of virtual corpora, but are stored |
| per document and can be retrieved. Useful for static data to be retrieved such as |
| I<reference> or I<externalLink>. |
| |
| =item B<Date> |
| |
| A representation of a date, that can later be used for date range queries to construct |
| virtual corpora. Useful for all date related information, such as I<pubDate> or I<createDate>. |
| |
| =back |
| |
| =head2 Metadata fields |
| |
| Currently L<KorAP::XML::Meta::I5> recognizes and transfers the following fields, given as |
| a SCSS selector rule (plus C<@> for attribute values) followed by the field name and |
| the metadata category. |
| The order may indicate a field to be overwritten. |
| |
| =over 2 |
| |
| =item B<On all levels> |
| |
| (analytic, monogr) editor[role=translator] translator ATTACHEMENT |
| pubPlace@key pubPlaceKey STRING |
| pubPlace pubPlace STRING |
| imprint publisher publisher ATTACHEMENT |
| textDesc textType textType STRING |
| textDesc textDomain textDomain STRING |
| textDesc textTypeArt textTypeArt STRING |
| textDesc textTypeRef textTypeRef STRING |
| pubDate[type=year] |
| & pubDate[type=month] |
| & pubDate[type=day] pubDate DATE |
| creatDate creationDate DATE |
| textClass catRef@target textClass KEYWORDS |
| textClass h\.keywords > keyTerm keywords KEYWORDS |
| biblFull editionStmt biblEditionStatement ATTACHEMENT |
| fileDesc editionStmt fileEditionStatement ATTACHEMENT |
| fileDesc publicationStmt > availability availability STRING |
| fileDesc publicationStmt > distributor distributor ATTACHEMENT |
| profileDesc > langUsage > language[id]@id language STRING |
| |
| =item B<On text level> |
| |
| textSigle textSigle STRING |
| fileDesc > titleStmt > t\.title title TEXT |
| (analytic, monogr) h\.title[type=main] title TEXT |
| (analytic, monogr) h\.title[type=sub] subTitle TEXT |
| (analytic, monogr) h\.author author TEXT |
| (analytic, monogr) editor[role!=translator] editor ATTACHEMENT |
| sourceDesc reference[type=complete] reference ATTACHEMENT |
| textDesc > column textColumn STRING |
| biblStruct biblScope[type=pp] srcPages ATTACHEMENT |
| |
| =item B<On document level> |
| |
| dokumentSigle docSigle STRING |
| fileDesc > titleStmt > d\.title docTitle TEXT |
| (analytic, monogr) h\.title[type=main] docTitle TEXT |
| (analytic, monogr) h\.title[type=sub] docSubTitle TEXT |
| (analytic, monogr) h\.author docAuthor TEXT |
| (analytic, monogr) editor[role!=translator] docEditor ATTACHEMENT |
| |
| =item B<On corpus level> |
| |
| korpusSigle corpusSigle STRING |
| fileDesc > titleStmt > c\.title corpusTitle TEXT |
| (analytic, monogr) h\.title[type=main] corpusTitle TEXT |
| (analytic, monogr) h\.title[type=sub] corpusSubTitle TEXT |
| (analytic, monogr) h\.author corpusAuthor TEXT |
| (analytic, monogr) editor[role!=translator] corpusEditor ATTACHEMENT |
| |
| =back |
| |
| Some fields are specially formated, like C<srcPages> or dates. |
| In case of Wikipedia texts, C<sourceDesc reference[type=complete]> will be |
| turned into an C<externalLink>. In case of DGD/AGD documents, an external link |
| to the DGD will be created as C<externalLink>. |
| |
| |
| =head1 AVAILABILITY |
| |
| https://github.com/KorAP/KorAP-XML-Krill |
| |
| |
| =head1 COPYRIGHT AND LICENSE |
| |
| Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/> |
| Author: L<Nils Diewald|https://nils-diewald.de/> |
| |
| KorAP::XML::Krill is developed as part of the |
| L<KorAP|https://korap.ids-mannheim.de/> |
| Corpus Analysis Platform at the |
| L<Institute for the German Language (IDS)|https://www.ids-mannheim.de/>, |
| member of the |
| L<Leibniz-Gemeinschaft|https://www.leibniz-gemeinschaft.de/en/> |
| and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project, |
| funded by the |
| L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>. |
| |
| KorAP::XML::Krill is free software published under the |
| L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>. |
| |
| =cut |