blob: a8b31a5c5eadbfdc1b3ef45f0ee9616cf339c095 [file] [log] [blame]
package KorAP::XML::Meta::I5;
use KorAP::XML::Meta::Base;
use Mojo::Util qw/url_escape/;
our $SIGLE_RE = qr/^([^_\/]+)(?:[_\/]([^\._\/]+?)(?:\.(.+?))?)?$/;
our $KORAP_LINK_PREF = 'data:application/x.korap-link;';
# STRING:
# "pubPlace",
# "textSigle",
# "docSigle",
# "corpusSigle",
# "textType",
# "textTypeArt",
# "textTypeRef",
# "textColumn",
# "textDomain",
# "availability",
# "language",
# "corpusID", // Deprecated!
# "ID" // Deprecated!
#
# TEXT:
# "author",
# "title",
# "subTitle",
# "corpusTitle",
# "corpusSubTitle",
# "corpusAuthor",
# "docTitle",
# "docSubTitle",
# "docAuthor"
#
# KEYWORDS:
# "textClass",
# "foundries",
# "keywords"
#
# STORE:
# "docEditor",
# "tokenSource",
# "layerInfos",
# "publisher",
# "editor",
# "fileEditionStatement",
# "biblEditionStatement",
# "reference",
# "corpusEditor"
# "distributor"
#
# DATE:
# "pubDate",
# "creationDate"
sub _squish ($) {
for ($_[0]) {
s!\s\s+! !g;
s!^\s*!!;
s!\s*$!!;
s!^\-+$!!g;
};
$_[0];
};
# Parse meta data
# This will normally be parsed in the order corpus, doc, text
sub parse {
my ($self, $dom, $type) = @_;
# Parse text sigle
if ($type eq 'text' && !$self->text_sigle) {
my $v = $dom->at('textSigle');
if ($v) {
$self->{_text_sigle} = _squish $v->text;
if ($self->{_text_sigle} =~ $SIGLE_RE) {
$self->{_text_sigle} = join('/', $1, $2, $3);
$self->{_doc_sigle} = join('/', $1, $2);
$self->{_corpus_sigle} = $1;
};
}
}
# Parse document sigle
elsif ($type eq 'doc' && !$self->doc_sigle) {
my $v = $dom->at('dokumentSigle');
if ($v) {
$self->{_doc_sigle} = $v->text;
if ($self->{_doc_sigle} =~ $SIGLE_RE) {
$self->{_doc_sigle} = join('/', $1, $2);
$self->{_corpus_sigle} = $1;
};
}
}
# Parse corpus sigle
elsif ($type eq 'corpus' && !$self->corpus_sigle) {
my $v = $dom->at('korpusSigle');
$self->{_corpus_sigle} = $v->text if $v;
};
# TODO: May have analytic AND monogr
foreach my $analytic ($dom->at('analytic'), $dom->at('monogr')) {
next unless $analytic;
# There is an analytic element
# Get title, subtitle, author, editor
my $title = $analytic->at('h\.title[type=main]');
my $sub_title = $analytic->at('h\.title[type=sub]');
my $author = $analytic->at('h\.author');
my $editor = $analytic->at('editor');
#if ($analytic->find('editor')->size > 1) {
# warn 'Mehr als ein Editor!';
# warn $analytic->find('editor')->join("\n");
#};
#if ($analytic->find('author')->size > 1) {
# warn 'Mehr als ein Autor!';
# warn $analytic->find('author')->join("\n");
#};
# Editor contains translator
my $translator;
if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') {
# Translator is only supported on the text level currently
$translator = _squish $editor->all_text;
$self->{A_translator} = $translator if $translator;
$editor = undef;
}
else {
$editor = $editor ? _squish $editor->all_text : undef;
};
$title = $title ? _squish $title->all_text : undef;
$sub_title = $sub_title ? _squish $sub_title->all_text : undef;
$author = $author ? _squish $author->all_text : undef;
# Text meta data
if ($type eq 'text') {
unless ($self->{T_title} || $self->{T_sub_title}) {
$self->{T_title} = _remove_prefix($title, $self->text_sigle) if $title;
$self->{T_sub_title} = $sub_title if $sub_title;
};
$self->{A_editor} //= $editor if $editor;
$self->{T_author} //= $author if $author;
}
# Doc meta data
elsif ($type eq 'doc') {
unless ($self->{T_doc_title} || $self->{T_doc_sub_title}) {
$self->{T_doc_title} //= _remove_prefix($title, $self->doc_sigle) if $title;
$self->{T_doc_sub_title} //= $sub_title if $sub_title;
};
$self->{T_doc_author} //= $author if $author;
$self->{A_doc_editor} //= $editor if $editor;
}
# Corpus meta data
elsif ($type eq 'corpus') {
unless ($self->{T_corpus_title} || $self->{T_corpus_sub_title}) {
$self->{T_corpus_title} //= _remove_prefix($title, $self->corpus_sigle) if $title;
$self->{T_corpus_sub_title} //= $sub_title if $sub_title;
};
$self->{T_corpus_author} //= $author if $author;
$self->{A_corpus_editor} //= $editor if $editor;
};
};
# Not in analytic
my $title;
if ($type eq 'corpus') {
# Corpus title not yet given
unless ($self->{T_corpus_title}) {
if ($title = $dom->at('fileDesc > titleStmt > c\.title')) {
$title = _squish($title->all_text);
if ($title) {
$self->{T_corpus_title} = _remove_prefix($title, $self->corpus_sigle);
};
};
};
}
# doc title
elsif ($type eq 'doc') {
unless ($self->{T_doc_title}) {
if ($title = $dom->at('fileDesc > titleStmt > d\.title')) {
$title = _squish($title->all_text);
if ($title) {
$self->{T_doc_title} = _remove_prefix($title, $self->doc_sigle);
};
};
};
}
# text title
elsif ($type eq 'text') {
unless ($self->{T_title}) {
if ($title = $dom->at('fileDesc > titleStmt > t\.title')) {
$title = _squish($title->all_text);
if ($title) {
$self->{T_title} = _remove_prefix($title, $self->text_sigle);
};
}
};
};
my $temp;
# Get PubPlace
if ($temp = $dom->at('pubPlace')) {
my $place_attr = $temp->attr('key');
$self->{S_pub_place_key} = $place_attr if $place_attr;
$temp = _squish $temp->all_text;
$self->{S_pub_place} = $temp if $temp;
};
# Get Publisher
if ($temp = $dom->at('imprint publisher')) {
$temp = _squish $temp->all_text;
$self->{A_publisher} = $temp if $temp;
};
# Get text type
$temp = $dom->at('textDesc');
my $temp_2;
if ($temp) {
if ($temp_2 = $temp->at('textType')) {
$temp_2 = _squish $temp_2->all_text;
$self->{S_text_type} = $temp_2 if $temp_2;
};
# Get text domain
if ($temp_2 = $temp->at('textDomain')) {
$temp_2 = _squish $temp_2->all_text;
$self->{S_text_domain} = $temp_2 if $temp_2;
};
# Get text type art
if ($temp_2 = $temp->at('textTypeArt')) {
$temp_2 = _squish $temp_2->all_text;
$self->{S_text_type_art} = $temp_2 if $temp_2;
};
# Get text type ref
if ($temp_2 = $temp->at('textTypeRef')) {
$temp_2 = _squish $temp_2->all_text;
$self->{S_text_type_ref} = $temp_2 if $temp_2;
};
};
state $NR_RE = qr/^\d+$/;
state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!;
# Get pubDate
my $pub_date = $dom->find('pubDate[type=year]');
$pub_date->each(
sub {
my $x = shift->parent;
my $year = $x->at('pubDate[type=year]') or return;
$year = $year ? $year->text : 0;
my $month = $x->at('pubDate[type=month]');
$month = $month ? $month->text : 0;
my $day = $x->at('pubDate[type=day]');
$day = $day ? $day->text : 0;
$year = 0 if $year !~ $NR_RE;
$month = 0 if $month !~ $NR_RE;
$day = 0 if $day !~ $NR_RE;
my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
$self->{D_pub_date} = $date;
});
# creatDate
my $create_date = $dom->at('creatDate');
if ($create_date && $create_date->text) {
$create_date = _squish $create_date->all_text;
if (index($create_date, '-') > -1) {
$self->log->warn("Creation date ranges are not supported");
($create_date) = split /\s*-\s*/, $create_date;
};
unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) {
unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) {
$create_date =~ /^\d{4}\.\d{2}\.\d{2}$/;
};
};
if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) {
$create_date =~ tr/\.//d;
$self->{D_creation_date} = $create_date;
};
};
$temp = $dom->at('textClass');
if ($temp) {
# Get textClasses
my @topic;
$temp->find("catRef")->each(
sub {
my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target'));
push(@topic, @ttopic);
}
);
$self->{K_text_class} = [@topic] if @topic > 0;
my $kws = $self->{K_keywords};
my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each;
push(@$kws, @keywords) if @keywords > 0;
};
if ($temp = $dom->at('biblFull editionStmt')) {
$temp = _squish $temp->all_text;
$self->{A_bibl_edition_statement} = $temp if $temp;
};
if ($temp = $dom->at('fileDesc')) {
my $temp2;
if (my $editionStmt = $temp->at('editionStmt')) {
$temp2 = _squish $editionStmt->all_text;
$self->{A_file_edition_statement} = $temp2 if $temp2;
};
if (my $availability = $temp->at('publicationStmt > availability')) {
$temp2 = _squish $availability->all_text;
$self->{S_availability} = $temp2 if $temp2;
};
if (my $distributor = $temp->at('publicationStmt > distributor')) {
$temp2 = _squish $distributor->all_text;
$self->{A_distributor} = $temp2 if $temp2;
}
};
if ($temp = $dom->at('profileDesc > langUsage > language[id]')) {
$self->{S_language} = $temp->attr('id') if $temp->attr('id');
};
# Some meta data only available in the corpus
#if ($type eq 'corpus') {
#}
# Some meta data only reevant from the text
if ($type eq 'text') {
if ($temp = $dom->at('sourceDesc reference[type=complete]')) {
if (my $ref_text = _squish $temp->all_text) {
$ref_text =~ s!$REF_RE!!;
$self->{A_reference} = $ref_text;
# In case of Wikipedia texts, take the URL
if ($ref_text =~ /URL:(http:.+?):\s+Wikipedia,\s+\d+\s*$/) {
$self->{A_externalLink} = $KORAP_LINK_PREF . 'title=Wikipedia,' . $1;
};
};
};
$temp = $dom->at('textDesc > column');
if ($temp && ($temp = _squish $temp->all_text)) {
$self->{S_text_column} = $temp;
};
if ($temp = $dom->at('biblStruct biblScope[type=pp]')) {
$temp = _squish $temp->all_text;
if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) {
$self->{A_src_pages} = $1 . '-' . $2;
};
};
# DGD treatment
if ($self->{T_title} && !$self->{A_externalLink} && $self->{_corpus_sigle} =~ /^(?:[AD]GD|FOLK)$/) {
my $transcript = $self->{T_title};
$transcript =~ s/_DF_\d+$//i;
$self->{A_externalLink} = $KORAP_LINK_PREF . 'title=DGD,' .
'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=' .
url_escape($transcript);
}
};
return 1;
};
sub _remove_prefix {
# This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
return $_[0] unless $_[1];
my ($title, $prefix) = @_;
# $prefix =~ tr!_!/!;
$prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!;
if (index($title, $prefix) == 0) {
$title = substr($title, length($prefix));
$title =~ s!^\s*[-;:,]\s*!!;
};
return _squish $title;
};
1;
__END__
=pod
=encoding utf8
=head1 NAME
KorAP::XML::Meta::I5 - Parses I5 meta data of a KorAP-XML document
=head1 DESCRIPTION
Parses I5 meta data of a KorAP-XML document.
Following the data model, all 3 levels of metadata are parsed, while not all
metadata levels contain the same information. The precedence is that metadata
defined on the text level will override metadata on the document level. And
metadata on the document level will override metadata on the corpus level.
=head2 Metadata categories
Krill currently supports the following types of metadata to be indexed.
They differ especially in the way they can be used to construct a virtual corpus.
=over 2
=item B<String>
A simple string representation of a meta data field. Useful for fixed values,
such as I<corpusSigle> or I<language>.
=item B<Text>
A string representation that will be indexed as a text, so fulltext search
(like phrase search) is supported. Useful for values where partial matches are
useful, like I<title> or I<author>.
=item B<Keywords>
Multiple string representations. Identical to string, but supports multiple
values in the same field. Useful for multiple given values such as I<textClass>.
=item B<Attachement>
Values that can't be used for the construction of virtual corpora, but are stored
per document and can be retrieved. Useful for static data to be retrieved such as
I<reference> or I<externalLink>.
=item B<Date>
A representation of a date, that can later be used for date range queries to construct
virtual corpora. Useful for all date related information, such as I<pubDate> or I<createDate>.
=back
=head2 Metadata fields
Currently L<KorAP::XML::Meta::I5> recognizes and transfers the following fields, given as
a SCSS selector rule (plus C<@> for attribute values) followed by the field name and
the metadata category.
The order may indicate a field to be overwritten.
=over 2
=item B<On all levels>
(analytic, monogr) editor[role=translator] translator ATTACHEMENT
pubPlace@key pubPlaceKey STRING
pubPlace pubPlace STRING
imprint publisher publisher ATTACHEMENT
textDesc textType textType STRING
textDesc textDomain textDomain STRING
textDesc textTypeArt textTypeArt STRING
textDesc textTypeRef textTypeRef STRING
pubDate[type=year]
& pubDate[type=month]
& pubDate[type=day] pubDate DATE
creatDate creationDate DATE
textClass catRef@target textClass KEYWORDS
textClass h\.keywords > keyTerm keywords KEYWORDS
biblFull editionStmt biblEditionStatement ATTACHEMENT
fileDesc editionStmt fileEditionStatement ATTACHEMENT
fileDesc publicationStmt > availability availability STRING
fileDesc publicationStmt > distributor distributor ATTACHEMENT
profileDesc > langUsage > language[id]@id language STRING
=item B<On text level>
textSigle textSigle STRING
fileDesc > titleStmt > t\.title title TEXT
(analytic, monogr) h\.title[type=main] title TEXT
(analytic, monogr) h\.title[type=sub] subTitle TEXT
(analytic, monogr) h\.author author TEXT
(analytic, monogr) editor[role!=translator] editor ATTACHEMENT
sourceDesc reference[type=complete] reference ATTACHEMENT
textDesc > column textColumn STRING
biblStruct biblScope[type=pp] srcPages ATTACHEMENT
=item B<On document level>
dokumentSigle docSigle STRING
fileDesc > titleStmt > d\.title docTitle TEXT
(analytic, monogr) h\.title[type=main] docTitle TEXT
(analytic, monogr) h\.title[type=sub] docSubTitle TEXT
(analytic, monogr) h\.author docAuthor TEXT
(analytic, monogr) editor[role!=translator] docEditor ATTACHEMENT
=item B<On corpus level>
korpusSigle corpusSigle STRING
fileDesc > titleStmt > c\.title corpusTitle TEXT
(analytic, monogr) h\.title[type=main] corpusTitle TEXT
(analytic, monogr) h\.title[type=sub] corpusSubTitle TEXT
(analytic, monogr) h\.author corpusAuthor TEXT
(analytic, monogr) editor[role!=translator] corpusEditor ATTACHEMENT
=back
Some fields are specially formated, like C<srcPages> or dates.
In case of Wikipedia texts, C<sourceDesc reference[type=complete]> will be
turned into an C<externalLink>. In case of DGD/AGD documents, an external link
to the DGD will be created as C<externalLink>.
=head1 AVAILABILITY
https://github.com/KorAP/KorAP-XML-Krill
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: L<Nils Diewald|https://nils-diewald.de/>
KorAP::XML::Krill is developed as part of the
L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|https://www.leibniz-gemeinschaft.de/en/>
and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
funded by the
L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
KorAP::XML::Krill is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
=cut