blob: 4af6c1a098243da91a06bca7164c6eab74412fbb [file] [log] [blame]
package KorAP::Document;
use Mojo::Base -base;
use Mojo::ByteStream 'b';
use Mojo::Util qw/encode/;
use XML::Fast;
use Try::Tiny;
use Carp qw/croak/;
use KorAP::Document::Primary;
use Log::Log4perl;
use KorAP::Log;
use Mojo::DOM;
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
our @ATTR = qw/text_sigle
doc_sigle
corpus_sigle
title
pub_date
sub_title
pub_place
author/;
our @ADVANCED_ATTR = qw/publisher
editor
text_type
text_type_art
text_type_ref
text_column
text_domain
creation_date
license
pages
file_edition_statement
bibl_edition_statement
reference
language
doc_title
doc_sub_title
doc_editor
doc_author
corpus_author
corpus_title
corpus_sub_title
corpus_editor
/;
# Separate: text_class, keywords
# Removed: coll_title, coll_sub_title, coll_author, coll_editor
# Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
has 'path';
has [@ATTR, @ADVANCED_ATTR];
has log => sub {
if(Log::Log4perl->initialized()) {
state $log = Log::Log4perl->get_logger(__PACKAGE__);
};
state $log = KorAP::Log->new;
return $log;
};
sub new {
my $class = shift;
my $self = bless { @_ }, $class;
if (exists $self->{path}) {
$self->{path} = rel2abs($self->{path});
if ($self->{path} !~ m!\/$!) {
$self->{path} .= '/';
};
};
return $self;
};
# parse document
sub parse {
my $self = shift;
my $data_xml = $self->path . 'data.xml';
my ($rt, $error, $file);
my $unable = 'Unable to parse document ' . $self->path;
unless (-e $data_xml) {
$self->log->warn($unable . ' - no data.xml found');
$error = 1;
}
else {
$file = b($data_xml)->slurp;
try {
local $SIG{__WARN__} = sub {
$error = 1;
};
$rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
}
catch {
$self->log->warn($unable);
$error = 1;
};
};
return if $error;
$self->log->debug('Parse document ' . $self->path);
# Get document id and corpus id
if ($rt && $rt->{'-docid'}) {
$self->text_sigle($rt->{'-docid'});
if ($self->text_sigle =~ /^(([^_]+)_[^\._]+?)\.\d+$/) {
$self->corpus_sigle($2);
$self->doc_sigle($1);
}
else {
croak $unable . ': ID not parseable';
};
}
else {
croak $unable . ': No raw_text found or no ID';
};
# Get primary data
my $pd = $rt->{text};
if ($pd) {
$self->{pd} = KorAP::Document::Primary->new($pd);
}
else {
croak $unable;
};
my @path = grep { $_ } splitdir($self->path);
my @header;
# Parse the corpus file, the doc file, and the text file for meta information
foreach (0..2) {
unshift @header, '/' . catfile(@path, 'header.xml');
pop @path;
};
my @type = qw/corpus doc text/;
foreach (@header) {
# Get corpus, doc and text meta data
my $type = shift(@type);
$self->_parse_meta($_, $type) if -e $_;
};
return 1;
};
# Primary data
sub primary {
$_[0]->{pd};
};
#sub author {
# my $self = shift;
#
# # Set authors
# if ($_[0]) {
# return $self->{authors} = [
# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
# ];
# }
# return ($self->{authors} // []);
#};
sub text_class {
my $self = shift;
if ($_[0]) {
return $self->{topics} = [ @_ ];
};
return ($self->{topics} // []);
};
sub text_class_string {
return join ' ', @{shift->text_class};
}
sub keywords {
my $self = shift;
if ($_[0]) {
return $self->{keywords} = [ @_ ];
};
return ($self->{keywords} // []);
};
sub keywords_string {
return join ' ', @{shift->keywords};
}
sub _remove_prefix {
return $_[0];
# This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
my $title = shift;
my $prefix = shift;
$prefix =~ tr!_!/!;
if (index($title, $prefix) == 0) {
$title = substr($title, length($prefix));
$title =~ s/^\s+//;
$title =~ s/\s+$//;
};
return $title;
};
sub _parse_meta {
my $self = shift;
my $header_xml = shift;
my $type = shift;
my $file = b($header_xml)->slurp->decode('iso-8859-1');
my $dom = Mojo::DOM->new($file);
my $analytic = $dom->at('analytic');
# There is an analytic element
if ($analytic) {
# Get title, subtitle, author, editor
my $title = $analytic->at('h\.title[type=main]');
my $sub_title = $analytic->at('h\.title[type=sub]');
my $author = $analytic->at('h\.author');
my $editor = $analytic->at('editor');
$title = $title ? $title->all_text : undef;
$sub_title = $sub_title ? $sub_title->all_text : undef;
$author = $author ? $author->all_text : undef;
$editor = $editor ? $editor->all_text : undef;
if ($type eq 'text') {
$self->title(_remove_prefix($title, $self->text_sigle)) if $title;
$self->sub_title($sub_title) if $sub_title;
$self->editor($editor) if $editor;
$self->author($author) if $author;
}
elsif ($type eq 'doc') {
$self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title;
$self->doc_sub_title($sub_title) if $sub_title;
$self->doc_author($author) if $author;
$self->doc_editor($editor) if $editor;
}
elsif ($type eq 'corpus') {
$self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title;
$self->corpus_sub_title($sub_title) if $sub_title;
$self->corpus_author($author) if $author;
$self->corpus_editor($editor) if $editor;
};
};
# Not in analytic
if ($type eq 'corpus') {
unless ($self->corpus_title) {
if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
$self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle)) if $title->all_text;
};
};
}
# doc title
elsif ($type eq 'doc') {
unless ($self->doc_title) {
if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
$self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle)) if $title->all_text;
};
};
}
# text title
elsif ($type eq 'text') {
unless ($self->title) {
if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
$self->title(_remove_prefix($title->all_text, $self->text_sigle)) if $title->all_text;
};
};
};
# Get PubPlace
if (my $place = $dom->at('pubPlace')) {
$self->pub_place($place->all_text) if $place->all_text;
};
# Get Publisher
if (my $publisher = $dom->at('imprint publisher')) {
$self->publisher($publisher->all_text) if $publisher->all_text;
};
# my $mono = $dom->at('monogr');
# if ($mono) {
#
# # Get title, subtitle, author, editor
# my $title = $mono->at('h\.title[type=main]');
# my $sub_title = $mono->at('h\.title[type=sub]');
# my $author = $mono->at('h\.author');
# my $editor = $mono->at('editor');
#
# $title = $title ? $title->all_text : undef;
# $sub_title = $sub_title ? $sub_title->all_text : undef;
# $author = $author ? $author->all_text : undef;
# $editor = $editor ? $editor->all_text : undef;
#
# if ($type eq 'text') {
# $self->title($title) if $title && !$self->title;
# $self->sub_title($sub_title) if $sub_title && !$self->sub_title;
# $self->editor($editor) if $editor && !$self->editor;
# $self->author($author) if $author && !$self->author;
# }
# elsif ($type eq 'doc') {
# $self->doc_title($title) if $title && !$self->doc_title;
# $self->doc_sub_title($sub_title) if $sub_title && !$self->doc_sub_title;
# $self->doc_author($author) if $author && !$self->doc_author;
# $self->doc_editor($editor) if $editor && !$self->doc_editor;
# }
# elsif ($type eq 'corpus') {
# $self->corpus_title($title) if $title && !$self->corpus_title;
# $self->corpus_sub_title($sub_title) if $sub_title && !$self->corpus_sub_title;
# $self->corpus_author($author) if $author && !$self->corpus_author;
# $self->corpus_editor($editor) if $editor && !$self->corpus_editor;
# };
# };
# Get text type
my $text_desc = $dom->at('textDesc');
if ($text_desc) {
if (my $text_type = $text_desc->at('textType')) {
$self->text_type($text_type->all_text) if $text_type->all_text;
};
# Get text domain
if (my $text_domain = $text_desc->at('textDomain')) {
$self->text_domain($text_domain->all_text) if $text_domain->all_text;
};
# Get text type art
if (my $text_type_art = $text_desc->at('textTypeArt')) {
$self->text_type_art($text_type_art->all_text) if $text_type_art->all_text;
};
# Get text type art
if (my $text_type_ref = $text_desc->at('textTypeRef')) {
$self->text_type_ref($text_type_ref->all_text) if $text_type_ref->all_text;
};
};
# Get pubDate
my $pub_date = $dom->find('pubDate[type=year]');
$pub_date->each(
sub {
my $x = shift->parent;
my $year = $x->at("pubDate[type=year]");
return unless $year;
$year = $year ? $year->text : 0;
my $month = $x->at("pubDate[type=month]");
$month = $month ? $month->text : 0;
my $day = $x->at("pubDate[type=day]");
$day = $day ? $day->text : 0;
$year = 0 if $year !~ /^\d+$/;
$month = 0 if $month !~ /^\d+$/;
$day = 0 if $day !~ /^\d+$/;
my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
$self->pub_date($date);
});
# creatDate
my $create_date = $dom->at('creatDate');
if ($create_date && $create_date->text) {
$create_date = $create_date->all_text;
if (index($create_date, '-') > -1) {
$self->log->warn("Creation date ranges are not supported");
($create_date) = split /\s*-\s*/, $create_date;
}
$create_date =~ s{^(\d{4})$}{$1\.00};
$create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
$create_date =~ tr/\.//d;
$self->creation_date($create_date);
};
};
my $text_class = $dom->at('textClass');
if ($text_class) {
# Get textClasses
my @topic;
$text_class->find("catRef")->each(
sub {
my ($ign, @ttopic) = split('\.', $_->attr('target'));
push(@topic, @ttopic);
}
);
$self->text_class(@topic) if @topic > 0;
my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
$self->keywords(@keywords) if @keywords > 0;
};
if (my $edition_statement = $dom->at('biblFull editionStmt')) {
$self->bibl_edition_statement($edition_statement->all_text)
if $edition_statement->text;
};
if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
$self->file_edition_statement($edition_statement->all_text)
if $edition_statement->text;
};
if (my $file_desc = $dom->at('fileDesc')) {
if (my $availability = $file_desc->at('publicationStmt > availability')) {
$self->license($availability->all_text);
};
};
# Some meta data only available in the corpus
if ($type eq 'corpus') {
if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
$self->language($language->attr('id'));
};
}
# Some meta data only reevant from the text
elsif ($type eq 'text') {
if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
if (my $ref_text = $reference->all_text) {
$ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
$self->reference($ref_text);
};
};
my $column = $dom->at('textDesc > column');
$self->text_column($column->all_text) if $column;
if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
$pages = $pages->all_text;
if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
$self->pages($1 . '-' . $2);
};
};
};
};
sub to_string {
my $self = shift;
my $string;
foreach (@ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
$string .= $_ . ' = ' . $att . "\n";
};
};
# if ($self->author) {
# foreach (@{$self->author}) {
# $_ =~ s/\n/ /g;
# $_ =~ s/\s\s+/ /g;
# $string .= 'author = ' . $_ . "\n";
# };
# };
if ($self->text_class) {
foreach (@{$self->text_class}) {
$string .= 'text_class = ' . $_ . "\n";
};
};
return $string;
};
sub _k {
my $x = $_[0];
$x =~ s/_(\w)/\U$1\E/g;
$x =~ s/id$/ID/gi;
return $x;
};
sub to_hash {
my $self = shift;
$self->parse unless $self->text_sigle;
my %hash;
foreach (@ATTR, @ADVANCED_ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
$hash{_k($_)} = $att;
};
};
for (qw/text_class keywords/) {
my @array = @{ $self->$_ };
next unless @array;
$hash{_k($_)} = join(' ', @array);
};
return \%hash;
};
# Don't work that well
sub _parse_meta_fast {
my $self = shift;
# my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
my $file = b($self->path . 'header.xml')->slurp;
my ($meta, $error);
my $unable = 'Unable to parse document ' . $self->path;
try {
local $SIG{__WARN__} = sub {
$error = 1;
};
$meta = xml2hash(
$file,
text => '#text',
attr => '-',
array => ['h.title', 'imprint', 'catRef', 'h.author']
)->{idsHeader};
}
catch {
$self->log->warn($unable);
$error = 1;
};
return if $error;
my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct};
my $analytic = $bibl_struct->{analytic};
my $titles = $analytic->{'h.title'};
foreach (@$titles) {
if ($_->{'-type'} eq 'main') {
$self->title($_->{'#text'});
}
elsif ($_->{'-type'} eq 'sub') {
$self->sub_title($_->{'#text'});
};
};
# Get Author
if (my $author = $analytic->{'h.author'}) {
$self->author($author->[0]);
};
# Get pubDate
my $date = $bibl_struct->{monogr}->{imprint};
my ($year, $month, $day) = (0,0,0);
foreach (@$date) {
if ($date->{-type} eq 'year') {
$year = $date->{'#text'};
}
elsif ($date->{-type} eq 'month') {
$month = $date->{'#text'};
}
elsif ($date->{-type} eq 'day') {
$day = $date->{'#text'};
};
};
$year = 0 if $year !~ /^\d+$/;
$month = 0 if $month !~ /^\d+$/;
$day = 0 if $day !~ /^\d+$/;
$date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
$self->pub_date($date);
# Get textClasses
my @topic;
my $textClass = $meta->{profileDesc}->{textClass}->{catRef};
foreach (@$textClass) {
my ($ign, @ttopic) = split('\.', $_->{'-target'});
push(@topic, @ttopic);
};
$self->text_class(@topic);
};
1;
__END__
=pod
=head1 NAME
KorAP::Document
=head1 SYNOPSIS
my $doc = KorAP::Document->new(
path => 'mydoc-1/'
);
$doc->parse;
print $doc->title;
=head1 DESCRIPTION
Parse the primary and meta data of a document.
=head2 ATTRIBUTES
=head2 text_sigle
$doc->text_sigle(75476);
print $doc->text_sigle;
The unique identifier of the text.
=head2 doc_sigle
$doc->doc_sigle(75476);
print $doc->doc_sigle;
The unique identifier of the document.
=head2 corpus_sigle
$doc->corpus_sigle(4);
print $doc->corpus_sigle;
The unique identifier of the corpus.
=head2 path
$doc->path("example-004/");
print $doc->path;
The path of the document.
=head2 title
$doc->title("Der Name der Rose");
print $doc->title;
The title of the document.
=head2 sub_title
$doc->sub_title("Natürlich eine Handschrift");
print $doc->sub_title;
The title of the document.
=head2 pub_place
$doc->pub_place("Rom");
print $doc->pub_place;
The publication place of the document.
=head2 pub_date
$doc->pub_place("19800404");
print $doc->pub_place;
The publication date of the document,
in the format "YYYYMMDD".
=head2 primary
print $doc->primary->data(0,20);
The L<KorAP::Document::Primary> object containing the primary data.
=head2 author
$doc->author('Binks, Jar Jar; Luke Skywalker');
print $doc->author->[0];
Set the author value as semikolon separated list of names or
get an array reference of author names.
=head2 text_class
$doc->text_class(qw/news sports/);
print $doc->text_class->[0];
Set the text class as an array or get an array
reference of text classes.
=head1 METHODS
=head2 parse
$doc->parse;
Run the parsing process of the document
=cut
Deal with:
<attribute name="info">
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
information expressed by the given layer of annotation (there may, and often will, be
more than one)</documentation>
<list>
<oneOrMore>
<choice>
<value type="NCName">pos</value>
<value type="NCName">lemma</value>
<value type="NCName">msd</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
the traditional abbreviation for "morphosyntactic description", listing info on
e.g. tense, person, case, etc.</documentation>
<value type="NCName">dep</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
information about types of relations, used in dependency-style annotations; it is
an indication for the visualiser that word-to-word relationships should be
displayed</documentation>
<value type="NCName">lbl</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
indicates the presence of labels over dependency relations</documentation>
<value type="NCName">const</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
stands for 'constituency' or hierarchical, tree-based annotations; it is an
indication for the visualiser that it should display syntactic
trees</documentation>
<value type="NCName">cat</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
used for syntactic categories, as separate from pos; note that these sets need not
be disjoint (at the lexical level, they usually overlap), but the frontend prefers
to keep them separate. 'cat' will be found in the context of chunking or
hierarchical parsing and will characterise nodes; it may also be found in
dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
(the latter are signalled by 'lbl')</documentation>
<value type="NCName">struct</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
non-linguistic information (headers, highlights, etc.)</documentation>
<value type="NCName">frag</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
>non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
<value type="NCName">ne</value>
<documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
entities</documentation>
</choice>
</oneOrMore>
</list>
</attribute>