blob: b4ab7def989d4c2d650c1955f266b558aefa1935 [file] [log] [blame]
package KorAP::Document;
use Mojo::Base -base;
use v5.16;
use Mojo::ByteStream 'b';
use Mojo::Util qw/encode/;
use XML::Fast;
use Try::Tiny;
use Carp qw/croak/;
use KorAP::Document::Primary;
use Log::Log4perl;
use KorAP::Log;
use Mojo::DOM;
use Data::Dumper;
our @ATTR = qw/id
corpus_id
pub_date
title
sub_title
pub_place/;
our @ADVANCED_ATTR = qw/publisher
editor
text_type
text_type_art
creation_date
coll_title
coll_sub_title
coll_author
coll_editor
/;
has 'path';
has [@ATTR, @ADVANCED_ATTR];
has log => sub {
if(Log::Log4perl->initialized()) {
state $log = Log::Log4perl->get_logger(__PACKAGE__);
};
state $log = KorAP::Log->new;
return $log;
};
sub new {
my $class = shift;
my $self = bless { @_ }, $class;
if (exists $self->{path} && $self->{path} !~ m!\/$!) {
$self->{path} .= '/';
};
return $self;
};
# parse document
sub parse {
my $self = shift;
my $data_xml = $self->path . 'data.xml';
my ($rt, $error, $file);
my $unable = 'Unable to parse document ' . $self->path;
unless (-e $data_xml) {
$self->log->warn($unable . ' - no data.xml found');
$error = 1;
}
else {
$file = b($data_xml)->slurp;
try {
local $SIG{__WARN__} = sub {
$error = 1;
};
$rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
}
catch {
$self->log->warn($unable);
$error = 1;
};
};
return if $error;
$self->log->debug('Parse document ' . $self->path);
# Get document id and corpus id
if ($rt && $rt->{'-docid'}) {
$self->id($rt->{'-docid'});
if ($self->id =~ /^([^_]+)_/) {
$self->corpus_id($1);
}
else {
croak $unable . ': ID not parseable';
};
}
else {
croak $unable . ': No raw_text found or no ID';
};
# Get primary data
my $pd = $rt->{text};
if ($pd) {
$self->{pd} = KorAP::Document::Primary->new($pd);
}
else {
croak $unable;
};
# Get meta data
$self->_parse_meta;
return 1;
};
# Primary data
sub primary {
$_[0]->{pd};
};
sub author {
my $self = shift;
# Set authors
if ($_[0]) {
return $self->{authors} = [
grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
];
}
return ($self->{authors} // []);
};
sub text_class {
my $self = shift;
if ($_[0]) {
return $self->{topics} = [ @_ ];
};
return ($self->{topics} // []);
};
sub _parse_meta {
my $self = shift;
my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
my $dom = Mojo::DOM->new($file);
my $analytic = $dom->at('analytic');
if ($analytic) {
# Get title
my $title = $analytic->at('h\.title[type=main]');
$self->title($title->text) if $title;
# Get Subtitle
my $sub_title = $analytic->at('h\.title[type=sub]');
$self->sub_title($sub_title->text) if $sub_title;
# Get Author
my $author = $analytic->at('h\.author');
$self->author($author->all_text) if $author;
# Get Editor
my $editor = $analytic->at('editor');
$self->editor($editor->all_text) if $editor;
};
# Get PubPlace
my $place = $dom->at('pubPlace');
$self->pub_place($place->all_text) if $place;
# Get Publisher
my $publisher = $dom->at('publisher');
$self->publisher($publisher->all_text) if $publisher;
my $mono = $dom->at('monogr');
if ($mono) {
# Get title
my $title = $mono->at('h\.title[type=main]');
# It's a monograph
if (!$self->title) {
$self->title($title->text) if $title;
# Get Subtitle
my $sub_title = $mono->at('h\.title[type=sub]');
$self->sub_title($sub_title->text) if $sub_title;
}
else {
$self->coll_title($title->text) if $title;
# Get Subtitle
my $sub_title = $mono->at('h\.title[type=sub]');
$self->coll_sub_title($sub_title->text) if $sub_title;
};
# Get Author
my $author = $mono->at('h\.author');
$self->coll_author($author->all_text) if $author;
# Get editor
my $editor = $mono->at('editor');
$self->coll_editor($editor->all_text) if $editor;
};
# Get text type
my $text_type = $dom->at('textDesc textType');
$self->text_type($text_type->all_text) if $text_type;
# Get text type
my $text_type_art = $dom->at('textDesc textTypeArt');
$self->text_type_art($text_type_art->all_text) if $text_type_art;
# Get pubDate
my $year = $dom->at("pubDate[type=year]");
$year = $year ? $year->text : 0;
my $month = $dom->at("pubDate[type=month]");
$month = $month ? $month->text : 0;
my $day = $dom->at("pubDate[type=day]");
$day = $day ? $day->text : 0;
$year = 0 if $year !~ /^\d+$/;
$month = 0 if $month !~ /^\d+$/;
$day = 0 if $day !~ /^\d+$/;
my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
$self->pub_date($date);
# creatDate
my $createdate = $dom->at('creatDate');
if ($createdate) {
$createdate = $createdate->all_text;
if (index($createdate, '-') > -1) {
$self->log->warn("Creation date ranges are not supported yet");
}
else {
$createdate =~ s{^(\d{4})$}{$1\.00};
$createdate =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
if ($createdate =~ /^\d{4}\.\d{2}\.\d{2}$/) {
$createdate =~ tr/\.//d;
$self->creation_date($createdate);
};
};
};
# Get textClasses
my @topic;
$dom->find("textClass catRef")->each(
sub {
my ($ign, @ttopic) = split('\.', $_->attr('target'));
push(@topic, @ttopic);
}
);
$self->text_class(@topic);
};
sub to_string {
my $self = shift;
my $string;
foreach (@ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
$string .= $_ . ' = ' . $att . "\n";
};
};
if ($self->author) {
foreach (@{$self->author}) {
$_ =~ s/\n/ /g;
$_ =~ s/\s\s+/ /g;
$string .= 'author = ' . $_ . "\n";
};
};
if ($self->text_class) {
foreach (@{$self->text_class}) {
$string .= 'text_class = ' . $_ . "\n";
};
};
return $string;
};
sub _k {
my $x = $_[0];
$x =~ s/_(\w)/\U$1\E/g;
$x =~ s/id$/ID/gi;
return $x;
};
sub to_hash {
my $self = shift;
$self->parse unless $self->id;
my %hash;
foreach (@ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
$hash{_k($_)} = $att;
};
};
for ('author') {
$hash{_k($_)} = join(',', @{ $self->$_ });
};
for ('text_class') {
$hash{_k($_)} = join(' ', @{ $self->$_ });
};
return \%hash;
};
sub _parse_meta_fast {
my $self = shift;
# my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
my $file = b($self->path . 'header.xml')->slurp;
my ($meta, $error);
my $unable = 'Unable to parse document ' . $self->path;
try {
local $SIG{__WARN__} = sub {
$error = 1;
};
$meta = xml2hash($file, text => '#text', attr => '-', array => ['h.title', 'imprint', 'catRef', 'h.author'])->{idsHeader};
}
catch {
$self->log->warn($unable);
$error = 1;
};
return if $error;
my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct};
my $analytic = $bibl_struct->{analytic};
my $titles = $analytic->{'h.title'};
foreach (@$titles) {
if ($_->{'-type'} eq 'main') {
$self->title($_->{'#text'});
}
elsif ($_->{'-type'} eq 'sub') {
$self->sub_title($_->{'#text'});
};
};
# Get Author
if (my $author = $analytic->{'h.author'}) {
$self->author($author->[0]);
};
# Get pubDate
my $date = $bibl_struct->{monogr}->{imprint};
my ($year, $month, $day) = (0,0,0);
foreach (@$date) {
warn $date;
if ($date->{-type} eq 'year') {
$year = $date->{'#text'};
}
elsif ($date->{-type} eq 'month') {
$month = $date->{'#text'};
}
elsif ($date->{-type} eq 'day') {
$day = $date->{'#text'};
};
};
$year = 0 if $year !~ /^\d+$/;
$month = 0 if $month !~ /^\d+$/;
$day = 0 if $day !~ /^\d+$/;
$date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
$self->pub_date($date);
# Get textClasses
my @topic;
my $textClass = $meta->{profileDesc}->{textClass}->{catRef};
foreach (@$textClass) {
my ($ign, @ttopic) = split('\.', $_->{'-target'});
push(@topic, @ttopic);
};
$self->text_class(@topic);
};
1;
__END__
=pod
=head1 NAME
KorAP::Document
=head1 SYNOPSIS
my $doc = KorAP::Document->new(
path => 'mydoc-1/'
);
$doc->parse;
print $doc->title;
=head1 DESCRIPTION
Parse the primary and meta data of a document.
=head2 ATTRIBUTES
=head2 id
$doc->id(75476);
print $doc->id;
The unique identifier of the document.
=head2 corpus_id
$doc->corpus_id(4);
print $doc->corpus_id;
The unique identifier of the corpus.
=head2 path
$doc->path("example-004/");
print $doc->path;
The path of the document.
=head2 title
$doc->title("Der Name der Rose");
print $doc->title;
The title of the document.
=head2 sub_title
$doc->sub_title("Natürlich eine Handschrift");
print $doc->sub_title;
The title of the document.
=head2 pub_place
$doc->pub_place("Rom");
print $doc->pub_place;
The publication place of the document.
=head2 pub_date
$doc->pub_place("19800404");
print $doc->pub_place;
The publication date of the document,
in the format "YYYYMMDD".
=head2 primary
print $doc->primary->data(0,20);
The L<KorAP::Document::Primary> object containing the primary data.
=head2 author
$doc->author('Binks, Jar Jar; Luke Skywalker');
print $doc->author->[0];
Set the author value as semikolon separated list of names or
get an array reference of author names.
=head2 text_class
$doc->text_class(qw/news sports/);
print $doc->text_class->[0];
Set the text class as an array or get an array
reference of text classes.
=head1 METHODS
=head2 parse
$doc->parse;
Run the parsing process of the document
=cut