blob: 5b4ef73907e88d529685cbd9b84f283addf54e00 [file] [log] [blame]
package KorAP::Document;
use Mojo::Base -base;
use v5.16;
use Mojo::ByteStream 'b';
use Mojo::DOM;
use Carp qw/croak/;
use KorAP::Document::Primary;
our @ATTR = qw/id corpus_id pub_date
title sub_title pub_place/;
has 'path';
has [@ATTR];
has log => sub { Log::Log4perl->get_logger(__PACKAGE__) };
# parse document
sub parse {
my $self = shift;
my $file = b($self->path . 'data.xml')->slurp;
state $unable = 'Unable to parse document ' . $self->path;
$self->log->debug('Parse document ' . $self->path);
my $dom = Mojo::DOM->new($file);
my $rt = $dom->at('raw_text');
# Get document id and corpus id
if ($rt && $rt->attr('docid')) {
$self->id($rt->attr('docid'));
if ($self->id =~ /^([^_]+)_/) {
$self->corpus_id($1);
}
else {
croak $unable . ': ID not parseable';
};
}
else {
croak $unable . ': No raw_text found or no ID';
};
# Get primary data
my $pd = $rt->at('text');
if ($pd) {
$pd = b($pd->text)->decode;
$self->{pd} = KorAP::Document::Primary->new($pd->to_string);
}
else {
croak $unable;
};
# Get meta data
$self->_parse_meta;
return 1;
};
# Primary data
sub primary {
$_[0]->{pd};
};
sub author {
my $self = shift;
# Set authors
if ($_[0]) {
return $self->{authors} = [
grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
];
}
return ($self->{authors} // []);
};
sub text_class {
my $self = shift;
if ($_[0]) {
return $self->{topics} = [ @_ ];
};
return ($self->{topics} // []);
};
sub _parse_meta {
my $self = shift;
my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
my $dom = Mojo::DOM->new($file);
my $analytic = $dom->at('analytic');
# Get title
my $title = $analytic->at('h\.title[type=main]');
$self->title($title->text) if $title;
# Get Subtitle
my $sub_title = $analytic->at('h\.title[type=sub]');
$self->sub_title($sub_title->text) if $sub_title;
# Get Author
my $author = $analytic->at('h\.author');
$self->author($author->all_text) if $author;
# Get pubDate
my $year = $dom->at("pubDate[type=year]");
$year = $year ? $year->text : 0;
my $month = $dom->at("pubDate[type=month]");
$month = $month ? $month->text : 0;
my $day = $dom->at("pubDate[type=day]");
$day = $day ? $day->text : 0;
$year = 0 if $year !~ /^\d+$/;
$month = 0 if $month !~ /^\d+$/;
$day = 0 if $day !~ /^\d+$/;
my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
$self->pub_date($date);
# Get textClasses
my @topic;
$dom->find("textClass catRef")->each(
sub {
my ($ign, @ttopic) = split('\.', $_->attr('target'));
push(@topic, @ttopic);
}
);
$self->text_class(@topic);
};
sub to_string {
my $self = shift;
my $string;
foreach (@ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
$string .= $_ . ' = ' . $att . "\n";
};
};
if ($self->author) {
foreach (@{$self->author}) {
$_ =~ s/\n/ /g;
$_ =~ s/\s\s+/ /g;
$string .= 'author = ' . $_ . "\n";
};
};
if ($self->text_class) {
foreach (@{$self->text_class}) {
$string .= 'text_class = ' . $_ . "\n";
};
};
return $string;
};
sub _k {
my $x = $_[0];
$x =~ s/_(\w)/\U$1\E/g;
$x =~ s/id$/ID/gi;
return $x;
};
sub to_hash {
my $self = shift;
my %hash;
foreach (@ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
$hash{_k($_)} = $att;
};
};
for ('author') {
$hash{_k($_)} = join(',', @{ $self->$_ });
};
for ('text_class') {
$hash{_k($_)} = join(' ', @{ $self->$_ });
};
return \%hash;
};
1;
__END__
=pod
=head1 NAME
KorAP::Document
=head1 SYNOPSIS
my $doc = KorAP::Document->new(
path => 'mydoc-1/'
);
$doc->parse;
print $doc->title;
=head1 DESCRIPTION
Parse the primary and meta data of a document.
=head2 ATTRIBUTES
=head2 id
$doc->id(75476);
print $doc->id;
The unique identifier of the document.
=head2 corpus_id
$doc->corpus_id(4);
print $doc->corpus_id;
The unique identifier of the corpus.
=head2 path
$doc->path("example-004/");
print $doc->path;
The path of the document.
=head2 title
$doc->title("Der Name der Rose");
print $doc->title;
The title of the document.
=head2 sub_title
$doc->sub_title("Natürlich eine Handschrift");
print $doc->sub_title;
The title of the document.
=head2 pub_place
$doc->pub_place("Rom");
print $doc->pub_place;
The publication place of the document.
=head2 pub_date
$doc->pub_place("19800404");
print $doc->pub_place;
The publication date of the document,
in the format "YYYYMMDD".
=head2 primary
print $doc->primary->data(0,20);
The L<KorAP::Document::Primary> object containing the primary data.
=head2 author
$doc->author('Binks, Jar Jar; Luke Skywalker');
print $doc->author->[0];
Set the author value as semikolon separated list of names or
get an array reference of author names.
=head2 text_class
$doc->text_class(qw/news sports/);
print $doc->text_class->[0];
Set the text class as an array or get an array
reference of text classes.
=head1 METHODS
=head2 parse
$doc->parse;
Run the parsing process of the document
=cut