| package KorAP::Document; |
| use Mojo::Base -base; |
| use Mojo::ByteStream 'b'; |
| use Mojo::Util qw/encode/; |
| use XML::Fast; |
| use Try::Tiny; |
| use Carp qw/croak/; |
| use KorAP::Document::Primary; |
| use Log::Log4perl; |
| use KorAP::Log; |
| use Mojo::DOM; |
| use Data::Dumper; |
| use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/; |
| |
| # TODO: Currently metadata is processed multiple times - that's horrible! |
| # Due to the kind of processing, processed metadata may be stored in |
| # a multiprocess cache instead. |
| |
| our @ATTR = qw/text_sigle |
| doc_sigle |
| corpus_sigle |
| title |
| pub_date |
| sub_title |
| pub_place |
| author/; |
| |
| our @ADVANCED_ATTR = qw/publisher |
| editor |
| text_type |
| text_type_art |
| text_type_ref |
| text_column |
| text_domain |
| creation_date |
| license |
| pages |
| file_edition_statement |
| bibl_edition_statement |
| reference |
| language |
| |
| doc_title |
| doc_sub_title |
| doc_editor |
| doc_author |
| |
| corpus_author |
| corpus_title |
| corpus_sub_title |
| corpus_editor |
| /; |
| # Separate: text_class, keywords |
| |
| # Removed: coll_title, coll_sub_title, coll_author, coll_editor |
| # Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author |
| |
| |
| has 'path'; |
| has [@ATTR, @ADVANCED_ATTR]; |
| |
| has log => sub { |
| if(Log::Log4perl->initialized()) { |
| state $log = Log::Log4perl->get_logger(__PACKAGE__); |
| }; |
| state $log = KorAP::Log->new; |
| return $log; |
| }; |
| |
| sub new { |
| my $class = shift; |
| my $self = bless { @_ }, $class; |
| if (exists $self->{path}) { |
| $self->{path} = rel2abs($self->{path}); |
| if ($self->{path} !~ m!\/$!) { |
| $self->{path} .= '/'; |
| }; |
| }; |
| return $self; |
| }; |
| |
| # parse document |
| sub parse { |
| my $self = shift; |
| |
| my $data_xml = $self->path . 'data.xml'; |
| |
| my ($rt, $error, $file); |
| |
| my $unable = 'Unable to parse document ' . $self->path; |
| |
| unless (-e $data_xml) { |
| $self->log->warn($unable . ' - no data.xml found'); |
| $error = 1; |
| } |
| |
| else { |
| $file = b($data_xml)->slurp; |
| |
| try { |
| local $SIG{__WARN__} = sub { |
| $error = 1; |
| }; |
| $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text}; |
| } |
| catch { |
| $self->log->warn($unable); |
| $error = 1; |
| }; |
| }; |
| |
| return if $error; |
| |
| $self->log->debug('Parse document ' . $self->path); |
| |
| # Get document id and corpus id |
| if ($rt && $rt->{'-docid'}) { |
| $self->text_sigle($rt->{'-docid'}); |
| if ($self->text_sigle =~ /^(([^_]+)_[^\._]+?)\..+?$/) { |
| $self->corpus_sigle($2); |
| $self->doc_sigle($1); |
| } |
| else { |
| croak $unable . ': ID not parseable'; |
| }; |
| } |
| else { |
| croak $unable . ': No raw_text found or no ID'; |
| }; |
| |
| # Get primary data |
| my $pd = $rt->{text}; |
| if ($pd) { |
| $self->{pd} = KorAP::Document::Primary->new($pd); |
| } |
| else { |
| croak $unable; |
| }; |
| |
| my @path = grep { $_ } splitdir($self->path); |
| my @header; |
| |
| # Parse the corpus file, the doc file, and the text file for meta information |
| foreach (0..2) { |
| unshift @header, '/' . catfile(@path, 'header.xml'); |
| pop @path; |
| }; |
| my @type = qw/corpus doc text/; |
| foreach (@header) { |
| # Get corpus, doc and text meta data |
| my $type = shift(@type); |
| $self->_parse_meta($_, $type) if -e $_; |
| }; |
| |
| return 1; |
| }; |
| |
| |
| # Primary data |
| sub primary { |
| $_[0]->{pd}; |
| }; |
| |
| #sub author { |
| # my $self = shift; |
| # |
| # # Set authors |
| # if ($_[0]) { |
| # return $self->{authors} = [ |
| # grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift()) |
| # ]; |
| # } |
| # return ($self->{authors} // []); |
| #}; |
| |
| sub text_class { |
| my $self = shift; |
| if ($_[0]) { |
| return $self->{topics} = [ @_ ]; |
| }; |
| return ($self->{topics} // []); |
| }; |
| |
| sub text_class_string { |
| return join ' ', @{shift->text_class}; |
| } |
| |
| sub keywords { |
| my $self = shift; |
| if ($_[0]) { |
| return $self->{keywords} = [ @_ ]; |
| }; |
| return ($self->{keywords} // []); |
| }; |
| |
| sub keywords_string { |
| return join ' ', @{shift->keywords}; |
| } |
| |
| sub _remove_prefix { |
| return $_[0]; |
| |
| # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ... |
| my $title = shift; |
| my $prefix = shift; |
| $prefix =~ tr!_!/!; |
| if (index($title, $prefix) == 0) { |
| $title = substr($title, length($prefix)); |
| $title =~ s/^\s+//; |
| $title =~ s/\s+$//; |
| }; |
| return $title; |
| }; |
| |
| |
| sub _parse_meta { |
| my $self = shift; |
| my $header_xml = shift; |
| my $type = shift; |
| |
| my $file = b($header_xml)->slurp->decode('iso-8859-1'); |
| |
| my $dom = Mojo::DOM->new($file); |
| |
| my $analytic = $dom->at('analytic'); |
| |
| # There is an analytic element |
| if ($analytic) { |
| |
| # Get title, subtitle, author, editor |
| my $title = $analytic->at('h\.title[type=main]'); |
| my $sub_title = $analytic->at('h\.title[type=sub]'); |
| my $author = $analytic->at('h\.author'); |
| my $editor = $analytic->at('editor'); |
| |
| $title = $title ? $title->all_text : undef; |
| $sub_title = $sub_title ? $sub_title->all_text : undef; |
| $author = $author ? $author->all_text : undef; |
| $editor = $editor ? $editor->all_text : undef; |
| |
| if ($type eq 'text') { |
| $self->title(_remove_prefix($title, $self->text_sigle)) if $title; |
| $self->sub_title($sub_title) if $sub_title; |
| $self->editor($editor) if $editor; |
| $self->author($author) if $author; |
| } |
| elsif ($type eq 'doc') { |
| $self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title; |
| $self->doc_sub_title($sub_title) if $sub_title; |
| $self->doc_author($author) if $author; |
| $self->doc_editor($editor) if $editor; |
| } |
| elsif ($type eq 'corpus') { |
| $self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title; |
| $self->corpus_sub_title($sub_title) if $sub_title; |
| $self->corpus_author($author) if $author; |
| $self->corpus_editor($editor) if $editor; |
| }; |
| }; |
| |
| # Not in analytic |
| if ($type eq 'corpus') { |
| unless ($self->corpus_title) { |
| if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) { |
| $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle)) if $title->all_text; |
| }; |
| }; |
| } |
| |
| # doc title |
| elsif ($type eq 'doc') { |
| unless ($self->doc_title) { |
| if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) { |
| $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle)) if $title->all_text; |
| }; |
| }; |
| } |
| |
| # text title |
| elsif ($type eq 'text') { |
| unless ($self->title) { |
| if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) { |
| $self->title(_remove_prefix($title->all_text, $self->text_sigle)) if $title->all_text; |
| }; |
| }; |
| }; |
| |
| # Get PubPlace |
| if (my $place = $dom->at('pubPlace')) { |
| $self->pub_place($place->all_text) if $place->all_text; |
| }; |
| |
| # Get Publisher |
| if (my $publisher = $dom->at('imprint publisher')) { |
| $self->publisher($publisher->all_text) if $publisher->all_text; |
| }; |
| |
| # my $mono = $dom->at('monogr'); |
| # if ($mono) { |
| # |
| # # Get title, subtitle, author, editor |
| # my $title = $mono->at('h\.title[type=main]'); |
| # my $sub_title = $mono->at('h\.title[type=sub]'); |
| # my $author = $mono->at('h\.author'); |
| # my $editor = $mono->at('editor'); |
| # |
| # $title = $title ? $title->all_text : undef; |
| # $sub_title = $sub_title ? $sub_title->all_text : undef; |
| # $author = $author ? $author->all_text : undef; |
| # $editor = $editor ? $editor->all_text : undef; |
| # |
| # if ($type eq 'text') { |
| # $self->title($title) if $title && !$self->title; |
| # $self->sub_title($sub_title) if $sub_title && !$self->sub_title; |
| # $self->editor($editor) if $editor && !$self->editor; |
| # $self->author($author) if $author && !$self->author; |
| # } |
| # elsif ($type eq 'doc') { |
| # $self->doc_title($title) if $title && !$self->doc_title; |
| # $self->doc_sub_title($sub_title) if $sub_title && !$self->doc_sub_title; |
| # $self->doc_author($author) if $author && !$self->doc_author; |
| # $self->doc_editor($editor) if $editor && !$self->doc_editor; |
| # } |
| # elsif ($type eq 'corpus') { |
| # $self->corpus_title($title) if $title && !$self->corpus_title; |
| # $self->corpus_sub_title($sub_title) if $sub_title && !$self->corpus_sub_title; |
| # $self->corpus_author($author) if $author && !$self->corpus_author; |
| # $self->corpus_editor($editor) if $editor && !$self->corpus_editor; |
| # }; |
| # }; |
| |
| # Get text type |
| my $text_desc = $dom->at('textDesc'); |
| |
| if ($text_desc) { |
| if (my $text_type = $text_desc->at('textType')) { |
| $self->text_type($text_type->all_text) if $text_type->all_text; |
| }; |
| |
| # Get text domain |
| if (my $text_domain = $text_desc->at('textDomain')) { |
| $self->text_domain($text_domain->all_text) if $text_domain->all_text; |
| }; |
| |
| # Get text type art |
| if (my $text_type_art = $text_desc->at('textTypeArt')) { |
| $self->text_type_art($text_type_art->all_text) if $text_type_art->all_text; |
| }; |
| |
| # Get text type art |
| if (my $text_type_ref = $text_desc->at('textTypeRef')) { |
| $self->text_type_ref($text_type_ref->all_text) if $text_type_ref->all_text; |
| }; |
| }; |
| |
| # Get pubDate |
| my $pub_date = $dom->find('pubDate[type=year]'); |
| $pub_date->each( |
| sub { |
| my $x = shift->parent; |
| my $year = $x->at("pubDate[type=year]"); |
| return unless $year; |
| |
| $year = $year ? $year->text : 0; |
| my $month = $x->at("pubDate[type=month]"); |
| $month = $month ? $month->text : 0; |
| my $day = $x->at("pubDate[type=day]"); |
| $day = $day ? $day->text : 0; |
| |
| $year = 0 if $year !~ /^\d+$/; |
| $month = 0 if $month !~ /^\d+$/; |
| $day = 0 if $day !~ /^\d+$/; |
| |
| my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| $date .= length($month) == 1 ? '0' . $month : $month; |
| $date .= length($day) == 1 ? '0' . $day : $day; |
| $self->pub_date($date); |
| }); |
| |
| # creatDate |
| my $create_date = $dom->at('creatDate'); |
| if ($create_date && $create_date->text) { |
| $create_date = $create_date->all_text; |
| if (index($create_date, '-') > -1) { |
| $self->log->warn("Creation date ranges are not supported"); |
| ($create_date) = split /\s*-\s*/, $create_date; |
| } |
| |
| $create_date =~ s{^(\d{4})$}{$1\.00}; |
| $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}; |
| if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) { |
| $create_date =~ tr/\.//d; |
| $self->creation_date($create_date); |
| }; |
| }; |
| |
| my $text_class = $dom->at('textClass'); |
| if ($text_class) { |
| # Get textClasses |
| my @topic; |
| |
| $text_class->find("catRef")->each( |
| sub { |
| my ($ign, @ttopic) = split('\.', $_->attr('target')); |
| push(@topic, @ttopic); |
| } |
| ); |
| $self->text_class(@topic) if @topic > 0; |
| |
| my @keywords = $text_class->find("h\.keywords > keyTerm")->each; |
| $self->keywords(@keywords) if @keywords > 0; |
| }; |
| |
| if (my $edition_statement = $dom->at('biblFull editionStmt')) { |
| $self->bibl_edition_statement($edition_statement->all_text) |
| if $edition_statement->text; |
| }; |
| |
| if (my $edition_statement = $dom->at('fileDescl editionStmt')) { |
| $self->file_edition_statement($edition_statement->all_text) |
| if $edition_statement->text; |
| }; |
| |
| if (my $file_desc = $dom->at('fileDesc')) { |
| if (my $availability = $file_desc->at('publicationStmt > availability')) { |
| $self->license($availability->all_text); |
| }; |
| }; |
| |
| # Some meta data only available in the corpus |
| if ($type eq 'corpus') { |
| if (my $language = $dom->at('profileDesc > langUsage > language[id]')) { |
| $self->language($language->attr('id')); |
| }; |
| } |
| |
| # Some meta data only reevant from the text |
| elsif ($type eq 'text') { |
| |
| if (my $reference = $dom->at('sourceDesc reference[type=complete]')) { |
| if (my $ref_text = $reference->all_text) { |
| $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!; |
| $self->reference($ref_text); |
| }; |
| }; |
| |
| my $column = $dom->at('textDesc > column'); |
| $self->text_column($column->all_text) if $column; |
| |
| if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) { |
| $pages = $pages->all_text; |
| if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) { |
| $self->pages($1 . '-' . $2); |
| }; |
| }; |
| }; |
| }; |
| |
| |
| |
| sub to_string { |
| my $self = shift; |
| |
| my $string; |
| |
| foreach (@ATTR) { |
| if (my $att = $self->$_) { |
| $att =~ s/\n/ /g; |
| $att =~ s/\s\s+/ /g; |
| $string .= $_ . ' = ' . $att . "\n"; |
| }; |
| }; |
| |
| # if ($self->author) { |
| # foreach (@{$self->author}) { |
| # $_ =~ s/\n/ /g; |
| # $_ =~ s/\s\s+/ /g; |
| # $string .= 'author = ' . $_ . "\n"; |
| # }; |
| # }; |
| |
| if ($self->text_class) { |
| foreach (@{$self->text_class}) { |
| $string .= 'text_class = ' . $_ . "\n"; |
| }; |
| }; |
| |
| return $string; |
| }; |
| |
| sub _k { |
| my $x = $_[0]; |
| $x =~ s/_(\w)/\U$1\E/g; |
| $x =~ s/id$/ID/gi; |
| return $x; |
| }; |
| |
| |
| sub to_hash { |
| my $self = shift; |
| |
| $self->parse unless $self->text_sigle; |
| |
| my %hash; |
| |
| foreach (@ATTR, @ADVANCED_ATTR) { |
| if (my $att = $self->$_) { |
| $att =~ s/\n/ /g; |
| $att =~ s/\s\s+/ /g; |
| $hash{_k($_)} = $att; |
| }; |
| }; |
| |
| for (qw/text_class keywords/) { |
| my @array = @{ $self->$_ }; |
| next unless @array; |
| $hash{_k($_)} = join(' ', @array); |
| }; |
| |
| return \%hash; |
| }; |
| |
| |
| # Don't work that well |
| sub _parse_meta_fast { |
| my $self = shift; |
| |
| # my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1'); |
| my $file = b($self->path . 'header.xml')->slurp; |
| |
| my ($meta, $error); |
| my $unable = 'Unable to parse document ' . $self->path; |
| |
| try { |
| local $SIG{__WARN__} = sub { |
| $error = 1; |
| }; |
| $meta = xml2hash( |
| $file, |
| text => '#text', |
| attr => '-', |
| array => ['h.title', 'imprint', 'catRef', 'h.author'] |
| )->{idsHeader}; |
| } |
| catch { |
| $self->log->warn($unable); |
| $error = 1; |
| }; |
| |
| return if $error; |
| |
| my $bibl_struct = $meta->{fileDesc}->{sourceDesc}->{biblStruct}; |
| my $analytic = $bibl_struct->{analytic}; |
| |
| my $titles = $analytic->{'h.title'}; |
| foreach (@$titles) { |
| if ($_->{'-type'} eq 'main') { |
| $self->title($_->{'#text'}); |
| } |
| elsif ($_->{'-type'} eq 'sub') { |
| $self->sub_title($_->{'#text'}); |
| }; |
| }; |
| |
| # Get Author |
| if (my $author = $analytic->{'h.author'}) { |
| $self->author($author->[0]); |
| }; |
| |
| # Get pubDate |
| my $date = $bibl_struct->{monogr}->{imprint}; |
| my ($year, $month, $day) = (0,0,0); |
| foreach (@$date) { |
| if ($date->{-type} eq 'year') { |
| $year = $date->{'#text'}; |
| } |
| elsif ($date->{-type} eq 'month') { |
| $month = $date->{'#text'}; |
| } |
| elsif ($date->{-type} eq 'day') { |
| $day = $date->{'#text'}; |
| }; |
| }; |
| |
| $year = 0 if $year !~ /^\d+$/; |
| $month = 0 if $month !~ /^\d+$/; |
| $day = 0 if $day !~ /^\d+$/; |
| |
| $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000'; |
| $date .= length($month) == 1 ? '0' . $month : $month; |
| $date .= length($day) == 1 ? '0' . $day : $day; |
| |
| $self->pub_date($date); |
| |
| # Get textClasses |
| my @topic; |
| my $textClass = $meta->{profileDesc}->{textClass}->{catRef}; |
| foreach (@$textClass) { |
| my ($ign, @ttopic) = split('\.', $_->{'-target'}); |
| push(@topic, @ttopic); |
| }; |
| $self->text_class(@topic); |
| }; |
| |
| |
| |
| 1; |
| |
| |
| __END__ |
| |
| =pod |
| |
| =head1 NAME |
| |
| KorAP::Document |
| |
| |
| =head1 SYNOPSIS |
| |
| my $doc = KorAP::Document->new( |
| path => 'mydoc-1/' |
| ); |
| |
| $doc->parse; |
| |
| print $doc->title; |
| |
| |
| =head1 DESCRIPTION |
| |
| Parse the primary and meta data of a document. |
| |
| |
| =head2 ATTRIBUTES |
| |
| =head2 text_sigle |
| |
| $doc->text_sigle(75476); |
| print $doc->text_sigle; |
| |
| The unique identifier of the text. |
| |
| |
| =head2 doc_sigle |
| |
| $doc->doc_sigle(75476); |
| print $doc->doc_sigle; |
| |
| The unique identifier of the document. |
| |
| |
| =head2 corpus_sigle |
| |
| $doc->corpus_sigle(4); |
| print $doc->corpus_sigle; |
| |
| The unique identifier of the corpus. |
| |
| |
| =head2 path |
| |
| $doc->path("example-004/"); |
| print $doc->path; |
| |
| The path of the document. |
| |
| |
| =head2 title |
| |
| $doc->title("Der Name der Rose"); |
| print $doc->title; |
| |
| The title of the document. |
| |
| |
| =head2 sub_title |
| |
| $doc->sub_title("Natürlich eine Handschrift"); |
| print $doc->sub_title; |
| |
| The title of the document. |
| |
| |
| =head2 pub_place |
| |
| $doc->pub_place("Rom"); |
| print $doc->pub_place; |
| |
| The publication place of the document. |
| |
| |
| =head2 pub_date |
| |
| $doc->pub_place("19800404"); |
| print $doc->pub_place; |
| |
| The publication date of the document, |
| in the format "YYYYMMDD". |
| |
| |
| =head2 primary |
| |
| print $doc->primary->data(0,20); |
| |
| The L<KorAP::Document::Primary> object containing the primary data. |
| |
| |
| =head2 author |
| |
| $doc->author('Binks, Jar Jar; Luke Skywalker'); |
| print $doc->author->[0]; |
| |
| Set the author value as semikolon separated list of names or |
| get an array reference of author names. |
| |
| =head2 text_class |
| |
| $doc->text_class(qw/news sports/); |
| print $doc->text_class->[0]; |
| |
| Set the text class as an array or get an array |
| reference of text classes. |
| |
| |
| =head1 METHODS |
| |
| =head2 parse |
| |
| $doc->parse; |
| |
| Run the parsing process of the document |
| |
| |
| =cut |
| |
| |
| Deal with: |
| <attribute name="info"> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of |
| information expressed by the given layer of annotation (there may, and often will, be |
| more than one)</documentation> |
| <list> |
| <oneOrMore> |
| <choice> |
| <value type="NCName">pos</value> |
| <value type="NCName">lemma</value> |
| <value type="NCName">msd</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is |
| the traditional abbreviation for "morphosyntactic description", listing info on |
| e.g. tense, person, case, etc.</documentation> |
| <value type="NCName">dep</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is |
| information about types of relations, used in dependency-style annotations; it is |
| an indication for the visualiser that word-to-word relationships should be |
| displayed</documentation> |
| <value type="NCName">lbl</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl' |
| indicates the presence of labels over dependency relations</documentation> |
| <value type="NCName">const</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const' |
| stands for 'constituency' or hierarchical, tree-based annotations; it is an |
| indication for the visualiser that it should display syntactic |
| trees</documentation> |
| <value type="NCName">cat</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is |
| used for syntactic categories, as separate from pos; note that these sets need not |
| be disjoint (at the lexical level, they usually overlap), but the frontend prefers |
| to keep them separate. 'cat' will be found in the context of chunking or |
| hierarchical parsing and will characterise nodes; it may also be found in |
| dependency annotations, to indicate labels on nodes, as opposed to labels on arcs |
| (the latter are signalled by 'lbl')</documentation> |
| <value type="NCName">struct</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all |
| non-linguistic information (headers, highlights, etc.)</documentation> |
| <value type="NCName">frag</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0" |
| >non-exhaustive coverage (when spanList/@fragmented="true")</documentation> |
| <value type="NCName">ne</value> |
| <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named |
| entities</documentation> |
| </choice> |
| </oneOrMore> |
| </list> |
| </attribute> |