Lucene field indexer written in perl

commit: 2db9ad0d484214d641313937810225a635a997a4 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Tue Oct 29 19:26:43 2013 +0000
committer: Nils Diewald <nils@diewald-online.de> Tue Oct 29 19:26:43 2013 +0000
tree: 46f404294a6ec09faf4d1a551a3a03599473dd61
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
new file mode 100644
index 0000000..8f80e6c
--- /dev/null
+++ b/lib/KorAP/Document.pm

@@ -0,0 +1,250 @@
+package KorAP::Document;
+use Mojo::Base -base;
+use v5.16;
+
+use Mojo::ByteStream 'b';
+use Mojo::DOM;
+use Carp qw/croak carp/;
+use KorAP::Document::Primary;
+
+has [qw/id corpus_id path/];
+has [qw/pub_date title sub_title pub_place/];
+
+# parse document
+sub parse {
+  my $self = shift;
+  my $file = b($self->path . 'data.xml')->slurp;
+
+  state $unable = 'Unable to parse document';
+
+  carp 'Parse document ' . $self->path;
+
+  my $dom = Mojo::DOM->new($file);
+
+  my $rt = $dom->at('raw_text');
+
+  # Get document id and corpus id
+  if ($rt && $rt->attr('docid')) {
+    $self->id($rt->attr('docid'));
+    if ($self->id =~ /^([^_]+)_/) {
+      $self->corpus_id($1);
+    }
+    else {
+      croak $unable;
+    };
+  }
+  else {
+    croak $unable;
+  };
+
+  # Get primary data
+  my $pd = $rt->at('text');
+  if ($pd) {
+
+    $pd = b($pd->text)->decode;
+    $self->{pd} = KorAP::Document::Primary->new($pd->to_string);
+  }
+  else {
+    croak $unable;
+  };
+
+  # Get meta data
+  $self->_parse_meta;
+  return 1;
+};
+
+
+# Primary data
+sub primary {
+  $_[0]->{pd};
+};
+
+sub author {
+  my $self = shift;
+
+  # Set authors
+  if ($_[0]) {
+    return $self->{authors} = [
+      grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
+    ];
+  }
+  return ($self->{authors} // []);
+};
+
+sub text_class {
+  my $self = shift;
+  if ($_[0]) {
+    return $self->{topics} = [ @_ ];
+  };
+  return ($self->{topics} // []);
+};
+
+
+
+sub _parse_meta {
+  my $self = shift;
+
+  my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1')->encode;
+
+  my $dom = Mojo::DOM->new($file);
+  my $monogr = $dom->at('monogr');
+
+  # Get title
+  my $title = $monogr->at('h\.title[type=main]');
+  $self->title($title->text) if $title;
+
+  # Get Subtitle
+  my $sub_title = $monogr->at('h\.title[type=sub]');
+  $self->sub_title($sub_title->text) if $sub_title;
+
+  # Get Author
+  my $author = $monogr->at('h\.author');
+  $self->author($author->all_text) if $author;
+
+  # Get pubDate
+  my $year = $dom->at("pubDate[type=year]");
+  $year = $year ? $year->text : 0;
+  my $month = $dom->at("pubDate[type=month]");
+  $month = $month ? $month->text : 0;
+  my $day = $dom->at("pubDate[type=day]");
+  $day = $day ? $day->text : 0;
+
+  my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
+  $date .= length($month) == 1 ? '0' . $month : $month;
+  $date .= length($day) == 1 ? '0' . $day : $day;
+
+  $self->pub_date($date);
+
+  # Get textClasses
+  my @topic;
+  $dom->find("textClass catRef")->each(
+    sub {
+      my ($ign, @ttopic) = split('\.', $_->attr('target'));
+      push(@topic, @ttopic);
+    }
+  );
+  $self->text_class(@topic);
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Document
+
+
+=head1 SYNOPSIS
+
+  my $doc = KorAP::Document->new(
+    path => 'mydoc-1/'
+  );
+
+  $doc->parse;
+
+  print $doc->title;
+
+
+=head1 DESCRIPTION
+
+Parse the primary and meta data of a document.
+
+
+=head2 ATTRIBUTES
+
+=head2 id
+
+  $doc->id(75476);
+  print $doc->id;
+
+The unique identifier of the document.
+
+
+=head2 corpus_id
+
+  $doc->corpus_id(4);
+  print $doc->corpus_id;
+
+The unique identifier of the corpus.
+
+
+=head2 path
+
+  $doc->path("example-004/");
+  print $doc->path;
+
+The path of the document.
+
+
+=head2 title
+
+  $doc->title("Der Name der Rose");
+  print $doc->title;
+
+The title of the document.
+
+
+=head2 sub_title
+
+  $doc->sub_title("Natürlich eine Handschrift");
+  print $doc->sub_title;
+
+The title of the document.
+
+
+=head2 pub_place
+
+  $doc->pub_place("Rom");
+  print $doc->pub_place;
+
+The publication place of the document.
+
+
+=head2 pub_date
+
+  $doc->pub_place("19800404");
+  print $doc->pub_place;
+
+The publication date of the document,
+in the format "YYYYMMDD".
+
+
+=head2 primary
+
+  print $doc->primary->data(0,20);
+
+The L<KorAP::Document::Primary> object containing the primary data.
+
+
+=head2 author
+
+  $doc->author('Binks, Jar Jar; Luke Skywalker');
+  print $doc->author->[0];
+
+Set the author value as semikolon separated list of names or
+get an array reference of author names.
+
+=head2 text_class
+
+  $doc->text_class(qw/news sports/);
+  print $doc->text_class->[0];
+
+Set the text class as an array or get an array
+reference of text classes.
+
+
+=head1 METHODS
+
+=head2 parse
+
+  $doc->parse;
+
+Run the parsing process of the document
+
+
+=cut
commit	2db9ad0d484214d641313937810225a635a997a4	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Tue Oct 29 19:26:43 2013 +0000
committer	Nils Diewald <nils@diewald-online.de>	Tue Oct 29 19:26:43 2013 +0000
tree	46f404294a6ec09faf4d1a551a3a03599473dd61