Simplified and modularized metadata processing
Change-Id: I63e78fd5994126c954263324bcfc2fd9d51e39ea
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 519e26e..087b2ad 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -18,53 +18,11 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.14';
-
-our @ATTR = qw/text_sigle
- doc_sigle
- corpus_sigle
- title
- pub_date
- sub_title
- pub_place
- author/;
-
-our @ADVANCED_ATTR = qw/publisher
- editor
- text_type
- text_type_art
- text_type_ref
- text_column
- text_domain
- creation_date
- license
- pages
- file_edition_statement
- bibl_edition_statement
- reference
- language
-
- doc_title
- doc_sub_title
- doc_editor
- doc_author
-
- corpus_author
- corpus_title
- corpus_sub_title
- corpus_editor
-
- availability
- pub_place_key
- /;
-# Separate: text_class, keywords
-
-# Removed: coll_title, coll_sub_title, coll_author, coll_editor
-# Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
-
+our $VERSION = '0.15';
has 'path';
-has [@ATTR, @ADVANCED_ATTR];
+has [qw/text_sigle doc_sigle corpus_sigle/];
+has 'meta_type' => 'I5';
has log => sub {
if(Log::Log4perl->initialized()) {
@@ -74,7 +32,6 @@
return $log;
};
-
sub new {
my $class = shift;
my $self = bless { @_ }, $class;
@@ -87,9 +44,11 @@
return $self;
};
-# parse document
+
+# Parse document (primary data and metadata)
sub parse {
my $self = shift;
+ my $meta_data_type = $self->meta_type;
my $data_xml = $self->path . 'data.xml';
@@ -97,6 +56,7 @@
my $unable = 'Unable to parse document ' . $self->path;
+ # No primary data found
unless (-e $data_xml) {
$self->log->warn($unable . ' - no data.xml found');
$error = 1;
@@ -104,6 +64,7 @@
else {
+ # Load file
$file = b($data_xml)->slurp;
try {
@@ -148,12 +109,30 @@
my @path = grep { $_ } splitdir($self->path);
my @header;
- # Parse the corpus file, the doc file, and the text file for meta information
+ # Parse the corpus file, the doc file,
+ # and the text file for meta information
foreach (0..2) {
unshift @header, '/' . catfile(@path, 'header.xml');
pop @path;
};
+
+ my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type;
+ my $meta;
+
+ if ($meta_class->can('new') || eval("require $meta_class; 1;")) {
+ $meta = $meta_class->new(
+ log => $self->log,
+ corpus_sigle => $self->corpus_sigle,
+ doc_sigle => $self->doc_sigle,
+ text_sigle => $self->text_sigle
+ );
+
+ $self->{meta} = $meta;
+ };
+
+ return unless $meta;
+
my @type = qw/corpus doc text/;
foreach (@header) {
# Get corpus, doc and text meta data
@@ -161,19 +140,22 @@
next unless -e $_;
+ # Slurp data and probably decode
my $slurp = b($_)->slurp;
- $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/;
+ $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
my $file = $slurp->decode($2 // 'UTF-8');
# Get DOM
my $dom = Mojo::DOM->new($file);
- if ($dom->at('idsHeader') || $dom->at('idsheader')) {
- $self->_parse_meta_i5($dom, $type);
- }
- else {
- $self->_parse_meta_tei($dom, $type);
- };
+ # Choose which metadata parser to use
+# if ($dom->at('idsHeader') || $dom->at('idsheader')) {
+# $self->_parse_meta_i5($dom, $type);
+# }
+# else {
+# $self->_parse_meta_tei($dom, $type);
+# };
+ $meta->parse($dom, $type);
};
return $self;
@@ -241,405 +223,66 @@
$_[0]->{pd};
};
-#sub author {
-# my $self = shift;
-#
-# # Set authors
-# if ($_[0]) {
-# return $self->{authors} = [
-# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
-# ];
-# }
-# return ($self->{authors} // []);
-#};
-
-sub text_class {
- my $self = shift;
- if ($_[0]) {
- return $self->{topics} = [ @_ ];
- };
- return ($self->{topics} //= []);
+sub meta {
+ return $_[0]->{meta};
};
-sub text_class_string {
- return join ' ', @{shift->text_class};
-}
-
-sub keywords {
+sub to_hash {
my $self = shift;
- if ($_[0]) {
- return $self->{keywords} = [ @_ ];
- };
- return ($self->{keywords} //= []);
-};
-sub keywords_string {
- return join ' ', @{shift->keywords};
-}
+ $self->parse unless $self->text_sigle;
-sub _remove_prefix {
-# return $_[0];
+ my %hash;
- # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
- my $title = shift;
- my $prefix = shift;
- $prefix =~ tr!_!/!;
- if (index($title, $prefix) == 0) {
- $title = substr($title, length($prefix));
- $title =~ s/^\s+//;
- $title =~ s/\s+$//;
- };
- return $title;
-};
+ # Get meta object
+ my $meta = $self->meta;
+ foreach (keys %$meta) {
+ # Ignore private keys
+ next if index($_, '_') == 0;
-sub _parse_meta_tei {
- my $self = shift;
- my $dom = shift;
- my $type = shift;
-
- my $stmt;
- if ($type eq 'text') {
-
- # Publisher
- try {
- $self->publisher($dom->at('publisher')->all_text);
- };
-
- # Date of publication
- try {
- my $date = $dom->at('date')->all_text;
- $self->store(sgbrDate => $date);
- if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2}).*$!$1$2$3!) {
- $self->pub_date($date);
- }
- else {
- $self->log->warn('"' . $date . '" is not a compatible pubDate');
- };
- };
-
- # Publication place
- try {
- my $pp = $dom->at('pubPlace');
- if ($pp) {
- $self->pub_place($pp->all_text) if $pp->all_text;
- };
- if ($pp->attr('ref')) {
- $self->reference($pp->attr('ref'));
- };
- };
-
- if ($stmt = $dom->at('titleStmt')) {
- # Title
- try {
- $stmt->find('title')->each(
- sub {
- my $type = $_->attr('type') || 'main';
- $self->title($_->all_text) if $type eq 'main';
-
- # Only support the first subtitle
- $self->sub_title($_->all_text) if $type eq 'sub' && !$self->sub_title;
- }
- );
- };
-
- # Author
- try {
- my $author = $stmt->at('author')->attr('ref');
-
- $author = $self->{ref_author}->{$author};
-
- if ($author) {
- my $array = $self->keywords;
- $self->author($author->{name} // $author->{id});
-
- if ($author->{age}) {
- $self->store('sgbrAuthorAgeClass' => $author->{age});
- push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
- };
- if ($author->{sex}) {
- $self->store('sgbrAuthorSex' => $author->{sex});
- push @$array, 'sgbrAuthorSex:' . $author->{sex};
- };
- };
- };
- };
-
- try {
- my $kodex = $dom->at('item[rend]')->attr('rend');
- if ($kodex) {
- my $array = $self->keywords;
- $self->store('sgbrKodex' => $kodex);
- push @$array, 'sgbrKodex:' . $kodex;
- };
- };
- }
-
- elsif ($type eq 'doc') {
- try {
- $dom->find('particDesc person')->each(
- sub {
-
- my $hash = $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
- age => $_->attr('age'),
- sex => $_->attr('sex'),
- id => $_->attr('xml:id')
- };
-
- # Get name
- if ($_->at('persName')) {
- $hash->{name} = $_->at('persName')->all_text;
- };
- });
- };
-
- try {
- my $lang = $dom->at('language[ident]')->attr('ident');
- $self->language($lang);
- };
-
- try {
- $self->store('funder', $dom->at('funder > orgName')->all_text);
- };
-
- try {
- $stmt = $dom->find('fileDesc > titleStmt > title')->each(
- sub {
- my $type = $_->attr('type') || 'main';
- $self->doc_title($_->all_text) if $type eq 'main';
- if ($type eq 'sub') {
- my $sub_title = $self->doc_sub_title;
- $self->doc_sub_title(
- ($sub_title ? $sub_title . ', ' : '') . $_->all_text
- );
- };
- }
- );
- };
- };
- return;
-};
-
-
-
-sub _parse_meta_i5 {
- my $self = shift;
- my $dom = shift;
- my $type = shift;
-
- my $analytic = $dom->at('analytic') || $dom->at('monogr');
-
- # There is an analytic element
- if ($analytic) {
-
- # Get title, subtitle, author, editor
- my $title = $analytic->at('h\.title[type=main]');
- my $sub_title = $analytic->at('h\.title[type=sub]');
- my $author = $analytic->at('h\.author');
- my $editor = $analytic->at('editor');
-
- $title = $title ? $title->all_text : undef;
- $sub_title = $sub_title ? $sub_title->all_text : undef;
- $author = $author ? $author->all_text : undef;
- $editor = $editor ? $editor->all_text : undef;
-
- if ($type eq 'text') {
- $self->title(_remove_prefix($title, $self->text_sigle)) if $title;
- $self->sub_title($sub_title) if $sub_title;
- $self->editor($editor) if $editor;
- $self->author($author) if $author;
+ my $v = $meta->{$_};
+ if (ref $v) {
+ $hash{_k($_)} = $meta->keywords($_);
}
- elsif ($type eq 'doc') {
- $self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title;
- $self->doc_sub_title($sub_title) if $sub_title;
- $self->doc_author($author) if $author;
- $self->doc_editor($editor) if $editor;
- }
- elsif ($type eq 'corpus') {
- $self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title;
- $self->corpus_sub_title($sub_title) if $sub_title;
- $self->corpus_author($author) if $author;
- $self->corpus_editor($editor) if $editor;
+ else {
+ $v =~ s/\n/ /g;
+ $v =~ s/\s\s+/ /g;
+ $hash{_k($_)} = $v;
};
};
- # Not in analytic
- if ($type eq 'corpus') {
- unless ($self->corpus_title) {
- if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
- $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle))
- if $title->all_text;
- };
- };
- }
-
- # doc title
- elsif ($type eq 'doc') {
- unless ($self->doc_title) {
- if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
- $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle))
- if $title->all_text;
- };
- };
- }
-
- # text title
- elsif ($type eq 'text') {
- unless ($self->title) {
- if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
- $self->title(_remove_prefix($title->all_text, $self->text_sigle))
- if $title->all_text;
- }
- };
+ foreach (qw/corpus doc text/) {
+ $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'};
};
- # Get PubPlace
- if (my $place = $dom->at('pubPlace')) {
- $self->pub_place($place->all_text) if $place->all_text;
- $self->pub_place_key($place->attr('key')) if $place->attr('key');
- };
-
- # Get Publisher
- if (my $publisher = $dom->at('imprint publisher')) {
- $self->publisher($publisher->all_text) if $publisher->all_text;
- };
-
- # Get text type
- my $text_desc = $dom->at('textDesc');
-
- if ($text_desc) {
- if (my $text_type = $text_desc->at('textType')) {
- $self->text_type($text_type->all_text) if $text_type->all_text;
- };
-
- # Get text domain
- if (my $text_domain = $text_desc->at('textDomain')) {
- $self->text_domain($text_domain->all_text) if $text_domain->all_text;
- };
-
- # Get text type art
- if (my $text_type_art = $text_desc->at('textTypeArt')) {
- $self->text_type_art($text_type_art->all_text) if $text_type_art->all_text;
- };
-
- # Get text type art
- if (my $text_type_ref = $text_desc->at('textTypeRef')) {
- $self->text_type_ref($text_type_ref->all_text) if $text_type_ref->all_text;
- };
- };
-
- # Availability
- try {
- $self->availability(
- $dom->at('availability')->all_text
- );
- };
-
- # Get pubDate
- my $pub_date = $dom->find('pubDate[type=year]');
- $pub_date->each(
- sub {
- my $x = shift->parent;
- my $year = $x->at("pubDate[type=year]");
- return unless $year;
-
- $year = $year ? $year->text : 0;
- my $month = $x->at("pubDate[type=month]");
- $month = $month ? $month->text : 0;
- my $day = $x->at("pubDate[type=day]");
- $day = $day ? $day->text : 0;
-
- $year = 0 if $year !~ /^\d+$/;
- $month = 0 if $month !~ /^\d+$/;
- $day = 0 if $day !~ /^\d+$/;
-
- my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
- $date .= length($month) == 1 ? '0' . $month : $month;
- $date .= length($day) == 1 ? '0' . $day : $day;
- $self->pub_date($date);
- });
-
- # creatDate
- my $create_date = $dom->at('creatDate');
- if ($create_date && $create_date->text) {
- $create_date = $create_date->all_text;
- if (index($create_date, '-') > -1) {
- $self->log->warn("Creation date ranges are not supported");
- ($create_date) = split /\s*-\s*/, $create_date;
- }
-
- $create_date =~ s{^(\d{4})$}{$1\.00};
- $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
- if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
- $create_date =~ tr/\.//d;
- $self->creation_date($create_date);
- };
- };
-
- my $text_class = $dom->at('textClass');
- if ($text_class) {
- # Get textClasses
- my @topic;
-
- $text_class->find("catRef")->each(
- sub {
- my ($ign, @ttopic) = split('\.', $_->attr('target'));
- push(@topic, @ttopic);
- }
- );
- $self->text_class(@topic) if @topic > 0;
-
- my $kws = $self->keywords;
- my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
- push(@$kws, @keywords) if @keywords > 0;
- };
-
- if (my $edition_statement = $dom->at('biblFull editionStmt')) {
- $self->bibl_edition_statement($edition_statement->all_text)
- if $edition_statement->text;
- };
-
- if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
- $self->file_edition_statement($edition_statement->all_text)
- if $edition_statement->text;
- };
-
- if (my $file_desc = $dom->at('fileDesc')) {
- if (my $availability = $file_desc->at('publicationStmt > availability')) {
- $self->license($availability->all_text);
- };
- };
-
- # Some meta data only available in the corpus
- if ($type eq 'corpus') {
- if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
- $self->language($language->attr('id'));
- };
- }
-
- # Some meta data only reevant from the text
- elsif ($type eq 'text') {
-
- if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
- if (my $ref_text = $reference->all_text) {
- $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
- $self->reference($ref_text);
- };
- };
-
- my $column = $dom->at('textDesc > column');
- $self->text_column($column->all_text) if $column;
-
- if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
- $pages = $pages->all_text;
- if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
- $self->pages($1 . '-' . $2);
- };
- };
- };
+ return \%hash;
};
+sub _k {
+ my $x = $_[0];
+ $x =~ s/_(\w)/\U$1\E/g;
+ $x =~ s/id$/ID/gi;
+ return $x;
+};
+
+
+sub to_json {
+ my $self = shift;
+ unless ($self->{tokenizer}) {
+ $self->log->warn('No tokenizer defined');
+ return;
+ };
+
+ return $self->{tokenizer}->to_json;
+};
+
+
+1;
+
+
+__END__
sub to_string {
my $self = shift;
@@ -660,38 +303,6 @@
return $string;
};
-sub _k {
- my $x = $_[0];
- $x =~ s/_(\w)/\U$1\E/g;
- $x =~ s/id$/ID/gi;
- return $x;
-};
-
-
-sub to_hash {
- my $self = shift;
-
- $self->parse unless $self->text_sigle;
-
- my %hash;
-
- foreach (@ATTR, @ADVANCED_ATTR, 'store') {
- if (my $att = $self->$_) {
- $att =~ s/\n/ /g;
- $att =~ s/\s\s+/ /g;
- $hash{_k($_)} = $att;
- };
- };
-
- for (qw/text_class keywords/) {
- my @array = @{ $self->$_ };
- next unless @array;
- $hash{_k($_)} = join(' ', @array);
- };
-
- return \%hash;
-};
-
# Todo: Make this a KoralQuery serializer
sub to_koral_query {
my $self = shift;
@@ -703,17 +314,6 @@
};
-sub to_json {
- my $self = shift;
- unless ($self->{tokenizer}) {
- $self->log->warn('No tokenizer defined');
- return;
- };
-
- return $self->{tokenizer}->to_json;
-};
-
-
1;
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
new file mode 100644
index 0000000..a92ebd5
--- /dev/null
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -0,0 +1,53 @@
+package KorAP::XML::Meta::Base;
+use strict;
+use warnings;
+
+# Importing method
+sub import {
+ my $class = shift;
+ my $caller = caller;
+
+ no strict 'refs';
+
+ push @{"${caller}::ISA"}, $class;
+
+ strict->import;
+ warnings->import;
+ utf8->import;
+ feature->import(':5.10');
+};
+
+sub log {
+ return $_[0]->{_log};
+};
+
+sub corpus_sigle {
+ $_[0]->{_corpus_sigle};
+};
+
+sub doc_sigle {
+ $_[0]->{_doc_sigle};
+};
+
+sub text_sigle {
+ $_[0]->{_text_sigle};
+};
+
+sub new {
+ my $class = shift;
+ my %hash = @_;
+ my $copy = {};
+ foreach (qw/log corpus_sigle doc_sigle text_sigle/) {
+ $copy->{'_' . $_} = $hash{$_};
+ };
+
+ bless $copy, $class;
+};
+
+sub keywords {
+ my $self = shift;
+ return join(' ', @{$self->{$_[0]} // []});
+};
+
+
+1;
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
new file mode 100644
index 0000000..e03640c
--- /dev/null
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -0,0 +1,274 @@
+package KorAP::XML::Meta::I5;
+use KorAP::XML::Meta::Base;
+use Try::Tiny;
+
+# Parse meta data
+sub parse {
+ my $self = shift;
+ my $dom = shift;
+ my $type = shift;
+
+ my $analytic = $dom->at('analytic') || $dom->at('monogr');
+
+ # There is an analytic element
+ if ($analytic) {
+
+ # Get title, subtitle, author, editor
+ my $title = $analytic->at('h\.title[type=main]');
+ my $sub_title = $analytic->at('h\.title[type=sub]');
+ my $author = $analytic->at('h\.author');
+ my $editor = $analytic->at('editor');
+
+ $title = $title ? $title->all_text : undef;
+ $sub_title = $sub_title ? $sub_title->all_text : undef;
+ $author = $author ? $author->all_text : undef;
+ $editor = $editor ? $editor->all_text : undef;
+
+ if ($type eq 'text') {
+ $self->{title} =_remove_prefix($title, $self->text_sigle) if $title;
+ $self->{sub_title} = $sub_title if $sub_title;
+ $self->{editor} = $editor if $editor;
+ $self->{author} = $author if $author;
+ }
+ elsif ($type eq 'doc') {
+ $self->{doc_title} = _remove_prefix($title, $self->doc_sigle) if $title;
+ $self->{doc_sub_title} = $sub_title if $sub_title;
+ $self->{doc_author} = $author if $author;
+ $self->{doc_editor} = $editor if $editor;
+ }
+ elsif ($type eq 'corpus') {
+ $self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle) if $title;
+ $self->{corpus_sub_title} = $sub_title if $sub_title;
+ $self->{corpus_author} = $author if $author;
+ $self->{corpus_editor} = $editor if $editor;
+ };
+ };
+
+ # Not in analytic
+ if ($type eq 'corpus') {
+ unless ($self->{corpus_title}) {
+ if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
+ $self->{corpus_title} = _remove_prefix($title->all_text, $self->corpus_sigle)
+ if $title->all_text;
+ };
+ };
+ }
+
+ # doc title
+ elsif ($type eq 'doc') {
+ unless ($self->{doc_title}) {
+ if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
+ $self->{doc_title} = _remove_prefix($title->all_text, $self->doc_sigle)
+ if $title->all_text;
+ };
+ };
+ }
+
+ # text title
+ elsif ($type eq 'text') {
+ unless ($self->{title}) {
+ if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
+ $self->{title} = _remove_prefix($title->all_text, $self->text_sigle)
+ if $title->all_text;
+ }
+ };
+ };
+
+ # Get PubPlace
+ if (my $place = $dom->at('pubPlace')) {
+ $self->{pub_place} = $place->all_text if $place->all_text;
+ $self->{pub_place_key} = $place->attr('key') if $place->attr('key');
+ };
+
+ # Get Publisher
+ if (my $publisher = $dom->at('imprint publisher')) {
+ $self->{publisher} = $publisher->all_text if $publisher->all_text;
+ };
+
+ # Get text type
+ my $text_desc = $dom->at('textDesc');
+
+ if ($text_desc) {
+ if (my $text_type = $text_desc->at('textType')) {
+ $self->{text_type} = $text_type->all_text if $text_type->all_text;
+ };
+
+ # Get text domain
+ if (my $text_domain = $text_desc->at('textDomain')) {
+ $self->{text_domain} = $text_domain->all_text if $text_domain->all_text;
+ };
+
+ # Get text type art
+ if (my $text_type_art = $text_desc->at('textTypeArt')) {
+ $self->{text_type_art} = $text_type_art->all_text if $text_type_art->all_text;
+ };
+
+ # Get text type art
+ if (my $text_type_ref = $text_desc->at('textTypeRef')) {
+ $self->{text_type_ref} = $text_type_ref->all_text if $text_type_ref->all_text;
+ };
+ };
+
+ # Availability
+ try {
+ $self->{availability} = $dom->at('availability')->all_text;
+ };
+
+ # Get pubDate
+ my $pub_date = $dom->find('pubDate[type=year]');
+ $pub_date->each(
+ sub {
+ my $x = shift->parent;
+ my $year = $x->at("pubDate[type=year]");
+ return unless $year;
+
+ $year = $year ? $year->text : 0;
+ my $month = $x->at("pubDate[type=month]");
+ $month = $month ? $month->text : 0;
+ my $day = $x->at("pubDate[type=day]");
+ $day = $day ? $day->text : 0;
+
+ $year = 0 if $year !~ /^\d+$/;
+ $month = 0 if $month !~ /^\d+$/;
+ $day = 0 if $day !~ /^\d+$/;
+
+ my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
+ $date .= length($month) == 1 ? '0' . $month : $month;
+ $date .= length($day) == 1 ? '0' . $day : $day;
+ $self->{pub_date} = $date;
+ });
+
+ # creatDate
+ my $create_date = $dom->at('creatDate');
+ if ($create_date && $create_date->text) {
+ $create_date = $create_date->all_text;
+ if (index($create_date, '-') > -1) {
+ $self->log->warn("Creation date ranges are not supported");
+ ($create_date) = split /\s*-\s*/, $create_date;
+ }
+
+ $create_date =~ s{^(\d{4})$}{$1\.00};
+ $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
+ if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
+ $create_date =~ tr/\.//d;
+ $self->{creation_date} = $create_date;
+ };
+ };
+
+ my $text_class = $dom->at('textClass');
+ if ($text_class) {
+ # Get textClasses
+ my @topic;
+
+ $text_class->find("catRef")->each(
+ sub {
+ my ($ign, @ttopic) = split('\.', $_->attr('target'));
+ push(@topic, @ttopic);
+ }
+ );
+ $self->{text_class} = [@topic] if @topic > 0;
+
+ my $kws = $self->{keywords};
+ my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
+ push(@$kws, @keywords) if @keywords > 0;
+ };
+
+ if (my $edition_statement = $dom->at('biblFull editionStmt')) {
+ $self->{bibl_edition_statement} = $edition_statement->all_text
+ if $edition_statement->text;
+ };
+
+ if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
+ $self->{file_edition_statement} = $edition_statement->all_text
+ if $edition_statement->text;
+ };
+
+ if (my $file_desc = $dom->at('fileDesc')) {
+ if (my $availability = $file_desc->at('publicationStmt > availability')) {
+ $self->{license} = $availability->all_text;
+ };
+ };
+
+ # Some meta data only available in the corpus
+ if ($type eq 'corpus') {
+ if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
+ $self->{language} = $language->attr('id');
+ };
+ }
+
+ # Some meta data only reevant from the text
+ elsif ($type eq 'text') {
+
+ if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
+ if (my $ref_text = $reference->all_text) {
+ $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
+ $self->{reference} = $ref_text;
+ };
+ };
+
+ my $column = $dom->at('textDesc > column');
+ $self->{text_column} = $column->all_text if $column;
+
+ if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
+ $pages = $pages->all_text;
+ if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
+ $self->{pages} = $1 . '-' . $2;
+ };
+ };
+ };
+};
+
+
+sub _remove_prefix {
+# return $_[0];
+
+ # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
+ my $title = shift;
+ my $prefix = shift or return $title;
+ $prefix =~ tr!_!/!;
+ if (index($title, $prefix) == 0) {
+ $title = substr($title, length($prefix));
+ $title =~ s/^\s+//;
+ $title =~ s/\s+$//;
+ };
+ return $title;
+};
+
+
+#sub author {
+# my $self = shift;
+#
+# # Set authors
+# if ($_[0]) {
+# return $self->{authors} = [
+# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
+# ];
+# }
+# return ($self->{authors} // []);
+#};
+#sub text_class {
+# my $self = shift;
+# if ($_[0]) {
+# return $self->{topics} = [ @_ ];
+# };
+# return ($self->{topics} //= []);
+#};
+
+#sub text_class_string {
+# return join ' ', @{shift->text_class};
+#}
+
+#sub keywords {
+# my $self = shift;
+# if ($_[0]) {
+# return $self->{keywords} = [ @_ ];
+# };
+# return ($self->{keywords} //= []);
+#};
+
+#sub keywords_string {
+# return join ' ', @{shift->keywords};
+#}
+
+
+1;
diff --git a/lib/KorAP/XML/Meta/Sgbr.pm b/lib/KorAP/XML/Meta/Sgbr.pm
new file mode 100644
index 0000000..2d33975
--- /dev/null
+++ b/lib/KorAP/XML/Meta/Sgbr.pm
@@ -0,0 +1,133 @@
+package KorAP::XML::Meta::Sgbr;
+use KorAP::XML::Meta::Base;
+use Try::Tiny;
+
+# Parse meta data
+sub parse {
+ my $self = shift;
+ my $dom = shift;
+ my $type = shift;
+
+ my $stmt;
+ if ($type eq 'text') {
+
+ # Publisher
+ try {
+ $self->{publisher} = $dom->at('publisher')->all_text;
+ };
+
+ # Date of publication
+ try {
+ my $date = $dom->at('date')->all_text;
+ $self->{sgbr_date} = $date;
+ if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2}).*$!$1$2$3!) {
+ $self->{pub_date} = $date;
+ }
+ else {
+ $self->log->warn('"' . $date . '" is not a compatible pubDate');
+ };
+ };
+
+ # Publication place
+ try {
+ my $pp = $dom->at('pubPlace');
+ if ($pp) {
+ $self->{pub_place} = $pp->all_text if $pp->all_text;
+ };
+ if ($pp->attr('ref')) {
+ $self->{reference} = $pp->attr('ref');
+ };
+ };
+
+ if ($stmt = $dom->at('titleStmt')) {
+ # Title
+ try {
+ $stmt->find('title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->{title} = $_->all_text if $type eq 'main';
+
+ # Only support the first subtitle
+ $self->{sub_title} = $_->all_text
+ if $type eq 'sub' && !$self->sub_title;
+ }
+ );
+ };
+
+ # Author
+ try {
+ my $author = $stmt->at('author')->attr('ref');
+
+ $author = $self->{_ref_author}->{$author};
+
+ if ($author) {
+ my $array = ($self->{keywords} //= []);
+ $self->{author} = $author->{name} // $author->{id};
+
+ if ($author->{age}) {
+ $self->{'sgbr_author_age_class'} = $author->{age};
+ push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
+ };
+ if ($author->{sex}) {
+ $self->{'sgbr_author_sex'} = $author->{sex};
+ push @$array, 'sgbrAuthorSex:' . $author->{sex};
+ };
+ };
+ };
+ };
+
+ try {
+ my $kodex = $dom->at('item[rend]')->attr('rend');
+ if ($kodex) {
+ my $array = ($self->{keywords} //= []);
+ $self->{'sgbr_kodex'} = $kodex;
+ push @$array, 'sgbrKodex:' . $kodex;
+ };
+ };
+ }
+
+ elsif ($type eq 'doc') {
+ try {
+ $dom->find('particDesc person')->each(
+ sub {
+
+ my $hash = $self->{_ref_author}->{'#' . $_->attr('xml:id')} = {
+ age => $_->attr('age'),
+ sex => $_->attr('sex'),
+ id => $_->attr('xml:id')
+ };
+
+ # Get name
+ if ($_->at('persName')) {
+ $hash->{name} = $_->at('persName')->all_text;
+ };
+ });
+ };
+
+ try {
+ my $lang = $dom->at('language[ident]')->attr('ident');
+ $self->{language} = $lang;
+ };
+
+ try {
+ $self->{'funder'} = $dom->at('funder > orgName')->all_text;
+ };
+
+ try {
+ $stmt = $dom->find('fileDesc > titleStmt > title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->{doc_title} = $_->all_text if $type eq 'main';
+ if ($type eq 'sub') {
+ my $sub_title = $self->{doc_sub_title};
+ $self->{doc_sub_title} =
+ ($sub_title ? $sub_title . ', ' : '') . $_->all_text;
+ };
+ }
+ );
+ };
+ };
+ return;
+};
+
+1;