Cleanup metadata files
Change-Id: Ibf3d07f300141c4be65b2bff2f5529ededa87b80
diff --git a/lib/KorAP/XML/Document/Primary.pm b/lib/KorAP/XML/Document/Primary.pm
index 54ed099..b2fab55 100644
--- a/lib/KorAP/XML/Document/Primary.pm
+++ b/lib/KorAP/XML/Document/Primary.pm
@@ -13,7 +13,7 @@
# Constructor
sub new {
my $class = shift;
- bless [shift()], $class;
+ bless [$_[0]], $class;
};
@@ -21,15 +21,14 @@
sub data {
my ($self, $from, $to) = @_;
+ # Get range data from primary
return substr($self->[0], $from) if $from && !$to;
+ # Get full data
return $self->[0] unless $to;
- my $substr = substr($self->[0], $from, $to - $from);
-
- return $substr if defined $substr;
-
- return;
+ # Return substring
+ return (substr($self->[0], $from, $to - $from) // undef);
};
@@ -95,8 +94,7 @@
# Calculate character offsets
sub _calc_chars {
use bytes;
- my $self = shift;
- my $text = shift;
+ my ($self, $text) = @_;
tie my @array, 'Packed::Array';
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 087b2ad..590e15d 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -32,9 +32,12 @@
return $log;
};
+# Constructor
sub new {
my $class = shift;
my $self = bless { @_ }, $class;
+
+ # Path is defined
if (exists $self->{path}) {
$self->{path} = rel2abs($self->{path});
if ($self->{path} !~ m!\/$!) {
@@ -50,8 +53,10 @@
my $self = shift;
my $meta_data_type = $self->meta_type;
- my $data_xml = $self->path . 'data.xml';
+ state $ENC_RE = qr/^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
+ # Path to primary
+ my $data_xml = $self->path . 'data.xml';
my ($rt, $error, $file);
my $unable = 'Unable to parse document ' . $self->path;
@@ -99,12 +104,11 @@
# Get primary data
my $pd = $rt->{text};
- if ($pd) {
- $self->{pd} = KorAP::XML::Document::Primary->new($pd);
- }
- else {
- croak $unable;
- };
+
+ croak $unable unless $pd;
+
+ # Associate primary data
+ $self->{pd} = KorAP::XML::Document::Primary->new($pd);
my @path = grep { $_ } splitdir($self->path);
my @header;
@@ -116,22 +120,27 @@
pop @path;
};
-
+ # Get metadata class and create an object
my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type;
my $meta;
if ($meta_class->can('new') || eval("require $meta_class; 1;")) {
$meta = $meta_class->new(
- log => $self->log,
+ log => $self->log,
corpus_sigle => $self->corpus_sigle,
doc_sigle => $self->doc_sigle,
text_sigle => $self->text_sigle
);
+ # Associate meta object
$self->{meta} = $meta;
};
- return unless $meta;
+ unless ($meta) {
+ $self->log->warn(
+ "Metadata object for $meta_data_type not initializable"
+ );
+ };
my @type = qw/corpus doc text/;
foreach (@header) {
@@ -142,19 +151,13 @@
# Slurp data and probably decode
my $slurp = b($_)->slurp;
- $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
+ $slurp =~ $ENC_RE;
my $file = $slurp->decode($2 // 'UTF-8');
# Get DOM
my $dom = Mojo::DOM->new($file);
- # Choose which metadata parser to use
-# if ($dom->at('idsHeader') || $dom->at('idsheader')) {
-# $self->_parse_meta_i5($dom, $type);
-# }
-# else {
-# $self->_parse_meta_tei($dom, $type);
-# };
+ # Parse object based on DOM
$meta->parse($dom, $type);
};
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index e03640c..42b3496 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -4,9 +4,7 @@
# Parse meta data
sub parse {
- my $self = shift;
- my $dom = shift;
- my $type = shift;
+ my ($self, $dom, $type) = @_;
my $analytic = $dom->at('analytic') || $dom->at('monogr');
@@ -24,18 +22,23 @@
$author = $author ? $author->all_text : undef;
$editor = $editor ? $editor->all_text : undef;
+ # Text meta data
if ($type eq 'text') {
$self->{title} =_remove_prefix($title, $self->text_sigle) if $title;
$self->{sub_title} = $sub_title if $sub_title;
$self->{editor} = $editor if $editor;
$self->{author} = $author if $author;
}
+
+ # Doc meta data
elsif ($type eq 'doc') {
$self->{doc_title} = _remove_prefix($title, $self->doc_sigle) if $title;
$self->{doc_sub_title} = $sub_title if $sub_title;
$self->{doc_author} = $author if $author;
$self->{doc_editor} = $editor if $editor;
}
+
+ # Corpus meta data
elsif ($type eq 'corpus') {
$self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle) if $title;
$self->{corpus_sub_title} = $sub_title if $sub_title;
@@ -45,11 +48,17 @@
};
# Not in analytic
+ my $title;
if ($type eq 'corpus') {
+
+ # Corpus title not yet given
unless ($self->{corpus_title}) {
- if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
- $self->{corpus_title} = _remove_prefix($title->all_text, $self->corpus_sigle)
- if $title->all_text;
+ if ($title = $dom->at('fileDesc > titleStmt > c\.title')) {
+ $title = $title->all_text;
+
+ if ($title) {
+ $self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle);
+ };
};
};
}
@@ -57,9 +66,12 @@
# doc title
elsif ($type eq 'doc') {
unless ($self->{doc_title}) {
- if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
- $self->{doc_title} = _remove_prefix($title->all_text, $self->doc_sigle)
- if $title->all_text;
+ if ($title = $dom->at('fileDesc > titleStmt > d\.title')) {
+ $title = $title->all_text;
+
+ if ($title) {
+ $self->{doc_title} = _remove_prefix($title, $self->doc_sigle);
+ };
};
};
}
@@ -67,70 +79,78 @@
# text title
elsif ($type eq 'text') {
unless ($self->{title}) {
- if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
- $self->{title} = _remove_prefix($title->all_text, $self->text_sigle)
- if $title->all_text;
+ if ($title = $dom->at('fileDesc > titleStmt > t\.title')) {
+ $title = $title->all_text;
+ if ($title) {
+ $self->{title} = _remove_prefix($title, $self->text_sigle);
+ };
}
};
};
+ my $temp;
+
# Get PubPlace
- if (my $place = $dom->at('pubPlace')) {
- $self->{pub_place} = $place->all_text if $place->all_text;
- $self->{pub_place_key} = $place->attr('key') if $place->attr('key');
+ if ($temp = $dom->at('pubPlace')) {
+ my $place_attr = $temp->attr('key');
+ $self->{pub_place_key} = $place_attr if $place_attr;
+ $temp = $temp->all_text;
+ $self->{pub_place} = $temp if $temp;
};
# Get Publisher
- if (my $publisher = $dom->at('imprint publisher')) {
- $self->{publisher} = $publisher->all_text if $publisher->all_text;
+ if ($temp = $dom->at('imprint publisher')) {
+ $temp = $temp->all_text;
+ $self->{publisher} = $temp if $temp;
};
# Get text type
- my $text_desc = $dom->at('textDesc');
+ $temp = $dom->at('textDesc');
+ my $temp_2;
- if ($text_desc) {
- if (my $text_type = $text_desc->at('textType')) {
- $self->{text_type} = $text_type->all_text if $text_type->all_text;
+ if ($temp) {
+ if ($temp_2 = $temp->at('textType')) {
+ $temp_2 = $temp_2->all_text;
+ $self->{text_type} = $temp_2 if $temp_2;
};
# Get text domain
- if (my $text_domain = $text_desc->at('textDomain')) {
- $self->{text_domain} = $text_domain->all_text if $text_domain->all_text;
+ if ($temp_2 = $temp->at('textDomain')) {
+ $temp_2 = $temp_2->all_text;
+ $self->{text_domain} = $temp_2 if $temp_2;
};
# Get text type art
- if (my $text_type_art = $text_desc->at('textTypeArt')) {
- $self->{text_type_art} = $text_type_art->all_text if $text_type_art->all_text;
+ if ($temp_2 = $temp->at('textTypeArt')) {
+ $temp_2 = $temp_2->all_text;
+ $self->{text_type_art} = $temp_2 if $temp_2;
};
- # Get text type art
- if (my $text_type_ref = $text_desc->at('textTypeRef')) {
- $self->{text_type_ref} = $text_type_ref->all_text if $text_type_ref->all_text;
+ # Get text type ref
+ if ($temp_2 = $temp->at('textTypeRef')) {
+ $temp_2 = $temp_2->all_text;
+ $self->{text_type_ref} = $temp_2 if $temp_2;
};
};
- # Availability
- try {
- $self->{availability} = $dom->at('availability')->all_text;
- };
+ state $NR_RE = qr/^\d+$/;
+ state $REF_RE = qr!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!;
# Get pubDate
my $pub_date = $dom->find('pubDate[type=year]');
$pub_date->each(
sub {
my $x = shift->parent;
- my $year = $x->at("pubDate[type=year]");
- return unless $year;
-
+ my $year = $x->at('pubDate[type=year]') or return;
$year = $year ? $year->text : 0;
- my $month = $x->at("pubDate[type=month]");
+ my $month = $x->at('pubDate[type=month]');
$month = $month ? $month->text : 0;
- my $day = $x->at("pubDate[type=day]");
+ my $day = $x->at('pubDate[type=day]');
$day = $day ? $day->text : 0;
- $year = 0 if $year !~ /^\d+$/;
- $month = 0 if $month !~ /^\d+$/;
- $day = 0 if $day !~ /^\d+$/;
+ $year = 0 if $year !~ $NR_RE;
+ $month = 0 if $month !~ $NR_RE;
+ $day = 0 if $day !~ $NR_RE;
my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
@@ -145,22 +165,24 @@
if (index($create_date, '-') > -1) {
$self->log->warn("Creation date ranges are not supported");
($create_date) = split /\s*-\s*/, $create_date;
- }
-
- $create_date =~ s{^(\d{4})$}{$1\.00};
- $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
- if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
+ };
+ unless ($create_date =~ s{^(\d{4})$}{$1\.00\.00}) {
+ unless ($create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00}) {
+ $create_date =~ /^\d{4}\.\d{2}\.\d{2}$/;
+ };
+ };
+ if ($create_date =~ /^\d{4}(?:\.\d{2}(?:\.\d{2})?)?$/) {
$create_date =~ tr/\.//d;
$self->{creation_date} = $create_date;
};
};
- my $text_class = $dom->at('textClass');
- if ($text_class) {
+ $temp = $dom->at('textClass');
+ if ($temp) {
# Get textClasses
my @topic;
- $text_class->find("catRef")->each(
+ $temp->find("catRef")->each(
sub {
my ($ign, @ttopic) = split('\.', $_->attr('target'));
push(@topic, @ttopic);
@@ -169,49 +191,52 @@
$self->{text_class} = [@topic] if @topic > 0;
my $kws = $self->{keywords};
- my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
+ my @keywords = $temp->find("h\.keywords > keyTerm")->each;
push(@$kws, @keywords) if @keywords > 0;
};
- if (my $edition_statement = $dom->at('biblFull editionStmt')) {
- $self->{bibl_edition_statement} = $edition_statement->all_text
- if $edition_statement->text;
+ if ($temp = $dom->at('biblFull editionStmt')) {
+ $temp = $temp->all_text;
+ $self->{bibl_edition_statement} = $temp if $temp;
};
- if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
- $self->{file_edition_statement} = $edition_statement->all_text
- if $edition_statement->text;
+ if ($temp = $dom->at('fileDescl editionStmt')) {
+ $temp = $temp->all_text;
+ $self->{file_edition_statement} = $temp if $temp;
};
- if (my $file_desc = $dom->at('fileDesc')) {
- if (my $availability = $file_desc->at('publicationStmt > availability')) {
- $self->{license} = $availability->all_text;
+ if ($temp = $dom->at('fileDesc')) {
+ if (my $availability = $temp->at('publicationStmt > availability')) {
+ $temp = $availability->all_text;
+ $self->{availability} = $temp if $temp;
};
};
# Some meta data only available in the corpus
if ($type eq 'corpus') {
- if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
- $self->{language} = $language->attr('id');
+ if ($temp = $dom->at('profileDesc > langUsage > language[id]')) {
+ $self->{language} = $temp->attr('id') if $temp->attr('id');
};
}
# Some meta data only reevant from the text
elsif ($type eq 'text') {
- if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
- if (my $ref_text = $reference->all_text) {
- $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
+ if ($temp = $dom->at('sourceDesc reference[type=complete]')) {
+ if (my $ref_text = $temp->all_text) {
+ $ref_text =~ s!$REF_RE!!;
$self->{reference} = $ref_text;
};
};
- my $column = $dom->at('textDesc > column');
- $self->{text_column} = $column->all_text if $column;
+ $temp = $dom->at('textDesc > column');
+ if ($temp && ($temp = $temp->all_text)) {
+ $self->{text_column} = $temp;
+ };
- if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
- $pages = $pages->all_text;
- if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
+ if ($temp = $dom->at('biblStruct biblScope[type=pp]')) {
+ $temp = $temp->all_text;
+ if ($temp && $temp =~ m/(\d+)\s*-\s*(\d+)/) {
$self->{pages} = $1 . '-' . $2;
};
};
@@ -220,55 +245,19 @@
sub _remove_prefix {
-# return $_[0];
-
# This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
- my $title = shift;
- my $prefix = shift or return $title;
+ return $_[0] unless $_[1];
+
+ my ($title, $prefix) = @_;
$prefix =~ tr!_!/!;
if (index($title, $prefix) == 0) {
$title = substr($title, length($prefix));
$title =~ s/^\s+//;
$title =~ s/\s+$//;
};
+
return $title;
};
-#sub author {
-# my $self = shift;
-#
-# # Set authors
-# if ($_[0]) {
-# return $self->{authors} = [
-# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
-# ];
-# }
-# return ($self->{authors} // []);
-#};
-#sub text_class {
-# my $self = shift;
-# if ($_[0]) {
-# return $self->{topics} = [ @_ ];
-# };
-# return ($self->{topics} //= []);
-#};
-
-#sub text_class_string {
-# return join ' ', @{shift->text_class};
-#}
-
-#sub keywords {
-# my $self = shift;
-# if ($_[0]) {
-# return $self->{keywords} = [ @_ ];
-# };
-# return ($self->{keywords} //= []);
-#};
-
-#sub keywords_string {
-# return join ' ', @{shift->keywords};
-#}
-
-
1;
diff --git a/t/meta.t b/t/meta.t
index e7d9f0e..a2ef1f9 100644
--- a/t/meta.t
+++ b/t/meta.t
@@ -310,7 +310,7 @@
ok(!$meta->{text_column}, 'text column');
ok(!$meta->{text_domain}, 'text domain');
ok(!$meta->{creation_date}, 'creation date');
-ok(!$meta->{license}, 'License');
+ok(!$meta->{availability}, 'License');
ok(!$meta->{pages}, 'Pages');
ok(!$meta->{file_edition_statement}, 'file edition statement');
ok(!$meta->{bibl_edition_statement}, 'bibl edition statement');
@@ -360,7 +360,7 @@
ok(!$meta->{text_domain}, 'text domain');
is($meta->{creation_date}, '20070707', 'creation date');
-is($meta->{license}, 'CC-BY-SA', 'License');
+is($meta->{availability}, 'CC-BY-SA', 'License');
ok(!$meta->{pages}, 'Pages');
ok(!$meta->{file_edition_statement}, 'file edition statement');
ok(!$meta->{bibl_edition_statement}, 'bibl edition statement');
diff --git a/t/real/bzk.t b/t/real/bzk.t
index e6ef8d5..8999cc5 100644
--- a/t/real/bzk.t
+++ b/t/real/bzk.t
@@ -45,7 +45,7 @@
is($meta->{pub_date}, '19590101', 'Creation date');
is($meta->{creation_date}, '19590101', 'Creation date');
-is($meta->{license}, 'ACA-NC-LC', 'License');
+is($meta->{availability}, 'ACA-NC-LC', 'License');
ok(!$meta->{pages}, 'Pages');
ok(!$meta->{file_edition_statement}, 'File Statement');
@@ -110,7 +110,7 @@
is($output->{textDomain}, 'Politik', 'Correct Text Domain');
is($output->{creationDate}, '19590101', 'Creation date');
-is($output->{license}, 'ACA-NC-LC', 'License');
+is($output->{availability}, 'ACA-NC-LC', 'License');
ok(!exists $output->{pages}, 'Pages');
ok(!exists $output->{fileEditionStatement}, 'File Statement');
ok(!exists $output->{biblEditionStatement}, 'Bibl Statement');
diff --git a/t/real/bzk_2.t b/t/real/bzk_2.t
index dba97cc..a44806b 100644
--- a/t/real/bzk_2.t
+++ b/t/real/bzk_2.t
@@ -46,7 +46,7 @@
is($meta->{creation_date}, '19590219', 'Creation date');
-is($meta->{license}, 'ACA-NC-LC', 'License');
+is($meta->{availability}, 'ACA-NC-LC', 'License');
ok(!$meta->{pages}, 'Pages');
ok(!$meta->{file_edition_statement}, 'File Statement');
@@ -112,7 +112,7 @@
is($output->{textClass}, 'politik ausland', 'Correct Text Domain');
is($output->{creationDate}, '19590219', 'Creation date');
-is($output->{license}, 'ACA-NC-LC', 'License');
+is($output->{availability}, 'ACA-NC-LC', 'License');
ok(!exists $output->{pages}, 'Pages');
ok(!exists $output->{fileEditionStatement}, 'File Statement');
ok(!exists $output->{biblEditionStatement}, 'Bibl Statement');
diff --git a/t/real/goethe.t b/t/real/goethe.t
index 204e217..9769af7 100644
--- a/t/real/goethe.t
+++ b/t/real/goethe.t
@@ -42,7 +42,7 @@
ok(!$meta->{text_column}, 'Correct Text Column');
ok(!$meta->{text_domain}, 'Correct Text Domain');
is($meta->{creation_date}, '18200000', 'Creation Date');
-is($meta->{license}, 'QAO-NC', 'License');
+is($meta->{availability}, 'QAO-NC', 'License');
is($meta->{pages}, '529-547', 'Pages');
ok(!$meta->{file_edition_statement}, 'File Ed Statement');
ok(!$meta->{bibl_edition_statement}, 'Bibl Ed Statement');
@@ -107,7 +107,7 @@
ok(!exists $output->{textColumn}, 'Correct Text Type');
ok(!exists $output->{textDomain}, 'Correct Text Type');
is($output->{creationDate}, '18200000', 'Creation Date');
-is($output->{license}, 'QAO-NC', 'License');
+is($output->{availability}, 'QAO-NC', 'License');
is($output->{pages}, '529-547', 'Pages');
ok(!exists $output->{fileEditionStatement}, 'Correct Text Type');
ok(!exists $output->{biblEditionStatement}, 'Correct Text Type');
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 9d867f9..71d16d4 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -39,7 +39,7 @@
ok(!$meta->{text_type_ref}, 'Correct Text Type Ref');
ok(!$meta->{text_domain}, 'Correct Text Domain');
is($meta->{creation_date}, '20070707', 'Creation date');
-is($meta->{license}, 'CC-BY-SA', 'License');
+is($meta->{availability}, 'CC-BY-SA', 'License');
ok(!$meta->{pages}, 'Pages');
ok(!$meta->{file_edition_statement}, 'File Statement');
ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
@@ -100,7 +100,7 @@
ok(!$output->{textTypeRef}, 'Correct Text Type Ref');
ok(!$output->{textDomain}, 'Correct Text Domain');
is($output->{creationDate}, '20070707', 'Creation date');
-is($output->{license}, 'CC-BY-SA', 'License');
+is($output->{availability}, 'CC-BY-SA', 'License');
ok(!$output->{pages}, 'Pages');
ok(!$output->{fileEditionStatement}, 'File Statement');
ok(!$output->{biblEditionStatement}, 'Bibl Statement');