Simplified and modularized metadata processing
Change-Id: I63e78fd5994126c954263324bcfc2fd9d51e39ea
diff --git a/.gitignore b/.gitignore
index 7332d99..8518f50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@
Makefile
pm_to_blib
t/corpus/BRZ13
+nytprof.out
+nytprof
*.tar.gz
*~
*.sqlite
diff --git a/Changes b/Changes
index 24425d0..96c7f55 100644
--- a/Changes
+++ b/Changes
@@ -1,7 +1,13 @@
+0.15 2016-03-17
+ - Modularized metadata handling.
+ - Simplified metadata handling.
+ - Added --meta option to script.
+ - Removed deprecated --human option from script.
+
0.14 2016-03-15
- Renamed ::Index to ::Annotate and ::Field to ::Index.
- Renamed 'allow' to 'anno' as parameters of the script.
- - Added readme
+ - Added readme.
0.13 2016-03-10
- Removed korapxml2krill_dir.
diff --git a/MANIFEST b/MANIFEST
index d2dfd51..cce10c3 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -13,6 +13,9 @@
lib/KorAP/XML/Index/MultiTermToken.pm
lib/KorAP/XML/Index/MultiTermTokenStream.pm
lib/KorAP/XML/Document/Primary.pm
+lib/KorAP/XML/Meta/Base.pm
+lib/KorAP/XML/Meta/I5.pm
+lib/KorAP/XML/Meta/Sgbr.pm
lib/KorAP/XML/Annotation/Base.pm
lib/KorAP/XML/Annotation/Base/Paragraphs.pm
lib/KorAP/XML/Annotation/Base/Sentences.pm
@@ -76,6 +79,7 @@
t/annotation/xip_morpho.t
t/annotation/xip_sentences.t
t/annotation/koralquery.t
+t/benchmark/parse_document.t
t/real/bzk.t
t/real/bzk_2.t
t/real/goethe.t
diff --git a/Readme.pod b/Readme.pod
index 149dab0..62662df 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -98,14 +98,16 @@
Defaults to C<0>.
This is I<experimental>.
-=item B<--human|-m>
+=item B<--meta|-m>
-Represent the data in an alternative human readible format.
-This is I<deprecated>.
+Define the metadata parser to use. Defaults to C<I5>.
+Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
+This is I<experimental>.
=item B<--pretty|-y>
Pretty print JSON output. Defaults to C<false>.
+This is I<deprecated>.
=item B<--gzip|-z>
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 519e26e..087b2ad 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -18,53 +18,11 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.14';
-
-our @ATTR = qw/text_sigle
- doc_sigle
- corpus_sigle
- title
- pub_date
- sub_title
- pub_place
- author/;
-
-our @ADVANCED_ATTR = qw/publisher
- editor
- text_type
- text_type_art
- text_type_ref
- text_column
- text_domain
- creation_date
- license
- pages
- file_edition_statement
- bibl_edition_statement
- reference
- language
-
- doc_title
- doc_sub_title
- doc_editor
- doc_author
-
- corpus_author
- corpus_title
- corpus_sub_title
- corpus_editor
-
- availability
- pub_place_key
- /;
-# Separate: text_class, keywords
-
-# Removed: coll_title, coll_sub_title, coll_author, coll_editor
-# Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
-
+our $VERSION = '0.15';
has 'path';
-has [@ATTR, @ADVANCED_ATTR];
+has [qw/text_sigle doc_sigle corpus_sigle/];
+has 'meta_type' => 'I5';
has log => sub {
if(Log::Log4perl->initialized()) {
@@ -74,7 +32,6 @@
return $log;
};
-
sub new {
my $class = shift;
my $self = bless { @_ }, $class;
@@ -87,9 +44,11 @@
return $self;
};
-# parse document
+
+# Parse document (primary data and metadata)
sub parse {
my $self = shift;
+ my $meta_data_type = $self->meta_type;
my $data_xml = $self->path . 'data.xml';
@@ -97,6 +56,7 @@
my $unable = 'Unable to parse document ' . $self->path;
+ # No primary data found
unless (-e $data_xml) {
$self->log->warn($unable . ' - no data.xml found');
$error = 1;
@@ -104,6 +64,7 @@
else {
+ # Load file
$file = b($data_xml)->slurp;
try {
@@ -148,12 +109,30 @@
my @path = grep { $_ } splitdir($self->path);
my @header;
- # Parse the corpus file, the doc file, and the text file for meta information
+ # Parse the corpus file, the doc file,
+ # and the text file for meta information
foreach (0..2) {
unshift @header, '/' . catfile(@path, 'header.xml');
pop @path;
};
+
+ my $meta_class = 'KorAP::XML::Meta::' . $meta_data_type;
+ my $meta;
+
+ if ($meta_class->can('new') || eval("require $meta_class; 1;")) {
+ $meta = $meta_class->new(
+ log => $self->log,
+ corpus_sigle => $self->corpus_sigle,
+ doc_sigle => $self->doc_sigle,
+ text_sigle => $self->text_sigle
+ );
+
+ $self->{meta} = $meta;
+ };
+
+ return unless $meta;
+
my @type = qw/corpus doc text/;
foreach (@header) {
# Get corpus, doc and text meta data
@@ -161,19 +140,22 @@
next unless -e $_;
+ # Slurp data and probably decode
my $slurp = b($_)->slurp;
- $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/;
+ $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/o;
my $file = $slurp->decode($2 // 'UTF-8');
# Get DOM
my $dom = Mojo::DOM->new($file);
- if ($dom->at('idsHeader') || $dom->at('idsheader')) {
- $self->_parse_meta_i5($dom, $type);
- }
- else {
- $self->_parse_meta_tei($dom, $type);
- };
+ # Choose which metadata parser to use
+# if ($dom->at('idsHeader') || $dom->at('idsheader')) {
+# $self->_parse_meta_i5($dom, $type);
+# }
+# else {
+# $self->_parse_meta_tei($dom, $type);
+# };
+ $meta->parse($dom, $type);
};
return $self;
@@ -241,405 +223,66 @@
$_[0]->{pd};
};
-#sub author {
-# my $self = shift;
-#
-# # Set authors
-# if ($_[0]) {
-# return $self->{authors} = [
-# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
-# ];
-# }
-# return ($self->{authors} // []);
-#};
-
-sub text_class {
- my $self = shift;
- if ($_[0]) {
- return $self->{topics} = [ @_ ];
- };
- return ($self->{topics} //= []);
+sub meta {
+ return $_[0]->{meta};
};
-sub text_class_string {
- return join ' ', @{shift->text_class};
-}
-
-sub keywords {
+sub to_hash {
my $self = shift;
- if ($_[0]) {
- return $self->{keywords} = [ @_ ];
- };
- return ($self->{keywords} //= []);
-};
-sub keywords_string {
- return join ' ', @{shift->keywords};
-}
+ $self->parse unless $self->text_sigle;
-sub _remove_prefix {
-# return $_[0];
+ my %hash;
- # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
- my $title = shift;
- my $prefix = shift;
- $prefix =~ tr!_!/!;
- if (index($title, $prefix) == 0) {
- $title = substr($title, length($prefix));
- $title =~ s/^\s+//;
- $title =~ s/\s+$//;
- };
- return $title;
-};
+ # Get meta object
+ my $meta = $self->meta;
+ foreach (keys %$meta) {
+ # Ignore private keys
+ next if index($_, '_') == 0;
-sub _parse_meta_tei {
- my $self = shift;
- my $dom = shift;
- my $type = shift;
-
- my $stmt;
- if ($type eq 'text') {
-
- # Publisher
- try {
- $self->publisher($dom->at('publisher')->all_text);
- };
-
- # Date of publication
- try {
- my $date = $dom->at('date')->all_text;
- $self->store(sgbrDate => $date);
- if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2}).*$!$1$2$3!) {
- $self->pub_date($date);
- }
- else {
- $self->log->warn('"' . $date . '" is not a compatible pubDate');
- };
- };
-
- # Publication place
- try {
- my $pp = $dom->at('pubPlace');
- if ($pp) {
- $self->pub_place($pp->all_text) if $pp->all_text;
- };
- if ($pp->attr('ref')) {
- $self->reference($pp->attr('ref'));
- };
- };
-
- if ($stmt = $dom->at('titleStmt')) {
- # Title
- try {
- $stmt->find('title')->each(
- sub {
- my $type = $_->attr('type') || 'main';
- $self->title($_->all_text) if $type eq 'main';
-
- # Only support the first subtitle
- $self->sub_title($_->all_text) if $type eq 'sub' && !$self->sub_title;
- }
- );
- };
-
- # Author
- try {
- my $author = $stmt->at('author')->attr('ref');
-
- $author = $self->{ref_author}->{$author};
-
- if ($author) {
- my $array = $self->keywords;
- $self->author($author->{name} // $author->{id});
-
- if ($author->{age}) {
- $self->store('sgbrAuthorAgeClass' => $author->{age});
- push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
- };
- if ($author->{sex}) {
- $self->store('sgbrAuthorSex' => $author->{sex});
- push @$array, 'sgbrAuthorSex:' . $author->{sex};
- };
- };
- };
- };
-
- try {
- my $kodex = $dom->at('item[rend]')->attr('rend');
- if ($kodex) {
- my $array = $self->keywords;
- $self->store('sgbrKodex' => $kodex);
- push @$array, 'sgbrKodex:' . $kodex;
- };
- };
- }
-
- elsif ($type eq 'doc') {
- try {
- $dom->find('particDesc person')->each(
- sub {
-
- my $hash = $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
- age => $_->attr('age'),
- sex => $_->attr('sex'),
- id => $_->attr('xml:id')
- };
-
- # Get name
- if ($_->at('persName')) {
- $hash->{name} = $_->at('persName')->all_text;
- };
- });
- };
-
- try {
- my $lang = $dom->at('language[ident]')->attr('ident');
- $self->language($lang);
- };
-
- try {
- $self->store('funder', $dom->at('funder > orgName')->all_text);
- };
-
- try {
- $stmt = $dom->find('fileDesc > titleStmt > title')->each(
- sub {
- my $type = $_->attr('type') || 'main';
- $self->doc_title($_->all_text) if $type eq 'main';
- if ($type eq 'sub') {
- my $sub_title = $self->doc_sub_title;
- $self->doc_sub_title(
- ($sub_title ? $sub_title . ', ' : '') . $_->all_text
- );
- };
- }
- );
- };
- };
- return;
-};
-
-
-
-sub _parse_meta_i5 {
- my $self = shift;
- my $dom = shift;
- my $type = shift;
-
- my $analytic = $dom->at('analytic') || $dom->at('monogr');
-
- # There is an analytic element
- if ($analytic) {
-
- # Get title, subtitle, author, editor
- my $title = $analytic->at('h\.title[type=main]');
- my $sub_title = $analytic->at('h\.title[type=sub]');
- my $author = $analytic->at('h\.author');
- my $editor = $analytic->at('editor');
-
- $title = $title ? $title->all_text : undef;
- $sub_title = $sub_title ? $sub_title->all_text : undef;
- $author = $author ? $author->all_text : undef;
- $editor = $editor ? $editor->all_text : undef;
-
- if ($type eq 'text') {
- $self->title(_remove_prefix($title, $self->text_sigle)) if $title;
- $self->sub_title($sub_title) if $sub_title;
- $self->editor($editor) if $editor;
- $self->author($author) if $author;
+ my $v = $meta->{$_};
+ if (ref $v) {
+ $hash{_k($_)} = $meta->keywords($_);
}
- elsif ($type eq 'doc') {
- $self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title;
- $self->doc_sub_title($sub_title) if $sub_title;
- $self->doc_author($author) if $author;
- $self->doc_editor($editor) if $editor;
- }
- elsif ($type eq 'corpus') {
- $self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title;
- $self->corpus_sub_title($sub_title) if $sub_title;
- $self->corpus_author($author) if $author;
- $self->corpus_editor($editor) if $editor;
+ else {
+ $v =~ s/\n/ /g;
+ $v =~ s/\s\s+/ /g;
+ $hash{_k($_)} = $v;
};
};
- # Not in analytic
- if ($type eq 'corpus') {
- unless ($self->corpus_title) {
- if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
- $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle))
- if $title->all_text;
- };
- };
- }
-
- # doc title
- elsif ($type eq 'doc') {
- unless ($self->doc_title) {
- if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
- $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle))
- if $title->all_text;
- };
- };
- }
-
- # text title
- elsif ($type eq 'text') {
- unless ($self->title) {
- if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
- $self->title(_remove_prefix($title->all_text, $self->text_sigle))
- if $title->all_text;
- }
- };
+ foreach (qw/corpus doc text/) {
+ $hash{$_ . 'Sigle'} = $self->{$_ . '_sigle'};
};
- # Get PubPlace
- if (my $place = $dom->at('pubPlace')) {
- $self->pub_place($place->all_text) if $place->all_text;
- $self->pub_place_key($place->attr('key')) if $place->attr('key');
- };
-
- # Get Publisher
- if (my $publisher = $dom->at('imprint publisher')) {
- $self->publisher($publisher->all_text) if $publisher->all_text;
- };
-
- # Get text type
- my $text_desc = $dom->at('textDesc');
-
- if ($text_desc) {
- if (my $text_type = $text_desc->at('textType')) {
- $self->text_type($text_type->all_text) if $text_type->all_text;
- };
-
- # Get text domain
- if (my $text_domain = $text_desc->at('textDomain')) {
- $self->text_domain($text_domain->all_text) if $text_domain->all_text;
- };
-
- # Get text type art
- if (my $text_type_art = $text_desc->at('textTypeArt')) {
- $self->text_type_art($text_type_art->all_text) if $text_type_art->all_text;
- };
-
- # Get text type art
- if (my $text_type_ref = $text_desc->at('textTypeRef')) {
- $self->text_type_ref($text_type_ref->all_text) if $text_type_ref->all_text;
- };
- };
-
- # Availability
- try {
- $self->availability(
- $dom->at('availability')->all_text
- );
- };
-
- # Get pubDate
- my $pub_date = $dom->find('pubDate[type=year]');
- $pub_date->each(
- sub {
- my $x = shift->parent;
- my $year = $x->at("pubDate[type=year]");
- return unless $year;
-
- $year = $year ? $year->text : 0;
- my $month = $x->at("pubDate[type=month]");
- $month = $month ? $month->text : 0;
- my $day = $x->at("pubDate[type=day]");
- $day = $day ? $day->text : 0;
-
- $year = 0 if $year !~ /^\d+$/;
- $month = 0 if $month !~ /^\d+$/;
- $day = 0 if $day !~ /^\d+$/;
-
- my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
- $date .= length($month) == 1 ? '0' . $month : $month;
- $date .= length($day) == 1 ? '0' . $day : $day;
- $self->pub_date($date);
- });
-
- # creatDate
- my $create_date = $dom->at('creatDate');
- if ($create_date && $create_date->text) {
- $create_date = $create_date->all_text;
- if (index($create_date, '-') > -1) {
- $self->log->warn("Creation date ranges are not supported");
- ($create_date) = split /\s*-\s*/, $create_date;
- }
-
- $create_date =~ s{^(\d{4})$}{$1\.00};
- $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
- if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
- $create_date =~ tr/\.//d;
- $self->creation_date($create_date);
- };
- };
-
- my $text_class = $dom->at('textClass');
- if ($text_class) {
- # Get textClasses
- my @topic;
-
- $text_class->find("catRef")->each(
- sub {
- my ($ign, @ttopic) = split('\.', $_->attr('target'));
- push(@topic, @ttopic);
- }
- );
- $self->text_class(@topic) if @topic > 0;
-
- my $kws = $self->keywords;
- my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
- push(@$kws, @keywords) if @keywords > 0;
- };
-
- if (my $edition_statement = $dom->at('biblFull editionStmt')) {
- $self->bibl_edition_statement($edition_statement->all_text)
- if $edition_statement->text;
- };
-
- if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
- $self->file_edition_statement($edition_statement->all_text)
- if $edition_statement->text;
- };
-
- if (my $file_desc = $dom->at('fileDesc')) {
- if (my $availability = $file_desc->at('publicationStmt > availability')) {
- $self->license($availability->all_text);
- };
- };
-
- # Some meta data only available in the corpus
- if ($type eq 'corpus') {
- if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
- $self->language($language->attr('id'));
- };
- }
-
- # Some meta data only reevant from the text
- elsif ($type eq 'text') {
-
- if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
- if (my $ref_text = $reference->all_text) {
- $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
- $self->reference($ref_text);
- };
- };
-
- my $column = $dom->at('textDesc > column');
- $self->text_column($column->all_text) if $column;
-
- if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
- $pages = $pages->all_text;
- if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
- $self->pages($1 . '-' . $2);
- };
- };
- };
+ return \%hash;
};
+sub _k {
+ my $x = $_[0];
+ $x =~ s/_(\w)/\U$1\E/g;
+ $x =~ s/id$/ID/gi;
+ return $x;
+};
+
+
+sub to_json {
+ my $self = shift;
+ unless ($self->{tokenizer}) {
+ $self->log->warn('No tokenizer defined');
+ return;
+ };
+
+ return $self->{tokenizer}->to_json;
+};
+
+
+1;
+
+
+__END__
sub to_string {
my $self = shift;
@@ -660,38 +303,6 @@
return $string;
};
-sub _k {
- my $x = $_[0];
- $x =~ s/_(\w)/\U$1\E/g;
- $x =~ s/id$/ID/gi;
- return $x;
-};
-
-
-sub to_hash {
- my $self = shift;
-
- $self->parse unless $self->text_sigle;
-
- my %hash;
-
- foreach (@ATTR, @ADVANCED_ATTR, 'store') {
- if (my $att = $self->$_) {
- $att =~ s/\n/ /g;
- $att =~ s/\s\s+/ /g;
- $hash{_k($_)} = $att;
- };
- };
-
- for (qw/text_class keywords/) {
- my @array = @{ $self->$_ };
- next unless @array;
- $hash{_k($_)} = join(' ', @array);
- };
-
- return \%hash;
-};
-
# Todo: Make this a KoralQuery serializer
sub to_koral_query {
my $self = shift;
@@ -703,17 +314,6 @@
};
-sub to_json {
- my $self = shift;
- unless ($self->{tokenizer}) {
- $self->log->warn('No tokenizer defined');
- return;
- };
-
- return $self->{tokenizer}->to_json;
-};
-
-
1;
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
new file mode 100644
index 0000000..a92ebd5
--- /dev/null
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -0,0 +1,53 @@
+package KorAP::XML::Meta::Base;
+use strict;
+use warnings;
+
+# Importing method
+sub import {
+ my $class = shift;
+ my $caller = caller;
+
+ no strict 'refs';
+
+ push @{"${caller}::ISA"}, $class;
+
+ strict->import;
+ warnings->import;
+ utf8->import;
+ feature->import(':5.10');
+};
+
+sub log {
+ return $_[0]->{_log};
+};
+
+sub corpus_sigle {
+ $_[0]->{_corpus_sigle};
+};
+
+sub doc_sigle {
+ $_[0]->{_doc_sigle};
+};
+
+sub text_sigle {
+ $_[0]->{_text_sigle};
+};
+
+sub new {
+ my $class = shift;
+ my %hash = @_;
+ my $copy = {};
+ foreach (qw/log corpus_sigle doc_sigle text_sigle/) {
+ $copy->{'_' . $_} = $hash{$_};
+ };
+
+ bless $copy, $class;
+};
+
+sub keywords {
+ my $self = shift;
+ return join(' ', @{$self->{$_[0]} // []});
+};
+
+
+1;
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
new file mode 100644
index 0000000..e03640c
--- /dev/null
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -0,0 +1,274 @@
+package KorAP::XML::Meta::I5;
+use KorAP::XML::Meta::Base;
+use Try::Tiny;
+
+# Parse meta data
+sub parse {
+ my $self = shift;
+ my $dom = shift;
+ my $type = shift;
+
+ my $analytic = $dom->at('analytic') || $dom->at('monogr');
+
+ # There is an analytic element
+ if ($analytic) {
+
+ # Get title, subtitle, author, editor
+ my $title = $analytic->at('h\.title[type=main]');
+ my $sub_title = $analytic->at('h\.title[type=sub]');
+ my $author = $analytic->at('h\.author');
+ my $editor = $analytic->at('editor');
+
+ $title = $title ? $title->all_text : undef;
+ $sub_title = $sub_title ? $sub_title->all_text : undef;
+ $author = $author ? $author->all_text : undef;
+ $editor = $editor ? $editor->all_text : undef;
+
+ if ($type eq 'text') {
+ $self->{title} =_remove_prefix($title, $self->text_sigle) if $title;
+ $self->{sub_title} = $sub_title if $sub_title;
+ $self->{editor} = $editor if $editor;
+ $self->{author} = $author if $author;
+ }
+ elsif ($type eq 'doc') {
+ $self->{doc_title} = _remove_prefix($title, $self->doc_sigle) if $title;
+ $self->{doc_sub_title} = $sub_title if $sub_title;
+ $self->{doc_author} = $author if $author;
+ $self->{doc_editor} = $editor if $editor;
+ }
+ elsif ($type eq 'corpus') {
+ $self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle) if $title;
+ $self->{corpus_sub_title} = $sub_title if $sub_title;
+ $self->{corpus_author} = $author if $author;
+ $self->{corpus_editor} = $editor if $editor;
+ };
+ };
+
+ # Not in analytic
+ if ($type eq 'corpus') {
+ unless ($self->{corpus_title}) {
+ if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
+ $self->{corpus_title} = _remove_prefix($title->all_text, $self->corpus_sigle)
+ if $title->all_text;
+ };
+ };
+ }
+
+ # doc title
+ elsif ($type eq 'doc') {
+ unless ($self->{doc_title}) {
+ if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
+ $self->{doc_title} = _remove_prefix($title->all_text, $self->doc_sigle)
+ if $title->all_text;
+ };
+ };
+ }
+
+ # text title
+ elsif ($type eq 'text') {
+ unless ($self->{title}) {
+ if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
+ $self->{title} = _remove_prefix($title->all_text, $self->text_sigle)
+ if $title->all_text;
+ }
+ };
+ };
+
+ # Get PubPlace
+ if (my $place = $dom->at('pubPlace')) {
+ $self->{pub_place} = $place->all_text if $place->all_text;
+ $self->{pub_place_key} = $place->attr('key') if $place->attr('key');
+ };
+
+ # Get Publisher
+ if (my $publisher = $dom->at('imprint publisher')) {
+ $self->{publisher} = $publisher->all_text if $publisher->all_text;
+ };
+
+ # Get text type
+ my $text_desc = $dom->at('textDesc');
+
+ if ($text_desc) {
+ if (my $text_type = $text_desc->at('textType')) {
+ $self->{text_type} = $text_type->all_text if $text_type->all_text;
+ };
+
+ # Get text domain
+ if (my $text_domain = $text_desc->at('textDomain')) {
+ $self->{text_domain} = $text_domain->all_text if $text_domain->all_text;
+ };
+
+ # Get text type art
+ if (my $text_type_art = $text_desc->at('textTypeArt')) {
+ $self->{text_type_art} = $text_type_art->all_text if $text_type_art->all_text;
+ };
+
+ # Get text type art
+ if (my $text_type_ref = $text_desc->at('textTypeRef')) {
+ $self->{text_type_ref} = $text_type_ref->all_text if $text_type_ref->all_text;
+ };
+ };
+
+ # Availability
+ try {
+ $self->{availability} = $dom->at('availability')->all_text;
+ };
+
+ # Get pubDate
+ my $pub_date = $dom->find('pubDate[type=year]');
+ $pub_date->each(
+ sub {
+ my $x = shift->parent;
+ my $year = $x->at("pubDate[type=year]");
+ return unless $year;
+
+ $year = $year ? $year->text : 0;
+ my $month = $x->at("pubDate[type=month]");
+ $month = $month ? $month->text : 0;
+ my $day = $x->at("pubDate[type=day]");
+ $day = $day ? $day->text : 0;
+
+ $year = 0 if $year !~ /^\d+$/;
+ $month = 0 if $month !~ /^\d+$/;
+ $day = 0 if $day !~ /^\d+$/;
+
+ my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
+ $date .= length($month) == 1 ? '0' . $month : $month;
+ $date .= length($day) == 1 ? '0' . $day : $day;
+ $self->{pub_date} = $date;
+ });
+
+ # creatDate
+ my $create_date = $dom->at('creatDate');
+ if ($create_date && $create_date->text) {
+ $create_date = $create_date->all_text;
+ if (index($create_date, '-') > -1) {
+ $self->log->warn("Creation date ranges are not supported");
+ ($create_date) = split /\s*-\s*/, $create_date;
+ }
+
+ $create_date =~ s{^(\d{4})$}{$1\.00};
+ $create_date =~ s{^(\d{4})\.(\d{2})$}{$1\.$2\.00};
+ if ($create_date =~ /^\d{4}\.\d{2}\.\d{2}$/) {
+ $create_date =~ tr/\.//d;
+ $self->{creation_date} = $create_date;
+ };
+ };
+
+ my $text_class = $dom->at('textClass');
+ if ($text_class) {
+ # Get textClasses
+ my @topic;
+
+ $text_class->find("catRef")->each(
+ sub {
+ my ($ign, @ttopic) = split('\.', $_->attr('target'));
+ push(@topic, @ttopic);
+ }
+ );
+ $self->{text_class} = [@topic] if @topic > 0;
+
+ my $kws = $self->{keywords};
+ my @keywords = $text_class->find("h\.keywords > keyTerm")->each;
+ push(@$kws, @keywords) if @keywords > 0;
+ };
+
+ if (my $edition_statement = $dom->at('biblFull editionStmt')) {
+ $self->{bibl_edition_statement} = $edition_statement->all_text
+ if $edition_statement->text;
+ };
+
+ if (my $edition_statement = $dom->at('fileDescl editionStmt')) {
+ $self->{file_edition_statement} = $edition_statement->all_text
+ if $edition_statement->text;
+ };
+
+ if (my $file_desc = $dom->at('fileDesc')) {
+ if (my $availability = $file_desc->at('publicationStmt > availability')) {
+ $self->{license} = $availability->all_text;
+ };
+ };
+
+ # Some meta data only available in the corpus
+ if ($type eq 'corpus') {
+ if (my $language = $dom->at('profileDesc > langUsage > language[id]')) {
+ $self->{language} = $language->attr('id');
+ };
+ }
+
+ # Some meta data only reevant from the text
+ elsif ($type eq 'text') {
+
+ if (my $reference = $dom->at('sourceDesc reference[type=complete]')) {
+ if (my $ref_text = $reference->all_text) {
+ $ref_text =~ s!^[a-zA-Z0-9]+\/[a-zA-Z0-9]+\.\d+[\s:]\s*!!;
+ $self->{reference} = $ref_text;
+ };
+ };
+
+ my $column = $dom->at('textDesc > column');
+ $self->{text_column} = $column->all_text if $column;
+
+ if (my $pages = $dom->at('biblStruct biblScope[type="pp"]')) {
+ $pages = $pages->all_text;
+ if ($pages && $pages =~ m/(\d+)\s*-\s*(\d+)/) {
+ $self->{pages} = $1 . '-' . $2;
+ };
+ };
+ };
+};
+
+
+sub _remove_prefix {
+# return $_[0];
+
+ # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
+ my $title = shift;
+ my $prefix = shift or return $title;
+ $prefix =~ tr!_!/!;
+ if (index($title, $prefix) == 0) {
+ $title = substr($title, length($prefix));
+ $title =~ s/^\s+//;
+ $title =~ s/\s+$//;
+ };
+ return $title;
+};
+
+
+#sub author {
+# my $self = shift;
+#
+# # Set authors
+# if ($_[0]) {
+# return $self->{authors} = [
+# grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
+# ];
+# }
+# return ($self->{authors} // []);
+#};
+#sub text_class {
+# my $self = shift;
+# if ($_[0]) {
+# return $self->{topics} = [ @_ ];
+# };
+# return ($self->{topics} //= []);
+#};
+
+#sub text_class_string {
+# return join ' ', @{shift->text_class};
+#}
+
+#sub keywords {
+# my $self = shift;
+# if ($_[0]) {
+# return $self->{keywords} = [ @_ ];
+# };
+# return ($self->{keywords} //= []);
+#};
+
+#sub keywords_string {
+# return join ' ', @{shift->keywords};
+#}
+
+
+1;
diff --git a/lib/KorAP/XML/Meta/Sgbr.pm b/lib/KorAP/XML/Meta/Sgbr.pm
new file mode 100644
index 0000000..2d33975
--- /dev/null
+++ b/lib/KorAP/XML/Meta/Sgbr.pm
@@ -0,0 +1,133 @@
+package KorAP::XML::Meta::Sgbr;
+use KorAP::XML::Meta::Base;
+use Try::Tiny;
+
+# Parse meta data
+sub parse {
+ my $self = shift;
+ my $dom = shift;
+ my $type = shift;
+
+ my $stmt;
+ if ($type eq 'text') {
+
+ # Publisher
+ try {
+ $self->{publisher} = $dom->at('publisher')->all_text;
+ };
+
+ # Date of publication
+ try {
+ my $date = $dom->at('date')->all_text;
+ $self->{sgbr_date} = $date;
+ if ($date =~ s!^\s*(\d{4})-(\d{2})-(\d{2}).*$!$1$2$3!) {
+ $self->{pub_date} = $date;
+ }
+ else {
+ $self->log->warn('"' . $date . '" is not a compatible pubDate');
+ };
+ };
+
+ # Publication place
+ try {
+ my $pp = $dom->at('pubPlace');
+ if ($pp) {
+ $self->{pub_place} = $pp->all_text if $pp->all_text;
+ };
+ if ($pp->attr('ref')) {
+ $self->{reference} = $pp->attr('ref');
+ };
+ };
+
+ if ($stmt = $dom->at('titleStmt')) {
+ # Title
+ try {
+ $stmt->find('title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->{title} = $_->all_text if $type eq 'main';
+
+ # Only support the first subtitle
+ $self->{sub_title} = $_->all_text
+ if $type eq 'sub' && !$self->sub_title;
+ }
+ );
+ };
+
+ # Author
+ try {
+ my $author = $stmt->at('author')->attr('ref');
+
+ $author = $self->{_ref_author}->{$author};
+
+ if ($author) {
+ my $array = ($self->{keywords} //= []);
+ $self->{author} = $author->{name} // $author->{id};
+
+ if ($author->{age}) {
+ $self->{'sgbr_author_age_class'} = $author->{age};
+ push @$array, 'sgbrAuthorAgeClass:' . $author->{age};
+ };
+ if ($author->{sex}) {
+ $self->{'sgbr_author_sex'} = $author->{sex};
+ push @$array, 'sgbrAuthorSex:' . $author->{sex};
+ };
+ };
+ };
+ };
+
+ try {
+ my $kodex = $dom->at('item[rend]')->attr('rend');
+ if ($kodex) {
+ my $array = ($self->{keywords} //= []);
+ $self->{'sgbr_kodex'} = $kodex;
+ push @$array, 'sgbrKodex:' . $kodex;
+ };
+ };
+ }
+
+ elsif ($type eq 'doc') {
+ try {
+ $dom->find('particDesc person')->each(
+ sub {
+
+ my $hash = $self->{_ref_author}->{'#' . $_->attr('xml:id')} = {
+ age => $_->attr('age'),
+ sex => $_->attr('sex'),
+ id => $_->attr('xml:id')
+ };
+
+ # Get name
+ if ($_->at('persName')) {
+ $hash->{name} = $_->at('persName')->all_text;
+ };
+ });
+ };
+
+ try {
+ my $lang = $dom->at('language[ident]')->attr('ident');
+ $self->{language} = $lang;
+ };
+
+ try {
+ $self->{'funder'} = $dom->at('funder > orgName')->all_text;
+ };
+
+ try {
+ $stmt = $dom->find('fileDesc > titleStmt > title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->{doc_title} = $_->all_text if $type eq 'main';
+ if ($type eq 'sub') {
+ my $sub_title = $self->{doc_sub_title};
+ $self->{doc_sub_title} =
+ ($sub_title ? $sub_title . ', ' : '') . $_->all_text;
+ };
+ }
+ );
+ };
+ };
+ return;
+};
+
+1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 0d7fb40..c5db742 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -46,9 +46,12 @@
#
# 2016/02/27
# - Added extract function
+#
+# 2016/03/17
+# - Added meta switch
# ----------------------------------------------------------
-our $LAST_CHANGE = '2016/03/02';
+our $LAST_CHANGE = '2016/03/17';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -63,13 +66,15 @@
};
my (@skip, @sigle);
+my $text;
# Parse options from the command line
GetOptions(
'input|i=s' => \(my $input),
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
- 'human|m' => \(my $text),
+# 'human|m' => \(my $text),
+ 'meta|m=s' => \(my $meta),
'token|t=s' => \(my $token_base),
'gzip|z' => \(my $gzip),
'skip|s=s' => \@skip,
@@ -136,7 +141,8 @@
my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
$anno . ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
- $call .= ' -m' if $text;
+# $call .= ' -m' if $text;
+ $call .= ' -m ' . $meta if $meta;
$call .= ' -w' if $overwrite;
$call .= ' -t ' . $token_base if $token_base;
$call .= ' -l ' . $log_level if $log_level;
@@ -185,7 +191,10 @@
# Create and parse new document
$input =~ s{([^/])$}{$1/};
- my $doc = KorAP::XML::Krill->new( path => $input );
+ my $doc = KorAP::XML::Krill->new(
+ path => $input,
+ meta_type => ($meta // 'I5')
+ );
unless ($doc->parse) {
$log->warn($output . " can't be processed - no document data");
@@ -278,9 +287,7 @@
};
my $file;
-
- my $print_text = $text ? $tokens->to_string($primary) :
- ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
+ my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
if ($output) {
@@ -597,14 +604,16 @@
Defaults to C<0>.
This is I<experimental>.
-=item B<--human|-m>
+=item B<--meta|-m>
-Represent the data in an alternative human readible format.
-This is I<deprecated>.
+Define the metadata parser to use. Defaults to C<I5>.
+Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
+This is I<experimental>.
=item B<--pretty|-y>
Pretty print JSON output. Defaults to C<false>.
+This is I<deprecated>.
=item B<--gzip|-z>
diff --git a/t/annotation/meta.t b/t/annotation/meta.t
index 713d8fd..f89c934 100644
--- a/t/annotation/meta.t
+++ b/t/annotation/meta.t
@@ -2,6 +2,7 @@
use strict;
use warnings;
use utf8;
+use JSON::XS;
use Test::More;
use Scalar::Util qw/weaken/;
use Data::Dumper;
@@ -22,36 +23,39 @@
is($doc->doc_sigle, 'Corpus_Doc', 'ID-doc');
is($doc->corpus_sigle, 'Corpus', 'ID-corpus');
-is($doc->title, 'Beispiel Text', 'title');
-is($doc->sub_title, 'Beispiel Text Untertitel', 'title');
-is($doc->pub_date, '20010402', 'Publication date');
-is($doc->pub_place, 'Mannheim', 'Publication place');
-is($doc->author, 'Mustermann, Max', 'Author');
+my $meta = $doc->meta;
-is($doc->publisher, 'Artificial articles Inc.', 'Publisher');
-is($doc->editor, 'Monika Mustermann', 'Editor');
-is($doc->text_type, 'Zeitung: Tageszeitung', 'Text Type');
-is($doc->text_type_art, 'Bericht', 'Text Type Art');
-is($doc->text_type_ref, 'Aphorismen', 'Text Type Ref');
-ok(!$doc->text_column, 'Text Column');
-ok(!$doc->text_domain, 'Text Domain');
-is($doc->creation_date, '19990601', 'Creation Date');
-ok(!$doc->license, 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Edition Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
-ok(!$doc->reference, 'Reference');
-is($doc->language, 'de', 'Language');
+is($meta->{title}, 'Beispiel Text', 'title');
+is($meta->{sub_title}, 'Beispiel Text Untertitel', 'title');
+is($meta->{pub_date}, '20010402', 'Publication date');
+is($meta->{pub_place}, 'Mannheim', 'Publication place');
+is($meta->{author}, 'Mustermann, Max', 'Author');
-is($doc->doc_title, 'Beispiel Dokument', 'Doc: title');
-ok(!$doc->doc_sub_title, 'Doc: subtitle');
-ok(!$doc->doc_editor, 'Doc: editor');
-ok(!$doc->doc_author, 'Doc: author');
+is($meta->{publisher}, 'Artificial articles Inc.', 'Publisher');
+is($meta->{editor}, 'Monika Mustermann', 'Editor');
+is($meta->{text_type}, 'Zeitung: Tageszeitung', 'Text Type');
+is($meta->{text_type_art}, 'Bericht', 'Text Type Art');
+is($meta->{text_type_ref}, 'Aphorismen', 'Text Type Ref');
+ok(!$meta->{text_column}, 'Text Column');
+ok(!$meta->{text_domain}, 'Text Domain');
+is($meta->{creation_date}, '19990601', 'Creation Date');
+ok(!$meta->{license}, 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Edition Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Edition Statement');
+ok(!$meta->{reference}, 'Reference');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Werke von Beispiel', 'Corpus: title');
-ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
-is($doc->corpus_editor, 'Mustermann, Monika', 'Corpus: editor');
-is($doc->corpus_author, 'Mustermann, Max', 'Corpus: author');
+is($meta->{doc_title}, 'Beispiel Dokument', 'Doc: title');
+ok(!$meta->{doc_sub_title}, 'Doc: subtitle');
+ok(!$meta->{doc_editor}, 'Doc: editor');
+ok(!$meta->{doc_author}, 'Doc: author');
+
+is($meta->{corpus_title}, 'Werke von Beispiel', 'Corpus: title');
+ok(!$meta->{corpus_sub_title}, 'Corpus: subtitle');
+is($meta->{corpus_editor}, 'Mustermann, Monika', 'Corpus: editor');
+is($meta->{corpus_author}, 'Mustermann, Max', 'Corpus: author');
+
done_testing;
diff --git a/t/meta.t b/t/meta.t
index d09e4ec..e7d9f0e 100644
--- a/t/meta.t
+++ b/t/meta.t
@@ -23,17 +23,18 @@
# Metdata
is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
-is($doc->title, 'A', 'title');
-ok(!$doc->sub_title, 'subTitle');
+my $meta = $doc->meta;
+is($meta->{title}, 'A', 'title');
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->corpus_sigle, 'WPD', 'corpusID');
-is($doc->pub_date, '20050328', 'pubDate');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubPlace');
-is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
-is($doc->text_class->[1], 'reisen', 'TextClass');
-is($doc->text_class->[2], 'wissenschaft', 'TextClass');
-is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
-ok(!$doc->text_class->[4], 'TextClass');
-is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+is($meta->{pub_date}, '20050328', 'pubDate');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'pubPlace');
+is($meta->{text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
+is($meta->{text_class}->[1], 'reisen', 'TextClass');
+is($meta->{text_class}->[2], 'wissenschaft', 'TextClass');
+is($meta->{text_class}->[3], 'populaerwissenschaft', 'TextClass');
+ok(!$meta->{text_class}->[4], 'TextClass');
+is($meta->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
#is($doc->author->[0], 'Ruru', 'author');
#is($doc->author->[1], 'Jens.Ol', 'author');
@@ -41,16 +42,16 @@
#ok(!$doc->author->[3], 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-is($doc->publisher, 'Wikipedia', 'Publisher');
-is($doc->creation_date, '20050000', 'Creation date');
-ok(!$doc->text_type, 'No text_type');
-ok(!$doc->text_type_art, 'no text_type art');
-ok(!$doc->text_type_ref, 'no text_type ref');
-ok(!$doc->text_domain, 'no text_domain');
-ok(!$doc->text_column, 'no text_column');
-ok(!$doc->keywords_string, 'no keywords');
-is($doc->text_class_string, 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes');
+ok(!$meta->{editor}, 'Editor');
+is($meta->{publisher}, 'Wikipedia', 'Publisher');
+is($meta->{creation_date}, '20050000', 'Creation date');
+ok(!$meta->{text_type}, 'No text_type');
+ok(!$meta->{text_type_art}, 'no text_type art');
+ok(!$meta->{text_type_ref}, 'no text_type ref');
+ok(!$meta->{text_domain}, 'no text_domain');
+ok(!$meta->{text_column}, 'no text_column');
+ok(!$meta->keywords('keywords'), 'no keywords');
+is($meta->keywords('text_class'), 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes');
#is($doc->coll_title, 'Wikipedia', 'Collection title');
#is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
@@ -62,50 +63,54 @@
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->title, 'Fischer und Kolp im Sonnenhügel', 'title');
-ok(!$doc->sub_title, 'subTitle');
+$meta = $doc->meta;
+is($meta->{title}, 'Fischer und Kolp im Sonnenhügel', 'title');
+
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'A01_APR.13047', 'ID');
is($doc->corpus_sigle, 'A01', 'corpusID');
-is($doc->pub_date, '20010402', 'pubDate');
-ok(!$doc->pub_place, 'pubPlace');
-is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
-is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author, 'author');
+is($meta->{pub_date}, '20010402', 'pubDate');
+ok(!$meta->{pub_place}, 'pubPlace');
+is($meta->{text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
+is($meta->{text_class}->[1], 'vereine-veranstaltungen', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
+ok(!$meta->{author}, 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-ok(!$doc->publisher, 'Publisher');
-is($doc->creation_date, '20010402', 'Creation date');
+ok(!$meta->{editor}, 'Editor');
+ok(!$meta->{publisher}, 'Publisher');
+is($meta->{creation_date}, '20010402', 'Creation date');
#ok(!$doc->coll_title, 'Collection title');
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
#ok(!$doc->coll_author, 'Collection author');
-ok(!$doc->text_type, 'text_type');
-is($doc->text_type_art, 'Bericht', 'text_type art');
+ok(!$meta->{text_type}, 'text_type');
+is($meta->{text_type_art}, 'Bericht', 'text_type art');
# ERL/0001
$path = catdir(dirname(__FILE__), 'corpus/ERL/00001');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->title, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse]', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
+
+$meta = $doc->meta;
+is($meta->{title}, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse]', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
# MK2/ERL.00001
-ok(!$doc->sub_title, 'subTitle');
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'MK2_ERL.00001', 'ID');
is($doc->corpus_sigle, 'MK2', 'corpusID');
-is($doc->pub_date, '00000000', 'pubDate');
-is($doc->pub_place, 'Karlsruhe', 'pubPlace');
-is($doc->text_class->[0], 'politik', 'TextClass');
-is($doc->text_class->[1], 'kommunalpolitik', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author, 'author');
+is($meta->{pub_date}, '00000000', 'pubDate');
+is($meta->{pub_place}, 'Karlsruhe', 'pubPlace');
+is($meta->{text_class}->[0], 'politik', 'TextClass');
+is($meta->{text_class}->[1], 'kommunalpolitik', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
+ok(!$meta->{author}, 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-is($doc->publisher, 'Badenia Verlag und Druckerei', 'Publisher');
-is($doc->creation_date, '19600000', 'Creation date');
+ok(!$meta->{editor}, 'Editor');
+is($meta->{publisher}, 'Badenia Verlag und Druckerei', 'Publisher');
+is($meta->{creation_date}, '19600000', 'Creation date');
# !!!
# diag 'Non-acceptance of creation date ranges may be temporary';
@@ -115,121 +120,126 @@
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
#ok(!$doc->coll_author, 'Collection author');
-is($doc->text_type, 'Erlass', 'text_type');
-ok(!$doc->text_type_art, 'text_type art');
+is($meta->{text_type}, 'Erlass', 'text_type');
+ok(!$meta->{text_type_art}, 'text_type art');
+
# A01/02035-substring
$path = catdir(dirname(__FILE__), 'corpus/A00/02035-substring');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-
ok($doc->parse, 'Parse document');
-is($doc->title, 'St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title'); # A00/JAN.02035
-ok(!$doc->sub_title, 'subTitle');
+
+$meta = $doc->meta;
+
+is($meta->{title}, 'St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title'); # A00/JAN.02035
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'A00_JAN.02035', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
-is($doc->pub_date, '20000111', 'pubDate');
-ok(!$doc->pub_place, 'pubPlace');
-is($doc->text_class->[0], 'sport', 'TextClass');
-is($doc->text_class->[1], 'ballsport', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author, 'author');
+is($meta->{pub_date}, '20000111', 'pubDate');
+ok(!$meta->{pub_place}, 'pubPlace');
+is($meta->{text_class}->[0], 'sport', 'TextClass');
+is($meta->{text_class}->[1], 'ballsport', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
+ok(!$meta->{author}, 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-ok(!$doc->publisher, 'Publisher');
-is($doc->creation_date, "20000111", 'Creation date');
+ok(!$meta->{editor}, 'Editor');
+ok(!$meta->{publisher}, 'Publisher');
+is($meta->{creation_date}, "20000111", 'Creation date');
#ok(!$doc->coll_title, 'Collection title');
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
#ok(!$doc->coll_author, 'Collection author');
-ok(!$doc->text_type, 'text_type');
-is($doc->text_type_art, 'Bericht', 'text_type art');
+ok(!$meta->{text_type}, 'text_type');
+is($meta->{text_type_art}, 'Bericht', 'text_type art');
# A01/02873-meta
$path = catdir(dirname(__FILE__), 'corpus/A00/02873-meta');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-
ok($doc->parse, 'Parse document');
-is($doc->title, 'Tradition und Moderne', 'title');
-ok(!$doc->sub_title, 'subTitle');
+$meta = $doc->meta;
+
+is($meta->{title}, 'Tradition und Moderne', 'title');
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'A00_JAN.02873', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
-is($doc->pub_date, '20000113', 'pubDate');
-ok(!$doc->pub_place, 'pubPlace');
-is($doc->text_class->[0], 'kultur', 'TextClass');
-is($doc->text_class->[1], 'film', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author, 'author');
+is($meta->{pub_date}, '20000113', 'pubDate');
+ok(!$meta->{pub_place}, 'pubPlace');
+is($meta->{text_class}->[0], 'kultur', 'TextClass');
+is($meta->{text_class}->[1], 'film', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
+ok(!$meta->{author}, 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-ok(!$doc->publisher, 'Publisher');
-is($doc->creation_date, "20000113", 'Creation date');
+ok(!$meta->{editor}, 'Editor');
+ok(!$meta->{publisher}, 'Publisher');
+is($meta->{creation_date}, "20000113", 'Creation date');
#ok(!$doc->coll_title, 'Collection title');
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
#ok(!$doc->coll_author, 'Collection author');
-ok(!$doc->text_type, 'text_type');
-is($doc->text_type_art, 'Bericht', 'text_type art');
+ok(!$meta->{text_type}, 'text_type');
+is($meta->{text_type_art}, 'Bericht', 'text_type art');
# A01/05663-unbalanced
$path = catdir(dirname(__FILE__), 'corpus/A00/05663-unbalanced');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-
ok($doc->parse, 'Parse document');
-is($doc->title, 'Mehr Arbeitslose im Dezember', 'title');
-ok(!$doc->sub_title, 'subTitle');
+$meta = $doc->meta;
+
+is($meta->{title}, 'Mehr Arbeitslose im Dezember', 'title');
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'A00_JAN.05663', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
-is($doc->pub_date, '20000124', 'pubDate');
-ok(!$doc->pub_place, 'pubPlace');
-is($doc->text_class->[0], 'gesundheit-ernaehrung', 'TextClass');
-is($doc->text_class->[1], 'gesundheit', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author, 'author');
+is($meta->{pub_date}, '20000124', 'pubDate');
+ok(!$meta->{pub_place}, 'pubPlace');
+is($meta->{text_class}->[0], 'gesundheit-ernaehrung', 'TextClass');
+is($meta->{text_class}->[1], 'gesundheit', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
+ok(!$meta->{author}, 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-ok(!$doc->publisher, 'Publisher');
-is($doc->creation_date, "20000124", 'Creation date');
+ok(!$meta->{editor}, 'Editor');
+ok(!$meta->{publisher}, 'Publisher');
+is($meta->{creation_date}, "20000124", 'Creation date');
#ok(!$doc->coll_title, 'Collection title');
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
#ok(!$doc->coll_author, 'Collection author');
-ok(!$doc->text_type, 'text_type');
-is($doc->text_type_art, 'Bericht', 'text_type art');
-
+ok(!$meta->{text_type}, 'text_type');
+is($meta->{text_type_art}, 'Bericht', 'text_type art');
# A01/07452-deep
$path = catdir(dirname(__FILE__), 'corpus/A00/07452-deep');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
-
ok($doc->parse, 'Parse document');
-is($doc->title, 'Wil im Dezember 1999', 'title');
-ok(!$doc->sub_title, 'subTitle');
+$meta = $doc->meta;
+
+is($meta->{title}, 'Wil im Dezember 1999', 'title');
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'A00_JAN.07452', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
-is($doc->pub_date, '20000129', 'pubDate');
-ok(!$doc->pub_place, 'pubPlace');
-is($doc->text_class->[0], 'politik', 'TextClass');
-is($doc->text_class->[1], 'kommunalpolitik', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author, 'author');
+is($meta->{pub_date}, '20000129', 'pubDate');
+ok(!$meta->{pub_place}, 'pubPlace');
+is($meta->{text_class}->[0], 'politik', 'TextClass');
+is($meta->{text_class}->[1], 'kommunalpolitik', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
+ok(!$meta->{author}, 'author');
# Additional information
-ok(!$doc->editor, 'Editor');
-ok(!$doc->publisher, 'Publisher');
-is($doc->creation_date, "20000129", 'Creation date');
+ok(!$meta->{editor}, 'Editor');
+ok(!$meta->{publisher}, 'Publisher');
+is($meta->{creation_date}, "20000129", 'Creation date');
#ok(!$doc->coll_title, 'Collection title');
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
#ok(!$doc->coll_author, 'Collection author');
-ok(!$doc->text_type, 'text_type');
-is($doc->text_type_art, 'Bericht', 'text_type art');
+ok(!$meta->{text_type}, 'text_type');
+is($meta->{text_type_art}, 'Bericht', 'text_type art');
# ART
$path = catdir(dirname(__FILE__), 'corpus/artificial');
@@ -240,33 +250,35 @@
#is($doc->path, $path . '/', 'Path');
ok($doc->parse, 'Parse document');
+$meta = $doc->meta;
# Metdata
-is($doc->title, 'Artificial Title', 'title');
-is($doc->sub_title, 'Artificial Subtitle', 'subTitle');
+is($meta->{title}, 'Artificial Title', 'title');
+is($meta->{sub_title}, 'Artificial Subtitle', 'subTitle');
is($doc->text_sigle, 'ART_ABC.00001', 'ID');
is($doc->corpus_sigle, 'ART', 'corpusID');
-is($doc->pub_date, '20010402', 'pubDate');
-is($doc->pub_place, 'Mannheim', 'pubPlace');
-is($doc->pub_place_key, 'DE', 'pubPlace key');
-is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
-is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
-ok(!$doc->text_class->[2], 'TextClass');
+is($meta->{pub_date}, '20010402', 'pubDate');
+is($meta->{pub_place}, 'Mannheim', 'pubPlace');
+is($meta->{pub_place_key}, 'DE', 'pubPlace key');
+is($meta->{text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
+is($meta->{text_class}->[1], 'vereine-veranstaltungen', 'TextClass');
+ok(!$meta->{text_class}->[2], 'TextClass');
#is($doc->author->[0], 'Ruru', 'author');
#is($doc->author->[1], 'Jens.Ol', 'author');
#is($doc->author->[2], 'Aglarech', 'author');
-is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+is($meta->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
# Additional information
-is($doc->editor, 'Nils Diewald', 'Editor');
-is($doc->publisher, 'Artificial articles Inc.', 'Publisher');
-is($doc->creation_date, '19990601', 'Creation date');
+is($meta->{editor}, 'Nils Diewald', 'Editor');
+is($meta->{publisher}, 'Artificial articles Inc.', 'Publisher');
+is($meta->{creation_date}, '19990601', 'Creation date');
#is($doc->coll_title, 'Artificial articles', 'Collection title');
#is($doc->coll_sub_title, 'Best of!', 'Collection subtitle');
#is($doc->coll_editor, 'Nils Diewald', 'Collection editor');
#is($doc->coll_author, 'Nils Diewald', 'Collection author');
-is($doc->text_type, 'Zeitung: Tageszeitung', 'No text_type');
-is($doc->text_type_art, 'Bericht', 'text_type art');
+is($meta->{text_type}, 'Zeitung: Tageszeitung', 'No text_type');
+is($meta->{text_type_art}, 'Bericht', 'text_type art');
+
# Multipath headers
$path = catdir(dirname(__FILE__), 'corpus/VDI/JAN/00001');
@@ -277,99 +289,99 @@
like($doc->path, qr!$path/$!, 'Path');
ok($doc->parse, 'Parse document');
-
+$meta = $doc->meta;
is($doc->text_sigle, 'VDI14_JAN.00001', 'text sigle');
is($doc->doc_sigle, 'VDI14_JAN', 'doc sigle');
-is($doc->corpus_sigle, 'VDI14', 'corpus sigle');
+is($meta->corpus_sigle, 'VDI14', 'corpus sigle');
-is($doc->title, '10- Zz mit Zahl', 'title');
+is($meta->{title}, '10- Zz mit Zahl', 'title');
-ok(!$doc->sub_title, 'subtitle');
-is($doc->pub_date, '20140117', 'pubdate');
-is($doc->pub_place, 'Düsseldorf', 'pubplace');
-is($doc->author, 'Windhövel, Kerstin', 'author');
-is($doc->publisher, 'VDI Verlag GmbH', 'publisher');
-ok(!$doc->editor, 'editor');
+ok(!$meta->{sub_title}, 'subtitle');
+is($meta->{pub_date}, '20140117', 'pubdate');
+is($meta->{pub_place}, 'Düsseldorf', 'pubplace');
+is($meta->{author}, 'Windhövel, Kerstin', 'author');
+is($meta->{publisher}, 'VDI Verlag GmbH', 'publisher');
+ok(!$meta->{editor}, 'editor');
-ok(!$doc->text_type, 'text type');
-ok(!$doc->text_type_art, 'text type art');
-ok(!$doc->text_type_ref, 'text type ref');
-ok(!$doc->text_column, 'text column');
-ok(!$doc->text_domain, 'text domain');
-ok(!$doc->creation_date, 'creation date');
-ok(!$doc->license, 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'file edition statement');
-ok(!$doc->bibl_edition_statement, 'bibl edition statement');
-is($doc->reference, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference');
+ok(!$meta->{text_type}, 'text type');
+ok(!$meta->{text_type_art}, 'text type art');
+ok(!$meta->{text_type_ref}, 'text type ref');
+ok(!$meta->{text_column}, 'text column');
+ok(!$meta->{text_domain}, 'text domain');
+ok(!$meta->{creation_date}, 'creation date');
+ok(!$meta->{license}, 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'file edition statement');
+ok(!$meta->{bibl_edition_statement}, 'bibl edition statement');
+is($meta->{reference}, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference');
-ok(!$doc->language, 'Language');
+ok(!$doc->{language}, 'Language');
# !!!
# diag 'This may be "de" in the future';
-is($doc->doc_title, 'VDI nachrichten, Januar 2014', 'Doc title');
-ok(!$doc->doc_sub_title, 'Doc Sub title');
-ok(!$doc->doc_editor, 'Doc editor');
-ok(!$doc->doc_author, 'Doc author');
+is($meta->{doc_title}, 'VDI nachrichten, Januar 2014', 'Doc title');
+ok(!$meta->{doc_sub_title}, 'Doc Sub title');
+ok(!$meta->{doc_editor}, 'Doc editor');
+ok(!$meta->{doc_author}, 'Doc author');
-is($doc->corpus_title, 'VDI nachrichten', 'Corpus title');
-ok(!$doc->corpus_sub_title, 'Corpus Sub title');
-is($doc->corpus_editor, 'Verein Deutscher Ingenieure', 'Corpus editor');
-ok(!$doc->corpus_author, 'Corpus author');
+is($meta->{corpus_title}, 'VDI nachrichten', 'Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Corpus Sub title');
+is($meta->{corpus_editor}, 'Verein Deutscher Ingenieure', 'Corpus editor');
+ok(!$meta->{corpus_author}, 'Corpus author');
-is($doc->keywords_string, '', 'Keywords');
-is($doc->text_class_string, 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class');
+is($meta->keywords('keywords'), '', 'Keywords');
+is($meta->keywords('text_class'), 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class');
# WDD
$path = catdir(dirname(__FILE__), 'corpus/WDD/G27/38989');
ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
like($doc->path, qr!$path/!, 'Path');
ok($doc->parse, 'Parse document');
+$meta = $doc->meta;
is($doc->text_sigle, 'WDD11_G27.38989', 'text sigle');
is($doc->doc_sigle, 'WDD11_G27', 'doc sigle');
is($doc->corpus_sigle, 'WDD11', 'corpus sigle');
-is($doc->title, 'Diskussion:Gunter A. Pilz', 'title');
-ok(!$doc->sub_title, 'subtitle');
-is($doc->pub_date, '20111029', 'pubdate');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubplace');
+is($meta->{title}, 'Diskussion:Gunter A. Pilz', 'title');
+ok(!$meta->{sub_title}, 'subtitle');
+is($meta->{pub_date}, '20111029', 'pubdate');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'pubplace');
-is($doc->author, '€pa, u.a.', 'author');
-is($doc->publisher, 'Wikipedia', 'publisher');
-ok(!$doc->editor, 'editor');
+is($meta->{author}, '€pa, u.a.', 'author');
+is($meta->{publisher}, 'Wikipedia', 'publisher');
+ok(!$meta->{editor}, 'editor');
-is($doc->text_type, 'Diskussionen zu Enzyklopädie-Artikeln', 'text type');
-ok(!$doc->text_type_art, 'text type art');
-ok(!$doc->text_type_ref, 'text type ref');
-ok(!$doc->text_column, 'text column');
-ok(!$doc->text_domain, 'text domain');
+is($meta->{text_type}, 'Diskussionen zu Enzyklopädie-Artikeln', 'text type');
+ok(!$meta->{text_type_art}, 'text type art');
+ok(!$meta->{text_type_ref}, 'text type ref');
+ok(!$meta->{text_column}, 'text column');
+ok(!$meta->{text_domain}, 'text domain');
-is($doc->creation_date, '20070707', 'creation date');
-is($doc->license, 'CC-BY-SA', 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'file edition statement');
-ok(!$doc->bibl_edition_statement, 'bibl edition statement');
-is($doc->reference, 'Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007', 'Reference');
+is($meta->{creation_date}, '20070707', 'creation date');
+is($meta->{license}, 'CC-BY-SA', 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'file edition statement');
+ok(!$meta->{bibl_edition_statement}, 'bibl edition statement');
+is($meta->{reference}, 'Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007', 'Reference');
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->doc_title, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Doc title');
-ok(!$doc->doc_sub_title, 'Doc Sub title');
-ok(!$doc->doc_editor, 'Doc editor');
-ok(!$doc->doc_author, 'Doc author');
+is($meta->{doc_title}, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Doc title');
+ok(!$meta->{doc_sub_title}, 'Doc Sub title');
+ok(!$meta->{doc_editor}, 'Doc editor');
+ok(!$meta->{doc_author}, 'Doc author');
-is($doc->corpus_title, 'Wikipedia', 'Corpus title');
-ok(!$doc->corpus_sub_title, 'Corpus Sub title');
-is($doc->corpus_editor, 'wikipedia.org', 'Corpus editor');
-ok(!$doc->corpus_author, 'Corpus author');
+is($meta->{corpus_title}, 'Wikipedia', 'Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Corpus Sub title');
+is($meta->{corpus_editor}, 'wikipedia.org', 'Corpus editor');
+ok(!$meta->{corpus_author}, 'Corpus author');
-is($doc->keywords_string, '', 'Keywords');
-is($doc->text_class_string, '', 'Text class');
+is($meta->keywords('keywords'), '', 'Keywords');
+is($meta->keywords('text_class'), '', 'Text class');
-
-is($doc->availability, 'CC-BY-SA', 'Availability');
+is($meta->{availability}, 'CC-BY-SA', 'Availability');
done_testing;
diff --git a/t/real/bzk.t b/t/real/bzk.t
index 196a420..e6ef8d5 100644
--- a/t/real/bzk.t
+++ b/t/real/bzk.t
@@ -25,45 +25,46 @@
is($doc->doc_sigle, 'BZK_D59', 'Correct document sigle');
is($doc->corpus_sigle, 'BZK', 'Correct corpus sigle');
-is($doc->title, 'Unser gemeinsames Werk wird siegreich sein', 'Title');
-ok(!$doc->sub_title, 'No SubTitle');
-ok(!$doc->author, 'Author');
-ok(!$doc->editor, 'Editor');
-is($doc->pub_place, 'Berlin', 'PubPlace');
-ok(!$doc->publisher, 'Publisher');
+my $meta = $doc->meta;
+is($meta->{title}, 'Unser gemeinsames Werk wird siegreich sein', 'Title');
+ok(!$meta->{sub_title}, 'No SubTitle');
+ok(!$meta->{author}, 'Author');
+ok(!$meta->{editor}, 'Editor');
+is($meta->{pub_place}, 'Berlin', 'PubPlace');
+ok(!$meta->{publisher}, 'Publisher');
-is($doc->text_type, 'Zeitung: Tageszeitung', 'Correct Text Type');
+is($meta->{text_type}, 'Zeitung: Tageszeitung', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-is($doc->text_type_ref, 'Tageszeitung', 'Correct Text Type Ref');
-is($doc->text_domain, 'Politik', 'Correct Text Domain');
-is($doc->text_column, 'POLITIK', 'Correct Text Column');
-is($doc->text_class->[0], 'politik', 'Correct Text Class');
-is($doc->text_class->[1], 'ausland', 'Correct Text Class');
-ok(!$doc->text_class->[2], 'Correct Text Class');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+is($meta->{text_type_ref}, 'Tageszeitung', 'Correct Text Type Ref');
+is($meta->{text_domain}, 'Politik', 'Correct Text Domain');
+is($meta->{text_column}, 'POLITIK', 'Correct Text Column');
+is($meta->{text_class}->[0], 'politik', 'Correct Text Class');
+is($meta->{text_class}->[1], 'ausland', 'Correct Text Class');
+ok(!$meta->{text_class}->[2], 'Correct Text Class');
-is($doc->pub_date, '19590101', 'Creation date');
-is($doc->creation_date, '19590101', 'Creation date');
-is($doc->license, 'ACA-NC-LC', 'License');
-ok(!$doc->pages, 'Pages');
+is($meta->{pub_date}, '19590101', 'Creation date');
+is($meta->{creation_date}, '19590101', 'Creation date');
+is($meta->{license}, 'ACA-NC-LC', 'License');
+ok(!$meta->{pages}, 'Pages');
-ok(!$doc->file_edition_statement, 'File Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Statement');
+ok(!$meta->{file_edition_statement}, 'File Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
-is($doc->reference . "\n", <<'REF', 'Reference');
+is($meta->{reference} . "\n", <<'REF', 'Reference');
Neues Deutschland, [Tageszeitung], 01.01.1959, Jg. 14, Berliner Ausgabe, S. 1. - Sachgebiet: Politik, Originalressort: POLITIK; Unser gemeinsames Werk wird siegreich sein
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Bonner Zeitungskorpus', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus sub title');
-ok(!$doc->corpus_author, 'Correct Corpus author');
-ok(!$doc->corpus_editor, 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Bonner Zeitungskorpus', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus sub title');
+ok(!$meta->{corpus_author}, 'Correct Corpus author');
+ok(!$meta->{corpus_editor}, 'Correct Corpus editor');
-is($doc->doc_title, 'Neues Deutschland', 'Correct Doc title');
-is($doc->doc_sub_title, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct doc editor');
+is($meta->{doc_title}, 'Neues Deutschland', 'Correct Doc title');
+is($meta->{doc_sub_title}, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/bzk_2.t b/t/real/bzk_2.t
index 4b958a2..dba97cc 100644
--- a/t/real/bzk_2.t
+++ b/t/real/bzk_2.t
@@ -25,46 +25,47 @@
is($doc->doc_sigle, 'BZK_D59', 'Correct document sigle');
is($doc->corpus_sigle, 'BZK', 'Correct corpus sigle');
-is($doc->title, 'Saragat-Partei zerfällt', 'Title');
-ok(!$doc->sub_title, 'No SubTitle');
-ok(!$doc->author, 'Author');
-ok(!$doc->editor, 'Editor');
-is($doc->pub_place, 'Berlin', 'PubPlace');
-is($doc->pub_date, '19590219', 'PubDate');
-ok(!$doc->publisher, 'Publisher');
+my $meta = $doc->meta;
+is($meta->{title}, 'Saragat-Partei zerfällt', 'Title');
+ok(!$meta->{sub_title}, 'No SubTitle');
+ok(!$meta->{author}, 'Author');
+ok(!$meta->{editor}, 'Editor');
+is($meta->{pub_place}, 'Berlin', 'PubPlace');
+is($meta->{pub_date}, '19590219', 'PubDate');
+ok(!$meta->{publisher}, 'Publisher');
-is($doc->text_type, 'Zeitung: Tageszeitung', 'Correct Text Type');
+is($meta->{text_type}, 'Zeitung: Tageszeitung', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-is($doc->text_type_ref, 'Tageszeitung', 'Correct Text Type Ref');
-is($doc->text_domain, 'Politik', 'Correct Text Domain');
-is($doc->text_column, 'POLITIK', 'Correct Text Column');
-is($doc->text_class->[0], 'politik', 'Correct Text Class');
-is($doc->text_class->[1], 'ausland', 'Correct Text Class');
-ok(!$doc->text_class->[2], 'Correct Text Class');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+is($meta->{text_type_ref}, 'Tageszeitung', 'Correct Text Type Ref');
+is($meta->{text_domain}, 'Politik', 'Correct Text Domain');
+is($meta->{text_column}, 'POLITIK', 'Correct Text Column');
+is($meta->{text_class}->[0], 'politik', 'Correct Text Class');
+is($meta->{text_class}->[1], 'ausland', 'Correct Text Class');
+ok(!$meta->{text_class}->[2], 'Correct Text Class');
-is($doc->creation_date, '19590219', 'Creation date');
-is($doc->license, 'ACA-NC-LC', 'License');
-ok(!$doc->pages, 'Pages');
+is($meta->{creation_date}, '19590219', 'Creation date');
+is($meta->{license}, 'ACA-NC-LC', 'License');
+ok(!$meta->{pages}, 'Pages');
-ok(!$doc->file_edition_statement, 'File Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Statement');
+ok(!$meta->{file_edition_statement}, 'File Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
-is($doc->reference . "\n", <<'REF', 'Reference');
+is($meta->{reference} . "\n", <<'REF', 'Reference');
Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14, Berliner Ausgabe, S. 7. - Sachgebiet: Politik, Originalressort: POLITIK; Saragat-Partei zerfällt
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Bonner Zeitungskorpus', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus sub title');
-ok(!$doc->corpus_author, 'Correct Corpus author');
-ok(!$doc->corpus_editor, 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Bonner Zeitungskorpus', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus sub title');
+ok(!$meta->{corpus_author}, 'Correct Corpus author');
+ok(!$meta->{corpus_editor}, 'Correct Corpus editor');
-is($doc->doc_title, 'Neues Deutschland', 'Correct Doc title');
-is($doc->doc_sub_title, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct doc editor');
+is($meta->{doc_title}, 'Neues Deutschland', 'Correct Doc title');
+is($meta->{doc_sub_title}, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/goethe.t b/t/real/goethe.t
index 770f70e..204e217 100644
--- a/t/real/goethe.t
+++ b/t/real/goethe.t
@@ -27,40 +27,41 @@
is($doc->doc_sigle, 'GOE_AGA', 'Correct document sigle');
is($doc->corpus_sigle, 'GOE', 'Correct corpus sigle');
-is($doc->title, 'Autobiographische Einzelheiten', 'Title');
-is($doc->pub_place, 'München', 'PubPlace');
-is($doc->pub_date, '19820000', 'Creation Date');
-ok(!$doc->sub_title, 'SubTitle');
-is($doc->author, 'Goethe, Johann Wolfgang von', 'Author');
+my $meta = $doc->meta;
+is($meta->{title}, 'Autobiographische Einzelheiten', 'Title');
+is($meta->{pub_place}, 'München', 'PubPlace');
+is($meta->{pub_date}, '19820000', 'Creation Date');
+ok(!$meta->{sub_title}, 'SubTitle');
+is($meta->{author}, 'Goethe, Johann Wolfgang von', 'Author');
-is($doc->publisher, 'Verlag C. H. Beck', 'Publisher');
-ok(!$doc->editor, 'Publisher');
-is($doc->text_type, 'Autobiographie', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-ok(!$doc->text_type_ref, 'Correct Text Type Ref');
-ok(!$doc->text_column, 'Correct Text Column');
-ok(!$doc->text_domain, 'Correct Text Domain');
-is($doc->creation_date, '18200000', 'Creation Date');
-is($doc->license, 'QAO-NC', 'License');
-is($doc->pages, '529-547', 'Pages');
-ok(!$doc->file_edition_statement, 'File Ed Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Ed Statement');
-is($doc->reference . "\n", <<'REF', 'Author');
+is($meta->{publisher}, 'Verlag C. H. Beck', 'Publisher');
+ok(!$meta->{editor}, 'Publisher');
+is($meta->{text_type}, 'Autobiographie', 'Correct Text Type');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+ok(!$meta->{text_type_ref}, 'Correct Text Type Ref');
+ok(!$meta->{text_column}, 'Correct Text Column');
+ok(!$meta->{text_domain}, 'Correct Text Domain');
+is($meta->{creation_date}, '18200000', 'Creation Date');
+is($meta->{license}, 'QAO-NC', 'License');
+is($meta->{pages}, '529-547', 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Ed Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Ed Statement');
+is($meta->{reference} . "\n", <<'REF', 'Author');
Goethe, Johann Wolfgang von: Autobiographische Einzelheiten, (Geschrieben bis 1832), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 10, Autobiographische Schriften II, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 529-547
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Goethes Werke', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus Sub title');
-is($doc->corpus_author, 'Goethe, Johann Wolfgang von', 'Correct Corpus author');
-is($doc->corpus_editor, 'Trunz, Erich', 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Goethes Werke', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus Sub title');
+is($meta->{corpus_author}, 'Goethe, Johann Wolfgang von', 'Correct Corpus author');
+is($meta->{corpus_editor}, 'Trunz, Erich', 'Correct Corpus editor');
-is($doc->doc_title, 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)',
+is($meta->{doc_title}, 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)',
'Correct Doc title');
-ok(!$doc->doc_sub_title, 'Correct Doc Sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct Doc editor');
+ok(!$meta->{doc_sub_title}, 'Correct Doc Sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct Doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 41e8e60..9d867f9 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -26,36 +26,37 @@
is($doc->doc_sigle, 'WDD11_G27', 'Correct document sigle');
is($doc->corpus_sigle, 'WDD11', 'Correct corpus sigle');
-is($doc->title, 'Diskussion:Gunter A. Pilz', 'Title');
-ok(!$doc->sub_title, 'No SubTitle');
-is($doc->author, '€pa, u.a.', 'Author');
-ok(!$doc->editor, 'Publisher');
+my $meta = $doc->meta;
+is($meta->{title}, 'Diskussion:Gunter A. Pilz', 'Title');
+ok(!$meta->{sub_title}, 'No SubTitle');
+is($meta->{author}, '€pa, u.a.', 'Author');
+ok(!$meta->{editor}, 'Publisher');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'PubPlace');
-is($doc->publisher, 'Wikipedia', 'Publisher');
-is($doc->text_type, 'Diskussionen zu Enzyklopädie-Artikeln', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-ok(!$doc->text_type_ref, 'Correct Text Type Ref');
-ok(!$doc->text_domain, 'Correct Text Domain');
-is($doc->creation_date, '20070707', 'Creation date');
-is($doc->license, 'CC-BY-SA', 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Statement');
-is($doc->reference . "\n", <<'REF', 'Reference');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'PubPlace');
+is($meta->{publisher}, 'Wikipedia', 'Publisher');
+is($meta->{text_type}, 'Diskussionen zu Enzyklopädie-Artikeln', 'Correct Text Type');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+ok(!$meta->{text_type_ref}, 'Correct Text Type Ref');
+ok(!$meta->{text_domain}, 'Correct Text Domain');
+is($meta->{creation_date}, '20070707', 'Creation date');
+is($meta->{license}, 'CC-BY-SA', 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
+is($meta->{reference} . "\n", <<'REF', 'Reference');
Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Wikipedia', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus sub title');
-ok(!$doc->corpus_author, 'Correct Corpus author');
-is($doc->corpus_editor, 'wikipedia.org', 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Wikipedia', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus sub title');
+ok(!$meta->{corpus_author}, 'Correct Corpus author');
+is($meta->{corpus_editor}, 'wikipedia.org', 'Correct Corpus editor');
-is($doc->doc_title, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Correct Doc title');
-ok(!$doc->doc_sub_title, 'Correct Doc sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct doc editor');
+is($meta->{doc_title}, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Correct Doc title');
+ok(!$meta->{doc_sub_title}, 'Correct Doc sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 45ffd01..1cf1711 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -27,19 +27,20 @@
is($doc->doc_sigle, 'WPD_AAA', 'Correct document sigle');
is($doc->corpus_sigle, 'WPD', 'Correct corpus sigle');
-is($doc->title, 'A', 'Title');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'PubPlace');
-is($doc->pub_date, '20050328', 'Creation Date');
-ok(!$doc->sub_title, 'SubTitle');
-is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'Author');
+my $meta = $doc->meta;
+is($meta->{title}, 'A', 'Title');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'PubPlace');
+is($meta->{pub_date}, '20050328', 'Creation Date');
+ok(!$meta->{sub_title}, 'SubTitle');
+is($meta->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'Author');
-ok(!$doc->doc_title, 'Correct Doc title');
-ok(!$doc->doc_sub_title, 'Correct Doc Sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct Doc editor');
+ok(!$meta->{doc_title}, 'Correct Doc title');
+ok(!$meta->{doc_sub_title}, 'Correct Doc Sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct Doc editor');
-ok(!$doc->corpus_title, 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus Sub title');
+ok(!$meta->{corpus_title}, 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus Sub title');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/sgbr/base.t b/t/sgbr/base.t
index 6eb7d62..718a0ba 100644
--- a/t/sgbr/base.t
+++ b/t/sgbr/base.t
@@ -14,7 +14,7 @@
path => $path . '/'
), 'Create Document');
-ok($doc->parse, 'Parse document');
+ok($doc->parse('Sgbr'), 'Parse document');
ok(my $tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
diff --git a/t/sgbr/meta.t b/t/sgbr/meta.t
index c5f9b60..a1c15c0 100644
--- a/t/sgbr/meta.t
+++ b/t/sgbr/meta.t
@@ -11,7 +11,8 @@
my $path = catdir(dirname(__FILE__), 'TEST', 'BSP', 1);
ok(my $doc = KorAP::XML::Krill->new(
- path => $path . '/'
+ path => $path . '/',
+ meta_type => 'Sgbr'
), 'Create Document');
ok($doc->parse, 'Parse document');
@@ -23,46 +24,48 @@
is($doc->doc_sigle, 'TEST_BSP', 'ID-doc');
is($doc->corpus_sigle, 'TEST', 'ID-corpus');
-is($doc->title, 'Sommerüberraschung', 'title');
-is($doc->author, 'TEST.BSP.Autoren.1', 'Author');
-is($doc->store('sgbrAuthorAgeClass'), 'X', 'AgeClass');
+my $meta = $doc->meta;
-is($doc->store('sgbrAuthorSex'), 'M', 'Sex');
-is($doc->store('sgbrKodex'), 'M', 'Kodex');
+is($meta->{title}, 'Sommerüberraschung', 'title');
+is($meta->{author}, 'TEST.BSP.Autoren.1', 'Author');
+is($meta->{'sgbr_author_age_class'}, 'X', 'AgeClass');
-is($doc->doc_title, 'Beispielkorpus', 'Doc: title');
-is($doc->doc_sub_title, 'Subkorpus Beispieltext', 'Doc: subtitle');
+is($meta->{'sgbr_author_sex'}, 'M', 'Sex');
+is($meta->{'sgbr_kodex'}, 'M', 'Kodex');
-is($doc->language, 'de', 'Language');
+is($meta->{doc_title}, 'Beispielkorpus', 'Doc: title');
+is($meta->{doc_sub_title}, 'Subkorpus Beispieltext', 'Doc: subtitle');
-ok(!$doc->publisher, 'Publisher');
-ok(!$doc->editor, 'Editor');
-ok(!$doc->text_type, 'Text Type');
-ok(!$doc->text_type_art, 'Text Type Art');
-ok(!$doc->text_type_ref, 'Text Type Ref');
-ok(!$doc->text_column, 'Text Column');
-ok(!$doc->text_domain, 'Text Domain');
-ok(!$doc->creation_date, 'Creation Date');
-ok(!$doc->license, 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Edition Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
-ok(!$doc->reference, 'Reference');
+is($meta->{language}, 'de', 'Language');
-ok(!$doc->doc_editor, 'Doc: editor');
-ok(!$doc->doc_author, 'Doc: author');
+ok(!$meta->{publisher}, 'Publisher');
+ok(!$meta->{editor}, 'Editor');
+ok(!$meta->{text_type}, 'Text Type');
+ok(!$meta->{text_type_art}, 'Text Type Art');
+ok(!$meta->{text_type_ref}, 'Text Type Ref');
+ok(!$meta->{text_column}, 'Text Column');
+ok(!$meta->{text_domain}, 'Text Domain');
+ok(!$meta->{creation_date}, 'Creation Date');
+ok(!$meta->{license}, 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Edition Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Edition Statement');
+ok(!$meta->{reference}, 'Reference');
-ok(!$doc->corpus_title, 'Corpus: title');
-ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
-ok(!$doc->corpus_editor, 'Corpus: editor');
-ok(!$doc->corpus_author, 'Corpus: author');
+ok(!$meta->{doc_editor}, 'Doc: editor');
+ok(!$meta->{doc_author}, 'Doc: author');
+
+ok(!$meta->{corpus_title}, 'Corpus: title');
+ok(!$meta->{corpus_sub_title}, 'Corpus: subtitle');
+ok(!$meta->{corpus_editor}, 'Corpus: editor');
+ok(!$meta->{corpus_author}, 'Corpus: author');
my $hash = $doc->to_hash;
is($hash->{title}, 'Sommerüberraschung', 'Corpus title');
-is($hash->{store}->{sgbrAuthorSex}, 'M', 'store');
+is($hash->{sgbrAuthorSex}, 'M', 'additional');
# Sgbr specific keywords
-is($doc->keywords_string, 'sgbrAuthorAgeClass:X sgbrAuthorSex:M sgbrKodex:M');
+is($meta->keywords('keywords'), 'sgbrAuthorAgeClass:X sgbrAuthorSex:M sgbrKodex:M');
done_testing;
diff --git a/t/sgbr/meta_duden.t b/t/sgbr/meta_duden.t
index 7cae3c5..c808022 100644
--- a/t/sgbr/meta_duden.t
+++ b/t/sgbr/meta_duden.t
@@ -11,7 +11,8 @@
my $path = catdir(dirname(__FILE__), 'PRO-DUD', 'BSP-2013-01', 32);
ok(my $doc = KorAP::XML::Krill->new(
- path => $path . '/'
+ path => $path . '/',
+ meta_type => 'Sgbr'
), 'Create Document');
ok($doc->parse, 'Parse document');
@@ -20,57 +21,57 @@
# Metdata
is($doc->text_sigle, 'PRO-DUD_BSP-2013-01.32', 'ID-text');
-
is($doc->doc_sigle, 'PRO-DUD_BSP-2013-01', 'ID-doc');
is($doc->corpus_sigle, 'PRO-DUD', 'ID-corpus');
-is($doc->title, 'Nur Platt, kein Deutsch', 'title');
-ok(!$doc->sub_title, 'no subtitle');
+my $meta = $doc->meta;
+is($meta->{title}, 'Nur Platt, kein Deutsch', 'title');
+ok(!$meta->{sub_title}, 'no subtitle');
-is($doc->publisher, 'Dorfblatt GmbH', 'Publisher');
-is($doc->pub_date, '20130126');
-is($doc->store('sgbrDate'), '2013-01-26');
-is($doc->pub_place, 'Stadtingen');
+is($meta->{publisher}, 'Dorfblatt GmbH', 'Publisher');
+is($meta->{pub_date}, '20130126');
+is($meta->{sgbr_date}, '2013-01-26');
+is($meta->{pub_place}, 'Stadtingen');
-is($doc->doc_title, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
-is($doc->doc_sub_title, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'Doc Sub title');
+is($meta->{doc_title}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
+is($meta->{doc_sub_title}, 'Subkorpus Ortsblatt, Jahrgang 2013, Monat Januar', 'Doc Sub title');
-is($doc->store('funder'), 'Bundesministerium für Bildung und Forschung', 'Funder');
+is($meta->{'funder'}, 'Bundesministerium für Bildung und Forschung', 'Funder');
-is($doc->author, 'unbekannt', 'Author');
-ok(!$doc->store('sgbrAuthorSex'), 'No Sex');
-is($doc->store('sgbrKodex'), 'T', '');
+is($meta->{author}, 'unbekannt', 'Author');
+ok(!$meta->{'sgbr_author_sex'}, 'No Sex');
+is($meta->{'sgbr_kodex'}, 'T', '');
-is($doc->keywords_string, 'sgbrKodex:T');
+is($meta->keywords('keywords'), 'sgbrKodex:T');
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-ok(!$doc->editor, 'Editor');
+ok(!$meta->{editor}, 'Editor');
-ok(!$doc->text_type, 'Text Type');
-ok(!$doc->text_type_art, 'Text Type Art');
-ok(!$doc->text_type_ref, 'Text Type Ref');
-ok(!$doc->text_column, 'Text Column');
-ok(!$doc->text_domain, 'Text Domain');
-ok(!$doc->creation_date, 'Creation Date');
-ok(!$doc->license, 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Edition Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
-ok(!$doc->reference, 'Reference');
+ok(!$meta->{text_type}, 'Text Type');
+ok(!$meta->{text_type_art}, 'Text Type Art');
+ok(!$meta->{text_type_ref}, 'Text Type Ref');
+ok(!$meta->{text_column}, 'Text Column');
+ok(!$meta->{text_domain}, 'Text Domain');
+ok(!$meta->{creation_date}, 'Creation Date');
+ok(!$meta->{license}, 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Edition Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Edition Statement');
+ok(!$meta->{reference}, 'Reference');
-ok(!$doc->doc_editor, 'Doc: editor');
-ok(!$doc->doc_author, 'Doc: author');
+ok(!$meta->{doc_editor}, 'Doc: editor');
+ok(!$meta->{doc_author}, 'Doc: author');
-ok(!$doc->corpus_title, 'Corpus: title');
-ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
-ok(!$doc->corpus_editor, 'Corpus: editor');
-ok(!$doc->corpus_author, 'Corpus: author');
+ok(!$meta->{corpus_title}, 'Corpus: title');
+ok(!$meta->{corpus_sub_title}, 'Corpus: subtitle');
+ok(!$meta->{corpus_editor}, 'Corpus: editor');
+ok(!$meta->{corpus_author}, 'Corpus: author');
my $hash = $doc->to_hash;
is($hash->{title}, 'Nur Platt, kein Deutsch', 'Corpus title');
-is($hash->{store}->{sgbrKodex}, 'T', 'store');
+is($hash->{sgbrKodex}, 'T', 'store');
done_testing;
diff --git a/t/sgbr/meta_ids.t b/t/sgbr/meta_ids.t
index 4128f5d..b040e09 100644
--- a/t/sgbr/meta_ids.t
+++ b/t/sgbr/meta_ids.t
@@ -11,7 +11,8 @@
my $path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '2843');
ok(my $doc = KorAP::XML::Krill->new(
- path => $path . '/'
+ path => $path . '/',
+ meta_type => 'Sgbr'
), 'Create Document');
ok($doc->parse, 'Parse document');
@@ -24,61 +25,62 @@
is($doc->doc_sigle, 'CMC-TSK_2014-09', 'ID-doc');
is($doc->corpus_sigle, 'CMC-TSK', 'ID-corpus');
-is($doc->title, '@ Koelle_am_Rhing 10:18', 'title');
+my $meta = $doc->meta;
-ok(!$doc->sub_title, 'no subtitle');
+is($meta->{title}, '@ Koelle_am_Rhing 10:18', 'title');
-is($doc->publisher, 'tagesschau.de', 'Publisher');
+ok(!$meta->{sub_title}, 'no subtitle');
-is($doc->pub_date, '20140930');
+is($meta->{publisher}, 'tagesschau.de', 'Publisher');
-ok(!$doc->pub_place, 'No pub place');
+is($meta->{pub_date}, '20140930');
-is($doc->doc_title, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
-is($doc->doc_sub_title, 'Subkorpus Internettexte, Subkorpus Leserkommentare Tagesschau, Subkorpus September 2014, Subkorpus Beispielauszug', 'Doc Sub title');
+ok(!$meta->{pub_place}, 'No pub place');
-is($doc->store('funder'), 'Bundesministerium für Bildung und Forschung', 'Funder');
+is($meta->{doc_title}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
+is($meta->{doc_sub_title}, 'Subkorpus Internettexte, Subkorpus Leserkommentare Tagesschau, Subkorpus September 2014, Subkorpus Beispielauszug', 'Doc Sub title');
-is($doc->author, 'privat23', 'Author');
-ok(!$doc->store('sgbrAuthorSex'), 'No Sex');
-ok(!$doc->store('sgbrKodex'), 'No kodex');
-is($doc->reference, 'http://meta.tagesschau.de/node/090285#comment-1732187', 'Publace ref');
+is($meta->{'funder'}, 'Bundesministerium für Bildung und Forschung', 'Funder');
-is($doc->keywords_string, '');
+is($meta->{author}, 'privat23', 'Author');
+ok(!$meta->{'sgbr_author_sex'}, 'No Sex');
+ok(!$meta->{'sgbr_kodex'}, 'No kodex');
+is($meta->{reference}, 'http://meta.tagesschau.de/node/090285#comment-1732187', 'Publace ref');
-is($doc->language, 'de', 'Language');
+is($meta->keywords('keywords'), '');
-ok(!$doc->editor, 'Editor');
+is($meta->{language}, 'de', 'Language');
-ok(!$doc->text_type, 'Text Type');
-ok(!$doc->text_type_art, 'Text Type Art');
-ok(!$doc->text_type_ref, 'Text Type Ref');
-ok(!$doc->text_column, 'Text Column');
-ok(!$doc->text_domain, 'Text Domain');
-ok(!$doc->creation_date, 'Creation Date');
-ok(!$doc->license, 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Edition Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
+ok(!$meta->{editor}, 'Editor');
-ok(!$doc->doc_editor, 'Doc: editor');
-ok(!$doc->doc_author, 'Doc: author');
+ok(!$meta->{text_type}, 'Text Type');
+ok(!$meta->{text_type_art}, 'Text Type Art');
+ok(!$meta->{text_type_ref}, 'Text Type Ref');
+ok(!$meta->{text_column}, 'Text Column');
+ok(!$meta->{text_domain}, 'Text Domain');
+ok(!$meta->{creation_date}, 'Creation Date');
+ok(!$meta->{license}, 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Edition Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Edition Statement');
-ok(!$doc->corpus_title, 'Corpus: title');
-ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
-ok(!$doc->corpus_editor, 'Corpus: editor');
-ok(!$doc->corpus_author, 'Corpus: author');
+ok(!$meta->{doc_editor}, 'Doc: editor');
+ok(!$meta->{doc_author}, 'Doc: author');
+
+ok(!$meta->{corpus_title}, 'Corpus: title');
+ok(!$meta->{corpus_sub_title}, 'Corpus: subtitle');
+ok(!$meta->{corpus_editor}, 'Corpus: editor');
+ok(!$meta->{corpus_author}, 'Corpus: author');
my $hash = $doc->to_hash;
is($hash->{title}, '@ Koelle_am_Rhing 10:18', 'Corpus title');
-
# Second document
-
$path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '3401');
ok($doc = KorAP::XML::Krill->new(
- path => $path . '/'
+ path => $path . '/',
+ meta_type => 'Sgbr'
), 'Create Document');
ok($doc->parse, 'Parse document');
@@ -91,51 +93,53 @@
is($doc->doc_sigle, 'CMC-TSK_2014-09', 'ID-doc');
is($doc->corpus_sigle, 'CMC-TSK', 'ID-corpus');
-is($doc->title, '@fitnessfrosch', 'title');
-ok(!$doc->sub_title, 'no subtitle');
+$meta = $doc->meta;
+is($meta->{title}, '@fitnessfrosch', 'title');
-is($doc->publisher, 'tagesschau.de', 'Publisher');
+ok(!$meta->{sub_title}, 'no subtitle');
-is($doc->pub_date, '20141001');
-is($doc->store('sgbrDate'), '2014-10-01 00:50:00');
+is($meta->{publisher}, 'tagesschau.de', 'Publisher');
-ok(!$doc->pub_place, 'No pub place');
+is($meta->{pub_date}, '20141001');
+is($meta->{'sgbr_date'}, '2014-10-01 00:50:00');
-is($doc->doc_title, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
-is($doc->doc_sub_title, 'Subkorpus Internettexte, Subkorpus Leserkommentare Tagesschau, Subkorpus September 2014, Subkorpus Beispielauszug', 'Doc Sub title');
+ok(!$meta->{pub_place}, 'No pub place');
-is($doc->store('funder'), 'Bundesministerium für Bildung und Forschung', 'Funder');
+is($meta->{doc_title}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
+is($meta->{doc_sub_title}, 'Subkorpus Internettexte, Subkorpus Leserkommentare Tagesschau, Subkorpus September 2014, Subkorpus Beispielauszug', 'Doc Sub title');
-is($doc->author, 'weltoffen', 'Author');
-ok(!$doc->store('sgbrAuthorSex'), 'No Sex');
-ok(!$doc->store('sgbrKodex'), 'No kodex');
-is($doc->reference, 'http://meta.tagesschau.de/node/090308#comment-1732754', 'Publace ref');
+is($meta->{'funder'}, 'Bundesministerium für Bildung und Forschung', 'Funder');
-is($doc->keywords_string, '');
+is($meta->{author}, 'weltoffen', 'Author');
+ok(!$meta->{'sgbr_author_sex'}, 'No Sex');
+ok(!$meta->{'sgbr_kodex'}, 'No kodex');
+is($meta->{reference}, 'http://meta.tagesschau.de/node/090308#comment-1732754', 'Publace ref');
-is($doc->language, 'de', 'Language');
+is($meta->keywords('keywords'), '');
-ok(!$doc->editor, 'Editor');
+is($meta->{language}, 'de', 'Language');
-ok(!$doc->text_type, 'Text Type');
-ok(!$doc->text_type_art, 'Text Type Art');
-ok(!$doc->text_type_ref, 'Text Type Ref');
-ok(!$doc->text_column, 'Text Column');
-ok(!$doc->text_domain, 'Text Domain');
-ok(!$doc->creation_date, 'Creation Date');
-ok(!$doc->license, 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Edition Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
+ok(!$meta->{editor}, 'Editor');
-ok(!$doc->doc_editor, 'Doc: editor');
-ok(!$doc->doc_author, 'Doc: author');
+ok(!$meta->{text_type}, 'Text Type');
+ok(!$meta->{text_type_art}, 'Text Type Art');
+ok(!$meta->{text_type_ref}, 'Text Type Ref');
+ok(!$meta->{text_column}, 'Text Column');
+ok(!$meta->{text_domain}, 'Text Domain');
+ok(!$meta->{creation_date}, 'Creation Date');
+ok(!$meta->{license}, 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Edition Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Edition Statement');
-ok(!$doc->corpus_title, 'Corpus: title');
-ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
-ok(!$doc->corpus_editor, 'Corpus: editor');
-ok(!$doc->corpus_author, 'Corpus: author');
+ok(!$meta->{doc_editor}, 'Doc: editor');
+ok(!$meta->{doc_author}, 'Doc: author');
+
+ok(!$meta->{corpus_title}, 'Corpus: title');
+ok(!$meta->{corpus_sub_title}, 'Corpus: subtitle');
+ok(!$meta->{corpus_editor}, 'Corpus: editor');
+ok(!$meta->{corpus_author}, 'Corpus: author');
$hash = $doc->to_hash;
is($hash->{title}, '@fitnessfrosch', 'Corpus title');
diff --git a/t/transform.t b/t/transform.t
index 026b5d3..7c9a15e 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -80,19 +80,21 @@
ok($doc->parse, 'Parse document');
# Metdata
-is($doc->title, 'A', 'title');
-ok(!$doc->sub_title, 'subTitle');
+my $meta = $doc->meta;
+is($meta->{title}, 'A', 'title');
+ok(!$meta->{sub_title}, 'subTitle');
is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
is($doc->corpus_sigle, 'WPD', 'corpusID');
-is($doc->pub_date, '20050328', 'pubDate');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubPlace');
-is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
-is($doc->text_class->[1], 'reisen', 'TextClass');
-is($doc->text_class->[2], 'wissenschaft', 'TextClass');
-is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
-ok(!$doc->text_class->[4], 'TextClass');
-is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+
+is($meta->{pub_date}, '20050328', 'pubDate');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'pubPlace');
+is($meta->{text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
+is($meta->{text_class}->[1], 'reisen', 'TextClass');
+is($meta->{text_class}->[2], 'wissenschaft', 'TextClass');
+is($meta->{text_class}->[3], 'populaerwissenschaft', 'TextClass');
+ok(!$meta->{text_class}->[4], 'TextClass');
+is($meta->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
# Get tokens
use_ok('KorAP::XML::Tokenizer');