Simplified and modularized metadata processing
Change-Id: I63e78fd5994126c954263324bcfc2fd9d51e39ea
diff --git a/t/real/bzk.t b/t/real/bzk.t
index 196a420..e6ef8d5 100644
--- a/t/real/bzk.t
+++ b/t/real/bzk.t
@@ -25,45 +25,46 @@
is($doc->doc_sigle, 'BZK_D59', 'Correct document sigle');
is($doc->corpus_sigle, 'BZK', 'Correct corpus sigle');
-is($doc->title, 'Unser gemeinsames Werk wird siegreich sein', 'Title');
-ok(!$doc->sub_title, 'No SubTitle');
-ok(!$doc->author, 'Author');
-ok(!$doc->editor, 'Editor');
-is($doc->pub_place, 'Berlin', 'PubPlace');
-ok(!$doc->publisher, 'Publisher');
+my $meta = $doc->meta;
+is($meta->{title}, 'Unser gemeinsames Werk wird siegreich sein', 'Title');
+ok(!$meta->{sub_title}, 'No SubTitle');
+ok(!$meta->{author}, 'Author');
+ok(!$meta->{editor}, 'Editor');
+is($meta->{pub_place}, 'Berlin', 'PubPlace');
+ok(!$meta->{publisher}, 'Publisher');
-is($doc->text_type, 'Zeitung: Tageszeitung', 'Correct Text Type');
+is($meta->{text_type}, 'Zeitung: Tageszeitung', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-is($doc->text_type_ref, 'Tageszeitung', 'Correct Text Type Ref');
-is($doc->text_domain, 'Politik', 'Correct Text Domain');
-is($doc->text_column, 'POLITIK', 'Correct Text Column');
-is($doc->text_class->[0], 'politik', 'Correct Text Class');
-is($doc->text_class->[1], 'ausland', 'Correct Text Class');
-ok(!$doc->text_class->[2], 'Correct Text Class');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+is($meta->{text_type_ref}, 'Tageszeitung', 'Correct Text Type Ref');
+is($meta->{text_domain}, 'Politik', 'Correct Text Domain');
+is($meta->{text_column}, 'POLITIK', 'Correct Text Column');
+is($meta->{text_class}->[0], 'politik', 'Correct Text Class');
+is($meta->{text_class}->[1], 'ausland', 'Correct Text Class');
+ok(!$meta->{text_class}->[2], 'Correct Text Class');
-is($doc->pub_date, '19590101', 'Creation date');
-is($doc->creation_date, '19590101', 'Creation date');
-is($doc->license, 'ACA-NC-LC', 'License');
-ok(!$doc->pages, 'Pages');
+is($meta->{pub_date}, '19590101', 'Creation date');
+is($meta->{creation_date}, '19590101', 'Creation date');
+is($meta->{license}, 'ACA-NC-LC', 'License');
+ok(!$meta->{pages}, 'Pages');
-ok(!$doc->file_edition_statement, 'File Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Statement');
+ok(!$meta->{file_edition_statement}, 'File Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
-is($doc->reference . "\n", <<'REF', 'Reference');
+is($meta->{reference} . "\n", <<'REF', 'Reference');
Neues Deutschland, [Tageszeitung], 01.01.1959, Jg. 14, Berliner Ausgabe, S. 1. - Sachgebiet: Politik, Originalressort: POLITIK; Unser gemeinsames Werk wird siegreich sein
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Bonner Zeitungskorpus', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus sub title');
-ok(!$doc->corpus_author, 'Correct Corpus author');
-ok(!$doc->corpus_editor, 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Bonner Zeitungskorpus', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus sub title');
+ok(!$meta->{corpus_author}, 'Correct Corpus author');
+ok(!$meta->{corpus_editor}, 'Correct Corpus editor');
-is($doc->doc_title, 'Neues Deutschland', 'Correct Doc title');
-is($doc->doc_sub_title, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct doc editor');
+is($meta->{doc_title}, 'Neues Deutschland', 'Correct Doc title');
+is($meta->{doc_sub_title}, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/bzk_2.t b/t/real/bzk_2.t
index 4b958a2..dba97cc 100644
--- a/t/real/bzk_2.t
+++ b/t/real/bzk_2.t
@@ -25,46 +25,47 @@
is($doc->doc_sigle, 'BZK_D59', 'Correct document sigle');
is($doc->corpus_sigle, 'BZK', 'Correct corpus sigle');
-is($doc->title, 'Saragat-Partei zerfällt', 'Title');
-ok(!$doc->sub_title, 'No SubTitle');
-ok(!$doc->author, 'Author');
-ok(!$doc->editor, 'Editor');
-is($doc->pub_place, 'Berlin', 'PubPlace');
-is($doc->pub_date, '19590219', 'PubDate');
-ok(!$doc->publisher, 'Publisher');
+my $meta = $doc->meta;
+is($meta->{title}, 'Saragat-Partei zerfällt', 'Title');
+ok(!$meta->{sub_title}, 'No SubTitle');
+ok(!$meta->{author}, 'Author');
+ok(!$meta->{editor}, 'Editor');
+is($meta->{pub_place}, 'Berlin', 'PubPlace');
+is($meta->{pub_date}, '19590219', 'PubDate');
+ok(!$meta->{publisher}, 'Publisher');
-is($doc->text_type, 'Zeitung: Tageszeitung', 'Correct Text Type');
+is($meta->{text_type}, 'Zeitung: Tageszeitung', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-is($doc->text_type_ref, 'Tageszeitung', 'Correct Text Type Ref');
-is($doc->text_domain, 'Politik', 'Correct Text Domain');
-is($doc->text_column, 'POLITIK', 'Correct Text Column');
-is($doc->text_class->[0], 'politik', 'Correct Text Class');
-is($doc->text_class->[1], 'ausland', 'Correct Text Class');
-ok(!$doc->text_class->[2], 'Correct Text Class');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+is($meta->{text_type_ref}, 'Tageszeitung', 'Correct Text Type Ref');
+is($meta->{text_domain}, 'Politik', 'Correct Text Domain');
+is($meta->{text_column}, 'POLITIK', 'Correct Text Column');
+is($meta->{text_class}->[0], 'politik', 'Correct Text Class');
+is($meta->{text_class}->[1], 'ausland', 'Correct Text Class');
+ok(!$meta->{text_class}->[2], 'Correct Text Class');
-is($doc->creation_date, '19590219', 'Creation date');
-is($doc->license, 'ACA-NC-LC', 'License');
-ok(!$doc->pages, 'Pages');
+is($meta->{creation_date}, '19590219', 'Creation date');
+is($meta->{license}, 'ACA-NC-LC', 'License');
+ok(!$meta->{pages}, 'Pages');
-ok(!$doc->file_edition_statement, 'File Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Statement');
+ok(!$meta->{file_edition_statement}, 'File Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
-is($doc->reference . "\n", <<'REF', 'Reference');
+is($meta->{reference} . "\n", <<'REF', 'Reference');
Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14, Berliner Ausgabe, S. 7. - Sachgebiet: Politik, Originalressort: POLITIK; Saragat-Partei zerfällt
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Bonner Zeitungskorpus', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus sub title');
-ok(!$doc->corpus_author, 'Correct Corpus author');
-ok(!$doc->corpus_editor, 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Bonner Zeitungskorpus', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus sub title');
+ok(!$meta->{corpus_author}, 'Correct Corpus author');
+ok(!$meta->{corpus_editor}, 'Correct Corpus editor');
-is($doc->doc_title, 'Neues Deutschland', 'Correct Doc title');
-is($doc->doc_sub_title, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct doc editor');
+is($meta->{doc_title}, 'Neues Deutschland', 'Correct Doc title');
+is($meta->{doc_sub_title}, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/goethe.t b/t/real/goethe.t
index 770f70e..204e217 100644
--- a/t/real/goethe.t
+++ b/t/real/goethe.t
@@ -27,40 +27,41 @@
is($doc->doc_sigle, 'GOE_AGA', 'Correct document sigle');
is($doc->corpus_sigle, 'GOE', 'Correct corpus sigle');
-is($doc->title, 'Autobiographische Einzelheiten', 'Title');
-is($doc->pub_place, 'München', 'PubPlace');
-is($doc->pub_date, '19820000', 'Creation Date');
-ok(!$doc->sub_title, 'SubTitle');
-is($doc->author, 'Goethe, Johann Wolfgang von', 'Author');
+my $meta = $doc->meta;
+is($meta->{title}, 'Autobiographische Einzelheiten', 'Title');
+is($meta->{pub_place}, 'München', 'PubPlace');
+is($meta->{pub_date}, '19820000', 'Creation Date');
+ok(!$meta->{sub_title}, 'SubTitle');
+is($meta->{author}, 'Goethe, Johann Wolfgang von', 'Author');
-is($doc->publisher, 'Verlag C. H. Beck', 'Publisher');
-ok(!$doc->editor, 'Publisher');
-is($doc->text_type, 'Autobiographie', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-ok(!$doc->text_type_ref, 'Correct Text Type Ref');
-ok(!$doc->text_column, 'Correct Text Column');
-ok(!$doc->text_domain, 'Correct Text Domain');
-is($doc->creation_date, '18200000', 'Creation Date');
-is($doc->license, 'QAO-NC', 'License');
-is($doc->pages, '529-547', 'Pages');
-ok(!$doc->file_edition_statement, 'File Ed Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Ed Statement');
-is($doc->reference . "\n", <<'REF', 'Author');
+is($meta->{publisher}, 'Verlag C. H. Beck', 'Publisher');
+ok(!$meta->{editor}, 'Publisher');
+is($meta->{text_type}, 'Autobiographie', 'Correct Text Type');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+ok(!$meta->{text_type_ref}, 'Correct Text Type Ref');
+ok(!$meta->{text_column}, 'Correct Text Column');
+ok(!$meta->{text_domain}, 'Correct Text Domain');
+is($meta->{creation_date}, '18200000', 'Creation Date');
+is($meta->{license}, 'QAO-NC', 'License');
+is($meta->{pages}, '529-547', 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Ed Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Ed Statement');
+is($meta->{reference} . "\n", <<'REF', 'Author');
Goethe, Johann Wolfgang von: Autobiographische Einzelheiten, (Geschrieben bis 1832), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 10, Autobiographische Schriften II, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 529-547
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Goethes Werke', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus Sub title');
-is($doc->corpus_author, 'Goethe, Johann Wolfgang von', 'Correct Corpus author');
-is($doc->corpus_editor, 'Trunz, Erich', 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Goethes Werke', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus Sub title');
+is($meta->{corpus_author}, 'Goethe, Johann Wolfgang von', 'Correct Corpus author');
+is($meta->{corpus_editor}, 'Trunz, Erich', 'Correct Corpus editor');
-is($doc->doc_title, 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)',
+is($meta->{doc_title}, 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)',
'Correct Doc title');
-ok(!$doc->doc_sub_title, 'Correct Doc Sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct Doc editor');
+ok(!$meta->{doc_sub_title}, 'Correct Doc Sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct Doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 41e8e60..9d867f9 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -26,36 +26,37 @@
is($doc->doc_sigle, 'WDD11_G27', 'Correct document sigle');
is($doc->corpus_sigle, 'WDD11', 'Correct corpus sigle');
-is($doc->title, 'Diskussion:Gunter A. Pilz', 'Title');
-ok(!$doc->sub_title, 'No SubTitle');
-is($doc->author, '€pa, u.a.', 'Author');
-ok(!$doc->editor, 'Publisher');
+my $meta = $doc->meta;
+is($meta->{title}, 'Diskussion:Gunter A. Pilz', 'Title');
+ok(!$meta->{sub_title}, 'No SubTitle');
+is($meta->{author}, '€pa, u.a.', 'Author');
+ok(!$meta->{editor}, 'Publisher');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'PubPlace');
-is($doc->publisher, 'Wikipedia', 'Publisher');
-is($doc->text_type, 'Diskussionen zu Enzyklopädie-Artikeln', 'Correct Text Type');
-ok(!$doc->text_type_art, 'Correct Text Type Art');
-ok(!$doc->text_type_ref, 'Correct Text Type Ref');
-ok(!$doc->text_domain, 'Correct Text Domain');
-is($doc->creation_date, '20070707', 'Creation date');
-is($doc->license, 'CC-BY-SA', 'License');
-ok(!$doc->pages, 'Pages');
-ok(!$doc->file_edition_statement, 'File Statement');
-ok(!$doc->bibl_edition_statement, 'Bibl Statement');
-is($doc->reference . "\n", <<'REF', 'Reference');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'PubPlace');
+is($meta->{publisher}, 'Wikipedia', 'Publisher');
+is($meta->{text_type}, 'Diskussionen zu Enzyklopädie-Artikeln', 'Correct Text Type');
+ok(!$meta->{text_type_art}, 'Correct Text Type Art');
+ok(!$meta->{text_type_ref}, 'Correct Text Type Ref');
+ok(!$meta->{text_domain}, 'Correct Text Domain');
+is($meta->{creation_date}, '20070707', 'Creation date');
+is($meta->{license}, 'CC-BY-SA', 'License');
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{file_edition_statement}, 'File Statement');
+ok(!$meta->{bibl_edition_statement}, 'Bibl Statement');
+is($meta->{reference} . "\n", <<'REF', 'Reference');
Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007
REF
-is($doc->language, 'de', 'Language');
+is($meta->{language}, 'de', 'Language');
-is($doc->corpus_title, 'Wikipedia', 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus sub title');
-ok(!$doc->corpus_author, 'Correct Corpus author');
-is($doc->corpus_editor, 'wikipedia.org', 'Correct Corpus editor');
+is($meta->{corpus_title}, 'Wikipedia', 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus sub title');
+ok(!$meta->{corpus_author}, 'Correct Corpus author');
+is($meta->{corpus_editor}, 'wikipedia.org', 'Correct Corpus editor');
-is($doc->doc_title, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Correct Doc title');
-ok(!$doc->doc_sub_title, 'Correct Doc sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct doc editor');
+is($meta->{doc_title}, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Correct Doc title');
+ok(!$meta->{doc_sub_title}, 'Correct Doc sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct doc editor');
# Tokenization
use_ok('KorAP::XML::Tokenizer');
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 45ffd01..1cf1711 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -27,19 +27,20 @@
is($doc->doc_sigle, 'WPD_AAA', 'Correct document sigle');
is($doc->corpus_sigle, 'WPD', 'Correct corpus sigle');
-is($doc->title, 'A', 'Title');
-is($doc->pub_place, 'URL:http://de.wikipedia.org', 'PubPlace');
-is($doc->pub_date, '20050328', 'Creation Date');
-ok(!$doc->sub_title, 'SubTitle');
-is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'Author');
+my $meta = $doc->meta;
+is($meta->{title}, 'A', 'Title');
+is($meta->{pub_place}, 'URL:http://de.wikipedia.org', 'PubPlace');
+is($meta->{pub_date}, '20050328', 'Creation Date');
+ok(!$meta->{sub_title}, 'SubTitle');
+is($meta->{author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'Author');
-ok(!$doc->doc_title, 'Correct Doc title');
-ok(!$doc->doc_sub_title, 'Correct Doc Sub title');
-ok(!$doc->doc_author, 'Correct Doc author');
-ok(!$doc->doc_editor, 'Correct Doc editor');
+ok(!$meta->{doc_title}, 'Correct Doc title');
+ok(!$meta->{doc_sub_title}, 'Correct Doc Sub title');
+ok(!$meta->{doc_author}, 'Correct Doc author');
+ok(!$meta->{doc_editor}, 'Correct Doc editor');
-ok(!$doc->corpus_title, 'Correct Corpus title');
-ok(!$doc->corpus_sub_title, 'Correct Corpus Sub title');
+ok(!$meta->{corpus_title}, 'Correct Corpus title');
+ok(!$meta->{corpus_sub_title}, 'Correct Corpus Sub title');
# Tokenization
use_ok('KorAP::XML::Tokenizer');