blob: cb8c96ae52dca9e44b811f36065b8a45137bfd82 [file] [log] [blame]
Nils Diewald8e323ee2014-04-23 17:28:14 +00001use strict;
2use warnings;
3use utf8;
4use Test::More;
5use Benchmark ':hireswallclock';
Akronfbf66382016-07-12 19:44:01 +02006use Mojo::DOM;
Akron3ec0a1c2017-01-18 14:41:55 +01007use Mojo::File;
Akronfbf66382016-07-12 19:44:01 +02008use Mojo::ByteStream 'b';
9use Data::Dumper;
Nils Diewald8e323ee2014-04-23 17:28:14 +000010use lib 'lib', '../lib';
11
12use File::Basename 'dirname';
13use File::Spec::Functions 'catdir';
14
Akrone4c2e412016-01-28 15:10:50 +010015use_ok('KorAP::XML::Krill');
Nils Diewald8e323ee2014-04-23 17:28:14 +000016
17# WPD/00001
Akron9c0488f2016-01-28 14:17:15 +010018my $path = catdir(dirname(__FILE__), 'corpus/WPD/00001');
Akrone4c2e412016-01-28 15:10:50 +010019ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020020like($doc->path, qr!\Q$path\E/!, 'Path');
Nils Diewald8e323ee2014-04-23 17:28:14 +000021
Akrone4c2e412016-01-28 15:10:50 +010022ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020023like($doc->path, qr!\Q$path\E/$!, 'Path');
Nils Diewald8e323ee2014-04-23 17:28:14 +000024
25ok($doc->parse, 'Parse document');
26
27# Metdata
Akron1cd5b872016-03-22 00:23:46 +010028is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
Nils Diewaldfeccbb12015-06-18 20:06:45 +000029
Akron35db6e32016-03-17 22:42:22 +010030my $meta = $doc->meta;
Akron5eb3aa02019-01-25 18:30:47 +010031is($meta->{T_title}, 'A', 'title');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020032
Akron5eb3aa02019-01-25 18:30:47 +010033ok(!$meta->{T_sub_title}, 'subTitle');
Nils Diewald840c9242014-10-28 19:51:26 +000034is($doc->corpus_sigle, 'WPD', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +010035is($meta->{D_pub_date}, '20050328', 'pubDate');
36is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'pubPlace');
37is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
38is($meta->{K_text_class}->[1], 'reisen', 'TextClass');
39is($meta->{K_text_class}->[2], 'wissenschaft', 'TextClass');
40is($meta->{K_text_class}->[3], 'populaerwissenschaft', 'TextClass');
41ok(!$meta->{K_text_class}->[4], 'TextClass');
42is($meta->{T_author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
Nils Diewaldfeccbb12015-06-18 20:06:45 +000043
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020044
Nils Diewaldfeccbb12015-06-18 20:06:45 +000045#is($doc->author->[0], 'Ruru', 'author');
46#is($doc->author->[1], 'Jens.Ol', 'author');
47#is($doc->author->[2], 'Aglarech', 'author');
48#ok(!$doc->author->[3], 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +000049
50# Additional information
Akron5eb3aa02019-01-25 18:30:47 +010051is($meta->{A_editor}, 'wikipedia.org', 'Editor');
52is($meta->{A_publisher}, 'Wikipedia', 'Publisher');
53is($meta->{D_creation_date}, '20050000', 'Creation date');
54ok(!$meta->{S_text_type}, 'No text_type');
55ok(!$meta->{S_text_type_art}, 'no text_type art');
56ok(!$meta->{S_text_type_ref}, 'no text_type ref');
57ok(!$meta->{S_text_domain}, 'no text_domain');
58ok(!$meta->{S_text_column}, 'no text_column');
59ok(!$meta->keywords('K_keywords'), 'no keywords');
60is($meta->keywords('K_text_class'), 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes');
Nils Diewaldfeccbb12015-06-18 20:06:45 +000061
62#is($doc->coll_title, 'Wikipedia', 'Collection title');
63#is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
64#is($doc->coll_editor, 'wikipedia.org', 'Collection editor');
65#ok(!$doc->coll_author, 'Collection author');
Nils Diewald8e323ee2014-04-23 17:28:14 +000066
Nils Diewald8e323ee2014-04-23 17:28:14 +000067# A01/13047
Akron9c0488f2016-01-28 14:17:15 +010068$path = catdir(dirname(__FILE__), 'corpus/A01/13047');
Akrone4c2e412016-01-28 15:10:50 +010069ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewald8e323ee2014-04-23 17:28:14 +000070
71ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +010072$meta = $doc->meta;
Akron5eb3aa02019-01-25 18:30:47 +010073is($meta->{T_title}, 'Fischer und Kolp im Sonnenhügel', 'title');
Akron35db6e32016-03-17 22:42:22 +010074
Akron5eb3aa02019-01-25 18:30:47 +010075ok(!$meta->{T_sub_title}, 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +010076is($doc->text_sigle, 'A01/APR/13047', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +000077is($doc->corpus_sigle, 'A01', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +010078is($meta->{D_pub_date}, '20010402', 'pubDate');
79ok(!$meta->{S_pub_place}, 'pubPlace');
80is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
81is($meta->{K_text_class}->[1], 'vereine-veranstaltungen', 'TextClass');
82ok(!$meta->{K_text_class}->[2], 'TextClass');
83ok(!$meta->{T_author}, 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +000084
85# Additional information
Akron5eb3aa02019-01-25 18:30:47 +010086ok(!$meta->{A_editor}, 'Editor');
87ok(!$meta->{A_publisher}, 'Publisher');
88is($meta->{D_creation_date}, '20010402', 'Creation date');
Nils Diewaldfeccbb12015-06-18 20:06:45 +000089#ok(!$doc->coll_title, 'Collection title');
90#ok(!$doc->coll_sub_title, 'Collection subtitle');
91#ok(!$doc->coll_editor, 'Collection editor');
92#ok(!$doc->coll_author, 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +010093ok(!$meta->{S_text_type}, 'text_type');
94is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
Nils Diewald8e323ee2014-04-23 17:28:14 +000095
Nils Diewald8e323ee2014-04-23 17:28:14 +000096# ERL/0001
Akron9c0488f2016-01-28 14:17:15 +010097$path = catdir(dirname(__FILE__), 'corpus/ERL/00001');
Akrone4c2e412016-01-28 15:10:50 +010098ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewald8e323ee2014-04-23 17:28:14 +000099
100ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100101
102$meta = $doc->meta;
Akron5eb3aa02019-01-25 18:30:47 +0100103is($meta->{T_title}, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse]', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
Akrona8665782016-01-27 21:47:57 +0100104# MK2/ERL.00001
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000105
Akron5eb3aa02019-01-25 18:30:47 +0100106ok(!$meta->{T_sub_title}, 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +0100107is($doc->text_sigle, 'MK2/ERL/00001', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +0000108is($doc->corpus_sigle, 'MK2', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +0100109is($meta->{D_pub_date}, '00000000', 'pubDate');
110is($meta->{S_pub_place}, 'Karlsruhe', 'pubPlace');
111is($meta->{K_text_class}->[0], 'politik', 'TextClass');
112is($meta->{K_text_class}->[1], 'kommunalpolitik', 'TextClass');
113ok(!$meta->{K_text_class}->[2], 'TextClass');
114ok(!$meta->{T_author}, 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000115
116# Additional information
Akron5eb3aa02019-01-25 18:30:47 +0100117ok(!$meta->{A_editor}, 'Editor');
118is($meta->{A_publisher}, 'Badenia Verlag und Druckerei', 'Publisher');
119is($meta->{D_creation_date}, '19600000', 'Creation date');
Akrona8665782016-01-27 21:47:57 +0100120
121# !!!
122# diag 'Non-acceptance of creation date ranges may be temporary';
123
124
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000125#ok(!$doc->coll_title, 'Collection title');
126#ok(!$doc->coll_sub_title, 'Collection subtitle');
127#ok(!$doc->coll_editor, 'Collection editor');
128#ok(!$doc->coll_author, 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +0100129is($meta->{S_text_type}, 'Erlass', 'text_type');
130ok(!$meta->{S_text_type_art}, 'text_type art');
Akron35db6e32016-03-17 22:42:22 +0100131
Nils Diewald8e323ee2014-04-23 17:28:14 +0000132
Nils Diewald8e323ee2014-04-23 17:28:14 +0000133# A01/02035-substring
Akron9c0488f2016-01-28 14:17:15 +0100134$path = catdir(dirname(__FILE__), 'corpus/A00/02035-substring');
Akrone4c2e412016-01-28 15:10:50 +0100135ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000136ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100137
138$meta = $doc->meta;
139
Akron5eb3aa02019-01-25 18:30:47 +0100140is($meta->{T_title}, 'St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title'); # A00/JAN.02035
141ok(!$meta->{T_sub_title}, 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +0100142is($doc->text_sigle, 'A00/JAN/02035', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +0000143is($doc->corpus_sigle, 'A00', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +0100144is($meta->{D_pub_date}, '20000111', 'pubDate');
145ok(!$meta->{S_pub_place}, 'pubPlace');
146is($meta->{K_text_class}->[0], 'sport', 'TextClass');
147is($meta->{K_text_class}->[1], 'ballsport', 'TextClass');
148ok(!$meta->{K_text_class}->[2], 'TextClass');
149ok(!$meta->{T_author}, 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000150
151# Additional information
Akron5eb3aa02019-01-25 18:30:47 +0100152ok(!$meta->{A_editor}, 'Editor');
153ok(!$meta->{A_publisher}, 'Publisher');
154is($meta->{D_creation_date}, "20000111", 'Creation date');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000155#ok(!$doc->coll_title, 'Collection title');
156#ok(!$doc->coll_sub_title, 'Collection subtitle');
157#ok(!$doc->coll_editor, 'Collection editor');
158#ok(!$doc->coll_author, 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +0100159ok(!$meta->{S_text_type}, 'text_type');
160is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000161
Nils Diewald8e323ee2014-04-23 17:28:14 +0000162# A01/02873-meta
Akron9c0488f2016-01-28 14:17:15 +0100163$path = catdir(dirname(__FILE__), 'corpus/A00/02873-meta');
Akrone4c2e412016-01-28 15:10:50 +0100164ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000165ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100166$meta = $doc->meta;
167
Akron5eb3aa02019-01-25 18:30:47 +0100168is($meta->{T_title}, 'Tradition und Moderne', 'title');
169ok(!$meta->{T_sub_title}, 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +0100170is($doc->text_sigle, 'A00/JAN/02873', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +0000171is($doc->corpus_sigle, 'A00', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +0100172is($meta->{D_pub_date}, '20000113', 'pubDate');
173ok(!$meta->{S_pub_place}, 'pubPlace');
174is($meta->{K_text_class}->[0], 'kultur', 'TextClass');
175is($meta->{K_text_class}->[1], 'film', 'TextClass');
176ok(!$meta->{K_text_class}->[2], 'TextClass');
177ok(!$meta->{T_author}, 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000178
Akrona8665782016-01-27 21:47:57 +0100179
Nils Diewald8e323ee2014-04-23 17:28:14 +0000180# Additional information
Akron5eb3aa02019-01-25 18:30:47 +0100181ok(!$meta->{A_editor}, 'Editor');
182ok(!$meta->{A_publisher}, 'Publisher');
183is($meta->{D_creation_date}, "20000113", 'Creation date');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000184#ok(!$doc->coll_title, 'Collection title');
185#ok(!$doc->coll_sub_title, 'Collection subtitle');
186#ok(!$doc->coll_editor, 'Collection editor');
187#ok(!$doc->coll_author, 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +0100188ok(!$meta->{S_text_type}, 'text_type');
189is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000190
191
192# A01/05663-unbalanced
Akron9c0488f2016-01-28 14:17:15 +0100193$path = catdir(dirname(__FILE__), 'corpus/A00/05663-unbalanced');
Akrone4c2e412016-01-28 15:10:50 +0100194ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000195ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100196$meta = $doc->meta;
197
Akron5eb3aa02019-01-25 18:30:47 +0100198is($meta->{T_title}, 'Mehr Arbeitslose im Dezember', 'title');
199ok(!$meta->{T_sub_title}, 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +0100200is($doc->text_sigle, 'A00/JAN/05663', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +0000201is($doc->corpus_sigle, 'A00', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +0100202is($meta->{D_pub_date}, '20000124', 'pubDate');
203ok(!$meta->{S_pub_place}, 'pubPlace');
204is($meta->{K_text_class}->[0], 'gesundheit-ernaehrung', 'TextClass');
205is($meta->{K_text_class}->[1], 'gesundheit', 'TextClass');
206ok(!$meta->{K_text_class}->[2], 'TextClass');
207ok(!$meta->{T_author}, 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000208
Akrona8665782016-01-27 21:47:57 +0100209
Nils Diewald8e323ee2014-04-23 17:28:14 +0000210# Additional information
Akron5eb3aa02019-01-25 18:30:47 +0100211ok(!$meta->{A_editor}, 'Editor');
212ok(!$meta->{A_publisher}, 'Publisher');
213is($meta->{D_creation_date}, "20000124", 'Creation date');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000214#ok(!$doc->coll_title, 'Collection title');
215#ok(!$doc->coll_sub_title, 'Collection subtitle');
216#ok(!$doc->coll_editor, 'Collection editor');
217#ok(!$doc->coll_author, 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +0100218ok(!$meta->{S_text_type}, 'text_type');
219is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000220
Nils Diewald8e323ee2014-04-23 17:28:14 +0000221# A01/07452-deep
Akron9c0488f2016-01-28 14:17:15 +0100222$path = catdir(dirname(__FILE__), 'corpus/A00/07452-deep');
Akrone4c2e412016-01-28 15:10:50 +0100223ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000224ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100225$meta = $doc->meta;
226
Akron5eb3aa02019-01-25 18:30:47 +0100227is($meta->{T_title}, 'Wil im Dezember 1999', 'title');
228ok(!$meta->{T_sub_title}, 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +0100229is($doc->text_sigle, 'A00/JAN/07452', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +0000230is($doc->corpus_sigle, 'A00', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +0100231is($meta->{D_pub_date}, '20000129', 'pubDate');
232ok(!$meta->{S_pub_place}, 'pubPlace');
233is($meta->{K_text_class}->[0], 'politik', 'TextClass');
234is($meta->{K_text_class}->[1], 'kommunalpolitik', 'TextClass');
235ok(!$meta->{K_text_class}->[2], 'TextClass');
236ok(!$meta->{T_author}, 'author');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000237
Akrona8665782016-01-27 21:47:57 +0100238
Nils Diewald8e323ee2014-04-23 17:28:14 +0000239# Additional information
Akron5eb3aa02019-01-25 18:30:47 +0100240ok(!$meta->{A_editor}, 'Editor');
241ok(!$meta->{A_publisher}, 'Publisher');
242is($meta->{D_creation_date}, "20000129", 'Creation date');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000243#ok(!$doc->coll_title, 'Collection title');
244#ok(!$doc->coll_sub_title, 'Collection subtitle');
245#ok(!$doc->coll_editor, 'Collection editor');
246#ok(!$doc->coll_author, 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +0100247ok(!$meta->{S_text_type}, 'text_type');
248is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000249
Nils Diewald98767bb2014-04-25 20:31:19 +0000250# ART
Akron9c0488f2016-01-28 14:17:15 +0100251$path = catdir(dirname(__FILE__), 'corpus/artificial');
Akrone4c2e412016-01-28 15:10:50 +0100252ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000253#is($doc->path, $path . '/', 'Path');
Nils Diewald8e323ee2014-04-23 17:28:14 +0000254
Akrone4c2e412016-01-28 15:10:50 +0100255ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000256#is($doc->path, $path . '/', 'Path');
Nils Diewald98767bb2014-04-25 20:31:19 +0000257
258ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100259$meta = $doc->meta;
Nils Diewald98767bb2014-04-25 20:31:19 +0000260
261# Metdata
Akron5eb3aa02019-01-25 18:30:47 +0100262is($meta->{T_title}, 'Artificial Title', 'title');
263is($meta->{T_sub_title}, 'Artificial Subtitle', 'subTitle');
Akron1cd5b872016-03-22 00:23:46 +0100264is($doc->text_sigle, 'ART/ABC/00001', 'ID');
Nils Diewald840c9242014-10-28 19:51:26 +0000265is($doc->corpus_sigle, 'ART', 'corpusID');
Akron5eb3aa02019-01-25 18:30:47 +0100266is($meta->{D_pub_date}, '20010402', 'pubDate');
267is($meta->{S_pub_place}, 'Mannheim', 'pubPlace');
268is($meta->{S_pub_place_key}, 'DE', 'pubPlace key');
269is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
270is($meta->{K_text_class}->[1], 'vereine-veranstaltungen', 'TextClass');
271ok(!$meta->{K_text_class}->[2], 'TextClass');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000272#is($doc->author->[0], 'Ruru', 'author');
273#is($doc->author->[1], 'Jens.Ol', 'author');
274#is($doc->author->[2], 'Aglarech', 'author');
Akron5eb3aa02019-01-25 18:30:47 +0100275is($meta->{T_author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
Nils Diewald98767bb2014-04-25 20:31:19 +0000276
277# Additional information
Akron5eb3aa02019-01-25 18:30:47 +0100278is($meta->{A_editor}, 'Nils Diewald', 'Editor');
279is($meta->{A_publisher}, 'Artificial articles Inc.', 'Publisher');
280is($meta->{D_creation_date}, '19990601', 'Creation date');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000281#is($doc->coll_title, 'Artificial articles', 'Collection title');
282#is($doc->coll_sub_title, 'Best of!', 'Collection subtitle');
283#is($doc->coll_editor, 'Nils Diewald', 'Collection editor');
284#is($doc->coll_author, 'Nils Diewald', 'Collection author');
Akron5eb3aa02019-01-25 18:30:47 +0100285is($meta->{S_text_type}, 'Zeitung: Tageszeitung', 'No text_type');
286is($meta->{S_text_type_art}, 'Bericht', 'text_type art');
Akron35db6e32016-03-17 22:42:22 +0100287
Nils Diewald8e323ee2014-04-23 17:28:14 +0000288
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000289# Multipath headers
Akron9c0488f2016-01-28 14:17:15 +0100290$path = catdir(dirname(__FILE__), 'corpus/VDI/JAN/00001');
Akrone4c2e412016-01-28 15:10:50 +0100291ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +0200292like($doc->path, qr!\Q$path\E/!, 'Path');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000293
Akrone4c2e412016-01-28 15:10:50 +0100294ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +0200295like($doc->path, qr!\Q$path\E/$!, 'Path');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000296
297ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100298$meta = $doc->meta;
Akrona8665782016-01-27 21:47:57 +0100299
Akron1cd5b872016-03-22 00:23:46 +0100300is($doc->text_sigle, 'VDI14/JAN/00001', 'text sigle');
301is($doc->doc_sigle, 'VDI14/JAN', 'doc sigle');
Akron35db6e32016-03-17 22:42:22 +0100302is($meta->corpus_sigle, 'VDI14', 'corpus sigle');
Akrona8665782016-01-27 21:47:57 +0100303
Akron5eb3aa02019-01-25 18:30:47 +0100304is($meta->{T_title}, '10- Zz mit Zahl', 'title');
Akrona8665782016-01-27 21:47:57 +0100305
Akron5eb3aa02019-01-25 18:30:47 +0100306ok(!$meta->{T_sub_title}, 'subtitle');
307is($meta->{D_pub_date}, '20140117', 'pubdate');
308is($meta->{S_pub_place}, 'Düsseldorf', 'pubplace');
309is($meta->{T_author}, 'Windhövel, Kerstin', 'author');
310is($meta->{A_publisher}, 'VDI Verlag GmbH', 'publisher');
311ok(!$meta->{A_editor}, 'editor');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000312
Akron5eb3aa02019-01-25 18:30:47 +0100313ok(!$meta->{S_text_type}, 'text type');
314ok(!$meta->{S_text_type_art}, 'text type art');
315ok(!$meta->{S_text_type_ref}, 'text type ref');
316ok(!$meta->{S_text_column}, 'text column');
317ok(!$meta->{S_text_domain}, 'text domain');
318ok(!$meta->{D_creation_date}, 'creation date');
319ok(!$meta->{S_availability}, 'License');
Akron35db6e32016-03-17 22:42:22 +0100320ok(!$meta->{pages}, 'Pages');
Akron5eb3aa02019-01-25 18:30:47 +0100321ok(!$meta->{A_file_edition_statement}, 'file edition statement');
322ok(!$meta->{A_bibl_edition_statement}, 'bibl edition statement');
323is($meta->{A_reference}, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000324
Akron5eb3aa02019-01-25 18:30:47 +0100325ok(!$doc->{S_language}, 'Language');
Akrona8665782016-01-27 21:47:57 +0100326# !!!
327# diag 'This may be "de" in the future';
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000328
Akron5eb3aa02019-01-25 18:30:47 +0100329is($meta->{T_doc_title}, 'VDI nachrichten, Januar 2014', 'Doc title');
330ok(!$meta->{T_doc_sub_title}, 'Doc Sub title');
331ok(!$meta->{A_doc_editor}, 'Doc editor');
332ok(!$meta->{T_doc_author}, 'Doc author');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000333
Akron5eb3aa02019-01-25 18:30:47 +0100334is($meta->{T_corpus_title}, 'VDI nachrichten', 'Corpus title');
335ok(!$meta->{T_corpus_sub_title}, 'Corpus Sub title');
336is($meta->{A_corpus_editor}, 'Verein Deutscher Ingenieure', 'Corpus editor');
337ok(!$meta->{T_corpus_author}, 'Corpus author');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000338
Akron5eb3aa02019-01-25 18:30:47 +0100339is($meta->keywords('K_keywords'), '', 'Keywords');
340is($meta->keywords('K_text_class'), 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000341
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000342# WDD
Akron9c0488f2016-01-28 14:17:15 +0100343$path = catdir(dirname(__FILE__), 'corpus/WDD/G27/38989');
Akrone4c2e412016-01-28 15:10:50 +0100344ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +0200345like($doc->path, qr!\Q$path\E/!, 'Path');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000346ok($doc->parse, 'Parse document');
Akron35db6e32016-03-17 22:42:22 +0100347$meta = $doc->meta;
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000348
Akron1cd5b872016-03-22 00:23:46 +0100349is($doc->text_sigle, 'WDD11/G27/38989', 'text sigle');
350is($doc->doc_sigle, 'WDD11/G27', 'doc sigle');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000351is($doc->corpus_sigle, 'WDD11', 'corpus sigle');
352
Akron5eb3aa02019-01-25 18:30:47 +0100353is($meta->{T_title}, 'Diskussion:Gunter A. Pilz', 'title');
354ok(!$meta->{T_sub_title}, 'subtitle');
355is($meta->{D_pub_date}, '20111029', 'pubdate');
356is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'pubplace');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000357
Akron5eb3aa02019-01-25 18:30:47 +0100358is($meta->{T_author}, '€pa, u.a.', 'author');
359is($meta->{A_publisher}, 'Wikipedia', 'publisher');
360is($meta->{A_editor}, 'wikipedia.org', 'Editor');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000361
Akron5eb3aa02019-01-25 18:30:47 +0100362is($meta->{S_text_type}, 'Diskussionen zu Enzyklopädie-Artikeln', 'text type');
363ok(!$meta->{S_text_type_art}, 'text type art');
364ok(!$meta->{S_text_type_ref}, 'text type ref');
365ok(!$meta->{S_text_column}, 'text column');
366ok(!$meta->{S_text_domain}, 'text domain');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000367
Akron5eb3aa02019-01-25 18:30:47 +0100368is($meta->{D_creation_date}, '20070707', 'creation date');
369is($meta->{S_availability}, 'CC-BY-SA', 'License');
Akron35db6e32016-03-17 22:42:22 +0100370ok(!$meta->{pages}, 'Pages');
Akron5eb3aa02019-01-25 18:30:47 +0100371ok(!$meta->{A_file_edition_statement}, 'file edition statement');
372ok(!$meta->{A_bibl_edition_statement}, 'bibl edition statement');
373is($meta->{A_reference}, 'Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007', 'Reference');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000374
Akron5eb3aa02019-01-25 18:30:47 +0100375is($meta->{S_language}, 'de', 'Language');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000376
Akron5eb3aa02019-01-25 18:30:47 +0100377is($meta->{T_doc_title}, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Doc title');
378ok(!$meta->{T_doc_sub_title}, 'Doc Sub title');
379ok(!$meta->{A_doc_editor}, 'Doc editor');
380ok(!$meta->{T_doc_author}, 'Doc author');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000381
Akron5eb3aa02019-01-25 18:30:47 +0100382is($meta->{T_corpus_title}, 'Wikipedia', 'Corpus title');
383ok(!$meta->{T_corpus_sub_title}, 'Corpus Sub title');
384is($meta->{A_corpus_editor}, 'wikipedia.org', 'Corpus editor');
385ok(!$meta->{T_corpus_author}, 'Corpus author');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000386
Akron35db6e32016-03-17 22:42:22 +0100387is($meta->keywords('keywords'), '', 'Keywords');
388is($meta->keywords('text_class'), '', 'Text class');
Nils Diewaldfeccbb12015-06-18 20:06:45 +0000389
Akron5eb3aa02019-01-25 18:30:47 +0100390is($meta->{S_availability}, 'CC-BY-SA', 'Availability');
Akrona8665782016-01-27 21:47:57 +0100391
Akronfbf66382016-07-12 19:44:01 +0200392use_ok('KorAP::XML::Meta::I5');
393
394$path = catdir(dirname(__FILE__), 'corpus', 'I5', 'rei-example.i5');
395ok($meta = KorAP::XML::Meta::I5->new, 'Construct meta object');
Akron3ec0a1c2017-01-18 14:41:55 +0100396my $dom = Mojo::DOM->new->parse(Mojo::File->new($path)->slurp);
Akronfbf66382016-07-12 19:44:01 +0200397ok($meta->parse($dom->at('idsHeader'), 'corpus'), 'Parse corpus header');
398
399my $hash = $meta->to_hash;
Akron5eb3aa02019-01-25 18:30:47 +0100400is($hash->{S_availability}, 'CC-BY-SA', 'Availability');
401is($hash->{S_language}, 'de', 'Language');
402is($hash->{T_corpus_title}, 'Reden und Interviews', 'Corpus title');
Akronfbf66382016-07-12 19:44:01 +0200403is($hash->{corpus_sigle}, 'REI', 'Corpus Sigle');
404
405ok($meta->parse($dom->find('idsHeader')->[1], 'doc'), 'Parse corpus header');
406
407$hash = $meta->to_hash;
Akron5eb3aa02019-01-25 18:30:47 +0100408is($hash->{S_availability}, 'CC-BY-SA', 'Availability');
409is($hash->{S_language}, 'de', 'Language');
410is($hash->{T_corpus_title}, 'Reden und Interviews', 'Corpus title');
Akronfbf66382016-07-12 19:44:01 +0200411is($hash->{corpus_sigle}, 'REI', 'Corpus Sigle');
412is($hash->{doc_sigle}, 'REI/BNG', 'Document Sigle');
Akron5eb3aa02019-01-25 18:30:47 +0100413is($hash->{T_doc_title}, 'Reden der Bundestagsfraktion Bündnis 90/DIE GRÜNEN, (2002-2006)', 'Document Sigle');
Akronfbf66382016-07-12 19:44:01 +0200414
415ok($meta->parse($dom->find('idsHeader')->[2], 'text'), 'Parse corpus header');
416
417$hash = $meta->to_hash;
Akron5eb3aa02019-01-25 18:30:47 +0100418is($hash->{S_availability}, 'CC-BY-SA', 'Availability');
419is($hash->{S_language}, 'de', 'Language');
420is($hash->{T_corpus_title}, 'Reden und Interviews', 'Corpus title');
Akronfbf66382016-07-12 19:44:01 +0200421is($hash->{corpus_sigle}, 'REI', 'Corpus Sigle');
422is($hash->{doc_sigle}, 'REI/BNG', 'Document Sigle');
Akron5eb3aa02019-01-25 18:30:47 +0100423is($hash->{T_doc_title}, 'Reden der Bundestagsfraktion Bündnis 90/DIE GRÜNEN, (2002-2006)', 'Document Sigle');
Akronfbf66382016-07-12 19:44:01 +0200424
425is($hash->{text_sigle}, 'REI/BNG/00001');
Akron5eb3aa02019-01-25 18:30:47 +0100426is($hash->{T_title}, 'Energiewirtschaft');
427is($hash->{T_sub_title}, 'Rede im Deutschen Bundestag am 19.01.2002');
428is($hash->{D_creation_date}, '20020119');
429is($hash->{D_pub_date}, '20020119');
430is($hash->{S_pub_place_key}, 'DE');
431is($hash->{A_reference}, 'Hustedt, Michaele: Energiewirtschaft. Rede im Deutschen Bundestag am 19.01.2002, Hrsg: Bundestagsfraktion Bündnis 90/DIE GRÜNEN [Ausführliche Zitierung nicht verfügbar]');
432is($hash->{K_text_class}->[0], 'politik');
433is($hash->{K_text_class}->[1], 'inland');
434is($hash->{T_author}, 'Hustedt, Michaele');
435is($hash->{S_pub_place}, 'Berlin');
Akrona8665782016-01-27 21:47:57 +0100436
Akron0465de52017-02-07 22:30:08 +0100437
438# UMB45/D38/00001
439$path = catdir(dirname(__FILE__), 'corpus','UMB45','D38','00001');
440ok($doc = KorAP::XML::Krill->new( path => $path), 'Load Korap::Document');
441like($doc->path, qr!\Q$path\E/!, 'Path');
442
443ok($doc->parse, 'Parse document');
444$meta = $doc->meta;
445
446is($doc->text_sigle, 'UMB45/D38/00001', 'text sigle');
447is($doc->doc_sigle, 'UMB45/D38', 'doc sigle');
448is($doc->corpus_sigle, 'UMB45', 'corpus sigle');
449
Akron5eb3aa02019-01-25 18:30:47 +0100450is($meta->{T_title}, 'In: Über Schuld und Aufgabe der geistigen Führungsschicht im deutschen politischen Leben der Gegenwart. - Göttingen, 1955', 'title');
Akron0465de52017-02-07 22:30:08 +0100451
452
Nils Diewald8e323ee2014-04-23 17:28:14 +0000453done_testing;
454__END__
Akronfbf66382016-07-12 19:44:01 +0200455
456
457