| Akron | 414ec95 | 2020-08-03 15:48:43 +0200 | [diff] [blame] | 1 | use strict; | 
|  | 2 | use warnings; | 
|  | 3 | use utf8; | 
|  | 4 | use Test::More; | 
|  | 5 | use Benchmark ':hireswallclock'; | 
|  | 6 | use Mojo::DOM; | 
|  | 7 | use Mojo::File; | 
|  | 8 | use Mojo::ByteStream 'b'; | 
|  | 9 | use Data::Dumper; | 
|  | 10 | use lib 'lib', '../lib'; | 
|  | 11 |  | 
|  | 12 | if ($ENV{SKIP_REAL}) { | 
|  | 13 | plan skip_all => 'Skip real tests'; | 
|  | 14 | }; | 
|  | 15 |  | 
|  | 16 | use File::Basename 'dirname'; | 
|  | 17 | use File::Spec::Functions 'catdir'; | 
|  | 18 |  | 
|  | 19 | use_ok('KorAP::XML::Krill'); | 
|  | 20 |  | 
|  | 21 | # WPD/00001 | 
|  | 22 | my $path = catdir(dirname(__FILE__), 'corpus','WPD','00001'); | 
|  | 23 | ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 24 | like($doc->path, qr!\Q$path\E/!, 'Path'); | 
|  | 25 |  | 
|  | 26 | ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document'); | 
|  | 27 | like($doc->path, qr!\Q$path\E/$!, 'Path'); | 
|  | 28 |  | 
|  | 29 | ok($doc->parse, 'Parse document'); | 
|  | 30 |  | 
|  | 31 | # Metdata | 
|  | 32 | is($doc->text_sigle, 'WPD/AAA/00001', 'ID'); | 
|  | 33 |  | 
|  | 34 | my $meta = $doc->meta; | 
|  | 35 | is($meta->{T_title}, 'A', 'title'); | 
|  | 36 |  | 
|  | 37 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 38 | is($doc->corpus_sigle, 'WPD', 'corpusID'); | 
|  | 39 | is($meta->{D_pub_date}, '20050328', 'pubDate'); | 
|  | 40 | is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'pubPlace'); | 
|  | 41 | is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass'); | 
|  | 42 | is($meta->{K_text_class}->[1], 'reisen', 'TextClass'); | 
|  | 43 | is($meta->{K_text_class}->[2], 'wissenschaft', 'TextClass'); | 
|  | 44 | is($meta->{K_text_class}->[3], 'populaerwissenschaft', 'TextClass'); | 
|  | 45 | ok(!$meta->{K_text_class}->[4], 'TextClass'); | 
|  | 46 | is($meta->{T_author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author'); | 
|  | 47 |  | 
|  | 48 |  | 
|  | 49 | #is($doc->author->[0], 'Ruru', 'author'); | 
|  | 50 | #is($doc->author->[1], 'Jens.Ol', 'author'); | 
|  | 51 | #is($doc->author->[2], 'Aglarech', 'author'); | 
|  | 52 | #ok(!$doc->author->[3], 'author'); | 
|  | 53 |  | 
|  | 54 | # Additional information | 
|  | 55 | is($meta->{A_editor}, 'wikipedia.org', 'Editor'); | 
|  | 56 | is($meta->{A_publisher}, 'Wikipedia', 'Publisher'); | 
|  | 57 | is($meta->{D_creation_date}, '20050000', 'Creation date'); | 
|  | 58 | ok(!$meta->{S_text_type}, 'No text_type'); | 
|  | 59 | ok(!$meta->{S_text_type_art}, 'no text_type art'); | 
|  | 60 | ok(!$meta->{S_text_type_ref}, 'no text_type ref'); | 
|  | 61 | ok(!$meta->{S_text_domain}, 'no text_domain'); | 
|  | 62 | ok(!$meta->{S_text_column}, 'no text_column'); | 
|  | 63 | ok(!$meta->keywords('K_keywords'), 'no keywords'); | 
|  | 64 | is($meta->keywords('K_text_class'), 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes'); | 
|  | 65 |  | 
|  | 66 | #is($doc->coll_title, 'Wikipedia', 'Collection title'); | 
|  | 67 | #is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle'); | 
|  | 68 | #is($doc->coll_editor, 'wikipedia.org', 'Collection editor'); | 
|  | 69 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 70 |  | 
|  | 71 | # A01/13047 | 
|  | 72 | $path = catdir(dirname(__FILE__), 'corpus','A01','13047'); | 
|  | 73 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 74 |  | 
|  | 75 | ok($doc->parse, 'Parse document'); | 
|  | 76 | $meta = $doc->meta; | 
|  | 77 | is($meta->{T_title}, 'Fischer und Kolp im Sonnenhügel', 'title'); | 
|  | 78 |  | 
|  | 79 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 80 | is($doc->text_sigle, 'A01/APR/13047', 'ID'); | 
|  | 81 | is($doc->corpus_sigle, 'A01', 'corpusID'); | 
|  | 82 | is($meta->{D_pub_date}, '20010402', 'pubDate'); | 
|  | 83 | ok(!$meta->{S_pub_place}, 'pubPlace'); | 
|  | 84 | is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass'); | 
|  | 85 | is($meta->{K_text_class}->[1], 'vereine-veranstaltungen', 'TextClass'); | 
|  | 86 | ok(!$meta->{K_text_class}->[2], 'TextClass'); | 
|  | 87 | ok(!$meta->{T_author}, 'author'); | 
|  | 88 |  | 
|  | 89 | # Additional information | 
|  | 90 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 91 | ok(!$meta->{A_publisher}, 'Publisher'); | 
|  | 92 | is($meta->{D_creation_date}, '20010402', 'Creation date'); | 
|  | 93 | #ok(!$doc->coll_title, 'Collection title'); | 
|  | 94 | #ok(!$doc->coll_sub_title, 'Collection subtitle'); | 
|  | 95 | #ok(!$doc->coll_editor, 'Collection editor'); | 
|  | 96 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 97 | ok(!$meta->{S_text_type}, 'text_type'); | 
|  | 98 | is($meta->{S_text_type_art}, 'Bericht', 'text_type art'); | 
|  | 99 |  | 
|  | 100 | # ERL/0001 | 
|  | 101 | $path = catdir(dirname(__FILE__), 'corpus','ERL','00001'); | 
|  | 102 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 103 |  | 
|  | 104 | ok($doc->parse, 'Parse document'); | 
|  | 105 |  | 
|  | 106 | $meta = $doc->meta; | 
|  | 107 | is($meta->{T_title}, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse]', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse] | 
|  | 108 | # MK2/ERL.00001 | 
|  | 109 |  | 
|  | 110 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 111 | is($doc->text_sigle, 'MK2/ERL/00001', 'ID'); | 
|  | 112 | is($doc->corpus_sigle, 'MK2', 'corpusID'); | 
|  | 113 | is($meta->{D_pub_date}, '00000000', 'pubDate'); | 
|  | 114 | is($meta->{S_pub_place}, 'Karlsruhe', 'pubPlace'); | 
|  | 115 | is($meta->{K_text_class}->[0], 'politik', 'TextClass'); | 
|  | 116 | is($meta->{K_text_class}->[1], 'kommunalpolitik', 'TextClass'); | 
|  | 117 | ok(!$meta->{K_text_class}->[2], 'TextClass'); | 
|  | 118 | ok(!$meta->{T_author}, 'author'); | 
|  | 119 |  | 
|  | 120 | # Additional information | 
|  | 121 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 122 | is($meta->{A_publisher}, 'Badenia Verlag und Druckerei', 'Publisher'); | 
|  | 123 | is($meta->{D_creation_date}, '19600000', 'Creation date'); | 
|  | 124 |  | 
|  | 125 | # !!! | 
|  | 126 | # diag 'Non-acceptance of creation date ranges may be temporary'; | 
|  | 127 |  | 
|  | 128 |  | 
|  | 129 | #ok(!$doc->coll_title, 'Collection title'); | 
|  | 130 | #ok(!$doc->coll_sub_title, 'Collection subtitle'); | 
|  | 131 | #ok(!$doc->coll_editor, 'Collection editor'); | 
|  | 132 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 133 | is($meta->{S_text_type}, 'Erlass', 'text_type'); | 
|  | 134 | ok(!$meta->{S_text_type_art}, 'text_type art'); | 
|  | 135 |  | 
|  | 136 |  | 
|  | 137 | # A01/02035-substring | 
|  | 138 | $path = catdir(dirname(__FILE__), 'corpus','A00','02035-substring'); | 
|  | 139 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 140 | ok($doc->parse, 'Parse document'); | 
|  | 141 |  | 
|  | 142 | $meta = $doc->meta; | 
|  | 143 |  | 
|  | 144 | is($meta->{T_title}, 'St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title'); # A00/JAN.02035 | 
|  | 145 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 146 | is($doc->text_sigle, 'A00/JAN/02035', 'ID'); | 
|  | 147 | is($doc->corpus_sigle, 'A00', 'corpusID'); | 
|  | 148 | is($meta->{D_pub_date}, '20000111', 'pubDate'); | 
|  | 149 | ok(!$meta->{S_pub_place}, 'pubPlace'); | 
|  | 150 | is($meta->{K_text_class}->[0], 'sport', 'TextClass'); | 
|  | 151 | is($meta->{K_text_class}->[1], 'ballsport', 'TextClass'); | 
|  | 152 | ok(!$meta->{K_text_class}->[2], 'TextClass'); | 
|  | 153 | ok(!$meta->{T_author}, 'author'); | 
|  | 154 |  | 
|  | 155 | # Additional information | 
|  | 156 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 157 | ok(!$meta->{A_publisher}, 'Publisher'); | 
|  | 158 | is($meta->{D_creation_date}, "20000111", 'Creation date'); | 
|  | 159 | #ok(!$doc->coll_title, 'Collection title'); | 
|  | 160 | #ok(!$doc->coll_sub_title, 'Collection subtitle'); | 
|  | 161 | #ok(!$doc->coll_editor, 'Collection editor'); | 
|  | 162 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 163 | ok(!$meta->{S_text_type}, 'text_type'); | 
|  | 164 | is($meta->{S_text_type_art}, 'Bericht', 'text_type art'); | 
|  | 165 |  | 
|  | 166 | # A01/02873-meta | 
|  | 167 | $path = catdir(dirname(__FILE__), 'corpus','A00','02873-meta'); | 
|  | 168 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 169 | ok($doc->parse, 'Parse document'); | 
|  | 170 | $meta = $doc->meta; | 
|  | 171 |  | 
|  | 172 | is($meta->{T_title}, 'Tradition und Moderne', 'title'); | 
|  | 173 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 174 | is($doc->text_sigle, 'A00/JAN/02873', 'ID'); | 
|  | 175 | is($doc->corpus_sigle, 'A00', 'corpusID'); | 
|  | 176 | is($meta->{D_pub_date}, '20000113', 'pubDate'); | 
|  | 177 | ok(!$meta->{S_pub_place}, 'pubPlace'); | 
|  | 178 | is($meta->{K_text_class}->[0], 'kultur', 'TextClass'); | 
|  | 179 | is($meta->{K_text_class}->[1], 'film', 'TextClass'); | 
|  | 180 | ok(!$meta->{K_text_class}->[2], 'TextClass'); | 
|  | 181 | ok(!$meta->{T_author}, 'author'); | 
|  | 182 |  | 
|  | 183 |  | 
|  | 184 | # Additional information | 
|  | 185 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 186 | ok(!$meta->{A_publisher}, 'Publisher'); | 
|  | 187 | is($meta->{D_creation_date}, "20000113", 'Creation date'); | 
|  | 188 | #ok(!$doc->coll_title, 'Collection title'); | 
|  | 189 | #ok(!$doc->coll_sub_title, 'Collection subtitle'); | 
|  | 190 | #ok(!$doc->coll_editor, 'Collection editor'); | 
|  | 191 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 192 | ok(!$meta->{S_text_type}, 'text_type'); | 
|  | 193 | is($meta->{S_text_type_art}, 'Bericht', 'text_type art'); | 
|  | 194 |  | 
|  | 195 |  | 
|  | 196 | # A01/05663-unbalanced | 
|  | 197 | $path = catdir(dirname(__FILE__), 'corpus','A00','05663-unbalanced'); | 
|  | 198 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 199 | ok($doc->parse, 'Parse document'); | 
|  | 200 | $meta = $doc->meta; | 
|  | 201 |  | 
|  | 202 | is($meta->{T_title}, 'Mehr Arbeitslose im Dezember', 'title'); | 
|  | 203 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 204 | is($doc->text_sigle, 'A00/JAN/05663', 'ID'); | 
|  | 205 | is($doc->corpus_sigle, 'A00', 'corpusID'); | 
|  | 206 | is($meta->{D_pub_date}, '20000124', 'pubDate'); | 
|  | 207 | ok(!$meta->{S_pub_place}, 'pubPlace'); | 
|  | 208 | is($meta->{K_text_class}->[0], 'gesundheit-ernaehrung', 'TextClass'); | 
|  | 209 | is($meta->{K_text_class}->[1], 'gesundheit', 'TextClass'); | 
|  | 210 | ok(!$meta->{K_text_class}->[2], 'TextClass'); | 
|  | 211 | ok(!$meta->{T_author}, 'author'); | 
|  | 212 |  | 
|  | 213 |  | 
|  | 214 | # Additional information | 
|  | 215 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 216 | ok(!$meta->{A_publisher}, 'Publisher'); | 
|  | 217 | is($meta->{D_creation_date}, "20000124", 'Creation date'); | 
|  | 218 | #ok(!$doc->coll_title, 'Collection title'); | 
|  | 219 | #ok(!$doc->coll_sub_title, 'Collection subtitle'); | 
|  | 220 | #ok(!$doc->coll_editor, 'Collection editor'); | 
|  | 221 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 222 | ok(!$meta->{S_text_type}, 'text_type'); | 
|  | 223 | is($meta->{S_text_type_art}, 'Bericht', 'text_type art'); | 
|  | 224 |  | 
|  | 225 | # A01/07452-deep | 
|  | 226 | $path = catdir(dirname(__FILE__), 'corpus','A00','07452-deep'); | 
|  | 227 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 228 | ok($doc->parse, 'Parse document'); | 
|  | 229 | $meta = $doc->meta; | 
|  | 230 |  | 
|  | 231 | is($meta->{T_title}, 'Wil im Dezember 1999', 'title'); | 
|  | 232 | ok(!$meta->{T_sub_title}, 'subTitle'); | 
|  | 233 | is($doc->text_sigle, 'A00/JAN/07452', 'ID'); | 
|  | 234 | is($doc->corpus_sigle, 'A00', 'corpusID'); | 
|  | 235 | is($meta->{D_pub_date}, '20000129', 'pubDate'); | 
|  | 236 | ok(!$meta->{S_pub_place}, 'pubPlace'); | 
|  | 237 | is($meta->{K_text_class}->[0], 'politik', 'TextClass'); | 
|  | 238 | is($meta->{K_text_class}->[1], 'kommunalpolitik', 'TextClass'); | 
|  | 239 | ok(!$meta->{K_text_class}->[2], 'TextClass'); | 
|  | 240 | ok(!$meta->{T_author}, 'author'); | 
|  | 241 |  | 
|  | 242 |  | 
|  | 243 | # Additional information | 
|  | 244 | ok(!$meta->{A_editor}, 'Editor'); | 
|  | 245 | ok(!$meta->{A_publisher}, 'Publisher'); | 
|  | 246 | is($meta->{D_creation_date}, "20000129", 'Creation date'); | 
|  | 247 | #ok(!$doc->coll_title, 'Collection title'); | 
|  | 248 | #ok(!$doc->coll_sub_title, 'Collection subtitle'); | 
|  | 249 | #ok(!$doc->coll_editor, 'Collection editor'); | 
|  | 250 | #ok(!$doc->coll_author, 'Collection author'); | 
|  | 251 | ok(!$meta->{S_text_type}, 'text_type'); | 
|  | 252 | is($meta->{S_text_type_art}, 'Bericht', 'text_type art'); | 
|  | 253 |  | 
|  | 254 | # Multipath headers | 
|  | 255 | $path = catdir(dirname(__FILE__), 'corpus','VDI','JAN','00001'); | 
|  | 256 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 257 | like($doc->path, qr!\Q$path\E/!, 'Path'); | 
|  | 258 |  | 
|  | 259 | ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document'); | 
|  | 260 | like($doc->path, qr!\Q$path\E/$!, 'Path'); | 
|  | 261 |  | 
|  | 262 | ok($doc->parse, 'Parse document'); | 
|  | 263 | $meta = $doc->meta; | 
|  | 264 |  | 
|  | 265 | is($doc->text_sigle, 'VDI14/JAN/00001', 'text sigle'); | 
|  | 266 | is($doc->doc_sigle, 'VDI14/JAN', 'doc sigle'); | 
|  | 267 | is($meta->corpus_sigle, 'VDI14', 'corpus sigle'); | 
|  | 268 |  | 
|  | 269 | is($meta->{T_title}, '10- Zz mit Zahl', 'title'); | 
|  | 270 |  | 
|  | 271 | ok(!$meta->{T_sub_title}, 'subtitle'); | 
|  | 272 | is($meta->{D_pub_date}, '20140117', 'pubdate'); | 
|  | 273 | is($meta->{S_pub_place}, 'Düsseldorf', 'pubplace'); | 
|  | 274 | is($meta->{T_author}, 'Windhövel, Kerstin', 'author'); | 
|  | 275 | is($meta->{A_publisher}, 'VDI Verlag GmbH', 'publisher'); | 
|  | 276 | ok(!$meta->{A_editor}, 'editor'); | 
|  | 277 |  | 
|  | 278 | ok(!$meta->{S_text_type}, 'text type'); | 
|  | 279 | ok(!$meta->{S_text_type_art}, 'text type art'); | 
|  | 280 | ok(!$meta->{S_text_type_ref}, 'text type ref'); | 
|  | 281 | ok(!$meta->{S_text_column}, 'text column'); | 
|  | 282 | ok(!$meta->{S_text_domain}, 'text domain'); | 
|  | 283 | ok(!$meta->{D_creation_date}, 'creation date'); | 
|  | 284 | ok(!$meta->{S_availability}, 'License'); | 
|  | 285 | ok(!$meta->{pages}, 'Pages'); | 
|  | 286 | ok(!$meta->{A_file_edition_statement}, 'file edition statement'); | 
|  | 287 | ok(!$meta->{A_bibl_edition_statement}, 'bibl edition statement'); | 
|  | 288 | is($meta->{A_reference}, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference'); | 
|  | 289 |  | 
|  | 290 | ok(!$doc->{S_language}, 'Language'); | 
|  | 291 | # !!! | 
|  | 292 | # diag 'This may be "de" in the future'; | 
|  | 293 |  | 
|  | 294 | is($meta->{T_doc_title}, 'VDI nachrichten, Januar 2014', 'Doc title'); | 
|  | 295 | ok(!$meta->{T_doc_sub_title}, 'Doc Sub title'); | 
|  | 296 | ok(!$meta->{A_doc_editor}, 'Doc editor'); | 
|  | 297 | ok(!$meta->{T_doc_author}, 'Doc author'); | 
|  | 298 |  | 
|  | 299 | is($meta->{T_corpus_title}, 'VDI nachrichten', 'Corpus title'); | 
|  | 300 | ok(!$meta->{T_corpus_sub_title}, 'Corpus Sub title'); | 
|  | 301 | is($meta->{A_corpus_editor}, 'Verein Deutscher Ingenieure', 'Corpus editor'); | 
|  | 302 | ok(!$meta->{T_corpus_author}, 'Corpus author'); | 
|  | 303 |  | 
|  | 304 | is($meta->keywords('K_keywords'), '', 'Keywords'); | 
|  | 305 | is($meta->keywords('K_text_class'), 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class'); | 
|  | 306 |  | 
|  | 307 | # WDD | 
|  | 308 | $path = catdir(dirname(__FILE__), 'corpus','WDD','G27','38989'); | 
|  | 309 | ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document'); | 
|  | 310 | like($doc->path, qr!\Q$path\E/!, 'Path'); | 
|  | 311 | ok($doc->parse, 'Parse document'); | 
|  | 312 | $meta = $doc->meta; | 
|  | 313 |  | 
|  | 314 | is($doc->text_sigle, 'WDD11/G27/38989', 'text sigle'); | 
|  | 315 | is($doc->doc_sigle, 'WDD11/G27', 'doc sigle'); | 
|  | 316 | is($doc->corpus_sigle, 'WDD11', 'corpus sigle'); | 
|  | 317 |  | 
|  | 318 | is($meta->{T_title}, 'Diskussion:Gunter A. Pilz', 'title'); | 
|  | 319 | ok(!$meta->{T_sub_title}, 'subtitle'); | 
|  | 320 | is($meta->{D_pub_date}, '20111029', 'pubdate'); | 
|  | 321 | is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'pubplace'); | 
|  | 322 |  | 
|  | 323 | is($meta->{T_author}, '€pa, u.a.', 'author'); | 
|  | 324 | is($meta->{A_publisher}, 'Wikipedia', 'publisher'); | 
|  | 325 | is($meta->{A_editor}, 'wikipedia.org', 'Editor'); | 
|  | 326 |  | 
|  | 327 | is($meta->{S_text_type}, 'Diskussionen zu Enzyklopädie-Artikeln', 'text type'); | 
|  | 328 | ok(!$meta->{S_text_type_art}, 'text type art'); | 
|  | 329 | ok(!$meta->{S_text_type_ref}, 'text type ref'); | 
|  | 330 | ok(!$meta->{S_text_column}, 'text column'); | 
|  | 331 | ok(!$meta->{S_text_domain}, 'text domain'); | 
|  | 332 |  | 
|  | 333 | is($meta->{D_creation_date}, '20070707', 'creation date'); | 
|  | 334 | is($meta->{S_availability}, 'CC-BY-SA', 'License'); | 
|  | 335 | ok(!$meta->{pages}, 'Pages'); | 
|  | 336 | ok(!$meta->{A_file_edition_statement}, 'file edition statement'); | 
|  | 337 | ok(!$meta->{A_bibl_edition_statement}, 'bibl edition statement'); | 
|  | 338 | is($meta->{A_reference}, 'Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007', 'Reference'); | 
|  | 339 |  | 
|  | 340 | is($meta->{S_language}, 'de', 'Language'); | 
|  | 341 |  | 
|  | 342 | is($meta->{T_doc_title}, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Doc title'); | 
|  | 343 | ok(!$meta->{T_doc_sub_title}, 'Doc Sub title'); | 
|  | 344 | ok(!$meta->{A_doc_editor}, 'Doc editor'); | 
|  | 345 | ok(!$meta->{T_doc_author}, 'Doc author'); | 
|  | 346 |  | 
|  | 347 | is($meta->{T_corpus_title}, 'Wikipedia', 'Corpus title'); | 
|  | 348 | ok(!$meta->{T_corpus_sub_title}, 'Corpus Sub title'); | 
|  | 349 | is($meta->{A_corpus_editor}, 'wikipedia.org', 'Corpus editor'); | 
|  | 350 | ok(!$meta->{T_corpus_author}, 'Corpus author'); | 
|  | 351 |  | 
|  | 352 | is($meta->keywords('keywords'), '', 'Keywords'); | 
|  | 353 | is($meta->keywords('text_class'), '', 'Text class'); | 
|  | 354 |  | 
|  | 355 | is($meta->{S_availability}, 'CC-BY-SA', 'Availability'); | 
|  | 356 |  | 
|  | 357 | use_ok('KorAP::XML::Meta::I5'); | 
|  | 358 |  | 
|  | 359 | $path = catdir(dirname(__FILE__), 'corpus', 'I5', 'rei-example.i5'); | 
|  | 360 | ok($meta = KorAP::XML::Meta::I5->new, 'Construct meta object'); | 
|  | 361 | my $dom = Mojo::DOM->new->parse(Mojo::File->new($path)->slurp); | 
|  | 362 | ok($meta->parse($dom->at('idsHeader'), 'corpus'), 'Parse corpus header'); | 
|  | 363 |  | 
|  | 364 | my $hash = $meta->to_hash; | 
|  | 365 | is($hash->{S_availability}, 'CC-BY-SA', 'Availability'); | 
|  | 366 | is($hash->{S_language}, 'de', 'Language'); | 
|  | 367 | is($hash->{T_corpus_title}, 'Reden und Interviews', 'Corpus title'); | 
|  | 368 | is($hash->{corpus_sigle}, 'REI', 'Corpus Sigle'); | 
|  | 369 |  | 
|  | 370 | ok($meta->parse($dom->find('idsHeader')->[1], 'doc'), 'Parse corpus header'); | 
|  | 371 |  | 
|  | 372 | $hash = $meta->to_hash; | 
|  | 373 | is($hash->{S_availability}, 'CC-BY-SA', 'Availability'); | 
|  | 374 | is($hash->{S_language}, 'de', 'Language'); | 
|  | 375 | is($hash->{T_corpus_title}, 'Reden und Interviews', 'Corpus title'); | 
|  | 376 | is($hash->{corpus_sigle}, 'REI', 'Corpus Sigle'); | 
|  | 377 | is($hash->{doc_sigle}, 'REI/BNG', 'Document Sigle'); | 
|  | 378 | is($hash->{T_doc_title}, 'Reden der Bundestagsfraktion Bündnis 90/DIE GRÜNEN, (2002-2006)', 'Document Sigle'); | 
|  | 379 |  | 
|  | 380 | ok($meta->parse($dom->find('idsHeader')->[2], 'text'), 'Parse corpus header'); | 
|  | 381 |  | 
|  | 382 | $hash = $meta->to_hash; | 
|  | 383 | is($hash->{S_availability}, 'CC-BY-SA', 'Availability'); | 
|  | 384 | is($hash->{S_language}, 'de', 'Language'); | 
|  | 385 | is($hash->{T_corpus_title}, 'Reden und Interviews', 'Corpus title'); | 
|  | 386 | is($hash->{corpus_sigle}, 'REI', 'Corpus Sigle'); | 
|  | 387 | is($hash->{doc_sigle}, 'REI/BNG', 'Document Sigle'); | 
|  | 388 | is($hash->{T_doc_title}, 'Reden der Bundestagsfraktion Bündnis 90/DIE GRÜNEN, (2002-2006)', 'Document Sigle'); | 
|  | 389 |  | 
|  | 390 | is($hash->{text_sigle}, 'REI/BNG/00001'); | 
|  | 391 | is($hash->{T_title}, 'Energiewirtschaft'); | 
|  | 392 | is($hash->{T_sub_title}, 'Rede im Deutschen Bundestag am 19.01.2002'); | 
|  | 393 | is($hash->{D_creation_date}, '20020119'); | 
|  | 394 | is($hash->{D_pub_date}, '20020119'); | 
|  | 395 | is($hash->{S_pub_place_key}, 'DE'); | 
|  | 396 | is($hash->{A_reference}, 'Hustedt, Michaele: Energiewirtschaft. Rede im Deutschen Bundestag am 19.01.2002, Hrsg: Bundestagsfraktion Bündnis 90/DIE GRÜNEN [Ausführliche Zitierung nicht verfügbar]'); | 
|  | 397 | is($hash->{K_text_class}->[0], 'politik'); | 
|  | 398 | is($hash->{K_text_class}->[1], 'inland'); | 
|  | 399 | is($hash->{T_author}, 'Hustedt, Michaele'); | 
|  | 400 | is($hash->{S_pub_place}, 'Berlin'); | 
|  | 401 |  | 
|  | 402 |  | 
|  | 403 | # UMB45/D38/00001 | 
|  | 404 | $path = catdir(dirname(__FILE__), 'corpus','UMB45','D38','00001'); | 
|  | 405 | ok($doc = KorAP::XML::Krill->new( path => $path), 'Load Korap::Document'); | 
|  | 406 | like($doc->path, qr!\Q$path\E/!, 'Path'); | 
|  | 407 |  | 
|  | 408 | ok($doc->parse, 'Parse document'); | 
|  | 409 | $meta = $doc->meta; | 
|  | 410 |  | 
|  | 411 | is($doc->text_sigle, 'UMB45/D38/00001', 'text sigle'); | 
|  | 412 | is($doc->doc_sigle, 'UMB45/D38', 'doc sigle'); | 
|  | 413 | is($doc->corpus_sigle, 'UMB45', 'corpus sigle'); | 
|  | 414 |  | 
|  | 415 | is($meta->{T_title}, 'In: Über Schuld und Aufgabe der geistigen Führungsschicht im deutschen politischen Leben der Gegenwart. - Göttingen, 1955', 'title'); | 
|  | 416 |  | 
|  | 417 |  | 
|  | 418 | done_testing; | 
|  | 419 | __END__ | 
|  | 420 |  | 
|  | 421 |  | 
|  | 422 |  |