blob: c9a83c7ad6c3a1aa490d3559c89ece7a31143cdd [file] [log] [blame]
Akron8c84aa52016-02-13 21:26:54 +01001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions 'catdir';
6use Data::Dumper;
7use KorAP::XML::Tokenizer;
8use KorAP::XML::Krill;
9use utf8;
10
11my $path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '2843');
12
13ok(my $doc = KorAP::XML::Krill->new(
Akron35db6e32016-03-17 22:42:22 +010014 path => $path . '/',
15 meta_type => 'Sgbr'
Akron8c84aa52016-02-13 21:26:54 +010016), 'Create Document');
17
18ok($doc->parse, 'Parse document');
19
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020020like($doc->path, qr!\Q$path\E/!, 'Path');
Akron8c84aa52016-02-13 21:26:54 +010021
22# Metdata
Akron1cd5b872016-03-22 00:23:46 +010023is($doc->text_sigle, 'CMC-TSK/2014-09/2843', 'ID-text');
Akron8c84aa52016-02-13 21:26:54 +010024
Akron1cd5b872016-03-22 00:23:46 +010025is($doc->doc_sigle, 'CMC-TSK/2014-09', 'ID-doc');
Akron8c84aa52016-02-13 21:26:54 +010026is($doc->corpus_sigle, 'CMC-TSK', 'ID-corpus');
27
Akron35db6e32016-03-17 22:42:22 +010028my $meta = $doc->meta;
Akron8c84aa52016-02-13 21:26:54 +010029
Akron5eb3aa02019-01-25 18:30:47 +010030is($meta->{T_title}, '@ Koelle_am_Rhing 10:18', 'title');
Akron8c84aa52016-02-13 21:26:54 +010031
Akron5eb3aa02019-01-25 18:30:47 +010032ok(!$meta->{T_sub_title}, 'no subtitle');
Akron8c84aa52016-02-13 21:26:54 +010033
Akron5eb3aa02019-01-25 18:30:47 +010034is($meta->{A_publisher}, 'tagesschau.de', 'Publisher');
Akron8c84aa52016-02-13 21:26:54 +010035
Akron5eb3aa02019-01-25 18:30:47 +010036is($meta->{D_pub_date}, '20140930');
Akron8c84aa52016-02-13 21:26:54 +010037
Akron5eb3aa02019-01-25 18:30:47 +010038ok(!$meta->{S_pub_place}, 'No pub place');
Akron8c84aa52016-02-13 21:26:54 +010039
Akron5eb3aa02019-01-25 18:30:47 +010040is($meta->{T_doc_title}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
41is($meta->{T_doc_sub_title}, 'Subkorpus Internettexte, Subkorpus Leserkommentare Tagesschau, Subkorpus September 2014, Subkorpus Beispielauszug', 'Doc Sub title');
Akron8c84aa52016-02-13 21:26:54 +010042
Akron5eb3aa02019-01-25 18:30:47 +010043is($meta->{'A_funder'}, 'Bundesministerium für Bildung und Forschung', 'Funder');
Akron8c84aa52016-02-13 21:26:54 +010044
Akron5eb3aa02019-01-25 18:30:47 +010045is($meta->{T_author}, 'privat23', 'Author');
46ok(!$meta->{'S_sgbr_author_sex'}, 'No Sex');
47ok(!$meta->{'S_sgbr_kodex'}, 'No kodex');
48is($meta->{A_reference}, 'http://meta.tagesschau.de/node/090285#comment-1732187', 'Publace ref');
Akron8c84aa52016-02-13 21:26:54 +010049
Akron5eb3aa02019-01-25 18:30:47 +010050is($meta->keywords('K_keywords'), '');
Akron8c84aa52016-02-13 21:26:54 +010051
Akron5eb3aa02019-01-25 18:30:47 +010052is($meta->{S_language}, 'de', 'Language');
Akron8c84aa52016-02-13 21:26:54 +010053
Akron5eb3aa02019-01-25 18:30:47 +010054ok(!$meta->{A_editor}, 'Editor');
Akron8c84aa52016-02-13 21:26:54 +010055
Akron5eb3aa02019-01-25 18:30:47 +010056ok(!$meta->{S_text_type}, 'Text Type');
57ok(!$meta->{S_text_type_art}, 'Text Type Art');
58ok(!$meta->{S_text_type_ref}, 'Text Type Ref');
59ok(!$meta->{S_text_column}, 'Text Column');
60ok(!$meta->{S_text_domain}, 'Text Domain');
61ok(!$meta->{D_creation_date}, 'Creation Date');
62ok(!$meta->{S_license}, 'License');
63ok(!$meta->{A_pages}, 'Pages');
64ok(!$meta->{A_file_edition_statement}, 'File Edition Statement');
65ok(!$meta->{A_bibl_edition_statement}, 'Bibl Edition Statement');
Akron8c84aa52016-02-13 21:26:54 +010066
Akron5eb3aa02019-01-25 18:30:47 +010067ok(!$meta->{A_doc_editor}, 'Doc: editor');
68ok(!$meta->{T_doc_author}, 'Doc: author');
Akron35db6e32016-03-17 22:42:22 +010069
Akron5eb3aa02019-01-25 18:30:47 +010070ok(!$meta->{T_corpus_title}, 'Corpus: title');
71ok(!$meta->{T_corpus_sub_title}, 'Corpus: subtitle');
72ok(!$meta->{A_corpus_editor}, 'Corpus: editor');
73ok(!$meta->{T_corpus_author}, 'Corpus: author');
Akron8c84aa52016-02-13 21:26:54 +010074
75my $hash = $doc->to_hash;
76is($hash->{title}, '@ Koelle_am_Rhing 10:18', 'Corpus title');
77
Akron8c84aa52016-02-13 21:26:54 +010078# Second document
Akron8c84aa52016-02-13 21:26:54 +010079$path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '3401');
80
81ok($doc = KorAP::XML::Krill->new(
Akron35db6e32016-03-17 22:42:22 +010082 path => $path . '/',
83 meta_type => 'Sgbr'
Akron8c84aa52016-02-13 21:26:54 +010084), 'Create Document');
85
86ok($doc->parse, 'Parse document');
87
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020088like($doc->path, qr!\Q$path\E/!, 'Path');
Akron8c84aa52016-02-13 21:26:54 +010089
90# Metdata
Akron1cd5b872016-03-22 00:23:46 +010091is($doc->text_sigle, 'CMC-TSK/2014-09/3401', 'ID-text');
Akron8c84aa52016-02-13 21:26:54 +010092
Akron1cd5b872016-03-22 00:23:46 +010093is($doc->doc_sigle, 'CMC-TSK/2014-09', 'ID-doc');
Akron8c84aa52016-02-13 21:26:54 +010094is($doc->corpus_sigle, 'CMC-TSK', 'ID-corpus');
95
Akron8c84aa52016-02-13 21:26:54 +010096
Akron35db6e32016-03-17 22:42:22 +010097$meta = $doc->meta;
Akron5eb3aa02019-01-25 18:30:47 +010098is($meta->{T_title}, '@fitnessfrosch', 'title');
Akron8c84aa52016-02-13 21:26:54 +010099
Akron5eb3aa02019-01-25 18:30:47 +0100100ok(!$meta->{T_sub_title}, 'no subtitle');
Akron8c84aa52016-02-13 21:26:54 +0100101
Akron5eb3aa02019-01-25 18:30:47 +0100102is($meta->{A_publisher}, 'tagesschau.de', 'Publisher');
Akron8c84aa52016-02-13 21:26:54 +0100103
Akron5eb3aa02019-01-25 18:30:47 +0100104is($meta->{D_pub_date}, '20141001');
105is($meta->{'D_sgbr_date'}, '2014-10-01 00:50:00');
Akron8c84aa52016-02-13 21:26:54 +0100106
Akron5eb3aa02019-01-25 18:30:47 +0100107ok(!$meta->{S_pub_place}, 'No pub place');
Akron8c84aa52016-02-13 21:26:54 +0100108
Akron5eb3aa02019-01-25 18:30:47 +0100109is($meta->{T_doc_title}, 'Korpus zur Beobachtung des Schreibgebrauchs im Deutschen', 'Doc title');
110is($meta->{T_doc_sub_title}, 'Subkorpus Internettexte, Subkorpus Leserkommentare Tagesschau, Subkorpus September 2014, Subkorpus Beispielauszug', 'Doc Sub title');
Akron8c84aa52016-02-13 21:26:54 +0100111
Akron5eb3aa02019-01-25 18:30:47 +0100112is($meta->{'A_funder'}, 'Bundesministerium für Bildung und Forschung', 'Funder');
Akron8c84aa52016-02-13 21:26:54 +0100113
Akron5eb3aa02019-01-25 18:30:47 +0100114is($meta->{T_author}, 'weltoffen', 'Author');
115ok(!$meta->{'S_sgbr_author_sex'}, 'No Sex');
116ok(!$meta->{'S_sgbr_kodex'}, 'No kodex');
117is($meta->{A_reference}, 'http://meta.tagesschau.de/node/090308#comment-1732754', 'Publace ref');
Akron8c84aa52016-02-13 21:26:54 +0100118
Akron5eb3aa02019-01-25 18:30:47 +0100119is($meta->keywords('K_keywords'), '');
Akron8c84aa52016-02-13 21:26:54 +0100120
Akron5eb3aa02019-01-25 18:30:47 +0100121is($meta->{S_language}, 'de', 'Language');
Akron8c84aa52016-02-13 21:26:54 +0100122
Akron5eb3aa02019-01-25 18:30:47 +0100123ok(!$meta->{A_editor}, 'Editor');
Akron8c84aa52016-02-13 21:26:54 +0100124
Akron5eb3aa02019-01-25 18:30:47 +0100125ok(!$meta->{S_text_type}, 'Text Type');
126ok(!$meta->{S_text_type_art}, 'Text Type Art');
127ok(!$meta->{S_text_type_ref}, 'Text Type Ref');
128ok(!$meta->{S_text_column}, 'Text Column');
129ok(!$meta->{S_text_domain}, 'Text Domain');
130ok(!$meta->{D_creation_date}, 'Creation Date');
131ok(!$meta->{S_license}, 'License');
132ok(!$meta->{A_pages}, 'Pages');
133ok(!$meta->{A_file_edition_statement}, 'File Edition Statement');
134ok(!$meta->{A_bibl_edition_statement}, 'Bibl Edition Statement');
Akron8c84aa52016-02-13 21:26:54 +0100135
Akron5eb3aa02019-01-25 18:30:47 +0100136ok(!$meta->{A_doc_editor}, 'Doc: editor');
137ok(!$meta->{T_doc_author}, 'Doc: author');
Akron35db6e32016-03-17 22:42:22 +0100138
Akron5eb3aa02019-01-25 18:30:47 +0100139ok(!$meta->{T_corpus_title}, 'Corpus: title');
140ok(!$meta->{T_corpus_sub_title}, 'Corpus: subtitle');
141ok(!$meta->{A_corpus_editor}, 'Corpus: editor');
142ok(!$meta->{T_corpus_author}, 'Corpus: author');
Akron8c84aa52016-02-13 21:26:54 +0100143
144$hash = $doc->to_hash;
145is($hash->{title}, '@fitnessfrosch', 'Corpus title');
146
147done_testing;
148__END__
149