blob: 66ed637bd3aabe0457ccaf0ac2e6a457b7bfcaf2 [file] [log] [blame]
Akron88d063a2022-03-21 15:10:01 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
7if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
11use utf8;
12use lib 'lib', '../lib';
13
14use File::Basename 'dirname';
15use File::Spec::Functions 'catdir';
16
17use_ok('KorAP::XML::Krill');
Akron64f7fae2022-07-27 12:45:33 +020018use_ok('KorAP::XML::Meta::I5');
Akron1a2535d2022-07-28 16:31:43 +020019use_ok('KorAP::XML::Meta::NKJP');
Akron527af142022-07-21 15:40:06 +020020use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
Akron88d063a2022-03-21 15:10:01 +010021
22my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
23
Akron1a2535d2022-07-28 16:31:43 +020024ok(my $doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP' ), 'Load Korap::Document');
Akron88d063a2022-03-21 15:10:01 +010025ok($doc->parse, 'Parse document');
26
27is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle');
28is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
29is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
30
31my $meta = $doc->meta;
32
33is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
Akron64f7fae2022-07-27 12:45:33 +020034is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
35
Akron1a2535d2022-07-28 16:31:43 +020036is($meta->{K_nkjp_channel}->[0], 'miesiecznik', 'NKJP-Channel');
37ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
38is($meta->{K_nkjp_type}->[0], 'publicystyka i wiadomości prasowe', 'NKJP-Type');
39ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
40
41ok($doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP', lang => 'en' ), 'Load Korap::Document');
Akron64f7fae2022-07-27 12:45:33 +020042ok($doc->parse, 'Parse document');
43$meta = $doc->meta;
44
45is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
46is($meta->{T_corpus_title}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Language sensitive Title');
47
Akron88d063a2022-03-21 15:10:01 +010048ok(!$meta->{T_sub_title}, 'SubTitle');
49ok(!$meta->{T_author}, 'Author');
50ok(!$meta->{A_editor}, 'Editor');
51ok(!$meta->{S_pub_place}, 'PubPlace');
52ok(!$meta->{A_publisher}, 'Publisher');
53
54ok(!$meta->{S_text_type}, 'No Text Type');
55ok(!$meta->{S_text_type_art}, 'No Text Type Art');
56ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
57ok(!$meta->{S_text_domain}, 'No Text Domain');
58ok(!$meta->{S_text_column}, 'No Text Column');
59
Akron1a2535d2022-07-28 16:31:43 +020060is($meta->{K_nkjp_channel}->[0], 'monthly', 'NKJP-Channel');
61ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
62is($meta->{K_nkjp_type}->[0], 'journalism', 'NKJP-Type');
63ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
Akron88d063a2022-03-21 15:10:01 +010064
65# Tokenization
66use_ok('KorAP::XML::Tokenizer');
67
68my ($token_base_foundry, $token_base_layer) = (qw/nkjp Morpho/);
69
70# Get tokenization
71my $tokens = KorAP::XML::Tokenizer->new(
72 path => $doc->path,
73 doc => $doc,
74 foundry => $token_base_foundry,
75 layer => $token_base_layer,
Akron527af142022-07-21 15:40:06 +020076 name => 'tokens',
Akron88d063a2022-03-21 15:10:01 +010077);
78ok($tokens, 'Token Object is fine');
79ok($tokens->parse, 'Token parsing is fine');
80
81my $output = decode_json( $tokens->to_json );
82
83is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>43', 't');
84is($output->{data}->{stream}->[0]->[3], 'i:nie', 't');
85is($output->{data}->{stream}->[1]->[2], 's:zdążyła', 't');
86
87## Base
88ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
Akron527af142022-07-21 15:40:06 +020089ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
Akron88d063a2022-03-21 15:10:01 +010090
91$output = $tokens->to_data;
92
93is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
94
Akronddf33192022-08-08 16:44:39 +020095is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ov=tokens nkjp/p=tokens', 'layerInfos');
Akron88d063a2022-03-21 15:10:01 +010096
97my $token = join('||', @{$output->{data}->{stream}->[7]});
98
99like($token, qr!<>:dereko\/s:seg\$<b>64!);
100like($token, qr!<>:dereko\/s:seg\$<b>64!);
101like($token, qr!iadu!);
102like($token, qr!nkjp\/lad!);
Akron2dd0e5d2022-11-15 09:44:43 +0100103like($token, qr!nkjp\/m:number:sg!);
104like($token, qr!nkjp\/m:case:gen!);
105like($token, qr!nkjp\/m:gender:m3!);
Akron88d063a2022-03-21 15:10:01 +0100106like($token, qr!nkjp\/p:subst!);
107like($token, qr!sadu!);
108
Akronafb98562022-06-08 14:45:09 +0200109
110# KolakowskiOco
111$path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KolakowskiOco');
112
Akron1a2535d2022-07-28 16:31:43 +0200113ok($doc = KorAP::XML::Krill->new( path => $path . '/', meta_type => 'NKJP', lang => 'pl'), 'Load Korap::Document');
Akronafb98562022-06-08 14:45:09 +0200114ok($doc->parse, 'Parse document');
115
116is($doc->text_sigle, 'NKJP/NKJP/KolakowskiOco', 'Correct text sigle');
117is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
118is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
119
120$meta = $doc->meta;
121
122is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "O co nas pytają wielcy filozofowie. Seria 3 "', 'Title');
123ok(!$meta->{T_sub_title}, 'SubTitle');
124ok(!$meta->{T_author}, 'Author');
125ok(!$meta->{A_editor}, 'Editor');
126ok(!$meta->{S_pub_place}, 'PubPlace');
127ok(!$meta->{A_publisher}, 'Publisher');
128
129ok(!$meta->{S_text_type}, 'No Text Type');
130ok(!$meta->{S_text_type_art}, 'No Text Type Art');
131ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
132ok(!$meta->{S_text_domain}, 'No Text Domain');
133ok(!$meta->{S_text_column}, 'No Text Column');
134
Akron1a2535d2022-07-28 16:31:43 +0200135is($meta->{K_nkjp_channel}->[0], 'książka', 'NKJP-Channel');
136ok(!$meta->{K_nkjp_channel}->[1], 'NKJP-Channel');
137is($meta->{K_nkjp_type}->[0], 'literatura piękna', 'NKJP-Type');
138ok(!$meta->{K_nkjp_type}->[1], 'NKJP-Type');
139
140
Akronafb98562022-06-08 14:45:09 +0200141# Get tokenization
142$tokens = KorAP::XML::Tokenizer->new(
143 path => $doc->path,
144 doc => $doc,
145 foundry => $token_base_foundry,
146 layer => $token_base_layer,
147 name => 'tokens'
148);
149ok($tokens, 'Token Object is fine');
150ok($tokens->parse, 'Token parsing is fine');
151
152$output = decode_json( $tokens->to_json );
153
154is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>117', 't');
155is($output->{data}->{stream}->[0]->[3], 'i:czy', 't');
156is($output->{data}->{stream}->[1]->[2], 's:zdarza', 't');
157
158## Base
159ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
Akron527af142022-07-21 15:40:06 +0200160ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
161ok($tokens->add('NKJP', 'NamedEntities'), 'Add NamedEntities');
Akronafb98562022-06-08 14:45:09 +0200162
163$output = $tokens->to_data;
164
Akron527af142022-07-21 15:40:06 +0200165is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho nkjp/namedentities', 'Foundries');
Akronafb98562022-06-08 14:45:09 +0200166
Akronddf33192022-08-08 16:44:39 +0200167is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/ov=tokens nkjp/p=tokens', 'layerInfos');
Akronafb98562022-06-08 14:45:09 +0200168
169$token = join('||', @{$output->{data}->{stream}->[5]});
170
171like($token, qr!<>:dereko/s:seg\$<b>64<i>23<i>28<i>6<b>4<s>1!);
172like($token, qr!_5\$<i>23<i>28!);
173like($token, qr!i:takie!);
174like($token, qr!nkjp/l:taki!);
Akron2dd0e5d2022-11-15 09:44:43 +0100175like($token, qr!nkjp/m:number:sg!);
176like($token, qr!nkjp/m:case:nom!);
177like($token, qr!nkjp/m:gender:n!);
178like($token, qr!nkjp/m:degree:pos!);
Akronafb98562022-06-08 14:45:09 +0200179like($token, qr!nkjp/p:adj!);
180like($token, qr!s:takie!);
Akronddf33192022-08-08 16:44:39 +0200181like($token, qr!nkjp/ov:takie!);
182
183
Akronafb98562022-06-08 14:45:09 +0200184
Akron527af142022-07-21 15:40:06 +0200185$token = join('||', @{$output->{data}->{stream}->[67]});
186
187like($token, qr!<>:dereko/s:seg\$<b>64<i>464<i>475<i>68<b>4<s>1!);
188like($token, qr!\@:dereko\/s:corresp:ann_segmentation\.xml\\#segm_2\.2-seg\$<b>17<s>1<i>68!);
189like($token, qr!\@:dereko\/s:id:morph_2\.2-seg\$<b>17<s>1<i>68!);
190like($token, qr!_67\$<i>464<i>475!);
191like($token, qr!i:kierkegaard!);
192like($token, qr!nkjp/l:Kierkegaard!);
Akron2dd0e5d2022-11-15 09:44:43 +0100193like($token, qr!nkjp/m:number:sg!);
194like($token, qr!nkjp/m:case:nom!);
195like($token, qr!nkjp/m:gender:m1!);
Akron527af142022-07-21 15:40:06 +0200196like($token, qr!nkjp/ne:persName:surname!);
197like($token, qr!nkjp/p:subst!);
198like($token, qr!s:Kierkegaard!);
Akronddf33192022-08-08 16:44:39 +0200199like($token, qr!nkjp/ov:Kierkegaard!);
Akron527af142022-07-21 15:40:06 +0200200
Akron88d063a2022-03-21 15:10:01 +0100201done_testing;
202__END__
203