blob: 8f94f440a4f8f9c6f74cbd0b301396eae4308060 [file] [log] [blame]
Akron88d063a2022-03-21 15:10:01 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
7if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
11use utf8;
12use lib 'lib', '../lib';
13
14use File::Basename 'dirname';
15use File::Spec::Functions 'catdir';
16
17use_ok('KorAP::XML::Krill');
Akron64f7fae2022-07-27 12:45:33 +020018use_ok('KorAP::XML::Meta::I5');
Akron527af142022-07-21 15:40:06 +020019use_ok('KorAP::XML::Annotation::NKJP::NamedEntities');
Akron88d063a2022-03-21 15:10:01 +010020
21my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
22
23ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
24ok($doc->parse, 'Parse document');
25
26is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle');
27is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
28is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
29
30my $meta = $doc->meta;
31
32is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
Akron64f7fae2022-07-27 12:45:33 +020033is($meta->{T_corpus_title}, 'Narodowy Korpus Języka Polskiego -- podkorpus zawierający 1 milion słów', 'Title');
34
35ok($doc = KorAP::XML::Krill->new( path => $path . '/', lang => 'en' ), 'Load Korap::Document');
36ok($doc->parse, 'Parse document');
37$meta = $doc->meta;
38
39is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
40is($meta->{T_corpus_title}, 'National Corpus of Polish -- the 1 million word subcorpus', 'Language sensitive Title');
41
Akron88d063a2022-03-21 15:10:01 +010042ok(!$meta->{T_sub_title}, 'SubTitle');
43ok(!$meta->{T_author}, 'Author');
44ok(!$meta->{A_editor}, 'Editor');
45ok(!$meta->{S_pub_place}, 'PubPlace');
46ok(!$meta->{A_publisher}, 'Publisher');
47
48ok(!$meta->{S_text_type}, 'No Text Type');
49ok(!$meta->{S_text_type_art}, 'No Text Type Art');
50ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
51ok(!$meta->{S_text_domain}, 'No Text Domain');
52ok(!$meta->{S_text_column}, 'No Text Column');
53
54
55# Tokenization
56use_ok('KorAP::XML::Tokenizer');
57
58my ($token_base_foundry, $token_base_layer) = (qw/nkjp Morpho/);
59
60# Get tokenization
61my $tokens = KorAP::XML::Tokenizer->new(
62 path => $doc->path,
63 doc => $doc,
64 foundry => $token_base_foundry,
65 layer => $token_base_layer,
Akron527af142022-07-21 15:40:06 +020066 name => 'tokens',
Akron88d063a2022-03-21 15:10:01 +010067);
68ok($tokens, 'Token Object is fine');
69ok($tokens->parse, 'Token parsing is fine');
70
71my $output = decode_json( $tokens->to_json );
72
73is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>43', 't');
74is($output->{data}->{stream}->[0]->[3], 'i:nie', 't');
75is($output->{data}->{stream}->[1]->[2], 's:zdążyła', 't');
76
77## Base
78ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
Akron527af142022-07-21 15:40:06 +020079ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
Akron88d063a2022-03-21 15:10:01 +010080
81$output = $tokens->to_data;
82
83is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
84
85is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
86
87my $token = join('||', @{$output->{data}->{stream}->[7]});
88
89like($token, qr!<>:dereko\/s:seg\$<b>64!);
90like($token, qr!<>:dereko\/s:seg\$<b>64!);
91like($token, qr!iadu!);
92like($token, qr!nkjp\/lad!);
93like($token, qr!nkjp\/m:sg:gen:m3!);
94like($token, qr!nkjp\/p:subst!);
95like($token, qr!sadu!);
96
Akronafb98562022-06-08 14:45:09 +020097
98# KolakowskiOco
99$path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KolakowskiOco');
100
101ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
102ok($doc->parse, 'Parse document');
103
104is($doc->text_sigle, 'NKJP/NKJP/KolakowskiOco', 'Correct text sigle');
105is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
106is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
107
108$meta = $doc->meta;
109
110is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "O co nas pytają wielcy filozofowie. Seria 3 "', 'Title');
111ok(!$meta->{T_sub_title}, 'SubTitle');
112ok(!$meta->{T_author}, 'Author');
113ok(!$meta->{A_editor}, 'Editor');
114ok(!$meta->{S_pub_place}, 'PubPlace');
115ok(!$meta->{A_publisher}, 'Publisher');
116
117ok(!$meta->{S_text_type}, 'No Text Type');
118ok(!$meta->{S_text_type_art}, 'No Text Type Art');
119ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
120ok(!$meta->{S_text_domain}, 'No Text Domain');
121ok(!$meta->{S_text_column}, 'No Text Column');
122
123# Get tokenization
124$tokens = KorAP::XML::Tokenizer->new(
125 path => $doc->path,
126 doc => $doc,
127 foundry => $token_base_foundry,
128 layer => $token_base_layer,
129 name => 'tokens'
130);
131ok($tokens, 'Token Object is fine');
132ok($tokens->parse, 'Token parsing is fine');
133
134$output = decode_json( $tokens->to_json );
135
136is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>117', 't');
137is($output->{data}->{stream}->[0]->[3], 'i:czy', 't');
138is($output->{data}->{stream}->[1]->[2], 's:zdarza', 't');
139
140## Base
141ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
Akron527af142022-07-21 15:40:06 +0200142ok($tokens->add('NKJP', 'Morpho'), 'Add Morpho');
143ok($tokens->add('NKJP', 'NamedEntities'), 'Add NamedEntities');
Akronafb98562022-06-08 14:45:09 +0200144
145$output = $tokens->to_data;
146
Akron527af142022-07-21 15:40:06 +0200147is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho nkjp/namedentities', 'Foundries');
Akronafb98562022-06-08 14:45:09 +0200148
Akron527af142022-07-21 15:40:06 +0200149is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/p=tokens', 'layerInfos');
Akronafb98562022-06-08 14:45:09 +0200150
151$token = join('||', @{$output->{data}->{stream}->[5]});
152
153like($token, qr!<>:dereko/s:seg\$<b>64<i>23<i>28<i>6<b>4<s>1!);
154like($token, qr!_5\$<i>23<i>28!);
155like($token, qr!i:takie!);
156like($token, qr!nkjp/l:taki!);
157like($token, qr!nkjp/m:sg:nom:n:pos!);
158like($token, qr!nkjp/p:adj!);
159like($token, qr!s:takie!);
160
Akron527af142022-07-21 15:40:06 +0200161$token = join('||', @{$output->{data}->{stream}->[67]});
162
163like($token, qr!<>:dereko/s:seg\$<b>64<i>464<i>475<i>68<b>4<s>1!);
164like($token, qr!\@:dereko\/s:corresp:ann_segmentation\.xml\\#segm_2\.2-seg\$<b>17<s>1<i>68!);
165like($token, qr!\@:dereko\/s:id:morph_2\.2-seg\$<b>17<s>1<i>68!);
166like($token, qr!_67\$<i>464<i>475!);
167like($token, qr!i:kierkegaard!);
168like($token, qr!nkjp/l:Kierkegaard!);
169like($token, qr!nkjp/m:sg:nom:m1!);
170like($token, qr!nkjp/ne:persName:surname!);
171like($token, qr!nkjp/p:subst!);
172like($token, qr!s:Kierkegaard!);
173
Akron88d063a2022-03-21 15:10:01 +0100174done_testing;
175__END__
176