blob: 0bf6f126b8989dd673a2fb706aeead20b231feb3 [file] [log] [blame]
Akron57510c12019-01-04 14:58:53 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
Akron57510c12019-01-04 14:58:53 +01006use utf8;
7
Akronfab17d32020-07-31 14:38:29 +02008if ($ENV{SKIP_REAL}) {
9 plan skip_all => 'Skip real tests';
10};
11
Akron57510c12019-01-04 14:58:53 +010012use Benchmark qw/:hireswallclock/;
13
14my $t = Benchmark->new;
15
Akron57510c12019-01-04 14:58:53 +010016use File::Basename 'dirname';
17use File::Spec::Functions 'catdir';
18
19use_ok('KorAP::XML::Krill');
20
Akron414ec952020-08-03 15:48:43 +020021my $path = catdir(dirname(__FILE__), 'corpus', 'AGD-scrambled', 'DOC', '00001');
Akron57510c12019-01-04 14:58:53 +010022
23ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
24ok($doc->parse, 'Parse document');
25
26is($doc->text_sigle, 'AGD/DOC/00001', 'Correct text sigle');
27is($doc->doc_sigle, 'AGD/DOC', 'Correct document sigle');
28is($doc->corpus_sigle, 'AGD', 'Correct corpus sigle');
29
30my $meta = $doc->meta;
31is($meta->{T_title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
32is($meta->{D_creation_date}, '20181112', 'Title');
33
Akron20294552019-11-29 16:15:35 +010034is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,'.
Akron8ad06c42022-01-11 17:07:49 +010035 'https%3A%2F%2Fdgd.ids-mannheim.de%2FDGD2Web%2FExternalAccessServlet%3F'.
36 'command%3DdisplayData%26id%3DFOLK_E_00321_SE_01_T_01', 'External link');
Akron57510c12019-01-04 14:58:53 +010037
38# Tokenization
39use_ok('KorAP::XML::Tokenizer');
40
41my ($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
42
43# Get tokenization
44my $tokens = KorAP::XML::Tokenizer->new(
45 path => $doc->path,
46 doc => $doc,
47 foundry => $token_base_foundry,
48 layer => $token_base_layer,
49 name => 'tokens',
Akronf1849aa2019-12-16 23:35:33 +010050 non_verbal_tokens => 1
Akron57510c12019-01-04 14:58:53 +010051);
52
53ok($tokens, 'Token Object is fine');
54ok($tokens->parse, 'Token parsing is fine');
55
56my $output = decode_json( $tokens->to_json );
57
58is(substr($output->{data}->{text}, 0, 100),
59 '+++++++++ ku sqn alxv a pwm ▮ xnj nq qtl ohmdgjqp ▮ ▮ ▮ ▮ ▮ fi ▮ sna ▮ alxv hn ▮ zjc ahyx ftwbramn l',
60 'Primary Data');
61
62is($output->{data}->{name}, 'tokens', 'tokenName');
63is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
64
65is($output->{version}, '0.03', 'version');
66is($output->{data}->{foundries}, '', 'Foundries');
67is($output->{data}->{layerInfos}, '', 'layerInfos');
Akronf1849aa2019-12-16 23:35:33 +010068is($output->{data}->{stream}->[0]->[4], 's:ku', 'data');
69is($output->{data}->{stream}->[1]->[2], 's:sqn', 'data');
70is($output->{data}->{stream}->[2]->[2], 's:alxv', 'data');
Akron57510c12019-01-04 14:58:53 +010071is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
72is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
73is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
74
75is($output->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
76
77## DeReKo
78$tokens->add('DeReKo', 'Structure');
79
80$output = decode_json( $tokens->to_json );
81
82is($output->{data}->{foundries},
83 'dereko dereko/structure',
84 'Foundries');
85is($output->{data}->{layerInfos}, 'dereko/s=spans', 'layerInfos');
86
87my $first_token = join('||', @{$output->{data}->{stream}->[0]});
88like($first_token, qr!<>:dereko/s:text!);
89
90## DGD
Akronc29b8e12019-12-16 14:28:09 +010091ok($tokens->add('DGD', 'Morpho'), 'Add Morpho');
Akron57510c12019-01-04 14:58:53 +010092
93$output = decode_json( $tokens->to_json );
94is($output->{data}->{foundries},
95 'dereko dereko/structure dgd dgd/morpho',
96 'Foundries');
97is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
98 'layerInfos');
99
Akronf1849aa2019-12-16 23:35:33 +0100100my $third_token = join('||', @{$output->{data}->{stream}->[2]});
Akron57510c12019-01-04 14:58:53 +0100101like($third_token, qr!dgd/l:alui!);
102like($third_token, qr!dgd/p:VMGWY!);
103like($third_token, qr!i:alxv!);
104like($third_token, qr!s:alxv!);
105
Akronc29b8e12019-12-16 14:28:09 +0100106## DGD base sentences
107ok($tokens->add('DGD', 'Structure'), 'Add sentences');
108$output = decode_json( $tokens->to_json );
Akron57510c12019-01-04 14:58:53 +0100109
Akronc29b8e12019-12-16 14:28:09 +0100110# Offsets are suboptimal set, but good enough
Akron57510c12019-01-04 14:58:53 +0100111
Akronc29b8e12019-12-16 14:28:09 +0100112$first_token = join('||', @{$output->{data}->{stream}->[0]});
Akronf1849aa2019-12-16 23:35:33 +0100113like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>2<b>1!);
Akronc29b8e12019-12-16 14:28:09 +0100114
Akronb62d92a2020-03-01 16:32:00 +0100115my $token = join('||', @{$output->{data}->{stream}->[2]});
Akronf1849aa2019-12-16 23:35:33 +0100116like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>4<b>1!);
Akronb62d92a2020-03-01 16:32:00 +0100117$token = join('||', @{$output->{data}->{stream}->[3]});
Akronc29b8e12019-12-16 14:28:09 +0100118unlike($token, qr!<>:base/s:s!);
119
Akronb62d92a2020-03-01 16:32:00 +0100120$token = join('||', @{$output->{data}->{stream}->[4]});
Akronf1849aa2019-12-16 23:35:33 +0100121like($token, qr!<>:base/s:s\$<b>64<i>23<i>27<i>5<b>1!);
122
123$token = join('||', @{$output->{data}->{stream}->[5]});
124like($token, qr!dgd/para:pause!);
Akron57510c12019-01-04 14:58:53 +0100125
Akron1cdbc9d2020-05-07 15:28:54 +0200126
127# New revision
Akron414ec952020-08-03 15:48:43 +0200128$path = catdir(dirname(__FILE__), 'corpus', 'FOLK-scrambled', '00068-SE-01', 'T-05');
Akron1cdbc9d2020-05-07 15:28:54 +0200129ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
130ok($doc->parse, 'Parse document');
131
132is($doc->text_sigle, 'FOLK/00068-SE-01/T-05', 'Correct text sigle');
133is($doc->doc_sigle, 'FOLK/00068-SE-01', 'Correct document sigle');
134is($doc->corpus_sigle, 'FOLK', 'Correct corpus sigle');
135
136$meta = $doc->meta;
137is($meta->{T_title}, 'FOLK_E_00068_SE_01_T_05_DF_01', 'Title');
138
Akron8ad06c42022-01-11 17:07:49 +0100139is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,'.
140 'https%3A%2F%2Fdgd.ids-mannheim.de%2FDGD2Web%2FExternalAccessServlet'.
141 '%3Fcommand%3DdisplayData%26id%3DFOLK_E_00068_SE_01_T_05');
Akron1cdbc9d2020-05-07 15:28:54 +0200142
143# Tokenization
144use_ok('KorAP::XML::Tokenizer');
145
146($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
147
148# Get tokenization
149$tokens = KorAP::XML::Tokenizer->new(
150 path => $doc->path,
151 doc => $doc,
152 foundry => $token_base_foundry,
153 layer => $token_base_layer,
154 name => 'tokens',
155 non_verbal_tokens => 1
156);
157
158ok($tokens, 'Token Object is fine');
159ok($tokens->parse, 'Token parsing is fine');
160
161## DeReKo
162# $tokens->add('DeReKo', 'Structure');
163
164## DGD
165ok($tokens->add('DGD', 'Morpho'), 'Add Morpho');
166
167$output = decode_json( $tokens->to_json );
168
169is(substr($output->{data}->{text}, 11, 30),
170 'ogeuy Nva wvho zhl usblyuug Kt',
171 'Primary Data');
172is($output->{data}->{name}, 'tokens', 'tokenName');
173is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
174
175is($output->{data}->{stream}->[0]->[1],
176 '<>:base/s:t$<b>64<i>0<i>39384<i>7190<b>0',
177 'data'
178 );
179
180is($output->{data}->{stream}->[0]->[2],
181 '@:dgd/para:type:micro$<b>16<s>1',
182 'data'
183 );
184
185is($output->{data}->{stream}->[0]->[3],
186 '@:dgd/para:rend:(.)$<b>16<s>1',
187 'data'
188 );
189
190is($output->{data}->{stream}->[0]->[5],
191 'dgd/para:pause$<b>128<s>1',
192 'data'
193 );
194
195is($output->{data}->{stream}->[1]->[0],
196 '@:dgd/para:desc:short breathe in$<b>16<s>1',
197 'data'
198 );
199
200is($output->{data}->{stream}->[1]->[1],
201 "\@:dgd/para:rend:\x{b0}h\$<b>16<s>1",
202 'data'
203 );
204
205is($output->{data}->{stream}->[1]->[3],
206 'dgd/para:vocal$<b>128<s>1',
207 'data'
208 );
209
210is($output->{data}->{stream}->[97]->[1],
211 'dgd/l:ui',
212 'data'
213 );
214
215is($output->{data}->{stream}->[97]->[2],
216 'dgd/p:AUUK',
217 'data'
218 );
219
220is($output->{data}->{stream}->[97]->[3],
221 'dgd/trans:rh',
222 'data'
223 );
224
225is($output->{data}->{stream}->[97]->[4],
226 'dgd/type:assimilated',
227 'data'
228 );
229
230
Akron57510c12019-01-04 14:58:53 +0100231done_testing;
232__END__