blob: ebb658b0dad63de8a795cc0a2bbe4b2a13c80d43 [file] [log] [blame]
Akron57510c12019-01-04 14:58:53 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6use Log::Log4perl;
7use utf8;
8
Akronfab17d32020-07-31 14:38:29 +02009if ($ENV{SKIP_REAL}) {
10 plan skip_all => 'Skip real tests';
11};
12
Akron57510c12019-01-04 14:58:53 +010013use Benchmark qw/:hireswallclock/;
14
15my $t = Benchmark->new;
16
17# Initialize log4perl object
18#Log::Log4perl->init({
19# 'log4perl.rootLogger' => 'TRACE, STDERR',
20# 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
21# 'log4perl.appender.STDERR.layout' => 'PatternLayout',
22# 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
23#});
24
25
26use File::Basename 'dirname';
27use File::Spec::Functions 'catdir';
28
29use_ok('KorAP::XML::Krill');
30
Akron414ec952020-08-03 15:48:43 +020031my $path = catdir(dirname(__FILE__), 'corpus', 'AGD-scrambled', 'DOC', '00001');
Akron57510c12019-01-04 14:58:53 +010032
33ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
34ok($doc->parse, 'Parse document');
35
36is($doc->text_sigle, 'AGD/DOC/00001', 'Correct text sigle');
37is($doc->doc_sigle, 'AGD/DOC', 'Correct document sigle');
38is($doc->corpus_sigle, 'AGD', 'Correct corpus sigle');
39
40my $meta = $doc->meta;
41is($meta->{T_title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
42is($meta->{D_creation_date}, '20181112', 'Title');
43
Akron20294552019-11-29 16:15:35 +010044is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,'.
45 'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData'.
46 '&id=FOLK_E_00321_SE_01_T_01', 'External link');
Akron57510c12019-01-04 14:58:53 +010047
48# Tokenization
49use_ok('KorAP::XML::Tokenizer');
50
51my ($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
52
53# Get tokenization
54my $tokens = KorAP::XML::Tokenizer->new(
55 path => $doc->path,
56 doc => $doc,
57 foundry => $token_base_foundry,
58 layer => $token_base_layer,
59 name => 'tokens',
Akronf1849aa2019-12-16 23:35:33 +010060 non_verbal_tokens => 1
Akron57510c12019-01-04 14:58:53 +010061);
62
63ok($tokens, 'Token Object is fine');
64ok($tokens->parse, 'Token parsing is fine');
65
66my $output = decode_json( $tokens->to_json );
67
68is(substr($output->{data}->{text}, 0, 100),
69 '+++++++++ ku sqn alxv a pwm ▮ xnj nq qtl ohmdgjqp ▮ ▮ ▮ ▮ ▮ fi ▮ sna ▮ alxv hn ▮ zjc ahyx ftwbramn l',
70 'Primary Data');
71
72is($output->{data}->{name}, 'tokens', 'tokenName');
73is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
74
75is($output->{version}, '0.03', 'version');
76is($output->{data}->{foundries}, '', 'Foundries');
77is($output->{data}->{layerInfos}, '', 'layerInfos');
Akronf1849aa2019-12-16 23:35:33 +010078is($output->{data}->{stream}->[0]->[4], 's:ku', 'data');
79is($output->{data}->{stream}->[1]->[2], 's:sqn', 'data');
80is($output->{data}->{stream}->[2]->[2], 's:alxv', 'data');
Akron57510c12019-01-04 14:58:53 +010081is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
82is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
83is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
84
85is($output->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
86
87## DeReKo
88$tokens->add('DeReKo', 'Structure');
89
90$output = decode_json( $tokens->to_json );
91
92is($output->{data}->{foundries},
93 'dereko dereko/structure',
94 'Foundries');
95is($output->{data}->{layerInfos}, 'dereko/s=spans', 'layerInfos');
96
97my $first_token = join('||', @{$output->{data}->{stream}->[0]});
98like($first_token, qr!<>:dereko/s:text!);
99
100## DGD
Akronc29b8e12019-12-16 14:28:09 +0100101ok($tokens->add('DGD', 'Morpho'), 'Add Morpho');
Akron57510c12019-01-04 14:58:53 +0100102
103$output = decode_json( $tokens->to_json );
104is($output->{data}->{foundries},
105 'dereko dereko/structure dgd dgd/morpho',
106 'Foundries');
107is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
108 'layerInfos');
109
Akronf1849aa2019-12-16 23:35:33 +0100110my $third_token = join('||', @{$output->{data}->{stream}->[2]});
Akron57510c12019-01-04 14:58:53 +0100111like($third_token, qr!dgd/l:alui!);
112like($third_token, qr!dgd/p:VMGWY!);
113like($third_token, qr!i:alxv!);
114like($third_token, qr!s:alxv!);
115
Akronc29b8e12019-12-16 14:28:09 +0100116## DGD base sentences
117ok($tokens->add('DGD', 'Structure'), 'Add sentences');
118$output = decode_json( $tokens->to_json );
Akron57510c12019-01-04 14:58:53 +0100119
Akronc29b8e12019-12-16 14:28:09 +0100120# Offsets are suboptimal set, but good enough
Akron57510c12019-01-04 14:58:53 +0100121
Akronc29b8e12019-12-16 14:28:09 +0100122$first_token = join('||', @{$output->{data}->{stream}->[0]});
Akronf1849aa2019-12-16 23:35:33 +0100123like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>2<b>1!);
Akronc29b8e12019-12-16 14:28:09 +0100124
Akronb62d92a2020-03-01 16:32:00 +0100125my $token = join('||', @{$output->{data}->{stream}->[2]});
Akronf1849aa2019-12-16 23:35:33 +0100126like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>4<b>1!);
Akronb62d92a2020-03-01 16:32:00 +0100127$token = join('||', @{$output->{data}->{stream}->[3]});
Akronc29b8e12019-12-16 14:28:09 +0100128unlike($token, qr!<>:base/s:s!);
129
Akronb62d92a2020-03-01 16:32:00 +0100130$token = join('||', @{$output->{data}->{stream}->[4]});
Akronf1849aa2019-12-16 23:35:33 +0100131like($token, qr!<>:base/s:s\$<b>64<i>23<i>27<i>5<b>1!);
132
133$token = join('||', @{$output->{data}->{stream}->[5]});
134like($token, qr!dgd/para:pause!);
Akron57510c12019-01-04 14:58:53 +0100135
Akron1cdbc9d2020-05-07 15:28:54 +0200136
137# New revision
Akron414ec952020-08-03 15:48:43 +0200138$path = catdir(dirname(__FILE__), 'corpus', 'FOLK-scrambled', '00068-SE-01', 'T-05');
Akron1cdbc9d2020-05-07 15:28:54 +0200139ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
140ok($doc->parse, 'Parse document');
141
142is($doc->text_sigle, 'FOLK/00068-SE-01/T-05', 'Correct text sigle');
143is($doc->doc_sigle, 'FOLK/00068-SE-01', 'Correct document sigle');
144is($doc->corpus_sigle, 'FOLK', 'Correct corpus sigle');
145
146$meta = $doc->meta;
147is($meta->{T_title}, 'FOLK_E_00068_SE_01_T_05_DF_01', 'Title');
148
149is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=FOLK_E_00068_SE_01_T_05');
150
151# Tokenization
152use_ok('KorAP::XML::Tokenizer');
153
154($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
155
156# Get tokenization
157$tokens = KorAP::XML::Tokenizer->new(
158 path => $doc->path,
159 doc => $doc,
160 foundry => $token_base_foundry,
161 layer => $token_base_layer,
162 name => 'tokens',
163 non_verbal_tokens => 1
164);
165
166ok($tokens, 'Token Object is fine');
167ok($tokens->parse, 'Token parsing is fine');
168
169## DeReKo
170# $tokens->add('DeReKo', 'Structure');
171
172## DGD
173ok($tokens->add('DGD', 'Morpho'), 'Add Morpho');
174
175$output = decode_json( $tokens->to_json );
176
177is(substr($output->{data}->{text}, 11, 30),
178 'ogeuy Nva wvho zhl usblyuug Kt',
179 'Primary Data');
180is($output->{data}->{name}, 'tokens', 'tokenName');
181is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
182
183is($output->{data}->{stream}->[0]->[1],
184 '<>:base/s:t$<b>64<i>0<i>39384<i>7190<b>0',
185 'data'
186 );
187
188is($output->{data}->{stream}->[0]->[2],
189 '@:dgd/para:type:micro$<b>16<s>1',
190 'data'
191 );
192
193is($output->{data}->{stream}->[0]->[3],
194 '@:dgd/para:rend:(.)$<b>16<s>1',
195 'data'
196 );
197
198is($output->{data}->{stream}->[0]->[5],
199 'dgd/para:pause$<b>128<s>1',
200 'data'
201 );
202
203is($output->{data}->{stream}->[1]->[0],
204 '@:dgd/para:desc:short breathe in$<b>16<s>1',
205 'data'
206 );
207
208is($output->{data}->{stream}->[1]->[1],
209 "\@:dgd/para:rend:\x{b0}h\$<b>16<s>1",
210 'data'
211 );
212
213is($output->{data}->{stream}->[1]->[3],
214 'dgd/para:vocal$<b>128<s>1',
215 'data'
216 );
217
218is($output->{data}->{stream}->[97]->[1],
219 'dgd/l:ui',
220 'data'
221 );
222
223is($output->{data}->{stream}->[97]->[2],
224 'dgd/p:AUUK',
225 'data'
226 );
227
228is($output->{data}->{stream}->[97]->[3],
229 'dgd/trans:rh',
230 'data'
231 );
232
233is($output->{data}->{stream}->[97]->[4],
234 'dgd/type:assimilated',
235 'data'
236 );
237
238
Akron57510c12019-01-04 14:58:53 +0100239done_testing;
240__END__