blob: 32f40084c905f5aec0f483d4115d073e4569c2a4 [file] [log] [blame]
Akron57510c12019-01-04 14:58:53 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6use Log::Log4perl;
7use utf8;
8
9use Benchmark qw/:hireswallclock/;
10
11my $t = Benchmark->new;
12
13# Initialize log4perl object
14#Log::Log4perl->init({
15# 'log4perl.rootLogger' => 'TRACE, STDERR',
16# 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
17# 'log4perl.appender.STDERR.layout' => 'PatternLayout',
18# 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
19#});
20
21
22use File::Basename 'dirname';
23use File::Spec::Functions 'catdir';
24
25use_ok('KorAP::XML::Krill');
26
27my $path = catdir(dirname(__FILE__), '..', 'corpus', 'AGD-scrambled', 'DOC', '00001');
28
29ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
30ok($doc->parse, 'Parse document');
31
32is($doc->text_sigle, 'AGD/DOC/00001', 'Correct text sigle');
33is($doc->doc_sigle, 'AGD/DOC', 'Correct document sigle');
34is($doc->corpus_sigle, 'AGD', 'Correct corpus sigle');
35
36my $meta = $doc->meta;
37is($meta->{T_title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
38is($meta->{D_creation_date}, '20181112', 'Title');
39
Akron20294552019-11-29 16:15:35 +010040is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,'.
41 'https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData'.
42 '&id=FOLK_E_00321_SE_01_T_01', 'External link');
Akron57510c12019-01-04 14:58:53 +010043
44# Tokenization
45use_ok('KorAP::XML::Tokenizer');
46
47my ($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
48
49# Get tokenization
50my $tokens = KorAP::XML::Tokenizer->new(
51 path => $doc->path,
52 doc => $doc,
53 foundry => $token_base_foundry,
54 layer => $token_base_layer,
55 name => 'tokens',
Akronf1849aa2019-12-16 23:35:33 +010056 non_verbal_tokens => 1
Akron57510c12019-01-04 14:58:53 +010057);
58
59ok($tokens, 'Token Object is fine');
60ok($tokens->parse, 'Token parsing is fine');
61
62my $output = decode_json( $tokens->to_json );
63
64is(substr($output->{data}->{text}, 0, 100),
65 '+++++++++ ku sqn alxv a pwm ▮ xnj nq qtl ohmdgjqp ▮ ▮ ▮ ▮ ▮ fi ▮ sna ▮ alxv hn ▮ zjc ahyx ftwbramn l',
66 'Primary Data');
67
68is($output->{data}->{name}, 'tokens', 'tokenName');
69is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
70
71is($output->{version}, '0.03', 'version');
72is($output->{data}->{foundries}, '', 'Foundries');
73is($output->{data}->{layerInfos}, '', 'layerInfos');
Akronf1849aa2019-12-16 23:35:33 +010074is($output->{data}->{stream}->[0]->[4], 's:ku', 'data');
75is($output->{data}->{stream}->[1]->[2], 's:sqn', 'data');
76is($output->{data}->{stream}->[2]->[2], 's:alxv', 'data');
Akron57510c12019-01-04 14:58:53 +010077is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
78is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
79is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
80
81is($output->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
82
83## DeReKo
84$tokens->add('DeReKo', 'Structure');
85
86$output = decode_json( $tokens->to_json );
87
88is($output->{data}->{foundries},
89 'dereko dereko/structure',
90 'Foundries');
91is($output->{data}->{layerInfos}, 'dereko/s=spans', 'layerInfos');
92
93my $first_token = join('||', @{$output->{data}->{stream}->[0]});
94like($first_token, qr!<>:dereko/s:text!);
95
96## DGD
Akronc29b8e12019-12-16 14:28:09 +010097ok($tokens->add('DGD', 'Morpho'), 'Add Morpho');
Akron57510c12019-01-04 14:58:53 +010098
99$output = decode_json( $tokens->to_json );
100is($output->{data}->{foundries},
101 'dereko dereko/structure dgd dgd/morpho',
102 'Foundries');
103is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
104 'layerInfos');
105
Akronf1849aa2019-12-16 23:35:33 +0100106my $third_token = join('||', @{$output->{data}->{stream}->[2]});
Akron57510c12019-01-04 14:58:53 +0100107like($third_token, qr!dgd/l:alui!);
108like($third_token, qr!dgd/p:VMGWY!);
109like($third_token, qr!i:alxv!);
110like($third_token, qr!s:alxv!);
111
Akronc29b8e12019-12-16 14:28:09 +0100112## DGD base sentences
113ok($tokens->add('DGD', 'Structure'), 'Add sentences');
114$output = decode_json( $tokens->to_json );
Akron57510c12019-01-04 14:58:53 +0100115
Akronc29b8e12019-12-16 14:28:09 +0100116# Offsets are suboptimal set, but good enough
Akron57510c12019-01-04 14:58:53 +0100117
Akronc29b8e12019-12-16 14:28:09 +0100118$first_token = join('||', @{$output->{data}->{stream}->[0]});
Akronf1849aa2019-12-16 23:35:33 +0100119like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>2<b>1!);
Akronc29b8e12019-12-16 14:28:09 +0100120
Akronb62d92a2020-03-01 16:32:00 +0100121my $token = join('||', @{$output->{data}->{stream}->[2]});
Akronf1849aa2019-12-16 23:35:33 +0100122like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>4<b>1!);
Akronb62d92a2020-03-01 16:32:00 +0100123$token = join('||', @{$output->{data}->{stream}->[3]});
Akronc29b8e12019-12-16 14:28:09 +0100124unlike($token, qr!<>:base/s:s!);
125
Akronb62d92a2020-03-01 16:32:00 +0100126$token = join('||', @{$output->{data}->{stream}->[4]});
Akronf1849aa2019-12-16 23:35:33 +0100127like($token, qr!<>:base/s:s\$<b>64<i>23<i>27<i>5<b>1!);
128
129$token = join('||', @{$output->{data}->{stream}->[5]});
130like($token, qr!dgd/para:pause!);
Akron57510c12019-01-04 14:58:53 +0100131
Akron1cdbc9d2020-05-07 15:28:54 +0200132
133# New revision
134$path = catdir(dirname(__FILE__), '..', 'corpus', 'FOLK-scrambled', '00068-SE-01', 'T-05');
135ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
136ok($doc->parse, 'Parse document');
137
138is($doc->text_sigle, 'FOLK/00068-SE-01/T-05', 'Correct text sigle');
139is($doc->doc_sigle, 'FOLK/00068-SE-01', 'Correct document sigle');
140is($doc->corpus_sigle, 'FOLK', 'Correct corpus sigle');
141
142$meta = $doc->meta;
143is($meta->{T_title}, 'FOLK_E_00068_SE_01_T_05_DF_01', 'Title');
144
145is($meta->{A_externalLink}, 'data:application/x.korap-link;title=DGD,https://dgd.ids-mannheim.de/DGD2Web/ExternalAccessServlet?command=displayData&id=FOLK_E_00068_SE_01_T_05');
146
147# Tokenization
148use_ok('KorAP::XML::Tokenizer');
149
150($token_base_foundry, $token_base_layer) = (qw/DGD Annot/);
151
152# Get tokenization
153$tokens = KorAP::XML::Tokenizer->new(
154 path => $doc->path,
155 doc => $doc,
156 foundry => $token_base_foundry,
157 layer => $token_base_layer,
158 name => 'tokens',
159 non_verbal_tokens => 1
160);
161
162ok($tokens, 'Token Object is fine');
163ok($tokens->parse, 'Token parsing is fine');
164
165## DeReKo
166# $tokens->add('DeReKo', 'Structure');
167
168## DGD
169ok($tokens->add('DGD', 'Morpho'), 'Add Morpho');
170
171$output = decode_json( $tokens->to_json );
172
173is(substr($output->{data}->{text}, 11, 30),
174 'ogeuy Nva wvho zhl usblyuug Kt',
175 'Primary Data');
176is($output->{data}->{name}, 'tokens', 'tokenName');
177is($output->{data}->{tokenSource}, 'dgd#annot', 'tokenSource');
178
179is($output->{data}->{stream}->[0]->[1],
180 '<>:base/s:t$<b>64<i>0<i>39384<i>7190<b>0',
181 'data'
182 );
183
184is($output->{data}->{stream}->[0]->[2],
185 '@:dgd/para:type:micro$<b>16<s>1',
186 'data'
187 );
188
189is($output->{data}->{stream}->[0]->[3],
190 '@:dgd/para:rend:(.)$<b>16<s>1',
191 'data'
192 );
193
194is($output->{data}->{stream}->[0]->[5],
195 'dgd/para:pause$<b>128<s>1',
196 'data'
197 );
198
199is($output->{data}->{stream}->[1]->[0],
200 '@:dgd/para:desc:short breathe in$<b>16<s>1',
201 'data'
202 );
203
204is($output->{data}->{stream}->[1]->[1],
205 "\@:dgd/para:rend:\x{b0}h\$<b>16<s>1",
206 'data'
207 );
208
209is($output->{data}->{stream}->[1]->[3],
210 'dgd/para:vocal$<b>128<s>1',
211 'data'
212 );
213
214is($output->{data}->{stream}->[97]->[1],
215 'dgd/l:ui',
216 'data'
217 );
218
219is($output->{data}->{stream}->[97]->[2],
220 'dgd/p:AUUK',
221 'data'
222 );
223
224is($output->{data}->{stream}->[97]->[3],
225 'dgd/trans:rh',
226 'data'
227 );
228
229is($output->{data}->{stream}->[97]->[4],
230 'dgd/type:assimilated',
231 'data'
232 );
233
234
Akron57510c12019-01-04 14:58:53 +0100235done_testing;
236__END__