blob: f7f946841c91f35f3fa33daedec54c1d8d799fce [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Akron2a60c532020-02-13 15:52:18 +01005use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01006
7use Test::More;
8use Test::Output;
Akrond89ef822020-02-17 12:42:09 +01009use Test::XML::Loy;
Peter Harders42e18a62020-07-21 02:43:26 +020010
11use FindBin;
12BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
Akron5fb5e8d2020-07-23 17:45:13 +020015use Test::KorAP::XML::TEI qw!korap_tempfile!;
Peter Harders57c884e2020-07-16 01:28:52 +020016
Akron797e8072020-02-13 07:59:40 +010017my $f = dirname(__FILE__);
18my $script = catfile($f, '..', 'script', 'tei2korapxml');
19ok(-f $script, 'Script found');
20
Akrond949e182020-02-14 12:23:57 +010021stdout_like(
Akron797e8072020-02-13 07:59:40 +010022 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010023 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010024 'Help'
25);
26
Akrond949e182020-02-14 12:23:57 +010027stdout_like(
28 sub { system('perl', $script, '--version') },
29 qr!tei2korapxml - v\d+?\.\d+?!,
30 'Version'
31);
32
33
Akron2a60c532020-02-13 15:52:18 +010034# Load example file
35my $file = catfile($f, 'data', 'goe_sample.i5.xml');
Peter Harders57c884e2020-07-16 01:28:52 +020036
Akron5fb5e8d2020-07-23 17:45:13 +020037my ($fh, $outzip) = korap_tempfile('script_out');
Akron2a60c532020-02-13 15:52:18 +010038
39# Generate zip file (unportable!)
40stderr_like(
Peter Hardersf9c51242020-07-21 02:37:44 +020041 sub { `cat '$file' | perl '$script' -ti > '$outzip'` },
Peter Harders57c884e2020-07-16 01:28:52 +020042# approaches for working with $fh (also better use OO interface then)
43# sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") },
44# sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close },
45# sub {
46# defined(my $pid = fork) or die "fork: $!";
47# if (!$pid) {
48# open STDOUT, '>&', $fh;
49# exec "cat '$file' | perl '$script'"
50# }
51# waitpid $pid, 0;
52# $fh->close;
53# },
Akron2a60c532020-02-13 15:52:18 +010054 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
55 'Processing'
56);
57
Akron85717512020-07-08 11:19:19 +020058ok(-e $outzip, "File $outzip exists");
59
Akron2a60c532020-02-13 15:52:18 +010060# Uncompress GOE/header.xml from zip file
61my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
62
63ok($zip, 'Zip-File is created');
64
Peter Harders57c884e2020-07-16 01:28:52 +020065# TODO: check wrong encoding in header-files (compare with input document)!
Akron2a60c532020-02-13 15:52:18 +010066# Read GOE/header.xml
67my $header_xml = '';
68$header_xml .= $zip->getline while !$zip->eof;
69ok($zip->close, 'Closed');
70
Akrond89ef822020-02-17 12:42:09 +010071my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010072
Akrond89ef822020-02-17 12:42:09 +010073$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
74 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
75 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
76 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010077
Akron68966082020-02-13 15:52:18 +010078
Akron2a60c532020-02-13 15:52:18 +010079# Uncompress GOE/AGA/header.xml from zip file
80$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
81
82ok($zip, 'Zip-File is found');
83
84# Read GOE/AGA/header.xml
85$header_xml = '';
86$header_xml .= $zip->getline while !$zip->eof;
87ok($zip->close, 'Closed');
88
Akrond89ef822020-02-17 12:42:09 +010089$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010090
Akrond89ef822020-02-17 12:42:09 +010091$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
92 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
93 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010094
95# Uncompress GOE/AGA/00000/header.xml from zip file
96$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
97
98ok($zip, 'Zip-File is found');
99
100# Read GOE/AGA/00000/header.xml
101$header_xml = '';
102$header_xml .= $zip->getline while !$zip->eof;
103ok($zip->close, 'Closed');
104
Akrond89ef822020-02-17 12:42:09 +0100105$t = Test::XML::Loy->new($header_xml);
106$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
107 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +0100108
109# Uncompress GOE/AGA/00000/data.xml from zip file
110$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
111
112ok($zip, 'Zip-File is found');
113
114# Read GOE/AGA/00000/data.xml
115my $data_xml = '';
116$data_xml .= $zip->getline while !$zip->eof;
117ok($zip->close, 'Closed');
118
Akrond89ef822020-02-17 12:42:09 +0100119$t = Test::XML::Loy->new($data_xml);
120$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
121 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100122
123# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
124$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
125
126ok($zip, 'Zip-File is found');
127
128# Read GOE/AGA/00000/struct/structure.xml
129my $struct_xml = '';
130$struct_xml .= $zip->getline while !$zip->eof;
Peter Harders57c884e2020-07-16 01:28:52 +0200131
Akron2a60c532020-02-13 15:52:18 +0100132ok($zip->close, 'Closed');
133
Akrond89ef822020-02-17 12:42:09 +0100134$t = Test::XML::Loy->new($struct_xml);
135$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100136
Peter Harders42e18a62020-07-21 02:43:26 +0200137$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
138ok(!$zip, 'External not generated');
Akroneac374d2020-07-07 09:00:44 +0200139
140# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
141$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
142
143# Read GOE/AGA/00000/base/tok.xml
144my $tokens_xml = '';
145$tokens_xml .= $zip->getline while !$zip->eof;
146ok($zip->close, 'Closed');
147
148$t = Test::XML::Loy->new($tokens_xml);
149$t->attr_is('spanList span:nth-child(1)', 'to', 8);
150
151$t->attr_is('spanList span#t_1', 'from', 9);
152$t->attr_is('spanList span#t_1', 'to', 11);
153
154$t->attr_is('spanList span#t_67', 'from', 427);
155$t->attr_is('spanList span#t_67', 'to', 430);
156
157$t->attr_is('spanList span#t_214', 'from', 1209);
158$t->attr_is('spanList span#t_214', 'to', 1212);
159
160$t->element_count_is('spanList span', 227);
161
162
163# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
164$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
165
Akron8b511f92020-07-09 17:28:08 +0200166$tokens_xml = '';
167$tokens_xml .= $zip->getline while !$zip->eof;
168ok($zip->close, 'Closed');
169
170$t = Test::XML::Loy->new($tokens_xml);
171$t->attr_is('spanList span:nth-child(1)', 'to', 8);
172
173$t->attr_is('spanList span#t_1', 'from', 9);
174$t->attr_is('spanList span#t_1', 'to', 11);
175
176$t->attr_is('spanList span#t_67', 'from', 427);
177$t->attr_is('spanList span#t_67', 'to', 430);
178
179$t->attr_is('spanList span#t_214', 'from', 1209);
180$t->attr_is('spanList span#t_214', 'to', 1212);
181
182$t->element_count_is('spanList span', 227);
183
Peter Harders42e18a62020-07-21 02:43:26 +0200184
Akron8b511f92020-07-09 17:28:08 +0200185# Tokenize with external tokenizer
186my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
187
Akron5fb5e8d2020-07-23 17:45:13 +0200188my ($fh2, $outzip2) = korap_tempfile('script_out2');
Peter Hardersb1227172020-07-21 02:12:10 +0200189
Akron8b511f92020-07-09 17:28:08 +0200190stderr_like(
Peter Harders42e18a62020-07-21 02:43:26 +0200191 sub { `cat '$file' | perl '$script' -tc='perl $cmd' > '$outzip2'` },
Akron8b511f92020-07-09 17:28:08 +0200192 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
193 'Processing'
194);
195
Peter Harders71f072b2020-07-15 14:15:01 +0200196# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
Peter Hardersb1227172020-07-21 02:12:10 +0200197$zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
Peter Harders42e18a62020-07-21 02:43:26 +0200198ok($zip, 'Found');
199ok(!$zip->eof, 'Readable');
Akron8b511f92020-07-09 17:28:08 +0200200
201# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200202$tokens_xml = '';
203$tokens_xml .= $zip->getline while !$zip->eof;
204ok($zip->close, 'Closed');
205
206$t = Test::XML::Loy->new($tokens_xml);
207$t->attr_is('spanList span:nth-child(1)', 'to', 8);
208
209$t->attr_is('spanList span#t_1', 'from', 9);
210$t->attr_is('spanList span#t_1', 'to', 11);
211
212$t->attr_is('spanList span#t_67', 'from', 427);
213$t->attr_is('spanList span#t_67', 'to', 430);
214
215$t->attr_is('spanList span#t_214', 'from', 1209);
216$t->attr_is('spanList span#t_214', 'to', 1212);
217
218$t->element_count_is('spanList span', 227);
219
Peter Harders71f072b2020-07-15 14:15:01 +0200220
Akron5fb5e8d2020-07-23 17:45:13 +0200221my ($fh3, $outzip3) = korap_tempfile('script_out3');
Peter Hardersb1227172020-07-21 02:12:10 +0200222
223
Peter Harders71f072b2020-07-15 14:15:01 +0200224# ~ test conservative tokenization ~
225
226$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
227
228stderr_like(
Peter Hardersf9c51242020-07-21 02:37:44 +0200229 sub { `cat '$file' | perl '$script' --ti > '$outzip3'` },
Peter Harders71f072b2020-07-15 14:15:01 +0200230 qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
231 'Processing'
232);
233
Peter Hardersb1227172020-07-21 02:12:10 +0200234ok(-e $outzip3, "File $outzip3 exists");
Peter Harders71f072b2020-07-15 14:15:01 +0200235
Peter Hardersb1227172020-07-21 02:12:10 +0200236$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
Peter Harders71f072b2020-07-15 14:15:01 +0200237
238ok($zip, 'Zip-File is created');
239
240my $cons = '';
241$cons .= $zip->getline while !$zip->eof;
242ok($zip->close, 'Closed');
243
244$t = Test::XML::Loy->new($cons);
245$t->attr_is('spanList span:nth-child(1)', 'to', 6);
246
247$t->attr_is('spanList span#t_1', 'from', 7);
248$t->attr_is('spanList span#t_1', 'to', 9);
249
250$t->attr_is('spanList span#t_3', 'from', 12);
251$t->attr_is('spanList span#t_3', 'to', 16);
252
253$t->attr_is('spanList span#t_9', 'from', 36);
254$t->attr_is('spanList span#t_9', 'to', 37);
255
256$t->attr_is('spanList span#t_13', 'from', 44);
257$t->attr_is('spanList span#t_13', 'to', 45); # "
258
259$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
260$t->attr_is('spanList span#t_14', 'to', 55);
261
262$t->attr_is('spanList span#t_15', 'from', 55); # "
263$t->attr_is('spanList span#t_15', 'to', 56);
264
265$t->attr_is('spanList span#t_19', 'from', 66);
266$t->attr_is('spanList span#t_19', 'to', 67);
267
268$t->element_count_is('spanList span', 20);
269
270
271# ~ test aggressive tokenization ~
272
Peter Hardersb1227172020-07-21 02:12:10 +0200273$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
Peter Harders71f072b2020-07-15 14:15:01 +0200274
275ok($zip, 'Zip-File is created');
276
277my $aggr = '';
278$aggr .= $zip->getline while !$zip->eof;
279ok($zip->close, 'Closed');
280
281$t = Test::XML::Loy->new($aggr);
282
283$t->attr_is('spanList span:nth-child(1)', 'to', 6);
284
285$t->attr_is('spanList span#t_1', 'from', 7);
286$t->attr_is('spanList span#t_1', 'to', 9);
287
288$t->attr_is('spanList span#t_3', 'from', 12);
289$t->attr_is('spanList span#t_3', 'to', 16);
290
291$t->attr_is('spanList span#t_9', 'from', 36);
292$t->attr_is('spanList span#t_9', 'to', 37);
293
294$t->attr_is('spanList span#t_13', 'from', 44);
295$t->attr_is('spanList span#t_13', 'to', 45); # "
296
297$t->attr_is('spanList span#t_14', 'from', 45); # twenty
298$t->attr_is('spanList span#t_14', 'to', 51);
299
300$t->attr_is('spanList span#t_15', 'from', 51); # -
301$t->attr_is('spanList span#t_15', 'to', 52);
302
303$t->attr_is('spanList span#t_16', 'from', 52); # two
304$t->attr_is('spanList span#t_16', 'to', 55);
305
306$t->attr_is('spanList span#t_17', 'from', 55); # "
307$t->attr_is('spanList span#t_17', 'to', 56);
308
309$t->attr_is('spanList span#t_21', 'from', 66);
310$t->attr_is('spanList span#t_21', 'to', 67);
311
312$t->element_count_is('spanList span', 22);
313
314
Peter Harders42e18a62020-07-21 02:43:26 +0200315subtest 'Check Tokenization Flags' => sub {
316
317 # Get external tokenizer
318 my $f = dirname(__FILE__);
319 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
320
321 # Load example file
322 my $file = catfile($f, 'data', 'goe_sample.i5.xml');
323
324 my ($fh, $outzip) = korap_tempfile('script_tokflags');
325
326 # Generate zip file (unportable!)
327 stderr_like(
328 sub { `cat '$file' | perl '$script' -ti -tc 'perl $cmd' > '$outzip'` },
329 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
330 'Processing'
331 );
332
333 ok(-e $outzip, "File $outzip exists");
334
335 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
336 ok($zip, 'Aggressive generated');
337 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
338 ok($zip, 'Conservative generated');
339 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
340 ok($zip, 'External generated');
341};
342
Akron797e8072020-02-13 07:59:40 +0100343done_testing;