blob: 401d76bd6d43c9913b17fdb1fc1d570fd755162b [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Peter Harders57c884e2020-07-16 01:28:52 +02005use File::Temp qw/tempfile/;
Akron2a60c532020-02-13 15:52:18 +01006use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Peter Harders57c884e2020-07-16 01:28:52 +020013our %ENV;
14# default: remove temp. file created by func. tempfile
15# to keep temp. files use e.g. 'KORAPXMLTEI_DONTUNLINK=1 prove -lr t/script.t'
16my $_UNLINK = $ENV{KORAPXMLTEI_DONTUNLINK}?0:1;
17
Akron797e8072020-02-13 07:59:40 +010018my $f = dirname(__FILE__);
19my $script = catfile($f, '..', 'script', 'tei2korapxml');
20ok(-f $script, 'Script found');
21
Akrond949e182020-02-14 12:23:57 +010022stdout_like(
Akron797e8072020-02-13 07:59:40 +010023 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010024 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010025 'Help'
26);
27
Akrond949e182020-02-14 12:23:57 +010028stdout_like(
29 sub { system('perl', $script, '--version') },
30 qr!tei2korapxml - v\d+?\.\d+?!,
31 'Version'
32);
33
34
Akron2a60c532020-02-13 15:52:18 +010035# Load example file
36my $file = catfile($f, 'data', 'goe_sample.i5.xml');
Peter Harders57c884e2020-07-16 01:28:52 +020037
38my ($fh, $outzip) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
Akron2a60c532020-02-13 15:52:18 +010039
40# Generate zip file (unportable!)
41stderr_like(
Peter Hardersf9c51242020-07-21 02:37:44 +020042 sub { `cat '$file' | perl '$script' -ti > '$outzip'` },
Peter Harders57c884e2020-07-16 01:28:52 +020043# approaches for working with $fh (also better use OO interface then)
44# sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") },
45# sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close },
46# sub {
47# defined(my $pid = fork) or die "fork: $!";
48# if (!$pid) {
49# open STDOUT, '>&', $fh;
50# exec "cat '$file' | perl '$script'"
51# }
52# waitpid $pid, 0;
53# $fh->close;
54# },
Akron2a60c532020-02-13 15:52:18 +010055 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
56 'Processing'
57);
58
Akron85717512020-07-08 11:19:19 +020059ok(-e $outzip, "File $outzip exists");
60
Akron2a60c532020-02-13 15:52:18 +010061# Uncompress GOE/header.xml from zip file
62my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
63
64ok($zip, 'Zip-File is created');
65
Peter Harders57c884e2020-07-16 01:28:52 +020066# TODO: check wrong encoding in header-files (compare with input document)!
Akron2a60c532020-02-13 15:52:18 +010067# Read GOE/header.xml
68my $header_xml = '';
69$header_xml .= $zip->getline while !$zip->eof;
70ok($zip->close, 'Closed');
71
Akrond89ef822020-02-17 12:42:09 +010072my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010073
Akrond89ef822020-02-17 12:42:09 +010074$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
75 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
76 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
77 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010078
Akron68966082020-02-13 15:52:18 +010079
Akron2a60c532020-02-13 15:52:18 +010080# Uncompress GOE/AGA/header.xml from zip file
81$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
82
83ok($zip, 'Zip-File is found');
84
85# Read GOE/AGA/header.xml
86$header_xml = '';
87$header_xml .= $zip->getline while !$zip->eof;
88ok($zip->close, 'Closed');
89
Akrond89ef822020-02-17 12:42:09 +010090$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010091
Akrond89ef822020-02-17 12:42:09 +010092$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
93 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
94 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010095
96# Uncompress GOE/AGA/00000/header.xml from zip file
97$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
98
99ok($zip, 'Zip-File is found');
100
101# Read GOE/AGA/00000/header.xml
102$header_xml = '';
103$header_xml .= $zip->getline while !$zip->eof;
104ok($zip->close, 'Closed');
105
Akrond89ef822020-02-17 12:42:09 +0100106$t = Test::XML::Loy->new($header_xml);
107$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
108 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +0100109
110# Uncompress GOE/AGA/00000/data.xml from zip file
111$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
112
113ok($zip, 'Zip-File is found');
114
115# Read GOE/AGA/00000/data.xml
116my $data_xml = '';
117$data_xml .= $zip->getline while !$zip->eof;
118ok($zip->close, 'Closed');
119
Akrond89ef822020-02-17 12:42:09 +0100120$t = Test::XML::Loy->new($data_xml);
121$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
122 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100123
124# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
125$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
126
127ok($zip, 'Zip-File is found');
128
129# Read GOE/AGA/00000/struct/structure.xml
130my $struct_xml = '';
131$struct_xml .= $zip->getline while !$zip->eof;
Peter Harders57c884e2020-07-16 01:28:52 +0200132
Akron2a60c532020-02-13 15:52:18 +0100133ok($zip->close, 'Closed');
134
Akrond89ef822020-02-17 12:42:09 +0100135$t = Test::XML::Loy->new($struct_xml);
136$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100137
Akroneac374d2020-07-07 09:00:44 +0200138
139# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
140$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
141
142# Read GOE/AGA/00000/base/tok.xml
143my $tokens_xml = '';
144$tokens_xml .= $zip->getline while !$zip->eof;
145ok($zip->close, 'Closed');
146
147$t = Test::XML::Loy->new($tokens_xml);
148$t->attr_is('spanList span:nth-child(1)', 'to', 8);
149
150$t->attr_is('spanList span#t_1', 'from', 9);
151$t->attr_is('spanList span#t_1', 'to', 11);
152
153$t->attr_is('spanList span#t_67', 'from', 427);
154$t->attr_is('spanList span#t_67', 'to', 430);
155
156$t->attr_is('spanList span#t_214', 'from', 1209);
157$t->attr_is('spanList span#t_214', 'to', 1212);
158
159$t->element_count_is('spanList span', 227);
160
161
162# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
163$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
164
Akron8b511f92020-07-09 17:28:08 +0200165$tokens_xml = '';
166$tokens_xml .= $zip->getline while !$zip->eof;
167ok($zip->close, 'Closed');
168
169$t = Test::XML::Loy->new($tokens_xml);
170$t->attr_is('spanList span:nth-child(1)', 'to', 8);
171
172$t->attr_is('spanList span#t_1', 'from', 9);
173$t->attr_is('spanList span#t_1', 'to', 11);
174
175$t->attr_is('spanList span#t_67', 'from', 427);
176$t->attr_is('spanList span#t_67', 'to', 430);
177
178$t->attr_is('spanList span#t_214', 'from', 1209);
179$t->attr_is('spanList span#t_214', 'to', 1212);
180
181$t->element_count_is('spanList span', 227);
182
183# Tokenize with external tokenizer
184my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
185
Peter Hardersb1227172020-07-21 02:12:10 +0200186my ($fh2, $outzip2) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
187
Akron8b511f92020-07-09 17:28:08 +0200188stderr_like(
Peter Hardersb1227172020-07-21 02:12:10 +0200189 sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
Akron8b511f92020-07-09 17:28:08 +0200190 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
191 'Processing'
192);
193
Peter Harders71f072b2020-07-15 14:15:01 +0200194# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
Peter Hardersb1227172020-07-21 02:12:10 +0200195$zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
Akron8b511f92020-07-09 17:28:08 +0200196
197# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200198$tokens_xml = '';
199$tokens_xml .= $zip->getline while !$zip->eof;
200ok($zip->close, 'Closed');
201
202$t = Test::XML::Loy->new($tokens_xml);
203$t->attr_is('spanList span:nth-child(1)', 'to', 8);
204
205$t->attr_is('spanList span#t_1', 'from', 9);
206$t->attr_is('spanList span#t_1', 'to', 11);
207
208$t->attr_is('spanList span#t_67', 'from', 427);
209$t->attr_is('spanList span#t_67', 'to', 430);
210
211$t->attr_is('spanList span#t_214', 'from', 1209);
212$t->attr_is('spanList span#t_214', 'to', 1212);
213
214$t->element_count_is('spanList span', 227);
215
Peter Harders71f072b2020-07-15 14:15:01 +0200216
217
218# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
219
220
Peter Hardersb1227172020-07-21 02:12:10 +0200221my ($fh3, $outzip3) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
222
223
Peter Harders71f072b2020-07-15 14:15:01 +0200224# ~ test conservative tokenization ~
225
226$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
227
228stderr_like(
Peter Hardersf9c51242020-07-21 02:37:44 +0200229 sub { `cat '$file' | perl '$script' --ti > '$outzip3'` },
Peter Harders71f072b2020-07-15 14:15:01 +0200230 qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
231 'Processing'
232);
233
Peter Hardersb1227172020-07-21 02:12:10 +0200234ok(-e $outzip3, "File $outzip3 exists");
Peter Harders71f072b2020-07-15 14:15:01 +0200235
Peter Hardersb1227172020-07-21 02:12:10 +0200236$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
Peter Harders71f072b2020-07-15 14:15:01 +0200237
238ok($zip, 'Zip-File is created');
239
240my $cons = '';
241$cons .= $zip->getline while !$zip->eof;
242ok($zip->close, 'Closed');
243
244$t = Test::XML::Loy->new($cons);
245$t->attr_is('spanList span:nth-child(1)', 'to', 6);
246
247$t->attr_is('spanList span#t_1', 'from', 7);
248$t->attr_is('spanList span#t_1', 'to', 9);
249
250$t->attr_is('spanList span#t_3', 'from', 12);
251$t->attr_is('spanList span#t_3', 'to', 16);
252
253$t->attr_is('spanList span#t_9', 'from', 36);
254$t->attr_is('spanList span#t_9', 'to', 37);
255
256$t->attr_is('spanList span#t_13', 'from', 44);
257$t->attr_is('spanList span#t_13', 'to', 45); # "
258
259$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
260$t->attr_is('spanList span#t_14', 'to', 55);
261
262$t->attr_is('spanList span#t_15', 'from', 55); # "
263$t->attr_is('spanList span#t_15', 'to', 56);
264
265$t->attr_is('spanList span#t_19', 'from', 66);
266$t->attr_is('spanList span#t_19', 'to', 67);
267
268$t->element_count_is('spanList span', 20);
269
270
271# ~ test aggressive tokenization ~
272
Peter Hardersb1227172020-07-21 02:12:10 +0200273$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
Peter Harders71f072b2020-07-15 14:15:01 +0200274
275ok($zip, 'Zip-File is created');
276
277my $aggr = '';
278$aggr .= $zip->getline while !$zip->eof;
279ok($zip->close, 'Closed');
280
281$t = Test::XML::Loy->new($aggr);
282
283$t->attr_is('spanList span:nth-child(1)', 'to', 6);
284
285$t->attr_is('spanList span#t_1', 'from', 7);
286$t->attr_is('spanList span#t_1', 'to', 9);
287
288$t->attr_is('spanList span#t_3', 'from', 12);
289$t->attr_is('spanList span#t_3', 'to', 16);
290
291$t->attr_is('spanList span#t_9', 'from', 36);
292$t->attr_is('spanList span#t_9', 'to', 37);
293
294$t->attr_is('spanList span#t_13', 'from', 44);
295$t->attr_is('spanList span#t_13', 'to', 45); # "
296
297$t->attr_is('spanList span#t_14', 'from', 45); # twenty
298$t->attr_is('spanList span#t_14', 'to', 51);
299
300$t->attr_is('spanList span#t_15', 'from', 51); # -
301$t->attr_is('spanList span#t_15', 'to', 52);
302
303$t->attr_is('spanList span#t_16', 'from', 52); # two
304$t->attr_is('spanList span#t_16', 'to', 55);
305
306$t->attr_is('spanList span#t_17', 'from', 55); # "
307$t->attr_is('spanList span#t_17', 'to', 56);
308
309$t->attr_is('spanList span#t_21', 'from', 66);
310$t->attr_is('spanList span#t_21', 'to', 67);
311
312$t->element_count_is('spanList span', 22);
313
314
Akron797e8072020-02-13 07:59:40 +0100315done_testing;