blob: 85c2cea9348b50ac5ab39172cf5968a034c6c29e [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Peter Harders57c884e2020-07-16 01:28:52 +02005use File::Temp qw/tempfile/;
Akron2a60c532020-02-13 15:52:18 +01006use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Peter Harders57c884e2020-07-16 01:28:52 +020013our %ENV;
14# default: remove temp. file created by func. tempfile
15# to keep temp. files use e.g. 'KORAPXMLTEI_DONTUNLINK=1 prove -lr t/script.t'
16my $_UNLINK = $ENV{KORAPXMLTEI_DONTUNLINK}?0:1;
17
Akron797e8072020-02-13 07:59:40 +010018my $f = dirname(__FILE__);
19my $script = catfile($f, '..', 'script', 'tei2korapxml');
20ok(-f $script, 'Script found');
21
Akrond949e182020-02-14 12:23:57 +010022stdout_like(
Akron797e8072020-02-13 07:59:40 +010023 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010024 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010025 'Help'
26);
27
Akrond949e182020-02-14 12:23:57 +010028stdout_like(
29 sub { system('perl', $script, '--version') },
30 qr!tei2korapxml - v\d+?\.\d+?!,
31 'Version'
32);
33
34
Akron2a60c532020-02-13 15:52:18 +010035# Load example file
36my $file = catfile($f, 'data', 'goe_sample.i5.xml');
Peter Harders57c884e2020-07-16 01:28:52 +020037
38my ($fh, $outzip) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
Akron2a60c532020-02-13 15:52:18 +010039
40# Generate zip file (unportable!)
41stderr_like(
42 sub { `cat '$file' | perl '$script' > '$outzip'` },
Peter Harders57c884e2020-07-16 01:28:52 +020043# approaches for working with $fh (also better use OO interface then)
44# sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") },
45# sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close },
46# sub {
47# defined(my $pid = fork) or die "fork: $!";
48# if (!$pid) {
49# open STDOUT, '>&', $fh;
50# exec "cat '$file' | perl '$script'"
51# }
52# waitpid $pid, 0;
53# $fh->close;
54# },
Akron2a60c532020-02-13 15:52:18 +010055 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
56 'Processing'
57);
58
Akron85717512020-07-08 11:19:19 +020059ok(-e $outzip, "File $outzip exists");
60
Akron2a60c532020-02-13 15:52:18 +010061# Uncompress GOE/header.xml from zip file
62my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
63
64ok($zip, 'Zip-File is created');
65
Peter Harders57c884e2020-07-16 01:28:52 +020066# TODO: check wrong encoding in header-files (compare with input document)!
Akron2a60c532020-02-13 15:52:18 +010067# Read GOE/header.xml
68my $header_xml = '';
69$header_xml .= $zip->getline while !$zip->eof;
70ok($zip->close, 'Closed');
71
Akrond89ef822020-02-17 12:42:09 +010072my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010073
Akrond89ef822020-02-17 12:42:09 +010074$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
75 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
76 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
77 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010078
Akron68966082020-02-13 15:52:18 +010079
Akron2a60c532020-02-13 15:52:18 +010080# Uncompress GOE/AGA/header.xml from zip file
81$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
82
83ok($zip, 'Zip-File is found');
84
85# Read GOE/AGA/header.xml
86$header_xml = '';
87$header_xml .= $zip->getline while !$zip->eof;
88ok($zip->close, 'Closed');
89
Akrond89ef822020-02-17 12:42:09 +010090$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010091
Akrond89ef822020-02-17 12:42:09 +010092$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
93 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
94 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010095
96# Uncompress GOE/AGA/00000/header.xml from zip file
97$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
98
99ok($zip, 'Zip-File is found');
100
101# Read GOE/AGA/00000/header.xml
102$header_xml = '';
103$header_xml .= $zip->getline while !$zip->eof;
104ok($zip->close, 'Closed');
105
Akrond89ef822020-02-17 12:42:09 +0100106$t = Test::XML::Loy->new($header_xml);
107$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
108 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +0100109
110# Uncompress GOE/AGA/00000/data.xml from zip file
111$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
112
113ok($zip, 'Zip-File is found');
114
115# Read GOE/AGA/00000/data.xml
116my $data_xml = '';
117$data_xml .= $zip->getline while !$zip->eof;
118ok($zip->close, 'Closed');
119
Akrond89ef822020-02-17 12:42:09 +0100120$t = Test::XML::Loy->new($data_xml);
121$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
122 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100123
124# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
125$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
126
127ok($zip, 'Zip-File is found');
128
129# Read GOE/AGA/00000/struct/structure.xml
130my $struct_xml = '';
131$struct_xml .= $zip->getline while !$zip->eof;
Peter Harders57c884e2020-07-16 01:28:52 +0200132
Akron2a60c532020-02-13 15:52:18 +0100133ok($zip->close, 'Closed');
134
Akrond89ef822020-02-17 12:42:09 +0100135$t = Test::XML::Loy->new($struct_xml);
136$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100137
Akroneac374d2020-07-07 09:00:44 +0200138
139# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
140$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
141
142# Read GOE/AGA/00000/base/tok.xml
143my $tokens_xml = '';
144$tokens_xml .= $zip->getline while !$zip->eof;
145ok($zip->close, 'Closed');
146
147$t = Test::XML::Loy->new($tokens_xml);
148$t->attr_is('spanList span:nth-child(1)', 'to', 8);
149
150$t->attr_is('spanList span#t_1', 'from', 9);
151$t->attr_is('spanList span#t_1', 'to', 11);
152
153$t->attr_is('spanList span#t_67', 'from', 427);
154$t->attr_is('spanList span#t_67', 'to', 430);
155
156$t->attr_is('spanList span#t_214', 'from', 1209);
157$t->attr_is('spanList span#t_214', 'to', 1212);
158
159$t->element_count_is('spanList span', 227);
160
161
162# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
163$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
164
Akron8b511f92020-07-09 17:28:08 +0200165$tokens_xml = '';
166$tokens_xml .= $zip->getline while !$zip->eof;
167ok($zip->close, 'Closed');
168
169$t = Test::XML::Loy->new($tokens_xml);
170$t->attr_is('spanList span:nth-child(1)', 'to', 8);
171
172$t->attr_is('spanList span#t_1', 'from', 9);
173$t->attr_is('spanList span#t_1', 'to', 11);
174
175$t->attr_is('spanList span#t_67', 'from', 427);
176$t->attr_is('spanList span#t_67', 'to', 430);
177
178$t->attr_is('spanList span#t_214', 'from', 1209);
179$t->attr_is('spanList span#t_214', 'to', 1212);
180
181$t->element_count_is('spanList span', 227);
182
183# Tokenize with external tokenizer
184my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
185
186stderr_like(
187 sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
188 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
189 'Processing'
190);
191
Peter Harders71f072b2020-07-15 14:15:01 +0200192# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
Akron8b511f92020-07-09 17:28:08 +0200193$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
194
195# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200196$tokens_xml = '';
197$tokens_xml .= $zip->getline while !$zip->eof;
198ok($zip->close, 'Closed');
199
200$t = Test::XML::Loy->new($tokens_xml);
201$t->attr_is('spanList span:nth-child(1)', 'to', 8);
202
203$t->attr_is('spanList span#t_1', 'from', 9);
204$t->attr_is('spanList span#t_1', 'to', 11);
205
206$t->attr_is('spanList span#t_67', 'from', 427);
207$t->attr_is('spanList span#t_67', 'to', 430);
208
209$t->attr_is('spanList span#t_214', 'from', 1209);
210$t->attr_is('spanList span#t_214', 'to', 1212);
211
212$t->element_count_is('spanList span', 227);
213
Peter Harders71f072b2020-07-15 14:15:01 +0200214
215
216# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
217
218
219# ~ test conservative tokenization ~
220
221$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
222
223stderr_like(
224 sub { `cat '$file' | perl '$script' > '$outzip'` },
225 qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
226 'Processing'
227);
228
229ok(-e $outzip, "File $outzip exists");
230
231$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
232
233ok($zip, 'Zip-File is created');
234
235my $cons = '';
236$cons .= $zip->getline while !$zip->eof;
237ok($zip->close, 'Closed');
238
239$t = Test::XML::Loy->new($cons);
240$t->attr_is('spanList span:nth-child(1)', 'to', 6);
241
242$t->attr_is('spanList span#t_1', 'from', 7);
243$t->attr_is('spanList span#t_1', 'to', 9);
244
245$t->attr_is('spanList span#t_3', 'from', 12);
246$t->attr_is('spanList span#t_3', 'to', 16);
247
248$t->attr_is('spanList span#t_9', 'from', 36);
249$t->attr_is('spanList span#t_9', 'to', 37);
250
251$t->attr_is('spanList span#t_13', 'from', 44);
252$t->attr_is('spanList span#t_13', 'to', 45); # "
253
254$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
255$t->attr_is('spanList span#t_14', 'to', 55);
256
257$t->attr_is('spanList span#t_15', 'from', 55); # "
258$t->attr_is('spanList span#t_15', 'to', 56);
259
260$t->attr_is('spanList span#t_19', 'from', 66);
261$t->attr_is('spanList span#t_19', 'to', 67);
262
263$t->element_count_is('spanList span', 20);
264
265
266# ~ test aggressive tokenization ~
267
268$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
269
270ok($zip, 'Zip-File is created');
271
272my $aggr = '';
273$aggr .= $zip->getline while !$zip->eof;
274ok($zip->close, 'Closed');
275
276$t = Test::XML::Loy->new($aggr);
277
278$t->attr_is('spanList span:nth-child(1)', 'to', 6);
279
280$t->attr_is('spanList span#t_1', 'from', 7);
281$t->attr_is('spanList span#t_1', 'to', 9);
282
283$t->attr_is('spanList span#t_3', 'from', 12);
284$t->attr_is('spanList span#t_3', 'to', 16);
285
286$t->attr_is('spanList span#t_9', 'from', 36);
287$t->attr_is('spanList span#t_9', 'to', 37);
288
289$t->attr_is('spanList span#t_13', 'from', 44);
290$t->attr_is('spanList span#t_13', 'to', 45); # "
291
292$t->attr_is('spanList span#t_14', 'from', 45); # twenty
293$t->attr_is('spanList span#t_14', 'to', 51);
294
295$t->attr_is('spanList span#t_15', 'from', 51); # -
296$t->attr_is('spanList span#t_15', 'to', 52);
297
298$t->attr_is('spanList span#t_16', 'from', 52); # two
299$t->attr_is('spanList span#t_16', 'to', 55);
300
301$t->attr_is('spanList span#t_17', 'from', 55); # "
302$t->attr_is('spanList span#t_17', 'to', 56);
303
304$t->attr_is('spanList span#t_21', 'from', 66);
305$t->attr_is('spanList span#t_21', 'to', 67);
306
307$t->element_count_is('spanList span', 22);
308
309
Akron797e8072020-02-13 07:59:40 +0100310done_testing;