blob: 8d17be63dccc7ba15d9f0344099817210ea211ea [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Akron2a60c532020-02-13 15:52:18 +01005use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01006
7use Test::More;
8use Test::Output;
Akrond89ef822020-02-17 12:42:09 +01009use Test::XML::Loy;
Akron5fb5e8d2020-07-23 17:45:13 +020010use Test::KorAP::XML::TEI qw!korap_tempfile!;
Peter Harders57c884e2020-07-16 01:28:52 +020011
Akron797e8072020-02-13 07:59:40 +010012my $f = dirname(__FILE__);
13my $script = catfile($f, '..', 'script', 'tei2korapxml');
14ok(-f $script, 'Script found');
15
Akrond949e182020-02-14 12:23:57 +010016stdout_like(
Akron797e8072020-02-13 07:59:40 +010017 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010018 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010019 'Help'
20);
21
Akrond949e182020-02-14 12:23:57 +010022stdout_like(
23 sub { system('perl', $script, '--version') },
24 qr!tei2korapxml - v\d+?\.\d+?!,
25 'Version'
26);
27
28
Akron2a60c532020-02-13 15:52:18 +010029# Load example file
30my $file = catfile($f, 'data', 'goe_sample.i5.xml');
Peter Harders57c884e2020-07-16 01:28:52 +020031
Akron5fb5e8d2020-07-23 17:45:13 +020032my ($fh, $outzip) = korap_tempfile('script_out');
Akron2a60c532020-02-13 15:52:18 +010033
34# Generate zip file (unportable!)
35stderr_like(
Peter Hardersf9c51242020-07-21 02:37:44 +020036 sub { `cat '$file' | perl '$script' -ti > '$outzip'` },
Peter Harders57c884e2020-07-16 01:28:52 +020037# approaches for working with $fh (also better use OO interface then)
38# sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") },
39# sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close },
40# sub {
41# defined(my $pid = fork) or die "fork: $!";
42# if (!$pid) {
43# open STDOUT, '>&', $fh;
44# exec "cat '$file' | perl '$script'"
45# }
46# waitpid $pid, 0;
47# $fh->close;
48# },
Akron2a60c532020-02-13 15:52:18 +010049 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
50 'Processing'
51);
52
Akron85717512020-07-08 11:19:19 +020053ok(-e $outzip, "File $outzip exists");
54
Akron2a60c532020-02-13 15:52:18 +010055# Uncompress GOE/header.xml from zip file
56my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
57
58ok($zip, 'Zip-File is created');
59
Peter Harders57c884e2020-07-16 01:28:52 +020060# TODO: check wrong encoding in header-files (compare with input document)!
Akron2a60c532020-02-13 15:52:18 +010061# Read GOE/header.xml
62my $header_xml = '';
63$header_xml .= $zip->getline while !$zip->eof;
64ok($zip->close, 'Closed');
65
Akrond89ef822020-02-17 12:42:09 +010066my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010067
Akrond89ef822020-02-17 12:42:09 +010068$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
69 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
70 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
71 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010072
Akron68966082020-02-13 15:52:18 +010073
Akron2a60c532020-02-13 15:52:18 +010074# Uncompress GOE/AGA/header.xml from zip file
75$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
76
77ok($zip, 'Zip-File is found');
78
79# Read GOE/AGA/header.xml
80$header_xml = '';
81$header_xml .= $zip->getline while !$zip->eof;
82ok($zip->close, 'Closed');
83
Akrond89ef822020-02-17 12:42:09 +010084$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010085
Akrond89ef822020-02-17 12:42:09 +010086$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
87 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
88 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010089
90# Uncompress GOE/AGA/00000/header.xml from zip file
91$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
92
93ok($zip, 'Zip-File is found');
94
95# Read GOE/AGA/00000/header.xml
96$header_xml = '';
97$header_xml .= $zip->getline while !$zip->eof;
98ok($zip->close, 'Closed');
99
Akrond89ef822020-02-17 12:42:09 +0100100$t = Test::XML::Loy->new($header_xml);
101$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
102 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +0100103
104# Uncompress GOE/AGA/00000/data.xml from zip file
105$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
106
107ok($zip, 'Zip-File is found');
108
109# Read GOE/AGA/00000/data.xml
110my $data_xml = '';
111$data_xml .= $zip->getline while !$zip->eof;
112ok($zip->close, 'Closed');
113
Akrond89ef822020-02-17 12:42:09 +0100114$t = Test::XML::Loy->new($data_xml);
115$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
116 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100117
118# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
119$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
120
121ok($zip, 'Zip-File is found');
122
123# Read GOE/AGA/00000/struct/structure.xml
124my $struct_xml = '';
125$struct_xml .= $zip->getline while !$zip->eof;
Peter Harders57c884e2020-07-16 01:28:52 +0200126
Akron2a60c532020-02-13 15:52:18 +0100127ok($zip->close, 'Closed');
128
Akrond89ef822020-02-17 12:42:09 +0100129$t = Test::XML::Loy->new($struct_xml);
130$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100131
Akroneac374d2020-07-07 09:00:44 +0200132
133# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
134$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
135
136# Read GOE/AGA/00000/base/tok.xml
137my $tokens_xml = '';
138$tokens_xml .= $zip->getline while !$zip->eof;
139ok($zip->close, 'Closed');
140
141$t = Test::XML::Loy->new($tokens_xml);
142$t->attr_is('spanList span:nth-child(1)', 'to', 8);
143
144$t->attr_is('spanList span#t_1', 'from', 9);
145$t->attr_is('spanList span#t_1', 'to', 11);
146
147$t->attr_is('spanList span#t_67', 'from', 427);
148$t->attr_is('spanList span#t_67', 'to', 430);
149
150$t->attr_is('spanList span#t_214', 'from', 1209);
151$t->attr_is('spanList span#t_214', 'to', 1212);
152
153$t->element_count_is('spanList span', 227);
154
155
156# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
157$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
158
Akron8b511f92020-07-09 17:28:08 +0200159$tokens_xml = '';
160$tokens_xml .= $zip->getline while !$zip->eof;
161ok($zip->close, 'Closed');
162
163$t = Test::XML::Loy->new($tokens_xml);
164$t->attr_is('spanList span:nth-child(1)', 'to', 8);
165
166$t->attr_is('spanList span#t_1', 'from', 9);
167$t->attr_is('spanList span#t_1', 'to', 11);
168
169$t->attr_is('spanList span#t_67', 'from', 427);
170$t->attr_is('spanList span#t_67', 'to', 430);
171
172$t->attr_is('spanList span#t_214', 'from', 1209);
173$t->attr_is('spanList span#t_214', 'to', 1212);
174
175$t->element_count_is('spanList span', 227);
176
177# Tokenize with external tokenizer
178my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
179
Akron5fb5e8d2020-07-23 17:45:13 +0200180my ($fh2, $outzip2) = korap_tempfile('script_out2');
Peter Hardersb1227172020-07-21 02:12:10 +0200181
Akron8b511f92020-07-09 17:28:08 +0200182stderr_like(
Peter Hardersb1227172020-07-21 02:12:10 +0200183 sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
Akron8b511f92020-07-09 17:28:08 +0200184 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
185 'Processing'
186);
187
Peter Harders71f072b2020-07-15 14:15:01 +0200188# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
Peter Hardersb1227172020-07-21 02:12:10 +0200189$zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
Akron8b511f92020-07-09 17:28:08 +0200190
191# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200192$tokens_xml = '';
193$tokens_xml .= $zip->getline while !$zip->eof;
194ok($zip->close, 'Closed');
195
196$t = Test::XML::Loy->new($tokens_xml);
197$t->attr_is('spanList span:nth-child(1)', 'to', 8);
198
199$t->attr_is('spanList span#t_1', 'from', 9);
200$t->attr_is('spanList span#t_1', 'to', 11);
201
202$t->attr_is('spanList span#t_67', 'from', 427);
203$t->attr_is('spanList span#t_67', 'to', 430);
204
205$t->attr_is('spanList span#t_214', 'from', 1209);
206$t->attr_is('spanList span#t_214', 'to', 1212);
207
208$t->element_count_is('spanList span', 227);
209
Peter Harders71f072b2020-07-15 14:15:01 +0200210
211
212# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
213
214
Akron5fb5e8d2020-07-23 17:45:13 +0200215my ($fh3, $outzip3) = korap_tempfile('script_out3');
Peter Hardersb1227172020-07-21 02:12:10 +0200216
217
Peter Harders71f072b2020-07-15 14:15:01 +0200218# ~ test conservative tokenization ~
219
220$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
221
222stderr_like(
Peter Hardersf9c51242020-07-21 02:37:44 +0200223 sub { `cat '$file' | perl '$script' --ti > '$outzip3'` },
Peter Harders71f072b2020-07-15 14:15:01 +0200224 qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
225 'Processing'
226);
227
Peter Hardersb1227172020-07-21 02:12:10 +0200228ok(-e $outzip3, "File $outzip3 exists");
Peter Harders71f072b2020-07-15 14:15:01 +0200229
Peter Hardersb1227172020-07-21 02:12:10 +0200230$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
Peter Harders71f072b2020-07-15 14:15:01 +0200231
232ok($zip, 'Zip-File is created');
233
234my $cons = '';
235$cons .= $zip->getline while !$zip->eof;
236ok($zip->close, 'Closed');
237
238$t = Test::XML::Loy->new($cons);
239$t->attr_is('spanList span:nth-child(1)', 'to', 6);
240
241$t->attr_is('spanList span#t_1', 'from', 7);
242$t->attr_is('spanList span#t_1', 'to', 9);
243
244$t->attr_is('spanList span#t_3', 'from', 12);
245$t->attr_is('spanList span#t_3', 'to', 16);
246
247$t->attr_is('spanList span#t_9', 'from', 36);
248$t->attr_is('spanList span#t_9', 'to', 37);
249
250$t->attr_is('spanList span#t_13', 'from', 44);
251$t->attr_is('spanList span#t_13', 'to', 45); # "
252
253$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
254$t->attr_is('spanList span#t_14', 'to', 55);
255
256$t->attr_is('spanList span#t_15', 'from', 55); # "
257$t->attr_is('spanList span#t_15', 'to', 56);
258
259$t->attr_is('spanList span#t_19', 'from', 66);
260$t->attr_is('spanList span#t_19', 'to', 67);
261
262$t->element_count_is('spanList span', 20);
263
264
265# ~ test aggressive tokenization ~
266
Peter Hardersb1227172020-07-21 02:12:10 +0200267$zip = IO::Uncompress::Unzip->new($outzip3, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
Peter Harders71f072b2020-07-15 14:15:01 +0200268
269ok($zip, 'Zip-File is created');
270
271my $aggr = '';
272$aggr .= $zip->getline while !$zip->eof;
273ok($zip->close, 'Closed');
274
275$t = Test::XML::Loy->new($aggr);
276
277$t->attr_is('spanList span:nth-child(1)', 'to', 6);
278
279$t->attr_is('spanList span#t_1', 'from', 7);
280$t->attr_is('spanList span#t_1', 'to', 9);
281
282$t->attr_is('spanList span#t_3', 'from', 12);
283$t->attr_is('spanList span#t_3', 'to', 16);
284
285$t->attr_is('spanList span#t_9', 'from', 36);
286$t->attr_is('spanList span#t_9', 'to', 37);
287
288$t->attr_is('spanList span#t_13', 'from', 44);
289$t->attr_is('spanList span#t_13', 'to', 45); # "
290
291$t->attr_is('spanList span#t_14', 'from', 45); # twenty
292$t->attr_is('spanList span#t_14', 'to', 51);
293
294$t->attr_is('spanList span#t_15', 'from', 51); # -
295$t->attr_is('spanList span#t_15', 'to', 52);
296
297$t->attr_is('spanList span#t_16', 'from', 52); # two
298$t->attr_is('spanList span#t_16', 'to', 55);
299
300$t->attr_is('spanList span#t_17', 'from', 55); # "
301$t->attr_is('spanList span#t_17', 'to', 56);
302
303$t->attr_is('spanList span#t_21', 'from', 66);
304$t->attr_is('spanList span#t_21', 'to', 67);
305
306$t->element_count_is('spanList span', 22);
307
308
Akron797e8072020-02-13 07:59:40 +0100309done_testing;