blob: eaf63483c7f938513c7786e0f4c8335a7edbfd82 [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Akron2a60c532020-02-13 15:52:18 +01005use File::Temp ':POSIX';
6use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Akron797e8072020-02-13 07:59:40 +010013my $f = dirname(__FILE__);
14my $script = catfile($f, '..', 'script', 'tei2korapxml');
15ok(-f $script, 'Script found');
16
Akrond949e182020-02-14 12:23:57 +010017stdout_like(
Akron797e8072020-02-13 07:59:40 +010018 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010019 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010020 'Help'
21);
22
Akrond949e182020-02-14 12:23:57 +010023stdout_like(
24 sub { system('perl', $script, '--version') },
25 qr!tei2korapxml - v\d+?\.\d+?!,
26 'Version'
27);
28
29
Akron2a60c532020-02-13 15:52:18 +010030# Load example file
31my $file = catfile($f, 'data', 'goe_sample.i5.xml');
32my $outzip = tmpnam();
33
34# Generate zip file (unportable!)
35stderr_like(
36 sub { `cat '$file' | perl '$script' > '$outzip'` },
37 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
38 'Processing'
39);
40
Akron85717512020-07-08 11:19:19 +020041ok(-e $outzip, "File $outzip exists");
42
Akron2a60c532020-02-13 15:52:18 +010043# Uncompress GOE/header.xml from zip file
44my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
45
46ok($zip, 'Zip-File is created');
47
48# Read GOE/header.xml
49my $header_xml = '';
50$header_xml .= $zip->getline while !$zip->eof;
51ok($zip->close, 'Closed');
52
Akrond89ef822020-02-17 12:42:09 +010053my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010054
Akrond89ef822020-02-17 12:42:09 +010055$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
56 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
57 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
58 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010059
Akron68966082020-02-13 15:52:18 +010060
Akron2a60c532020-02-13 15:52:18 +010061# Uncompress GOE/AGA/header.xml from zip file
62$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
63
64ok($zip, 'Zip-File is found');
65
66# Read GOE/AGA/header.xml
67$header_xml = '';
68$header_xml .= $zip->getline while !$zip->eof;
69ok($zip->close, 'Closed');
70
Akrond89ef822020-02-17 12:42:09 +010071$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010072
Akrond89ef822020-02-17 12:42:09 +010073$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
74 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
75 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010076
77# Uncompress GOE/AGA/00000/header.xml from zip file
78$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
79
80ok($zip, 'Zip-File is found');
81
82# Read GOE/AGA/00000/header.xml
83$header_xml = '';
84$header_xml .= $zip->getline while !$zip->eof;
85ok($zip->close, 'Closed');
86
Akrond89ef822020-02-17 12:42:09 +010087$t = Test::XML::Loy->new($header_xml);
88$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
89 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +010090
91# Uncompress GOE/AGA/00000/data.xml from zip file
92$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
93
94ok($zip, 'Zip-File is found');
95
96# Read GOE/AGA/00000/data.xml
97my $data_xml = '';
98$data_xml .= $zip->getline while !$zip->eof;
99ok($zip->close, 'Closed');
100
Akrond89ef822020-02-17 12:42:09 +0100101$t = Test::XML::Loy->new($data_xml);
102$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
103 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100104
105# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
106$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
107
108ok($zip, 'Zip-File is found');
109
110# Read GOE/AGA/00000/struct/structure.xml
111my $struct_xml = '';
112$struct_xml .= $zip->getline while !$zip->eof;
113ok($zip->close, 'Closed');
114
Akrond89ef822020-02-17 12:42:09 +0100115$t = Test::XML::Loy->new($struct_xml);
116$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100117
Akroneac374d2020-07-07 09:00:44 +0200118
119# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
120$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
121
122# Read GOE/AGA/00000/base/tok.xml
123my $tokens_xml = '';
124$tokens_xml .= $zip->getline while !$zip->eof;
125ok($zip->close, 'Closed');
126
127$t = Test::XML::Loy->new($tokens_xml);
128$t->attr_is('spanList span:nth-child(1)', 'to', 8);
129
130$t->attr_is('spanList span#t_1', 'from', 9);
131$t->attr_is('spanList span#t_1', 'to', 11);
132
133$t->attr_is('spanList span#t_67', 'from', 427);
134$t->attr_is('spanList span#t_67', 'to', 430);
135
136$t->attr_is('spanList span#t_214', 'from', 1209);
137$t->attr_is('spanList span#t_214', 'to', 1212);
138
139$t->element_count_is('spanList span', 227);
140
141
142# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
143$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
144
Akron8b511f92020-07-09 17:28:08 +0200145$tokens_xml = '';
146$tokens_xml .= $zip->getline while !$zip->eof;
147ok($zip->close, 'Closed');
148
149$t = Test::XML::Loy->new($tokens_xml);
150$t->attr_is('spanList span:nth-child(1)', 'to', 8);
151
152$t->attr_is('spanList span#t_1', 'from', 9);
153$t->attr_is('spanList span#t_1', 'to', 11);
154
155$t->attr_is('spanList span#t_67', 'from', 427);
156$t->attr_is('spanList span#t_67', 'to', 430);
157
158$t->attr_is('spanList span#t_214', 'from', 1209);
159$t->attr_is('spanList span#t_214', 'to', 1212);
160
161$t->element_count_is('spanList span', 227);
162
163# Tokenize with external tokenizer
164my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
165
166stderr_like(
167 sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
168 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
169 'Processing'
170);
171
Peter Harders71f072b2020-07-15 14:15:01 +0200172# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
Akron8b511f92020-07-09 17:28:08 +0200173$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
174
175# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200176$tokens_xml = '';
177$tokens_xml .= $zip->getline while !$zip->eof;
178ok($zip->close, 'Closed');
179
180$t = Test::XML::Loy->new($tokens_xml);
181$t->attr_is('spanList span:nth-child(1)', 'to', 8);
182
183$t->attr_is('spanList span#t_1', 'from', 9);
184$t->attr_is('spanList span#t_1', 'to', 11);
185
186$t->attr_is('spanList span#t_67', 'from', 427);
187$t->attr_is('spanList span#t_67', 'to', 430);
188
189$t->attr_is('spanList span#t_214', 'from', 1209);
190$t->attr_is('spanList span#t_214', 'to', 1212);
191
192$t->element_count_is('spanList span', 227);
193
Peter Harders71f072b2020-07-15 14:15:01 +0200194
195
196# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
197
198
199# ~ test conservative tokenization ~
200
201$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
202
203stderr_like(
204 sub { `cat '$file' | perl '$script' > '$outzip'` },
205 qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
206 'Processing'
207);
208
209ok(-e $outzip, "File $outzip exists");
210
211$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
212
213ok($zip, 'Zip-File is created');
214
215my $cons = '';
216$cons .= $zip->getline while !$zip->eof;
217ok($zip->close, 'Closed');
218
219$t = Test::XML::Loy->new($cons);
220$t->attr_is('spanList span:nth-child(1)', 'to', 6);
221
222$t->attr_is('spanList span#t_1', 'from', 7);
223$t->attr_is('spanList span#t_1', 'to', 9);
224
225$t->attr_is('spanList span#t_3', 'from', 12);
226$t->attr_is('spanList span#t_3', 'to', 16);
227
228$t->attr_is('spanList span#t_9', 'from', 36);
229$t->attr_is('spanList span#t_9', 'to', 37);
230
231$t->attr_is('spanList span#t_13', 'from', 44);
232$t->attr_is('spanList span#t_13', 'to', 45); # "
233
234$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
235$t->attr_is('spanList span#t_14', 'to', 55);
236
237$t->attr_is('spanList span#t_15', 'from', 55); # "
238$t->attr_is('spanList span#t_15', 'to', 56);
239
240$t->attr_is('spanList span#t_19', 'from', 66);
241$t->attr_is('spanList span#t_19', 'to', 67);
242
243$t->element_count_is('spanList span', 20);
244
245
246# ~ test aggressive tokenization ~
247
248$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
249
250ok($zip, 'Zip-File is created');
251
252my $aggr = '';
253$aggr .= $zip->getline while !$zip->eof;
254ok($zip->close, 'Closed');
255
256$t = Test::XML::Loy->new($aggr);
257
258$t->attr_is('spanList span:nth-child(1)', 'to', 6);
259
260$t->attr_is('spanList span#t_1', 'from', 7);
261$t->attr_is('spanList span#t_1', 'to', 9);
262
263$t->attr_is('spanList span#t_3', 'from', 12);
264$t->attr_is('spanList span#t_3', 'to', 16);
265
266$t->attr_is('spanList span#t_9', 'from', 36);
267$t->attr_is('spanList span#t_9', 'to', 37);
268
269$t->attr_is('spanList span#t_13', 'from', 44);
270$t->attr_is('spanList span#t_13', 'to', 45); # "
271
272$t->attr_is('spanList span#t_14', 'from', 45); # twenty
273$t->attr_is('spanList span#t_14', 'to', 51);
274
275$t->attr_is('spanList span#t_15', 'from', 51); # -
276$t->attr_is('spanList span#t_15', 'to', 52);
277
278$t->attr_is('spanList span#t_16', 'from', 52); # two
279$t->attr_is('spanList span#t_16', 'to', 55);
280
281$t->attr_is('spanList span#t_17', 'from', 55); # "
282$t->attr_is('spanList span#t_17', 'to', 56);
283
284$t->attr_is('spanList span#t_21', 'from', 66);
285$t->attr_is('spanList span#t_21', 'to', 67);
286
287$t->element_count_is('spanList span', 22);
288
289
Akron797e8072020-02-13 07:59:40 +0100290done_testing;