blob: 753326c884f4ab03d581e60ecf983a2f35decb8f [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Peter Harders57c884e2020-07-16 01:28:52 +02005use File::Temp qw/tempfile/;
Akron2a60c532020-02-13 15:52:18 +01006use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Peter Harders57c884e2020-07-16 01:28:52 +020013our %ENV;
14# default: remove temp. file created by func. tempfile
15# to keep temp. files use e.g. 'KORAPXMLTEI_DONTUNLINK=1 prove -lr t/script.t'
16my $_UNLINK = $ENV{KORAPXMLTEI_DONTUNLINK}?0:1;
17
Akron797e8072020-02-13 07:59:40 +010018my $f = dirname(__FILE__);
19my $script = catfile($f, '..', 'script', 'tei2korapxml');
20ok(-f $script, 'Script found');
21
Akrond949e182020-02-14 12:23:57 +010022stdout_like(
Akron797e8072020-02-13 07:59:40 +010023 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010024 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010025 'Help'
26);
27
Akrond949e182020-02-14 12:23:57 +010028stdout_like(
29 sub { system('perl', $script, '--version') },
30 qr!tei2korapxml - v\d+?\.\d+?!,
31 'Version'
32);
33
34
Akron2a60c532020-02-13 15:52:18 +010035# Load example file
36my $file = catfile($f, 'data', 'goe_sample.i5.xml');
Peter Harders57c884e2020-07-16 01:28:52 +020037
38my ($fh, $outzip) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
Akron2a60c532020-02-13 15:52:18 +010039
40# Generate zip file (unportable!)
Akron8b511f92020-07-09 17:28:08 +020041# TODO:
42# Call with aggressive and conservative tokenizations!
Akron2a60c532020-02-13 15:52:18 +010043stderr_like(
44 sub { `cat '$file' | perl '$script' > '$outzip'` },
Peter Harders57c884e2020-07-16 01:28:52 +020045# approaches for working with $fh (also better use OO interface then)
46# sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") },
47# sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close },
48# sub {
49# defined(my $pid = fork) or die "fork: $!";
50# if (!$pid) {
51# open STDOUT, '>&', $fh;
52# exec "cat '$file' | perl '$script'"
53# }
54# waitpid $pid, 0;
55# $fh->close;
56# },
Akron2a60c532020-02-13 15:52:18 +010057 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
58 'Processing'
59);
60
Akron85717512020-07-08 11:19:19 +020061ok(-e $outzip, "File $outzip exists");
62
Akron2a60c532020-02-13 15:52:18 +010063# Uncompress GOE/header.xml from zip file
64my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
65
66ok($zip, 'Zip-File is created');
67
Peter Harders57c884e2020-07-16 01:28:52 +020068# TODO: check wrong encoding in header-files (compare with input document)!
Akron2a60c532020-02-13 15:52:18 +010069# Read GOE/header.xml
70my $header_xml = '';
71$header_xml .= $zip->getline while !$zip->eof;
72ok($zip->close, 'Closed');
73
Akrond89ef822020-02-17 12:42:09 +010074my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010075
Akrond89ef822020-02-17 12:42:09 +010076$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
77 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
78 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
79 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010080
Akron68966082020-02-13 15:52:18 +010081
Akron2a60c532020-02-13 15:52:18 +010082# Uncompress GOE/AGA/header.xml from zip file
83$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
84
85ok($zip, 'Zip-File is found');
86
87# Read GOE/AGA/header.xml
88$header_xml = '';
89$header_xml .= $zip->getline while !$zip->eof;
90ok($zip->close, 'Closed');
91
Akrond89ef822020-02-17 12:42:09 +010092$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010093
Akrond89ef822020-02-17 12:42:09 +010094$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
95 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
96 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010097
98# Uncompress GOE/AGA/00000/header.xml from zip file
99$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
100
101ok($zip, 'Zip-File is found');
102
103# Read GOE/AGA/00000/header.xml
104$header_xml = '';
105$header_xml .= $zip->getline while !$zip->eof;
106ok($zip->close, 'Closed');
107
Akrond89ef822020-02-17 12:42:09 +0100108$t = Test::XML::Loy->new($header_xml);
109$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
110 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +0100111
112# Uncompress GOE/AGA/00000/data.xml from zip file
113$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
114
115ok($zip, 'Zip-File is found');
116
117# Read GOE/AGA/00000/data.xml
118my $data_xml = '';
119$data_xml .= $zip->getline while !$zip->eof;
120ok($zip->close, 'Closed');
121
Akrond89ef822020-02-17 12:42:09 +0100122$t = Test::XML::Loy->new($data_xml);
123$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
124 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100125
126# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
127$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
128
129ok($zip, 'Zip-File is found');
130
131# Read GOE/AGA/00000/struct/structure.xml
132my $struct_xml = '';
133$struct_xml .= $zip->getline while !$zip->eof;
Peter Harders57c884e2020-07-16 01:28:52 +0200134
Akron2a60c532020-02-13 15:52:18 +0100135ok($zip->close, 'Closed');
136
Akrond89ef822020-02-17 12:42:09 +0100137$t = Test::XML::Loy->new($struct_xml);
138$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100139
Akroneac374d2020-07-07 09:00:44 +0200140
141# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
142$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
143
144# Read GOE/AGA/00000/base/tok.xml
145my $tokens_xml = '';
146$tokens_xml .= $zip->getline while !$zip->eof;
147ok($zip->close, 'Closed');
148
149$t = Test::XML::Loy->new($tokens_xml);
150$t->attr_is('spanList span:nth-child(1)', 'to', 8);
151
152$t->attr_is('spanList span#t_1', 'from', 9);
153$t->attr_is('spanList span#t_1', 'to', 11);
154
155$t->attr_is('spanList span#t_67', 'from', 427);
156$t->attr_is('spanList span#t_67', 'to', 430);
157
158$t->attr_is('spanList span#t_214', 'from', 1209);
159$t->attr_is('spanList span#t_214', 'to', 1212);
160
161$t->element_count_is('spanList span', 227);
162
163
164# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
165$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
166
Akron8b511f92020-07-09 17:28:08 +0200167$tokens_xml = '';
168$tokens_xml .= $zip->getline while !$zip->eof;
169ok($zip->close, 'Closed');
170
171$t = Test::XML::Loy->new($tokens_xml);
172$t->attr_is('spanList span:nth-child(1)', 'to', 8);
173
174$t->attr_is('spanList span#t_1', 'from', 9);
175$t->attr_is('spanList span#t_1', 'to', 11);
176
177$t->attr_is('spanList span#t_67', 'from', 427);
178$t->attr_is('spanList span#t_67', 'to', 430);
179
180$t->attr_is('spanList span#t_214', 'from', 1209);
181$t->attr_is('spanList span#t_214', 'to', 1212);
182
183$t->element_count_is('spanList span', 227);
184
185# Tokenize with external tokenizer
186my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
187
188stderr_like(
189 sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
190 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
191 'Processing'
192);
193
194# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
195$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
196
197# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200198$tokens_xml = '';
199$tokens_xml .= $zip->getline while !$zip->eof;
200ok($zip->close, 'Closed');
201
202$t = Test::XML::Loy->new($tokens_xml);
203$t->attr_is('spanList span:nth-child(1)', 'to', 8);
204
205$t->attr_is('spanList span#t_1', 'from', 9);
206$t->attr_is('spanList span#t_1', 'to', 11);
207
208$t->attr_is('spanList span#t_67', 'from', 427);
209$t->attr_is('spanList span#t_67', 'to', 430);
210
211$t->attr_is('spanList span#t_214', 'from', 1209);
212$t->attr_is('spanList span#t_214', 'to', 1212);
213
214$t->element_count_is('spanList span', 227);
215
Akron797e8072020-02-13 07:59:40 +0100216done_testing;