blob: 501078937a7d3f9b8f86ecf428d0ef3fe9de7016 [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Akron2a60c532020-02-13 15:52:18 +01005use File::Temp ':POSIX';
6use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Akron797e8072020-02-13 07:59:40 +010013my $f = dirname(__FILE__);
14my $script = catfile($f, '..', 'script', 'tei2korapxml');
15ok(-f $script, 'Script found');
16
Akrond949e182020-02-14 12:23:57 +010017stdout_like(
Akron797e8072020-02-13 07:59:40 +010018 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010019 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010020 'Help'
21);
22
Akrond949e182020-02-14 12:23:57 +010023stdout_like(
24 sub { system('perl', $script, '--version') },
25 qr!tei2korapxml - v\d+?\.\d+?!,
26 'Version'
27);
28
29
Akron2a60c532020-02-13 15:52:18 +010030# Load example file
31my $file = catfile($f, 'data', 'goe_sample.i5.xml');
32my $outzip = tmpnam();
33
34# Generate zip file (unportable!)
Akron8b511f92020-07-09 17:28:08 +020035# TODO:
36# Call with aggressive and conservative tokenizations!
Akron2a60c532020-02-13 15:52:18 +010037stderr_like(
38 sub { `cat '$file' | perl '$script' > '$outzip'` },
39 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
40 'Processing'
41);
42
Akron85717512020-07-08 11:19:19 +020043ok(-e $outzip, "File $outzip exists");
44
Akron2a60c532020-02-13 15:52:18 +010045# Uncompress GOE/header.xml from zip file
46my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
47
48ok($zip, 'Zip-File is created');
49
50# Read GOE/header.xml
51my $header_xml = '';
52$header_xml .= $zip->getline while !$zip->eof;
53ok($zip->close, 'Closed');
54
Akrond89ef822020-02-17 12:42:09 +010055my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010056
Akrond89ef822020-02-17 12:42:09 +010057$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
58 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
59 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
60 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010061
Akron68966082020-02-13 15:52:18 +010062
Akron2a60c532020-02-13 15:52:18 +010063# Uncompress GOE/AGA/header.xml from zip file
64$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
65
66ok($zip, 'Zip-File is found');
67
68# Read GOE/AGA/header.xml
69$header_xml = '';
70$header_xml .= $zip->getline while !$zip->eof;
71ok($zip->close, 'Closed');
72
Akrond89ef822020-02-17 12:42:09 +010073$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010074
Akrond89ef822020-02-17 12:42:09 +010075$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
76 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
77 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010078
79# Uncompress GOE/AGA/00000/header.xml from zip file
80$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
81
82ok($zip, 'Zip-File is found');
83
84# Read GOE/AGA/00000/header.xml
85$header_xml = '';
86$header_xml .= $zip->getline while !$zip->eof;
87ok($zip->close, 'Closed');
88
Akrond89ef822020-02-17 12:42:09 +010089$t = Test::XML::Loy->new($header_xml);
90$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
91 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +010092
93# Uncompress GOE/AGA/00000/data.xml from zip file
94$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
95
96ok($zip, 'Zip-File is found');
97
98# Read GOE/AGA/00000/data.xml
99my $data_xml = '';
100$data_xml .= $zip->getline while !$zip->eof;
101ok($zip->close, 'Closed');
102
Akrond89ef822020-02-17 12:42:09 +0100103$t = Test::XML::Loy->new($data_xml);
104$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
105 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100106
107# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
108$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
109
110ok($zip, 'Zip-File is found');
111
112# Read GOE/AGA/00000/struct/structure.xml
113my $struct_xml = '';
114$struct_xml .= $zip->getline while !$zip->eof;
115ok($zip->close, 'Closed');
116
Akrond89ef822020-02-17 12:42:09 +0100117$t = Test::XML::Loy->new($struct_xml);
118$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100119
Akroneac374d2020-07-07 09:00:44 +0200120
121# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
122$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
123
124# Read GOE/AGA/00000/base/tok.xml
125my $tokens_xml = '';
126$tokens_xml .= $zip->getline while !$zip->eof;
127ok($zip->close, 'Closed');
128
129$t = Test::XML::Loy->new($tokens_xml);
130$t->attr_is('spanList span:nth-child(1)', 'to', 8);
131
132$t->attr_is('spanList span#t_1', 'from', 9);
133$t->attr_is('spanList span#t_1', 'to', 11);
134
135$t->attr_is('spanList span#t_67', 'from', 427);
136$t->attr_is('spanList span#t_67', 'to', 430);
137
138$t->attr_is('spanList span#t_214', 'from', 1209);
139$t->attr_is('spanList span#t_214', 'to', 1212);
140
141$t->element_count_is('spanList span', 227);
142
143
144# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
145$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
146
Akron8b511f92020-07-09 17:28:08 +0200147$tokens_xml = '';
148$tokens_xml .= $zip->getline while !$zip->eof;
149ok($zip->close, 'Closed');
150
151$t = Test::XML::Loy->new($tokens_xml);
152$t->attr_is('spanList span:nth-child(1)', 'to', 8);
153
154$t->attr_is('spanList span#t_1', 'from', 9);
155$t->attr_is('spanList span#t_1', 'to', 11);
156
157$t->attr_is('spanList span#t_67', 'from', 427);
158$t->attr_is('spanList span#t_67', 'to', 430);
159
160$t->attr_is('spanList span#t_214', 'from', 1209);
161$t->attr_is('spanList span#t_214', 'to', 1212);
162
163$t->element_count_is('spanList span', 227);
164
165# Tokenize with external tokenizer
166my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
167
168stderr_like(
169 sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
170 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
171 'Processing'
172);
173
174# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
175$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
176
177# Read GOE/AGA/00000/base/tokens.xml
Akroneac374d2020-07-07 09:00:44 +0200178$tokens_xml = '';
179$tokens_xml .= $zip->getline while !$zip->eof;
180ok($zip->close, 'Closed');
181
182$t = Test::XML::Loy->new($tokens_xml);
183$t->attr_is('spanList span:nth-child(1)', 'to', 8);
184
185$t->attr_is('spanList span#t_1', 'from', 9);
186$t->attr_is('spanList span#t_1', 'to', 11);
187
188$t->attr_is('spanList span#t_67', 'from', 427);
189$t->attr_is('spanList span#t_67', 'to', 430);
190
191$t->attr_is('spanList span#t_214', 'from', 1209);
192$t->attr_is('spanList span#t_214', 'to', 1212);
193
194$t->element_count_is('spanList span', 227);
195
Akron797e8072020-02-13 07:59:40 +0100196done_testing;