blob: 3ac91d17c7c066ab3ac5fa810a8e8210b75a5734 [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Akron2a60c532020-02-13 15:52:18 +01005use File::Temp ':POSIX';
6use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Akron797e8072020-02-13 07:59:40 +010013my $f = dirname(__FILE__);
14my $script = catfile($f, '..', 'script', 'tei2korapxml');
15ok(-f $script, 'Script found');
16
Akrond949e182020-02-14 12:23:57 +010017stdout_like(
Akron797e8072020-02-13 07:59:40 +010018 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010019 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010020 'Help'
21);
22
Akrond949e182020-02-14 12:23:57 +010023stdout_like(
24 sub { system('perl', $script, '--version') },
25 qr!tei2korapxml - v\d+?\.\d+?!,
26 'Version'
27);
28
29
Akron2a60c532020-02-13 15:52:18 +010030# Load example file
31my $file = catfile($f, 'data', 'goe_sample.i5.xml');
32my $outzip = tmpnam();
33
34# Generate zip file (unportable!)
35stderr_like(
36 sub { `cat '$file' | perl '$script' > '$outzip'` },
37 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
38 'Processing'
39);
40
Akron85717512020-07-08 11:19:19 +020041ok(-e $outzip, "File $outzip exists");
42
Akron2a60c532020-02-13 15:52:18 +010043# Uncompress GOE/header.xml from zip file
44my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
45
46ok($zip, 'Zip-File is created');
47
48# Read GOE/header.xml
49my $header_xml = '';
50$header_xml .= $zip->getline while !$zip->eof;
51ok($zip->close, 'Closed');
52
Akrond89ef822020-02-17 12:42:09 +010053my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010054
Akrond89ef822020-02-17 12:42:09 +010055$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
56 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
57 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
58 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010059
Akron68966082020-02-13 15:52:18 +010060
Akron2a60c532020-02-13 15:52:18 +010061# Uncompress GOE/AGA/header.xml from zip file
62$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
63
64ok($zip, 'Zip-File is found');
65
66# Read GOE/AGA/header.xml
67$header_xml = '';
68$header_xml .= $zip->getline while !$zip->eof;
69ok($zip->close, 'Closed');
70
Akrond89ef822020-02-17 12:42:09 +010071$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010072
Akrond89ef822020-02-17 12:42:09 +010073$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
74 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
75 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010076
77# Uncompress GOE/AGA/00000/header.xml from zip file
78$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
79
80ok($zip, 'Zip-File is found');
81
82# Read GOE/AGA/00000/header.xml
83$header_xml = '';
84$header_xml .= $zip->getline while !$zip->eof;
85ok($zip->close, 'Closed');
86
Akrond89ef822020-02-17 12:42:09 +010087$t = Test::XML::Loy->new($header_xml);
88$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
89 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +010090
91# Uncompress GOE/AGA/00000/data.xml from zip file
92$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
93
94ok($zip, 'Zip-File is found');
95
96# Read GOE/AGA/00000/data.xml
97my $data_xml = '';
98$data_xml .= $zip->getline while !$zip->eof;
99ok($zip->close, 'Closed');
100
Akrond89ef822020-02-17 12:42:09 +0100101$t = Test::XML::Loy->new($data_xml);
102$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
103 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100104
105# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
106$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
107
108ok($zip, 'Zip-File is found');
109
110# Read GOE/AGA/00000/struct/structure.xml
111my $struct_xml = '';
112$struct_xml .= $zip->getline while !$zip->eof;
113ok($zip->close, 'Closed');
114
Akrond89ef822020-02-17 12:42:09 +0100115$t = Test::XML::Loy->new($struct_xml);
116$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100117
Akroneac374d2020-07-07 09:00:44 +0200118
119# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
120$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
121
122# Read GOE/AGA/00000/base/tok.xml
123my $tokens_xml = '';
124$tokens_xml .= $zip->getline while !$zip->eof;
125ok($zip->close, 'Closed');
126
127$t = Test::XML::Loy->new($tokens_xml);
128$t->attr_is('spanList span:nth-child(1)', 'to', 8);
129
130$t->attr_is('spanList span#t_1', 'from', 9);
131$t->attr_is('spanList span#t_1', 'to', 11);
132
133$t->attr_is('spanList span#t_67', 'from', 427);
134$t->attr_is('spanList span#t_67', 'to', 430);
135
136$t->attr_is('spanList span#t_214', 'from', 1209);
137$t->attr_is('spanList span#t_214', 'to', 1212);
138
139$t->element_count_is('spanList span', 227);
140
141
142# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
143$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
144
145# Read GOE/AGA/00000/base/tok.xml
146$tokens_xml = '';
147$tokens_xml .= $zip->getline while !$zip->eof;
148ok($zip->close, 'Closed');
149
150$t = Test::XML::Loy->new($tokens_xml);
151$t->attr_is('spanList span:nth-child(1)', 'to', 8);
152
153$t->attr_is('spanList span#t_1', 'from', 9);
154$t->attr_is('spanList span#t_1', 'to', 11);
155
156$t->attr_is('spanList span#t_67', 'from', 427);
157$t->attr_is('spanList span#t_67', 'to', 430);
158
159$t->attr_is('spanList span#t_214', 'from', 1209);
160$t->attr_is('spanList span#t_214', 'to', 1212);
161
162$t->element_count_is('spanList span', 227);
163
Akron797e8072020-02-13 07:59:40 +0100164done_testing;