blob: da4563338a1979f16020947fd83ff94e1d22f4c8 [file] [log] [blame]
Akron797e8072020-02-13 07:59:40 +01001use strict;
2use warnings;
3use File::Basename 'dirname';
4use File::Spec::Functions qw/catfile/;
Akron2a60c532020-02-13 15:52:18 +01005use File::Temp ':POSIX';
6use IO::Uncompress::Unzip qw(unzip $UnzipError);
Akron797e8072020-02-13 07:59:40 +01007
8use Test::More;
9use Test::Output;
10
Akrond89ef822020-02-17 12:42:09 +010011use Test::XML::Loy;
Akron2a60c532020-02-13 15:52:18 +010012
Akron797e8072020-02-13 07:59:40 +010013my $f = dirname(__FILE__);
14my $script = catfile($f, '..', 'script', 'tei2korapxml');
15ok(-f $script, 'Script found');
16
Akrond949e182020-02-14 12:23:57 +010017stdout_like(
Akron797e8072020-02-13 07:59:40 +010018 sub { system('perl', $script, '--help') },
Akrond949e182020-02-14 12:23:57 +010019 qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
Akron797e8072020-02-13 07:59:40 +010020 'Help'
21);
22
Akrond949e182020-02-14 12:23:57 +010023stdout_like(
24 sub { system('perl', $script, '--version') },
25 qr!tei2korapxml - v\d+?\.\d+?!,
26 'Version'
27);
28
29
Akron2a60c532020-02-13 15:52:18 +010030# Load example file
31my $file = catfile($f, 'data', 'goe_sample.i5.xml');
32my $outzip = tmpnam();
33
34# Generate zip file (unportable!)
35stderr_like(
36 sub { `cat '$file' | perl '$script' > '$outzip'` },
37 qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
38 'Processing'
39);
40
41# Uncompress GOE/header.xml from zip file
42my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
43
44ok($zip, 'Zip-File is created');
45
46# Read GOE/header.xml
47my $header_xml = '';
48$header_xml .= $zip->getline while !$zip->eof;
49ok($zip->close, 'Closed');
50
Akrond89ef822020-02-17 12:42:09 +010051my $t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010052
Akrond89ef822020-02-17 12:42:09 +010053$t->text_is('korpusSigle', 'GOE', 'korpusSigle')
54 ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title')
55 ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
56 ->text_is('pubDate[type=year]', '1982', 'pubDate');
Akron2a60c532020-02-13 15:52:18 +010057
Akron68966082020-02-13 15:52:18 +010058
Akron2a60c532020-02-13 15:52:18 +010059# Uncompress GOE/AGA/header.xml from zip file
60$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');
61
62ok($zip, 'Zip-File is found');
63
64# Read GOE/AGA/header.xml
65$header_xml = '';
66$header_xml .= $zip->getline while !$zip->eof;
67ok($zip->close, 'Closed');
68
Akrond89ef822020-02-17 12:42:09 +010069$t = Test::XML::Loy->new($header_xml);
Akron2a60c532020-02-13 15:52:18 +010070
Akrond89ef822020-02-17 12:42:09 +010071$t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle')
72 ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title')
73 ->text_is('creatDate', '1820-1822', 'creatDate');
Akron2a60c532020-02-13 15:52:18 +010074
75# Uncompress GOE/AGA/00000/header.xml from zip file
76$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml');
77
78ok($zip, 'Zip-File is found');
79
80# Read GOE/AGA/00000/header.xml
81$header_xml = '';
82$header_xml .= $zip->getline while !$zip->eof;
83ok($zip->close, 'Closed');
84
Akrond89ef822020-02-17 12:42:09 +010085$t = Test::XML::Loy->new($header_xml);
86$t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle')
87 ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title');
Akron2a60c532020-02-13 15:52:18 +010088
89# Uncompress GOE/AGA/00000/data.xml from zip file
90$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml');
91
92ok($zip, 'Zip-File is found');
93
94# Read GOE/AGA/00000/data.xml
95my $data_xml = '';
96$data_xml .= $zip->getline while !$zip->eof;
97ok($zip->close, 'Closed');
98
Akrond89ef822020-02-17 12:42:09 +010099$t = Test::XML::Loy->new($data_xml);
100$t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
101 ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
Akron2a60c532020-02-13 15:52:18 +0100102
103# Uncompress GOE/AGA/00000/struct/structure.xml from zip file
104$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml');
105
106ok($zip, 'Zip-File is found');
107
108# Read GOE/AGA/00000/struct/structure.xml
109my $struct_xml = '';
110$struct_xml .= $zip->getline while !$zip->eof;
111ok($zip->close, 'Closed');
112
Akrond89ef822020-02-17 12:42:09 +0100113$t = Test::XML::Loy->new($struct_xml);
114$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
Akron797e8072020-02-13 07:59:40 +0100115
Akroneac374d2020-07-07 09:00:44 +0200116
117# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
118$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
119
120# Read GOE/AGA/00000/base/tok.xml
121my $tokens_xml = '';
122$tokens_xml .= $zip->getline while !$zip->eof;
123ok($zip->close, 'Closed');
124
125$t = Test::XML::Loy->new($tokens_xml);
126$t->attr_is('spanList span:nth-child(1)', 'to', 8);
127
128$t->attr_is('spanList span#t_1', 'from', 9);
129$t->attr_is('spanList span#t_1', 'to', 11);
130
131$t->attr_is('spanList span#t_67', 'from', 427);
132$t->attr_is('spanList span#t_67', 'to', 430);
133
134$t->attr_is('spanList span#t_214', 'from', 1209);
135$t->attr_is('spanList span#t_214', 'to', 1212);
136
137$t->element_count_is('spanList span', 227);
138
139
140# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
141$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
142
143# Read GOE/AGA/00000/base/tok.xml
144$tokens_xml = '';
145$tokens_xml .= $zip->getline while !$zip->eof;
146ok($zip->close, 'Closed');
147
148$t = Test::XML::Loy->new($tokens_xml);
149$t->attr_is('spanList span:nth-child(1)', 'to', 8);
150
151$t->attr_is('spanList span#t_1', 'from', 9);
152$t->attr_is('spanList span#t_1', 'to', 11);
153
154$t->attr_is('spanList span#t_67', 'from', 427);
155$t->attr_is('spanList span#t_67', 'to', 430);
156
157$t->attr_is('spanList span#t_214', 'from', 1209);
158$t->attr_is('spanList span#t_214', 'to', 1212);
159
160$t->element_count_is('spanList span', 227);
161
Akron797e8072020-02-13 07:59:40 +0100162done_testing;