| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 1 | use strict; | 
|  | 2 | use warnings; | 
|  | 3 | use File::Basename 'dirname'; | 
|  | 4 | use File::Spec::Functions qw/catfile/; | 
| Peter Harders | 57c884e | 2020-07-16 01:28:52 +0200 | [diff] [blame] | 5 | use File::Temp qw/tempfile/; | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 6 | use IO::Uncompress::Unzip qw(unzip $UnzipError); | 
| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 7 |  | 
|  | 8 | use Test::More; | 
|  | 9 | use Test::Output; | 
|  | 10 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 11 | use Test::XML::Loy; | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 12 |  | 
| Peter Harders | 57c884e | 2020-07-16 01:28:52 +0200 | [diff] [blame] | 13 | our %ENV; | 
|  | 14 | # default: remove temp. file created by func. tempfile | 
|  | 15 | #  to keep temp. files use e.g. 'KORAPXMLTEI_DONTUNLINK=1 prove -lr t/script.t' | 
|  | 16 | my $_UNLINK = $ENV{KORAPXMLTEI_DONTUNLINK}?0:1; | 
|  | 17 |  | 
| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 18 | my $f = dirname(__FILE__); | 
|  | 19 | my $script = catfile($f, '..', 'script', 'tei2korapxml'); | 
|  | 20 | ok(-f $script, 'Script found'); | 
|  | 21 |  | 
| Akron | d949e18 | 2020-02-14 12:23:57 +0100 | [diff] [blame] | 22 | stdout_like( | 
| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 23 | sub { system('perl', $script, '--help') }, | 
| Akron | d949e18 | 2020-02-14 12:23:57 +0100 | [diff] [blame] | 24 | qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!, | 
| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 25 | 'Help' | 
|  | 26 | ); | 
|  | 27 |  | 
| Akron | d949e18 | 2020-02-14 12:23:57 +0100 | [diff] [blame] | 28 | stdout_like( | 
|  | 29 | sub { system('perl', $script, '--version') }, | 
|  | 30 | qr!tei2korapxml - v\d+?\.\d+?!, | 
|  | 31 | 'Version' | 
|  | 32 | ); | 
|  | 33 |  | 
|  | 34 |  | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 35 | # Load example file | 
|  | 36 | my $file = catfile($f, 'data', 'goe_sample.i5.xml'); | 
| Peter Harders | 57c884e | 2020-07-16 01:28:52 +0200 | [diff] [blame] | 37 |  | 
|  | 38 | my ($fh, $outzip) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 39 |  | 
|  | 40 | # Generate zip file (unportable!) | 
|  | 41 | stderr_like( | 
|  | 42 | sub { `cat '$file' | perl '$script' > '$outzip'` }, | 
| Peter Harders | 57c884e | 2020-07-16 01:28:52 +0200 | [diff] [blame] | 43 | # approaches for working with $fh (also better use OO interface then) | 
|  | 44 | #  sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") }, | 
|  | 45 | #  sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close }, | 
|  | 46 | #  sub { | 
|  | 47 | #    defined(my $pid = fork) or die "fork: $!"; | 
|  | 48 | #    if (!$pid) { | 
|  | 49 | #      open STDOUT, '>&', $fh; | 
|  | 50 | #      exec "cat '$file' | perl '$script'" | 
|  | 51 | #    } | 
|  | 52 | #    waitpid $pid, 0; | 
|  | 53 | #    $fh->close; | 
|  | 54 | #  }, | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 55 | qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!, | 
|  | 56 | 'Processing' | 
|  | 57 | ); | 
|  | 58 |  | 
| Akron | 8571751 | 2020-07-08 11:19:19 +0200 | [diff] [blame] | 59 | ok(-e $outzip, "File $outzip exists"); | 
|  | 60 |  | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 61 | # Uncompress GOE/header.xml from zip file | 
|  | 62 | my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml'); | 
|  | 63 |  | 
|  | 64 | ok($zip, 'Zip-File is created'); | 
|  | 65 |  | 
| Peter Harders | 57c884e | 2020-07-16 01:28:52 +0200 | [diff] [blame] | 66 | # TODO: check wrong encoding in header-files (compare with input document)! | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 67 | # Read GOE/header.xml | 
|  | 68 | my $header_xml = ''; | 
|  | 69 | $header_xml .= $zip->getline while !$zip->eof; | 
|  | 70 | ok($zip->close, 'Closed'); | 
|  | 71 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 72 | my $t = Test::XML::Loy->new($header_xml); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 73 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 74 | $t->text_is('korpusSigle', 'GOE', 'korpusSigle') | 
|  | 75 | ->text_is('h\.title[type=main]', 'Goethes Werke', 'h.title') | 
|  | 76 | ->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author') | 
|  | 77 | ->text_is('pubDate[type=year]', '1982', 'pubDate'); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 78 |  | 
| Akron | 6896608 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 79 |  | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 80 | # Uncompress GOE/AGA/header.xml from zip file | 
|  | 81 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml'); | 
|  | 82 |  | 
|  | 83 | ok($zip, 'Zip-File is found'); | 
|  | 84 |  | 
|  | 85 | # Read GOE/AGA/header.xml | 
|  | 86 | $header_xml = ''; | 
|  | 87 | $header_xml .= $zip->getline while !$zip->eof; | 
|  | 88 | ok($zip->close, 'Closed'); | 
|  | 89 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 90 | $t = Test::XML::Loy->new($header_xml); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 91 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 92 | $t->text_is('dokumentSigle', 'GOE/AGA', 'dokumentSigle') | 
|  | 93 | ->text_is('d\.title', 'Goethe: Autobiographische Schriften II, (1817-1825, 1832)', 'd.title') | 
|  | 94 | ->text_is('creatDate', '1820-1822', 'creatDate'); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 95 |  | 
|  | 96 | # Uncompress GOE/AGA/00000/header.xml from zip file | 
|  | 97 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/header.xml'); | 
|  | 98 |  | 
|  | 99 | ok($zip, 'Zip-File is found'); | 
|  | 100 |  | 
|  | 101 | # Read GOE/AGA/00000/header.xml | 
|  | 102 | $header_xml = ''; | 
|  | 103 | $header_xml .= $zip->getline while !$zip->eof; | 
|  | 104 | ok($zip->close, 'Closed'); | 
|  | 105 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 106 | $t = Test::XML::Loy->new($header_xml); | 
|  | 107 | $t->text_is('textSigle', 'GOE/AGA.00000', 'textSigle') | 
|  | 108 | ->text_is('analytic > h\.title[type=main]', 'Campagne in Frankreich', 'h.title'); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 109 |  | 
|  | 110 | # Uncompress GOE/AGA/00000/data.xml from zip file | 
|  | 111 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/data.xml'); | 
|  | 112 |  | 
|  | 113 | ok($zip, 'Zip-File is found'); | 
|  | 114 |  | 
|  | 115 | # Read GOE/AGA/00000/data.xml | 
|  | 116 | my $data_xml = ''; | 
|  | 117 | $data_xml .= $zip->getline while !$zip->eof; | 
|  | 118 | ok($zip->close, 'Closed'); | 
|  | 119 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 120 | $t = Test::XML::Loy->new($data_xml); | 
|  | 121 | $t->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id') | 
|  | 122 | ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content'); | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 123 |  | 
|  | 124 | # Uncompress GOE/AGA/00000/struct/structure.xml from zip file | 
|  | 125 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/struct/structure.xml'); | 
|  | 126 |  | 
|  | 127 | ok($zip, 'Zip-File is found'); | 
|  | 128 |  | 
|  | 129 | # Read GOE/AGA/00000/struct/structure.xml | 
|  | 130 | my $struct_xml = ''; | 
|  | 131 | $struct_xml .= $zip->getline while !$zip->eof; | 
| Peter Harders | 57c884e | 2020-07-16 01:28:52 +0200 | [diff] [blame] | 132 |  | 
| Akron | 2a60c53 | 2020-02-13 15:52:18 +0100 | [diff] [blame] | 133 | ok($zip->close, 'Closed'); | 
|  | 134 |  | 
| Akron | d89ef82 | 2020-02-17 12:42:09 +0100 | [diff] [blame] | 135 | $t = Test::XML::Loy->new($struct_xml); | 
|  | 136 | $t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content'); | 
| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 137 |  | 
| Akron | eac374d | 2020-07-07 09:00:44 +0200 | [diff] [blame] | 138 |  | 
|  | 139 | # Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file | 
|  | 140 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml'); | 
|  | 141 |  | 
|  | 142 | # Read GOE/AGA/00000/base/tok.xml | 
|  | 143 | my $tokens_xml = ''; | 
|  | 144 | $tokens_xml .= $zip->getline while !$zip->eof; | 
|  | 145 | ok($zip->close, 'Closed'); | 
|  | 146 |  | 
|  | 147 | $t = Test::XML::Loy->new($tokens_xml); | 
|  | 148 | $t->attr_is('spanList span:nth-child(1)', 'to', 8); | 
|  | 149 |  | 
|  | 150 | $t->attr_is('spanList span#t_1', 'from', 9); | 
|  | 151 | $t->attr_is('spanList span#t_1', 'to', 11); | 
|  | 152 |  | 
|  | 153 | $t->attr_is('spanList span#t_67', 'from', 427); | 
|  | 154 | $t->attr_is('spanList span#t_67', 'to', 430); | 
|  | 155 |  | 
|  | 156 | $t->attr_is('spanList span#t_214', 'from', 1209); | 
|  | 157 | $t->attr_is('spanList span#t_214', 'to', 1212); | 
|  | 158 |  | 
|  | 159 | $t->element_count_is('spanList span', 227); | 
|  | 160 |  | 
|  | 161 |  | 
|  | 162 | # Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file | 
|  | 163 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml'); | 
|  | 164 |  | 
| Akron | 8b511f9 | 2020-07-09 17:28:08 +0200 | [diff] [blame] | 165 | $tokens_xml = ''; | 
|  | 166 | $tokens_xml .= $zip->getline while !$zip->eof; | 
|  | 167 | ok($zip->close, 'Closed'); | 
|  | 168 |  | 
|  | 169 | $t = Test::XML::Loy->new($tokens_xml); | 
|  | 170 | $t->attr_is('spanList span:nth-child(1)', 'to', 8); | 
|  | 171 |  | 
|  | 172 | $t->attr_is('spanList span#t_1', 'from', 9); | 
|  | 173 | $t->attr_is('spanList span#t_1', 'to', 11); | 
|  | 174 |  | 
|  | 175 | $t->attr_is('spanList span#t_67', 'from', 427); | 
|  | 176 | $t->attr_is('spanList span#t_67', 'to', 430); | 
|  | 177 |  | 
|  | 178 | $t->attr_is('spanList span#t_214', 'from', 1209); | 
|  | 179 | $t->attr_is('spanList span#t_214', 'to', 1212); | 
|  | 180 |  | 
|  | 181 | $t->element_count_is('spanList span', 227); | 
|  | 182 |  | 
|  | 183 | # Tokenize with external tokenizer | 
|  | 184 | my $cmd = catfile($f, 'cmd', 'tokenizer.pl'); | 
|  | 185 |  | 
|  | 186 | stderr_like( | 
|  | 187 | sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` }, | 
|  | 188 | qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!, | 
|  | 189 | 'Processing' | 
|  | 190 | ); | 
|  | 191 |  | 
| Peter Harders | 71f072b | 2020-07-15 14:15:01 +0200 | [diff] [blame] | 192 | # Uncompress GOE/AGA/00000/base/tokens.xml from zip file | 
| Akron | 8b511f9 | 2020-07-09 17:28:08 +0200 | [diff] [blame] | 193 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml'); | 
|  | 194 |  | 
|  | 195 | # Read GOE/AGA/00000/base/tokens.xml | 
| Akron | eac374d | 2020-07-07 09:00:44 +0200 | [diff] [blame] | 196 | $tokens_xml = ''; | 
|  | 197 | $tokens_xml .= $zip->getline while !$zip->eof; | 
|  | 198 | ok($zip->close, 'Closed'); | 
|  | 199 |  | 
|  | 200 | $t = Test::XML::Loy->new($tokens_xml); | 
|  | 201 | $t->attr_is('spanList span:nth-child(1)', 'to', 8); | 
|  | 202 |  | 
|  | 203 | $t->attr_is('spanList span#t_1', 'from', 9); | 
|  | 204 | $t->attr_is('spanList span#t_1', 'to', 11); | 
|  | 205 |  | 
|  | 206 | $t->attr_is('spanList span#t_67', 'from', 427); | 
|  | 207 | $t->attr_is('spanList span#t_67', 'to', 430); | 
|  | 208 |  | 
|  | 209 | $t->attr_is('spanList span#t_214', 'from', 1209); | 
|  | 210 | $t->attr_is('spanList span#t_214', 'to', 1212); | 
|  | 211 |  | 
|  | 212 | $t->element_count_is('spanList span', 227); | 
|  | 213 |  | 
| Peter Harders | 71f072b | 2020-07-15 14:15:01 +0200 | [diff] [blame] | 214 |  | 
|  | 215 |  | 
|  | 216 | # TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded) | 
|  | 217 |  | 
|  | 218 |  | 
|  | 219 | # ~ test conservative tokenization ~ | 
|  | 220 |  | 
|  | 221 | $file = catfile($f, 'data', 'text_with_blanks.i5.xml'); | 
|  | 222 |  | 
|  | 223 | stderr_like( | 
|  | 224 | sub { `cat '$file' | perl '$script' > '$outzip'` }, | 
|  | 225 | qr!tei2korapxml: .*? text_id=CORP_DOC.00001!, | 
|  | 226 | 'Processing' | 
|  | 227 | ); | 
|  | 228 |  | 
|  | 229 | ok(-e $outzip, "File $outzip exists"); | 
|  | 230 |  | 
|  | 231 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml'); | 
|  | 232 |  | 
|  | 233 | ok($zip, 'Zip-File is created'); | 
|  | 234 |  | 
|  | 235 | my $cons = ''; | 
|  | 236 | $cons .= $zip->getline while !$zip->eof; | 
|  | 237 | ok($zip->close, 'Closed'); | 
|  | 238 |  | 
|  | 239 | $t = Test::XML::Loy->new($cons); | 
|  | 240 | $t->attr_is('spanList span:nth-child(1)', 'to', 6); | 
|  | 241 |  | 
|  | 242 | $t->attr_is('spanList span#t_1', 'from', 7); | 
|  | 243 | $t->attr_is('spanList span#t_1', 'to', 9); | 
|  | 244 |  | 
|  | 245 | $t->attr_is('spanList span#t_3', 'from', 12); | 
|  | 246 | $t->attr_is('spanList span#t_3', 'to', 16); | 
|  | 247 |  | 
|  | 248 | $t->attr_is('spanList span#t_9', 'from', 36); | 
|  | 249 | $t->attr_is('spanList span#t_9', 'to', 37); | 
|  | 250 |  | 
|  | 251 | $t->attr_is('spanList span#t_13', 'from', 44); | 
|  | 252 | $t->attr_is('spanList span#t_13', 'to', 45);          # " | 
|  | 253 |  | 
|  | 254 | $t->attr_is('spanList span#t_14', 'from', 45);        # twenty-two | 
|  | 255 | $t->attr_is('spanList span#t_14', 'to', 55); | 
|  | 256 |  | 
|  | 257 | $t->attr_is('spanList span#t_15', 'from', 55);        # " | 
|  | 258 | $t->attr_is('spanList span#t_15', 'to', 56); | 
|  | 259 |  | 
|  | 260 | $t->attr_is('spanList span#t_19', 'from', 66); | 
|  | 261 | $t->attr_is('spanList span#t_19', 'to', 67); | 
|  | 262 |  | 
|  | 263 | $t->element_count_is('spanList span', 20); | 
|  | 264 |  | 
|  | 265 |  | 
|  | 266 | # ~ test aggressive tokenization ~ | 
|  | 267 |  | 
|  | 268 | $zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml'); | 
|  | 269 |  | 
|  | 270 | ok($zip, 'Zip-File is created'); | 
|  | 271 |  | 
|  | 272 | my $aggr = ''; | 
|  | 273 | $aggr .= $zip->getline while !$zip->eof; | 
|  | 274 | ok($zip->close, 'Closed'); | 
|  | 275 |  | 
|  | 276 | $t = Test::XML::Loy->new($aggr); | 
|  | 277 |  | 
|  | 278 | $t->attr_is('spanList span:nth-child(1)', 'to', 6); | 
|  | 279 |  | 
|  | 280 | $t->attr_is('spanList span#t_1', 'from', 7); | 
|  | 281 | $t->attr_is('spanList span#t_1', 'to', 9); | 
|  | 282 |  | 
|  | 283 | $t->attr_is('spanList span#t_3', 'from', 12); | 
|  | 284 | $t->attr_is('spanList span#t_3', 'to', 16); | 
|  | 285 |  | 
|  | 286 | $t->attr_is('spanList span#t_9', 'from', 36); | 
|  | 287 | $t->attr_is('spanList span#t_9', 'to', 37); | 
|  | 288 |  | 
|  | 289 | $t->attr_is('spanList span#t_13', 'from', 44); | 
|  | 290 | $t->attr_is('spanList span#t_13', 'to', 45);          # " | 
|  | 291 |  | 
|  | 292 | $t->attr_is('spanList span#t_14', 'from', 45);        # twenty | 
|  | 293 | $t->attr_is('spanList span#t_14', 'to', 51); | 
|  | 294 |  | 
|  | 295 | $t->attr_is('spanList span#t_15', 'from', 51);        # - | 
|  | 296 | $t->attr_is('spanList span#t_15', 'to', 52); | 
|  | 297 |  | 
|  | 298 | $t->attr_is('spanList span#t_16', 'from', 52);        # two | 
|  | 299 | $t->attr_is('spanList span#t_16', 'to', 55); | 
|  | 300 |  | 
|  | 301 | $t->attr_is('spanList span#t_17', 'from', 55);        # " | 
|  | 302 | $t->attr_is('spanList span#t_17', 'to', 56); | 
|  | 303 |  | 
|  | 304 | $t->attr_is('spanList span#t_21', 'from', 66); | 
|  | 305 | $t->attr_is('spanList span#t_21', 'to', 67); | 
|  | 306 |  | 
|  | 307 | $t->element_count_is('spanList span', 22); | 
|  | 308 |  | 
|  | 309 |  | 
| Akron | 797e807 | 2020-02-13 07:59:40 +0100 | [diff] [blame] | 310 | done_testing; |