| use utf8; |
| use strict; |
| use warnings; |
| use Test::More; |
| |
| use FindBin; |
| BEGIN { |
| unshift @INC, "$FindBin::Bin/../lib"; |
| }; |
| |
| use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!; |
| |
| use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities'); |
| |
| subtest 'remove_xml_comments' => sub { |
| my ($fh, $filename) = korap_tempfile('tei'); |
| |
| print $fh <<'HTML'; |
| mehrzeiliger |
| Kommentar |
| --> |
| Test |
| HTML |
| |
| is(remove_xml_comments($fh, "hallo"),"hallo"); |
| is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo "); |
| is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo"); |
| |
| seek($fh, 0, 0); |
| |
| is(remove_xml_comments($fh, '<!--'), "Test\n"); |
| |
| seek($fh, 0, 0); |
| |
| print $fh <<'HTML'; |
| mehrzeiliger |
| Kommentar |
| --><!-- Versuch |
| -->ist <!-- a --><!-- b --> ein Test |
| HTML |
| |
| seek($fh, 0, 0); |
| |
| is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist ein Test\n"); |
| |
| close($fh); |
| }; |
| |
| |
| subtest 'remove_xml_comments in script' => sub { |
| test_tei2korapxml( |
| template => { |
| text => "<!--\nDies ist ein\nmehrzeiligerKommentar -->Text1", |
| textSigle => 'A/B.1', |
| pattern => 'xx' |
| }, |
| param => '--no-tokenizer' |
| ) |
| ->file_exists('A/B/1/data.xml') |
| ->unzip_xml('A/B/1/data.xml') |
| ->text_is('text', 'Text1'); |
| }; |
| |
| |
| subtest 'skip missing dir in script' => sub { |
| test_tei2korapxml( |
| template => { |
| text => "Nur ein Test", |
| textSigle => '', |
| pattern => 'missing_dir' |
| }, |
| param => '--no-tokenizer' |
| ) |
| ->file_exists_not('A/B/1/data.xml') |
| ->stderr_like(qr!Empty '<textSigle />' \(L29\) in header!) |
| ->stderr_like(qr!skipping this text!) |
| ; |
| }; |
| |
| |
| subtest 'escape_xml' => sub { |
| is( |
| escape_xml('"""'), |
| '"""' |
| ); |
| |
| is( |
| escape_xml('&&&'), |
| '&&&' |
| ); |
| |
| is( |
| escape_xml('<<<'), |
| '<<<' |
| ); |
| |
| is( |
| escape_xml('>>>'), |
| '>>>' |
| ); |
| |
| is( |
| escape_xml('<tag att1="foo" att2="bar">C&A</tag>'), |
| '<tag att1="foo" att2="bar">C&A</tag>' |
| ); |
| }; |
| |
| subtest 'escape_xml_minimal' => sub { |
| is( |
| escape_xml_minimal('"""'), |
| '"""' |
| ); |
| |
| is( |
| escape_xml_minimal('&&&'), |
| '&&&' |
| ); |
| |
| is( |
| escape_xml_minimal('<<<'), |
| '<<<' |
| ); |
| |
| is( |
| escape_xml_minimal('>>>'), |
| '>>>' |
| ); |
| |
| is( |
| escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'), |
| '<tag att1="foo" att2="bar">C&A</tag>' |
| ); |
| }; |
| |
| subtest 'Replace all entities' => sub { |
| is( |
| replace_entities('α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ'), |
| 'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ' |
| ); |
| is(replace_entities('A'), 'A'); |
| is(replace_entities('«'), replace_entities('«')); |
| is(replace_entities('A'), 'A'); |
| is(replace_entities('&<>'), '&<>') |
| }; |
| |
| done_testing; |