Marc Kupietz | 8a954e5 | 2021-02-16 22:03:07 +0100 | [diff] [blame] | 1 | use utf8; |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 2 | use strict; |
| 3 | use warnings; |
| 4 | use Test::More; |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 5 | |
| 6 | use FindBin; |
| 7 | BEGIN { |
| 8 | unshift @INC, "$FindBin::Bin/../lib"; |
| 9 | }; |
| 10 | |
Akron | 19c6c35 | 2020-08-01 13:29:00 +0200 | [diff] [blame] | 11 | use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!; |
Peter Harders | 42e18a6 | 2020-07-21 02:43:26 +0200 | [diff] [blame] | 12 | |
Marc Kupietz | 8a954e5 | 2021-02-16 22:03:07 +0100 | [diff] [blame] | 13 | use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities'); |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 14 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 15 | subtest 'remove_xml_comments' => sub { |
| 16 | my ($fh, $filename) = korap_tempfile('tei'); |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 17 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 18 | print $fh <<'HTML'; |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 19 | mehrzeiliger |
| 20 | Kommentar |
| 21 | --> |
| 22 | Test |
| 23 | HTML |
| 24 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 25 | is(remove_xml_comments($fh, "hallo"),"hallo"); |
| 26 | is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo "); |
| 27 | is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo"); |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 28 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 29 | seek($fh, 0, 0); |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 30 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 31 | is(remove_xml_comments($fh, '<!--'), "Test\n"); |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 32 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 33 | seek($fh, 0, 0); |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 34 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 35 | print $fh <<'HTML'; |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 36 | mehrzeiliger |
| 37 | Kommentar |
| 38 | --><!-- Versuch |
| 39 | -->ist <!-- a --><!-- b --> ein Test |
| 40 | HTML |
| 41 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 42 | seek($fh, 0, 0); |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 43 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 44 | is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist ein Test\n"); |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 45 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 46 | close($fh); |
| 47 | }; |
| 48 | |
Akron | 19c6c35 | 2020-08-01 13:29:00 +0200 | [diff] [blame] | 49 | |
| 50 | subtest 'remove_xml_comments in script' => sub { |
| 51 | test_tei2korapxml( |
| 52 | template => { |
| 53 | text => "<!--\nDies ist ein\nmehrzeiligerKommentar -->Text1", |
| 54 | textSigle => 'A/B.1', |
| 55 | pattern => 'xx' |
Akron | b93fabb | 2023-01-13 12:05:44 +0100 | [diff] [blame] | 56 | }, |
| 57 | param => '--no-tokenizer' |
Akron | 19c6c35 | 2020-08-01 13:29:00 +0200 | [diff] [blame] | 58 | ) |
| 59 | ->file_exists('A/B/1/data.xml') |
| 60 | ->unzip_xml('A/B/1/data.xml') |
| 61 | ->text_is('text', 'Text1'); |
| 62 | }; |
| 63 | |
| 64 | |
Akron | dafaa7a | 2021-02-19 15:17:58 +0100 | [diff] [blame] | 65 | subtest 'skip missing dir in script' => sub { |
| 66 | test_tei2korapxml( |
| 67 | template => { |
| 68 | text => "Nur ein Test", |
| 69 | textSigle => '', |
| 70 | pattern => 'missing_dir' |
Akron | b93fabb | 2023-01-13 12:05:44 +0100 | [diff] [blame] | 71 | }, |
| 72 | param => '--no-tokenizer' |
Akron | dafaa7a | 2021-02-19 15:17:58 +0100 | [diff] [blame] | 73 | ) |
| 74 | ->file_exists_not('A/B/1/data.xml') |
| 75 | ->stderr_like(qr!Empty '<textSigle />' \(L29\) in header!) |
| 76 | ->stderr_like(qr!skipping this text!) |
| 77 | ; |
| 78 | }; |
| 79 | |
| 80 | |
Akron | 0465e9e | 2020-07-27 15:55:21 +0200 | [diff] [blame] | 81 | subtest 'escape_xml' => sub { |
| 82 | is( |
| 83 | escape_xml('"""'), |
| 84 | '"""' |
| 85 | ); |
| 86 | |
| 87 | is( |
| 88 | escape_xml('&&&'), |
| 89 | '&&&' |
| 90 | ); |
| 91 | |
| 92 | is( |
| 93 | escape_xml('<<<'), |
| 94 | '<<<' |
| 95 | ); |
| 96 | |
| 97 | is( |
| 98 | escape_xml('>>>'), |
| 99 | '>>>' |
| 100 | ); |
| 101 | |
| 102 | is( |
| 103 | escape_xml('<tag att1="foo" att2="bar">C&A</tag>'), |
| 104 | '<tag att1="foo" att2="bar">C&A</tag>' |
| 105 | ); |
| 106 | }; |
| 107 | |
Marc Kupietz | fd0e6a9 | 2020-09-09 18:07:29 +0200 | [diff] [blame] | 108 | subtest 'escape_xml_minimal' => sub { |
| 109 | is( |
| 110 | escape_xml_minimal('"""'), |
| 111 | '"""' |
| 112 | ); |
| 113 | |
| 114 | is( |
| 115 | escape_xml_minimal('&&&'), |
| 116 | '&&&' |
| 117 | ); |
| 118 | |
| 119 | is( |
| 120 | escape_xml_minimal('<<<'), |
| 121 | '<<<' |
| 122 | ); |
| 123 | |
| 124 | is( |
| 125 | escape_xml_minimal('>>>'), |
| 126 | '>>>' |
| 127 | ); |
| 128 | |
| 129 | is( |
| 130 | escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'), |
| 131 | '<tag att1="foo" att2="bar">C&A</tag>' |
| 132 | ); |
| 133 | }; |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 134 | |
Marc Kupietz | 8a954e5 | 2021-02-16 22:03:07 +0100 | [diff] [blame] | 135 | subtest 'Replace all entities' => sub { |
| 136 | is( |
| 137 | replace_entities('α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ'), |
| 138 | 'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ' |
| 139 | ); |
| 140 | is(replace_entities('A'), 'A'); |
| 141 | is(replace_entities('«'), replace_entities('«')); |
| 142 | is(replace_entities('A'), 'A'); |
| 143 | is(replace_entities('&<>'), '&<>') |
| 144 | }; |
| 145 | |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 146 | done_testing; |