blob: 98b945e841c6f44b50c1f7c5f4ff9545d6823e19 [file] [log] [blame]
Marc Kupietz8a954e52021-02-16 22:03:07 +01001use utf8;
Akron4f67cd42020-07-02 12:27:58 +02002use strict;
3use warnings;
4use Test::More;
Akron4f67cd42020-07-02 12:27:58 +02005
6use FindBin;
7BEGIN {
8 unshift @INC, "$FindBin::Bin/../lib";
9};
10
Akron19c6c352020-08-01 13:29:00 +020011use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
Peter Harders42e18a62020-07-21 02:43:26 +020012
Marc Kupietz8a954e52021-02-16 22:03:07 +010013use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities');
Akron4f67cd42020-07-02 12:27:58 +020014
Akron0465e9e2020-07-27 15:55:21 +020015subtest 'remove_xml_comments' => sub {
16 my ($fh, $filename) = korap_tempfile('tei');
Akron4f67cd42020-07-02 12:27:58 +020017
Akron0465e9e2020-07-27 15:55:21 +020018 print $fh <<'HTML';
Akron4f67cd42020-07-02 12:27:58 +020019mehrzeiliger
20Kommentar
21 -->
22Test
23HTML
24
Akron0465e9e2020-07-27 15:55:21 +020025 is(remove_xml_comments($fh, "hallo"),"hallo");
26 is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
27 is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
Akron4f67cd42020-07-02 12:27:58 +020028
Akron0465e9e2020-07-27 15:55:21 +020029 seek($fh, 0, 0);
Akron4f67cd42020-07-02 12:27:58 +020030
Akron0465e9e2020-07-27 15:55:21 +020031 is(remove_xml_comments($fh, '<!--'), "Test\n");
Akron4f67cd42020-07-02 12:27:58 +020032
Akron0465e9e2020-07-27 15:55:21 +020033 seek($fh, 0, 0);
Akron2d547bc2020-07-04 10:34:35 +020034
Akron0465e9e2020-07-27 15:55:21 +020035 print $fh <<'HTML';
Akron2d547bc2020-07-04 10:34:35 +020036mehrzeiliger
37Kommentar
38 --><!-- Versuch
39-->ist <!-- a --><!-- b --> ein Test
40HTML
41
Akron0465e9e2020-07-27 15:55:21 +020042 seek($fh, 0, 0);
Akron2d547bc2020-07-04 10:34:35 +020043
Akron0465e9e2020-07-27 15:55:21 +020044 is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist ein Test\n");
Akron2d547bc2020-07-04 10:34:35 +020045
Akron0465e9e2020-07-27 15:55:21 +020046 close($fh);
47};
48
Akron19c6c352020-08-01 13:29:00 +020049
50subtest 'remove_xml_comments in script' => sub {
51 test_tei2korapxml(
52 template => {
53 text => "<!--\nDies ist ein\nmehrzeiligerKommentar -->Text1",
54 textSigle => 'A/B.1',
55 pattern => 'xx'
Akronb93fabb2023-01-13 12:05:44 +010056 },
57 param => '--no-tokenizer'
Akron19c6c352020-08-01 13:29:00 +020058 )
59 ->file_exists('A/B/1/data.xml')
60 ->unzip_xml('A/B/1/data.xml')
61 ->text_is('text', 'Text1');
62};
63
64
Akrondafaa7a2021-02-19 15:17:58 +010065subtest 'skip missing dir in script' => sub {
66 test_tei2korapxml(
67 template => {
68 text => "Nur ein Test",
69 textSigle => '',
70 pattern => 'missing_dir'
Akronb93fabb2023-01-13 12:05:44 +010071 },
72 param => '--no-tokenizer'
Akrondafaa7a2021-02-19 15:17:58 +010073 )
74 ->file_exists_not('A/B/1/data.xml')
75 ->stderr_like(qr!Empty '<textSigle />' \(L29\) in header!)
76 ->stderr_like(qr!skipping this text!)
77 ;
78};
79
80
Akron0465e9e2020-07-27 15:55:21 +020081subtest 'escape_xml' => sub {
82 is(
83 escape_xml('"""'),
84 '&quot;&quot;&quot;'
85 );
86
87 is(
88 escape_xml('&&&'),
89 '&amp;&amp;&amp;'
90 );
91
92 is(
93 escape_xml('<<<'),
94 '&lt;&lt;&lt;'
95 );
96
97 is(
98 escape_xml('>>>'),
99 '&gt;&gt;&gt;'
100 );
101
102 is(
103 escape_xml('<tag att1="foo" att2="bar">C&A</tag>'),
104 '&lt;tag att1=&quot;foo&quot; att2=&quot;bar&quot;&gt;C&amp;A&lt;/tag&gt;'
105 );
106};
107
Marc Kupietzfd0e6a92020-09-09 18:07:29 +0200108subtest 'escape_xml_minimal' => sub {
109 is(
110 escape_xml_minimal('"""'),
111 '"""'
112 );
113
114 is(
115 escape_xml_minimal('&&&'),
116 '&amp;&amp;&amp;'
117 );
118
119 is(
120 escape_xml_minimal('<<<'),
121 '&lt;&lt;&lt;'
122 );
123
124 is(
125 escape_xml_minimal('>>>'),
126 '&gt;&gt;&gt;'
127 );
128
129 is(
130 escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'),
131 '&lt;tag att1="foo" att2="bar"&gt;C&amp;A&lt;/tag&gt;'
132 );
133};
Akron2d547bc2020-07-04 10:34:35 +0200134
Marc Kupietz8a954e52021-02-16 22:03:07 +0100135subtest 'Replace all entities' => sub {
136 is(
137 replace_entities('&alpha;&ap;&bdquo;&blk12;&blk14;&blk34;&block;&boxDL;&boxdl;&boxdr;&boxDR;&boxH;&boxh;&boxhd;&boxHD;&boxhu;&boxHU;&boxUL;&boxul;&boxur;&boxUR;&boxv;&boxV;&boxvh;&boxVH;&boxvl;&boxVL;&boxVR;&boxvr;&bull;&caron;&ccaron;&circ;&dagger;&Dagger;&ecaron;&euro;&fnof;&hellip;&Horbar;&inodot;&iota;&ldquo;&ldquor;&lhblk;&lsaquo;&lsquo;&lsquor;&mdash;&ndash;&nu;&oelig;&OElig;&omega;&Omega;&permil;&phi;&pi;&piv;&rcaron;&rdquo;&rho;&rsaquo;&rsquo;&rsquor;&scaron;&Scaron;&sigma;&squ;&squb;&squf;&sub;&tilde;&trade;&uhblk;&Yuml;&zcaron;&Zcaron;'),
138 'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ'
139 );
140 is(replace_entities('&#65;'), 'A');
141 is(replace_entities('&#171;'), replace_entities('&#x00AB;'));
142 is(replace_entities('&#x41;'), 'A');
143 is(replace_entities('&amp;&lt;&gt;'), '&amp;&lt;&gt;')
144};
145
Akron4f67cd42020-07-02 12:27:58 +0200146done_testing;