Automatically replace entities with their corresponding characters
Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.
Numeric decimal and hexadecimal entities are replaced, too
Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/t/tei.t b/t/tei.t
index 69b4ee1..be5cb3d 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,3 +1,4 @@
+use utf8;
use strict;
use warnings;
use Test::More;
@@ -9,7 +10,7 @@
use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities');
subtest 'remove_xml_comments' => sub {
my ($fh, $filename) = korap_tempfile('tei');
@@ -114,4 +115,15 @@
);
};
+subtest 'Replace all entities' => sub {
+ is(
+ replace_entities('α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ'),
+ 'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ'
+ );
+ is(replace_entities('A'), 'A');
+ is(replace_entities('«'), replace_entities('«'));
+ is(replace_entities('A'), 'A');
+ is(replace_entities('&<>'), '&<>')
+};
+
done_testing;