Automatically replace entities with their corresponding characters
Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.
Numeric decimal and hexadecimal entities are replaced, too
Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/t/script.t b/t/script.t
index 81067d9..fbe28bf 100644
--- a/t/script.t
+++ b/t/script.t
@@ -484,6 +484,20 @@
->attr_is('layer', 'docid', 'WDD19_ß0000.10317');
};
+subtest 'Check entity replacement' => sub {
+ my $t = test_tei2korapxml(
+ file => catfile($f, 'data', 'text_with_entities.i5.xml'),
+ tmp => 'script_entity_replacement',
+ param => '-ti'
+ )->stderr_like(qr!tei2korapxml: .*? text_id=CORP_DOC.00003!);
+
+ $t->unzip_xml('CORP/DOC/00003/data.xml')
+ ->content_like(qr!üüü Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ!);
+
+ $t->unzip_xml('CORP/DOC/00003/header.xml')
+ ->content_like(qr!üüü x α•α y!);
+};
+
subtest 'Test Log' => sub {
test_tei2korapxml(
tmp => 'script_out',