Automatically replace entities with their corresponding characters

Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.

Numeric decimal and hexadecimal entities are replaced, too

Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/t/data/text_with_entities.i5.xml b/t/data/text_with_entities.i5.xml
new file mode 100644
index 0000000..e675176
--- /dev/null
+++ b/t/data/text_with_entities.i5.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<idsText>
+  <idsHeader type="text">
+    <fileDesc>
+      <titleStmt>
+        <textSigle>CORP/DOC.00003</textSigle>
+        <t.title>üüü x &alpha;&bull;&alpha; y</t.title>
+      </titleStmt>
+    </fileDesc>
+  </idsHeader>
+  <text>
+   <p id="p1">üüü &#x20;&#65;&alpha;&ap;&bdquo;&blk12;&blk14;&blk34;&block;&boxDL;&boxdl;&boxdr;&boxDR;&boxH;&boxh;&boxhd;&boxHD;&boxhu;&boxHU;&boxUL;&boxul;&boxur;&boxUR;&boxv;&boxV;&boxvh;&boxVH;&boxvl;&boxVL;&boxVR;&boxvr;&bull;&caron;&ccaron;&circ;&dagger;&Dagger;&ecaron;&euro;&fnof;&hellip;&Horbar;&inodot;&iota;&ldquo;&ldquor;&lhblk;&lsaquo;&lsquo;&lsquor;&mdash;&ndash;&nu;&oelig;&OElig;&omega;&Omega;&permil;&phi;&pi;&piv;&rcaron;&rdquo;&rho;&rsaquo;&rsquo;&rsquor;&scaron;&Scaron;&sigma;&squ;&squb;&squf;&sub;&tilde;&trade;&uhblk;&Yuml;&zcaron;&Zcaron;</p>
+  </text>
+</idsText>
diff --git a/t/script.t b/t/script.t
index 81067d9..fbe28bf 100644
--- a/t/script.t
+++ b/t/script.t
@@ -484,6 +484,20 @@
       ->attr_is('layer', 'docid', 'WDD19_ß0000.10317');
 };
 
+subtest 'Check entity replacement' => sub {
+  my $t = test_tei2korapxml(
+    file => catfile($f, 'data', 'text_with_entities.i5.xml'),
+    tmp => 'script_entity_replacement',
+    param => '-ti'
+  )->stderr_like(qr!tei2korapxml: .*? text_id=CORP_DOC.00003!);
+
+  $t->unzip_xml('CORP/DOC/00003/data.xml')
+    ->content_like(qr!üüü  Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ!);
+
+  $t->unzip_xml('CORP/DOC/00003/header.xml')
+    ->content_like(qr!üüü x α•α y!);
+};
+
 subtest 'Test Log' => sub {
   test_tei2korapxml(
     tmp => 'script_out',
diff --git a/t/tei.t b/t/tei.t
index 69b4ee1..be5cb3d 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,3 +1,4 @@
+use utf8;
 use strict;
 use warnings;
 use Test::More;
@@ -9,7 +10,7 @@
 
 use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
 
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities');
 
 subtest 'remove_xml_comments' => sub {
   my ($fh, $filename) = korap_tempfile('tei');
@@ -114,4 +115,15 @@
   );
 };
 
+subtest 'Replace all entities' => sub {
+  is(
+    replace_entities('&alpha;&ap;&bdquo;&blk12;&blk14;&blk34;&block;&boxDL;&boxdl;&boxdr;&boxDR;&boxH;&boxh;&boxhd;&boxHD;&boxhu;&boxHU;&boxUL;&boxul;&boxur;&boxUR;&boxv;&boxV;&boxvh;&boxVH;&boxvl;&boxVL;&boxVR;&boxvr;&bull;&caron;&ccaron;&circ;&dagger;&Dagger;&ecaron;&euro;&fnof;&hellip;&Horbar;&inodot;&iota;&ldquo;&ldquor;&lhblk;&lsaquo;&lsquo;&lsquor;&mdash;&ndash;&nu;&oelig;&OElig;&omega;&Omega;&permil;&phi;&pi;&piv;&rcaron;&rdquo;&rho;&rsaquo;&rsquo;&rsquor;&scaron;&Scaron;&sigma;&squ;&squb;&squf;&sub;&tilde;&trade;&uhblk;&Yuml;&zcaron;&Zcaron;'),
+    'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ'
+  );
+  is(replace_entities('&#65;'), 'A');
+  is(replace_entities('&#171;'), replace_entities('&#x00AB;'));
+  is(replace_entities('&#x41;'), 'A');
+  is(replace_entities('&amp;&lt;&gt;'), '&amp;&lt;&gt;')
+};
+
 done_testing;