Automatically replace entities with their corresponding characters

Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.

Numeric decimal and hexadecimal entities are replaced, too

Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/Changes b/Changes
index 106043e..d3252c4 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
         - -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)
         - tokenizer invocation comments removed from KorAP XML output
         - indentation of </span> tags fixed
+        - character entities that used in DeReKo are automatically replaced by their corresponding characters
 0.03 2021-01-12
         - Update KorAP-Tokenizer to released 2.0 version
         - Improve test suite for recent version
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index de68534..493fce5 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -1,9 +1,10 @@
+use utf8;
 package KorAP::XML::TEI;
 use strict;
 use warnings;
 
 use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities);
 
 # convert '&', '<' and '>' into their corresponding sgml-entities
 my %ent_without_quot = (
@@ -17,6 +18,87 @@
   '"' => '&quot;'
 );
 
+#  GET http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent | perl -C255 -wlne 'print "'\''$1'\''\t=>\t '\''", chr($2), "'\'', # $3" if(/ENTITY (\S*)\s".#(\d+).*<!-- (.*) -->/)'
+my %html_entities = (
+  'alpha'  => 'α', # GREEK SMALL LETTER ALPHA
+  'ap'     => '≈', # ALMOST EQUAL TO
+  'bdquo'  => '„', # DOUBLE LOW-9 QUOTATION MARK
+  'blk12'  => '▒', # MEDIUM SHADE
+  'blk14'  => '░', # LIGHT SHADE
+  'blk34'  => '▓', # DARK SHADE
+  'block'  => '█', # FULL BLOCK
+  'boxDL'  => '╗', # BOX DRAWINGS DOUBLE DOWN AND LEFT
+  'boxdl'  => '┐', # BOX DRAWINGS LIGHT DOWN AND LEFT
+  'boxdr'  => '┌', # BOX DRAWINGS LIGHT DOWN AND RIGHT
+  'boxDR'  => '╔', # BOX DRAWINGS DOUBLE DOWN AND RIGHT
+  'boxH'   => '═', # BOX DRAWINGS DOUBLE HORIZONTAL
+  'boxh'   => '─', # BOX DRAWINGS LIGHT HORIZONTAL
+  'boxhd'  => '┬', # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
+  'boxHD'  => '╦', # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
+  'boxhu'  => '┴', # BOX DRAWINGS LIGHT UP AND HORIZONTAL
+  'boxHU'  => '╩', # BOX DRAWINGS DOUBLE UP AND HORIZONTAL
+  'boxUL'  => '╝', # BOX DRAWINGS DOUBLE UP AND LEFT
+  'boxul'  => '┘', # BOX DRAWINGS LIGHT UP AND LEFT
+  'boxur'  => '└', # BOX DRAWINGS LIGHT UP AND RIGHT
+  'boxUR'  => '╚', # BOX DRAWINGS DOUBLE UP AND RIGHT
+  'boxv'   => '│', # BOX DRAWINGS LIGHT VERTICAL
+  'boxV'   => '║', # BOX DRAWINGS DOUBLE VERTICAL
+  'boxvh'  => '┼', # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
+  'boxVH'  => '╬', # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
+  'boxvl'  => '┤', # BOX DRAWINGS LIGHT VERTICAL AND LEFT
+  'boxVL'  => '╣', # BOX DRAWINGS DOUBLE VERTICAL AND LEFT
+  'boxVR'  => '╠', # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
+  'boxvr'  => '├', # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
+  'bull'   => '•', # BULLET
+  'caron'  => 'ˇ', # CARON
+  'ccaron' => 'č', # LATIN SMALL LETTER C WITH CARON
+  'circ'   => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT
+  'dagger' => '†', # DAGGER
+  'Dagger' => '‡', # DOUBLE DAGGER
+  'ecaron' => 'ě', # LATIN SMALL LETTER E WITH CARON
+  'euro'   => '€', # EURO SIGN
+  'fnof'   => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
+  'hellip' => '…', # HORIZONTAL ELLIPSIS
+  'Horbar' => '‗', # DOUBLE LOW LINE
+  'inodot' => 'ı', # LATIN SMALL LETTER DOTLESS I
+  'iota'   => 'ι', # GREEK SMALL LETTER IOTA
+  'ldquo'  => '“', # LEFT DOUBLE QUOTATION MARK
+  'ldquor' => '„', # DOUBLE LOW-9 QUOTATION MARK
+  'lhblk'  => '▄', # LOWER HALF BLOCK
+  'lsaquo' => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+  'lsquo'  => '‘', # LEFT SINGLE QUOTATION MARK
+  'lsquor' => '‚', # SINGLE LOW-9 QUOTATION MARK
+  'mdash'  => '—', # EM DASH
+  'ndash'  => '–', # EN DASH
+  'nu'     => 'ν', # GREEK SMALL LETTER NU
+  'oelig'  => 'œ', # LATIN SMALL LIGATURE OE
+  'OElig'  => 'Œ', # LATIN CAPITAL LIGATURE OE
+  'omega'  => 'ω', # GREEK SMALL LETTER OMEGA
+  'Omega'  => 'Ω', # GREEK CAPITAL LETTER OMEGA
+  'permil' => '‰', # PER MILLE SIGN
+  'phi'    => 'φ', # GREEK SMALL LETTER PHI
+  'pi'     => 'π', # GREEK SMALL LETTER PI
+  'piv'    => 'ϖ', # GREEK PI SYMBOL
+  'rcaron' => 'ř', # LATIN SMALL LETTER R WITH CARON
+  'rdquo'  => '”', # RIGHT DOUBLE QUOTATION MARK
+  'rho'    => 'ρ', # GREEK SMALL LETTER RHO
+  'rsaquo' => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+  'rsquo'  => '’', # RIGHT SINGLE QUOTATION MARK
+  'rsquor' => '‘', # LEFT SINGLE QUOTATION MARK
+  'scaron' => 'š', # LATIN SMALL LETTER S WITH CARON
+  'Scaron' => 'Š', # LATIN CAPITAL LETTER S WITH CARON
+  'sigma'  => 'σ', # GREEK SMALL LETTER SIGMA
+  'squ'    => '□', # WHITE SQUARE
+  'squb'   => '■', # BLACK SQUARE
+  'squf'   => '▪', # BLACK SMALL SQUARE
+  'sub'    => '⊂', # SUBSET OF
+  'tilde'  => '˜', # SMALL TILDE
+  'trade'  => '™', # TRADE MARK SIGN
+  'uhblk'  => '▀', # UPPER HALF BLOCK
+  'Yuml'   => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
+  'zcaron' => 'ž', # LATIN SMALL LETTER Z WITH CARON
+  'Zcaron' => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON
+);
 
 # remove xml comments
 sub remove_xml_comments {
@@ -82,5 +164,13 @@
   ($_[0] // '') =~ s/([&<>])/$ent_without_quot{$1}/ger;
 };
 
+# Replace all entities, except %ent
+sub replace_entities {
+  $_= shift;
+  s/[&]#(x[0-9A-Fa-f]+);/chr(hex("0$1"))/ge;
+  s/[&]#(\d+);/chr($1)/ge;
+  s/\&(alpha|ap|bdquo|blk12|blk14|blk34|block|boxDL|boxdl|boxdr|boxDR|boxH|boxh|boxhd|boxHD|boxhu|boxHU|boxUL|boxul|boxur|boxUR|boxv|boxV|boxvh|boxVH|boxvl|boxVL|boxVR|boxvr|bull|caron|ccaron|circ|dagger|Dagger|ecaron|euro|fnof|hellip|Horbar|inodot|iota|ldquo|ldquor|lhblk|lsaquo|lsquo|lsquor|mdash|ndash|nu|oelig|OElig|omega|Omega|permil|phi|pi|piv|rcaron|rdquo|rho|rsaquo|rsquo|rsquor|scaron|Scaron|sigma|squ|squb|squf|sub|tilde|trade|uhblk|Yuml|zcaron|Zcaron);/$html_entities{$1}/ge;
+  return($_);
+};
 
 1;
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index a03f7c4..c397c7a 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -4,6 +4,7 @@
 use Log::Any qw($log);
 use Encode qw(encode decode);
 use KorAP::XML::TEI qw!escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 
 # Parsing of i5 header files
 
@@ -63,6 +64,7 @@
   while (<$fh>) {
 
     $_ = decode($self->[INPUTENC], $_);
+    $_ = replace_entities($_);
 
     # Change:
     #   This version keeps comments in header files
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f15376f..bd54462 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -19,7 +19,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI qw!remove_xml_comments!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -236,6 +236,7 @@
   };
 
   $_ = decode($input_enc, $_);
+  $_ = replace_entities($_);
 
   if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
@@ -258,6 +259,7 @@
 
       $_ = remove_xml_comments( $input_fh, $_ );
       $_ = decode($input_enc, $_);
+      $_ = replace_entities($_);
 
       # ~ end of text body ~
       if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
diff --git a/t/data/text_with_entities.i5.xml b/t/data/text_with_entities.i5.xml
new file mode 100644
index 0000000..e675176
--- /dev/null
+++ b/t/data/text_with_entities.i5.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<idsText>
+  <idsHeader type="text">
+    <fileDesc>
+      <titleStmt>
+        <textSigle>CORP/DOC.00003</textSigle>
+        <t.title>üüü x &alpha;&bull;&alpha; y</t.title>
+      </titleStmt>
+    </fileDesc>
+  </idsHeader>
+  <text>
+   <p id="p1">üüü &#x20;&#65;&alpha;&ap;&bdquo;&blk12;&blk14;&blk34;&block;&boxDL;&boxdl;&boxdr;&boxDR;&boxH;&boxh;&boxhd;&boxHD;&boxhu;&boxHU;&boxUL;&boxul;&boxur;&boxUR;&boxv;&boxV;&boxvh;&boxVH;&boxvl;&boxVL;&boxVR;&boxvr;&bull;&caron;&ccaron;&circ;&dagger;&Dagger;&ecaron;&euro;&fnof;&hellip;&Horbar;&inodot;&iota;&ldquo;&ldquor;&lhblk;&lsaquo;&lsquo;&lsquor;&mdash;&ndash;&nu;&oelig;&OElig;&omega;&Omega;&permil;&phi;&pi;&piv;&rcaron;&rdquo;&rho;&rsaquo;&rsquo;&rsquor;&scaron;&Scaron;&sigma;&squ;&squb;&squf;&sub;&tilde;&trade;&uhblk;&Yuml;&zcaron;&Zcaron;</p>
+  </text>
+</idsText>
diff --git a/t/script.t b/t/script.t
index 81067d9..fbe28bf 100644
--- a/t/script.t
+++ b/t/script.t
@@ -484,6 +484,20 @@
       ->attr_is('layer', 'docid', 'WDD19_ß0000.10317');
 };
 
+subtest 'Check entity replacement' => sub {
+  my $t = test_tei2korapxml(
+    file => catfile($f, 'data', 'text_with_entities.i5.xml'),
+    tmp => 'script_entity_replacement',
+    param => '-ti'
+  )->stderr_like(qr!tei2korapxml: .*? text_id=CORP_DOC.00003!);
+
+  $t->unzip_xml('CORP/DOC/00003/data.xml')
+    ->content_like(qr!üüü  Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ!);
+
+  $t->unzip_xml('CORP/DOC/00003/header.xml')
+    ->content_like(qr!üüü x α•α y!);
+};
+
 subtest 'Test Log' => sub {
   test_tei2korapxml(
     tmp => 'script_out',
diff --git a/t/tei.t b/t/tei.t
index 69b4ee1..be5cb3d 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,3 +1,4 @@
+use utf8;
 use strict;
 use warnings;
 use Test::More;
@@ -9,7 +10,7 @@
 
 use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
 
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities');
 
 subtest 'remove_xml_comments' => sub {
   my ($fh, $filename) = korap_tempfile('tei');
@@ -114,4 +115,15 @@
   );
 };
 
+subtest 'Replace all entities' => sub {
+  is(
+    replace_entities('&alpha;&ap;&bdquo;&blk12;&blk14;&blk34;&block;&boxDL;&boxdl;&boxdr;&boxDR;&boxH;&boxh;&boxhd;&boxHD;&boxhu;&boxHU;&boxUL;&boxul;&boxur;&boxUR;&boxv;&boxV;&boxvh;&boxVH;&boxvl;&boxVL;&boxVR;&boxvr;&bull;&caron;&ccaron;&circ;&dagger;&Dagger;&ecaron;&euro;&fnof;&hellip;&Horbar;&inodot;&iota;&ldquo;&ldquor;&lhblk;&lsaquo;&lsquo;&lsquor;&mdash;&ndash;&nu;&oelig;&OElig;&omega;&Omega;&permil;&phi;&pi;&piv;&rcaron;&rdquo;&rho;&rsaquo;&rsquo;&rsquor;&scaron;&Scaron;&sigma;&squ;&squb;&squf;&sub;&tilde;&trade;&uhblk;&Yuml;&zcaron;&Zcaron;'),
+    'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ'
+  );
+  is(replace_entities('&#65;'), 'A');
+  is(replace_entities('&#171;'), replace_entities('&#x00AB;'));
+  is(replace_entities('&#x41;'), 'A');
+  is(replace_entities('&amp;&lt;&gt;'), '&amp;&lt;&gt;')
+};
+
 done_testing;