Automatically replace entities with their corresponding characters

Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.

Numeric decimal and hexadecimal entities are replaced, too

Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index de68534..493fce5 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -1,9 +1,10 @@
+use utf8;
 package KorAP::XML::TEI;
 use strict;
 use warnings;
 
 use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities);
 
 # convert '&', '<' and '>' into their corresponding sgml-entities
 my %ent_without_quot = (
@@ -17,6 +18,87 @@
   '"' => '&quot;'
 );
 
+#  GET http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent | perl -C255 -wlne 'print "'\''$1'\''\t=>\t '\''", chr($2), "'\'', # $3" if(/ENTITY (\S*)\s".#(\d+).*<!-- (.*) -->/)'
+my %html_entities = (
+  'alpha'  => 'α', # GREEK SMALL LETTER ALPHA
+  'ap'     => '≈', # ALMOST EQUAL TO
+  'bdquo'  => '„', # DOUBLE LOW-9 QUOTATION MARK
+  'blk12'  => '▒', # MEDIUM SHADE
+  'blk14'  => '░', # LIGHT SHADE
+  'blk34'  => '▓', # DARK SHADE
+  'block'  => '█', # FULL BLOCK
+  'boxDL'  => '╗', # BOX DRAWINGS DOUBLE DOWN AND LEFT
+  'boxdl'  => '┐', # BOX DRAWINGS LIGHT DOWN AND LEFT
+  'boxdr'  => '┌', # BOX DRAWINGS LIGHT DOWN AND RIGHT
+  'boxDR'  => '╔', # BOX DRAWINGS DOUBLE DOWN AND RIGHT
+  'boxH'   => '═', # BOX DRAWINGS DOUBLE HORIZONTAL
+  'boxh'   => '─', # BOX DRAWINGS LIGHT HORIZONTAL
+  'boxhd'  => '┬', # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
+  'boxHD'  => '╦', # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
+  'boxhu'  => '┴', # BOX DRAWINGS LIGHT UP AND HORIZONTAL
+  'boxHU'  => '╩', # BOX DRAWINGS DOUBLE UP AND HORIZONTAL
+  'boxUL'  => '╝', # BOX DRAWINGS DOUBLE UP AND LEFT
+  'boxul'  => '┘', # BOX DRAWINGS LIGHT UP AND LEFT
+  'boxur'  => '└', # BOX DRAWINGS LIGHT UP AND RIGHT
+  'boxUR'  => '╚', # BOX DRAWINGS DOUBLE UP AND RIGHT
+  'boxv'   => '│', # BOX DRAWINGS LIGHT VERTICAL
+  'boxV'   => '║', # BOX DRAWINGS DOUBLE VERTICAL
+  'boxvh'  => '┼', # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
+  'boxVH'  => '╬', # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
+  'boxvl'  => '┤', # BOX DRAWINGS LIGHT VERTICAL AND LEFT
+  'boxVL'  => '╣', # BOX DRAWINGS DOUBLE VERTICAL AND LEFT
+  'boxVR'  => '╠', # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
+  'boxvr'  => '├', # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
+  'bull'   => '•', # BULLET
+  'caron'  => 'ˇ', # CARON
+  'ccaron' => 'č', # LATIN SMALL LETTER C WITH CARON
+  'circ'   => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT
+  'dagger' => '†', # DAGGER
+  'Dagger' => '‡', # DOUBLE DAGGER
+  'ecaron' => 'ě', # LATIN SMALL LETTER E WITH CARON
+  'euro'   => '€', # EURO SIGN
+  'fnof'   => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
+  'hellip' => '…', # HORIZONTAL ELLIPSIS
+  'Horbar' => '‗', # DOUBLE LOW LINE
+  'inodot' => 'ı', # LATIN SMALL LETTER DOTLESS I
+  'iota'   => 'ι', # GREEK SMALL LETTER IOTA
+  'ldquo'  => '“', # LEFT DOUBLE QUOTATION MARK
+  'ldquor' => '„', # DOUBLE LOW-9 QUOTATION MARK
+  'lhblk'  => '▄', # LOWER HALF BLOCK
+  'lsaquo' => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+  'lsquo'  => '‘', # LEFT SINGLE QUOTATION MARK
+  'lsquor' => '‚', # SINGLE LOW-9 QUOTATION MARK
+  'mdash'  => '—', # EM DASH
+  'ndash'  => '–', # EN DASH
+  'nu'     => 'ν', # GREEK SMALL LETTER NU
+  'oelig'  => 'œ', # LATIN SMALL LIGATURE OE
+  'OElig'  => 'Œ', # LATIN CAPITAL LIGATURE OE
+  'omega'  => 'ω', # GREEK SMALL LETTER OMEGA
+  'Omega'  => 'Ω', # GREEK CAPITAL LETTER OMEGA
+  'permil' => '‰', # PER MILLE SIGN
+  'phi'    => 'φ', # GREEK SMALL LETTER PHI
+  'pi'     => 'π', # GREEK SMALL LETTER PI
+  'piv'    => 'ϖ', # GREEK PI SYMBOL
+  'rcaron' => 'ř', # LATIN SMALL LETTER R WITH CARON
+  'rdquo'  => '”', # RIGHT DOUBLE QUOTATION MARK
+  'rho'    => 'ρ', # GREEK SMALL LETTER RHO
+  'rsaquo' => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+  'rsquo'  => '’', # RIGHT SINGLE QUOTATION MARK
+  'rsquor' => '‘', # LEFT SINGLE QUOTATION MARK
+  'scaron' => 'š', # LATIN SMALL LETTER S WITH CARON
+  'Scaron' => 'Š', # LATIN CAPITAL LETTER S WITH CARON
+  'sigma'  => 'σ', # GREEK SMALL LETTER SIGMA
+  'squ'    => '□', # WHITE SQUARE
+  'squb'   => '■', # BLACK SQUARE
+  'squf'   => '▪', # BLACK SMALL SQUARE
+  'sub'    => '⊂', # SUBSET OF
+  'tilde'  => '˜', # SMALL TILDE
+  'trade'  => '™', # TRADE MARK SIGN
+  'uhblk'  => '▀', # UPPER HALF BLOCK
+  'Yuml'   => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
+  'zcaron' => 'ž', # LATIN SMALL LETTER Z WITH CARON
+  'Zcaron' => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON
+);
 
 # remove xml comments
 sub remove_xml_comments {
@@ -82,5 +164,13 @@
   ($_[0] // '') =~ s/([&<>])/$ent_without_quot{$1}/ger;
 };
 
+# Replace all entities, except %ent
+sub replace_entities {
+  $_= shift;
+  s/[&]#(x[0-9A-Fa-f]+);/chr(hex("0$1"))/ge;
+  s/[&]#(\d+);/chr($1)/ge;
+  s/\&(alpha|ap|bdquo|blk12|blk14|blk34|block|boxDL|boxdl|boxdr|boxDR|boxH|boxh|boxhd|boxHD|boxhu|boxHU|boxUL|boxul|boxur|boxUR|boxv|boxV|boxvh|boxVH|boxvl|boxVL|boxVR|boxvr|bull|caron|ccaron|circ|dagger|Dagger|ecaron|euro|fnof|hellip|Horbar|inodot|iota|ldquo|ldquor|lhblk|lsaquo|lsquo|lsquor|mdash|ndash|nu|oelig|OElig|omega|Omega|permil|phi|pi|piv|rcaron|rdquo|rho|rsaquo|rsquo|rsquor|scaron|Scaron|sigma|squ|squb|squf|sub|tilde|trade|uhblk|Yuml|zcaron|Zcaron);/$html_entities{$1}/ge;
+  return($_);
+};
 
 1;
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index a03f7c4..c397c7a 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -4,6 +4,7 @@
 use Log::Any qw($log);
 use Encode qw(encode decode);
 use KorAP::XML::TEI qw!escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 
 # Parsing of i5 header files
 
@@ -63,6 +64,7 @@
   while (<$fh>) {
 
     $_ = decode($self->[INPUTENC], $_);
+    $_ = replace_entities($_);
 
     # Change:
     #   This version keeps comments in header files