Automatically replace entities with their corresponding characters
Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.
Numeric decimal and hexadecimal entities are replaced, too
Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index de68534..493fce5 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -1,9 +1,10 @@
+use utf8;
package KorAP::XML::TEI;
use strict;
use warnings;
use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities);
# convert '&', '<' and '>' into their corresponding sgml-entities
my %ent_without_quot = (
@@ -17,6 +18,87 @@
'"' => '"'
);
+# GET http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent | perl -C255 -wlne 'print "'\''$1'\''\t=>\t '\''", chr($2), "'\'', # $3" if(/ENTITY (\S*)\s".#(\d+).*<!-- (.*) -->/)'
+my %html_entities = (
+ 'alpha' => 'α', # GREEK SMALL LETTER ALPHA
+ 'ap' => '≈', # ALMOST EQUAL TO
+ 'bdquo' => '„', # DOUBLE LOW-9 QUOTATION MARK
+ 'blk12' => '▒', # MEDIUM SHADE
+ 'blk14' => '░', # LIGHT SHADE
+ 'blk34' => '▓', # DARK SHADE
+ 'block' => '█', # FULL BLOCK
+ 'boxDL' => '╗', # BOX DRAWINGS DOUBLE DOWN AND LEFT
+ 'boxdl' => '┐', # BOX DRAWINGS LIGHT DOWN AND LEFT
+ 'boxdr' => '┌', # BOX DRAWINGS LIGHT DOWN AND RIGHT
+ 'boxDR' => '╔', # BOX DRAWINGS DOUBLE DOWN AND RIGHT
+ 'boxH' => '═', # BOX DRAWINGS DOUBLE HORIZONTAL
+ 'boxh' => '─', # BOX DRAWINGS LIGHT HORIZONTAL
+ 'boxhd' => '┬', # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
+ 'boxHD' => '╦', # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
+ 'boxhu' => '┴', # BOX DRAWINGS LIGHT UP AND HORIZONTAL
+ 'boxHU' => '╩', # BOX DRAWINGS DOUBLE UP AND HORIZONTAL
+ 'boxUL' => '╝', # BOX DRAWINGS DOUBLE UP AND LEFT
+ 'boxul' => '┘', # BOX DRAWINGS LIGHT UP AND LEFT
+ 'boxur' => '└', # BOX DRAWINGS LIGHT UP AND RIGHT
+ 'boxUR' => '╚', # BOX DRAWINGS DOUBLE UP AND RIGHT
+ 'boxv' => '│', # BOX DRAWINGS LIGHT VERTICAL
+ 'boxV' => '║', # BOX DRAWINGS DOUBLE VERTICAL
+ 'boxvh' => '┼', # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
+ 'boxVH' => '╬', # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
+ 'boxvl' => '┤', # BOX DRAWINGS LIGHT VERTICAL AND LEFT
+ 'boxVL' => '╣', # BOX DRAWINGS DOUBLE VERTICAL AND LEFT
+ 'boxVR' => '╠', # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
+ 'boxvr' => '├', # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
+ 'bull' => '•', # BULLET
+ 'caron' => 'ˇ', # CARON
+ 'ccaron' => 'č', # LATIN SMALL LETTER C WITH CARON
+ 'circ' => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT
+ 'dagger' => '†', # DAGGER
+ 'Dagger' => '‡', # DOUBLE DAGGER
+ 'ecaron' => 'ě', # LATIN SMALL LETTER E WITH CARON
+ 'euro' => '€', # EURO SIGN
+ 'fnof' => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
+ 'hellip' => '…', # HORIZONTAL ELLIPSIS
+ 'Horbar' => '‗', # DOUBLE LOW LINE
+ 'inodot' => 'ı', # LATIN SMALL LETTER DOTLESS I
+ 'iota' => 'ι', # GREEK SMALL LETTER IOTA
+ 'ldquo' => '“', # LEFT DOUBLE QUOTATION MARK
+ 'ldquor' => '„', # DOUBLE LOW-9 QUOTATION MARK
+ 'lhblk' => '▄', # LOWER HALF BLOCK
+ 'lsaquo' => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 'lsquo' => '‘', # LEFT SINGLE QUOTATION MARK
+ 'lsquor' => '‚', # SINGLE LOW-9 QUOTATION MARK
+ 'mdash' => '—', # EM DASH
+ 'ndash' => '–', # EN DASH
+ 'nu' => 'ν', # GREEK SMALL LETTER NU
+ 'oelig' => 'œ', # LATIN SMALL LIGATURE OE
+ 'OElig' => 'Œ', # LATIN CAPITAL LIGATURE OE
+ 'omega' => 'ω', # GREEK SMALL LETTER OMEGA
+ 'Omega' => 'Ω', # GREEK CAPITAL LETTER OMEGA
+ 'permil' => '‰', # PER MILLE SIGN
+ 'phi' => 'φ', # GREEK SMALL LETTER PHI
+ 'pi' => 'π', # GREEK SMALL LETTER PI
+ 'piv' => 'ϖ', # GREEK PI SYMBOL
+ 'rcaron' => 'ř', # LATIN SMALL LETTER R WITH CARON
+ 'rdquo' => '”', # RIGHT DOUBLE QUOTATION MARK
+ 'rho' => 'ρ', # GREEK SMALL LETTER RHO
+ 'rsaquo' => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 'rsquo' => '’', # RIGHT SINGLE QUOTATION MARK
+ 'rsquor' => '‘', # LEFT SINGLE QUOTATION MARK
+ 'scaron' => 'š', # LATIN SMALL LETTER S WITH CARON
+ 'Scaron' => 'Š', # LATIN CAPITAL LETTER S WITH CARON
+ 'sigma' => 'σ', # GREEK SMALL LETTER SIGMA
+ 'squ' => '□', # WHITE SQUARE
+ 'squb' => '■', # BLACK SQUARE
+ 'squf' => '▪', # BLACK SMALL SQUARE
+ 'sub' => '⊂', # SUBSET OF
+ 'tilde' => '˜', # SMALL TILDE
+ 'trade' => '™', # TRADE MARK SIGN
+ 'uhblk' => '▀', # UPPER HALF BLOCK
+ 'Yuml' => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
+ 'zcaron' => 'ž', # LATIN SMALL LETTER Z WITH CARON
+ 'Zcaron' => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON
+);
# remove xml comments
sub remove_xml_comments {
@@ -82,5 +164,13 @@
($_[0] // '') =~ s/([&<>])/$ent_without_quot{$1}/ger;
};
+# Replace all entities, except %ent
+sub replace_entities {
+ $_= shift;
+ s/[&]#(x[0-9A-Fa-f]+);/chr(hex("0$1"))/ge;
+ s/[&]#(\d+);/chr($1)/ge;
+ s/\&(alpha|ap|bdquo|blk12|blk14|blk34|block|boxDL|boxdl|boxdr|boxDR|boxH|boxh|boxhd|boxHD|boxhu|boxHU|boxUL|boxul|boxur|boxUR|boxv|boxV|boxvh|boxVH|boxvl|boxVL|boxVR|boxvr|bull|caron|ccaron|circ|dagger|Dagger|ecaron|euro|fnof|hellip|Horbar|inodot|iota|ldquo|ldquor|lhblk|lsaquo|lsquo|lsquor|mdash|ndash|nu|oelig|OElig|omega|Omega|permil|phi|pi|piv|rcaron|rdquo|rho|rsaquo|rsquo|rsquor|scaron|Scaron|sigma|squ|squb|squf|sub|tilde|trade|uhblk|Yuml|zcaron|Zcaron);/$html_entities{$1}/ge;
+ return($_);
+};
1;
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index a03f7c4..c397c7a 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -4,6 +4,7 @@
use Log::Any qw($log);
use Encode qw(encode decode);
use KorAP::XML::TEI qw!escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
# Parsing of i5 header files
@@ -63,6 +64,7 @@
while (<$fh>) {
$_ = decode($self->[INPUTENC], $_);
+ $_ = replace_entities($_);
# Change:
# This version keeps comments in header files