Automatically replace entities with their corresponding characters
Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.
Numeric decimal and hexadecimal entities are replaced, too
Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/Changes b/Changes
index 106043e..d3252c4 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
- -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)
- tokenizer invocation comments removed from KorAP XML output
- indentation of </span> tags fixed
+ - character entities that used in DeReKo are automatically replaced by their corresponding characters
0.03 2021-01-12
- Update KorAP-Tokenizer to released 2.0 version
- Improve test suite for recent version
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index de68534..493fce5 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -1,9 +1,10 @@
+use utf8;
package KorAP::XML::TEI;
use strict;
use warnings;
use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities);
# convert '&', '<' and '>' into their corresponding sgml-entities
my %ent_without_quot = (
@@ -17,6 +18,87 @@
'"' => '"'
);
+# GET http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent | perl -C255 -wlne 'print "'\''$1'\''\t=>\t '\''", chr($2), "'\'', # $3" if(/ENTITY (\S*)\s".#(\d+).*<!-- (.*) -->/)'
+my %html_entities = (
+ 'alpha' => 'α', # GREEK SMALL LETTER ALPHA
+ 'ap' => '≈', # ALMOST EQUAL TO
+ 'bdquo' => '„', # DOUBLE LOW-9 QUOTATION MARK
+ 'blk12' => '▒', # MEDIUM SHADE
+ 'blk14' => '░', # LIGHT SHADE
+ 'blk34' => '▓', # DARK SHADE
+ 'block' => '█', # FULL BLOCK
+ 'boxDL' => '╗', # BOX DRAWINGS DOUBLE DOWN AND LEFT
+ 'boxdl' => '┐', # BOX DRAWINGS LIGHT DOWN AND LEFT
+ 'boxdr' => '┌', # BOX DRAWINGS LIGHT DOWN AND RIGHT
+ 'boxDR' => '╔', # BOX DRAWINGS DOUBLE DOWN AND RIGHT
+ 'boxH' => '═', # BOX DRAWINGS DOUBLE HORIZONTAL
+ 'boxh' => '─', # BOX DRAWINGS LIGHT HORIZONTAL
+ 'boxhd' => '┬', # BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
+ 'boxHD' => '╦', # BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
+ 'boxhu' => '┴', # BOX DRAWINGS LIGHT UP AND HORIZONTAL
+ 'boxHU' => '╩', # BOX DRAWINGS DOUBLE UP AND HORIZONTAL
+ 'boxUL' => '╝', # BOX DRAWINGS DOUBLE UP AND LEFT
+ 'boxul' => '┘', # BOX DRAWINGS LIGHT UP AND LEFT
+ 'boxur' => '└', # BOX DRAWINGS LIGHT UP AND RIGHT
+ 'boxUR' => '╚', # BOX DRAWINGS DOUBLE UP AND RIGHT
+ 'boxv' => '│', # BOX DRAWINGS LIGHT VERTICAL
+ 'boxV' => '║', # BOX DRAWINGS DOUBLE VERTICAL
+ 'boxvh' => '┼', # BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
+ 'boxVH' => '╬', # BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
+ 'boxvl' => '┤', # BOX DRAWINGS LIGHT VERTICAL AND LEFT
+ 'boxVL' => '╣', # BOX DRAWINGS DOUBLE VERTICAL AND LEFT
+ 'boxVR' => '╠', # BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
+ 'boxvr' => '├', # BOX DRAWINGS LIGHT VERTICAL AND RIGHT
+ 'bull' => '•', # BULLET
+ 'caron' => 'ˇ', # CARON
+ 'ccaron' => 'č', # LATIN SMALL LETTER C WITH CARON
+ 'circ' => 'ˆ', # MODIFIER LETTER CIRCUMFLEX ACCENT
+ 'dagger' => '†', # DAGGER
+ 'Dagger' => '‡', # DOUBLE DAGGER
+ 'ecaron' => 'ě', # LATIN SMALL LETTER E WITH CARON
+ 'euro' => '€', # EURO SIGN
+ 'fnof' => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
+ 'hellip' => '…', # HORIZONTAL ELLIPSIS
+ 'Horbar' => '‗', # DOUBLE LOW LINE
+ 'inodot' => 'ı', # LATIN SMALL LETTER DOTLESS I
+ 'iota' => 'ι', # GREEK SMALL LETTER IOTA
+ 'ldquo' => '“', # LEFT DOUBLE QUOTATION MARK
+ 'ldquor' => '„', # DOUBLE LOW-9 QUOTATION MARK
+ 'lhblk' => '▄', # LOWER HALF BLOCK
+ 'lsaquo' => '‹', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 'lsquo' => '‘', # LEFT SINGLE QUOTATION MARK
+ 'lsquor' => '‚', # SINGLE LOW-9 QUOTATION MARK
+ 'mdash' => '—', # EM DASH
+ 'ndash' => '–', # EN DASH
+ 'nu' => 'ν', # GREEK SMALL LETTER NU
+ 'oelig' => 'œ', # LATIN SMALL LIGATURE OE
+ 'OElig' => 'Œ', # LATIN CAPITAL LIGATURE OE
+ 'omega' => 'ω', # GREEK SMALL LETTER OMEGA
+ 'Omega' => 'Ω', # GREEK CAPITAL LETTER OMEGA
+ 'permil' => '‰', # PER MILLE SIGN
+ 'phi' => 'φ', # GREEK SMALL LETTER PHI
+ 'pi' => 'π', # GREEK SMALL LETTER PI
+ 'piv' => 'ϖ', # GREEK PI SYMBOL
+ 'rcaron' => 'ř', # LATIN SMALL LETTER R WITH CARON
+ 'rdquo' => '”', # RIGHT DOUBLE QUOTATION MARK
+ 'rho' => 'ρ', # GREEK SMALL LETTER RHO
+ 'rsaquo' => '›', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 'rsquo' => '’', # RIGHT SINGLE QUOTATION MARK
+ 'rsquor' => '‘', # LEFT SINGLE QUOTATION MARK
+ 'scaron' => 'š', # LATIN SMALL LETTER S WITH CARON
+ 'Scaron' => 'Š', # LATIN CAPITAL LETTER S WITH CARON
+ 'sigma' => 'σ', # GREEK SMALL LETTER SIGMA
+ 'squ' => '□', # WHITE SQUARE
+ 'squb' => '■', # BLACK SQUARE
+ 'squf' => '▪', # BLACK SMALL SQUARE
+ 'sub' => '⊂', # SUBSET OF
+ 'tilde' => '˜', # SMALL TILDE
+ 'trade' => '™', # TRADE MARK SIGN
+ 'uhblk' => '▀', # UPPER HALF BLOCK
+ 'Yuml' => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
+ 'zcaron' => 'ž', # LATIN SMALL LETTER Z WITH CARON
+ 'Zcaron' => 'Ž', # LATIN CAPITAL LETTER Z WITH CARON
+);
# remove xml comments
sub remove_xml_comments {
@@ -82,5 +164,13 @@
($_[0] // '') =~ s/([&<>])/$ent_without_quot{$1}/ger;
};
+# Replace all entities, except %ent
+sub replace_entities {
+ $_= shift;
+ s/[&]#(x[0-9A-Fa-f]+);/chr(hex("0$1"))/ge;
+ s/[&]#(\d+);/chr($1)/ge;
+ s/\&(alpha|ap|bdquo|blk12|blk14|blk34|block|boxDL|boxdl|boxdr|boxDR|boxH|boxh|boxhd|boxHD|boxhu|boxHU|boxUL|boxul|boxur|boxUR|boxv|boxV|boxvh|boxVH|boxvl|boxVL|boxVR|boxvr|bull|caron|ccaron|circ|dagger|Dagger|ecaron|euro|fnof|hellip|Horbar|inodot|iota|ldquo|ldquor|lhblk|lsaquo|lsquo|lsquor|mdash|ndash|nu|oelig|OElig|omega|Omega|permil|phi|pi|piv|rcaron|rdquo|rho|rsaquo|rsquo|rsquor|scaron|Scaron|sigma|squ|squb|squf|sub|tilde|trade|uhblk|Yuml|zcaron|Zcaron);/$html_entities{$1}/ge;
+ return($_);
+};
1;
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index a03f7c4..c397c7a 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -4,6 +4,7 @@
use Log::Any qw($log);
use Encode qw(encode decode);
use KorAP::XML::TEI qw!escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
# Parsing of i5 header files
@@ -63,6 +64,7 @@
while (<$fh>) {
$_ = decode($self->[INPUTENC], $_);
+ $_ = replace_entities($_);
# Change:
# This version keeps comments in header files
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f15376f..bd54462 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -19,7 +19,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -236,6 +236,7 @@
};
$_ = decode($input_enc, $_);
+ $_ = replace_entities($_);
if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
@@ -258,6 +259,7 @@
$_ = remove_xml_comments( $input_fh, $_ );
$_ = decode($input_enc, $_);
+ $_ = replace_entities($_);
# ~ end of text body ~
if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
diff --git a/t/data/text_with_entities.i5.xml b/t/data/text_with_entities.i5.xml
new file mode 100644
index 0000000..e675176
--- /dev/null
+++ b/t/data/text_with_entities.i5.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<idsText>
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00003</textSigle>
+ <t.title>üüü x α•α y</t.title>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ <p id="p1">üüü  Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ</p>
+ </text>
+</idsText>
diff --git a/t/script.t b/t/script.t
index 81067d9..fbe28bf 100644
--- a/t/script.t
+++ b/t/script.t
@@ -484,6 +484,20 @@
->attr_is('layer', 'docid', 'WDD19_ß0000.10317');
};
+subtest 'Check entity replacement' => sub {
+ my $t = test_tei2korapxml(
+ file => catfile($f, 'data', 'text_with_entities.i5.xml'),
+ tmp => 'script_entity_replacement',
+ param => '-ti'
+ )->stderr_like(qr!tei2korapxml: .*? text_id=CORP_DOC.00003!);
+
+ $t->unzip_xml('CORP/DOC/00003/data.xml')
+ ->content_like(qr!üüü Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ!);
+
+ $t->unzip_xml('CORP/DOC/00003/header.xml')
+ ->content_like(qr!üüü x α•α y!);
+};
+
subtest 'Test Log' => sub {
test_tei2korapxml(
tmp => 'script_out',
diff --git a/t/tei.t b/t/tei.t
index 69b4ee1..be5cb3d 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,3 +1,4 @@
+use utf8;
use strict;
use warnings;
use Test::More;
@@ -9,7 +10,7 @@
use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities');
subtest 'remove_xml_comments' => sub {
my ($fh, $filename) = korap_tempfile('tei');
@@ -114,4 +115,15 @@
);
};
+subtest 'Replace all entities' => sub {
+ is(
+ replace_entities('α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ'),
+ 'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ'
+ );
+ is(replace_entities('A'), 'A');
+ is(replace_entities('«'), replace_entities('«'));
+ is(replace_entities('A'), 'A');
+ is(replace_entities('&<>'), '&<>')
+};
+
done_testing;