Automatically replace entities with their corresponding characters
Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.
Numeric decimal and hexadecimal entities are replaced, too
Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/t/data/text_with_entities.i5.xml b/t/data/text_with_entities.i5.xml
new file mode 100644
index 0000000..e675176
--- /dev/null
+++ b/t/data/text_with_entities.i5.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<idsText>
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>CORP/DOC.00003</textSigle>
+ <t.title>üüü x α•α y</t.title>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ <p id="p1">üüü  Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ</p>
+ </text>
+</idsText>
diff --git a/t/script.t b/t/script.t
index 81067d9..fbe28bf 100644
--- a/t/script.t
+++ b/t/script.t
@@ -484,6 +484,20 @@
->attr_is('layer', 'docid', 'WDD19_ß0000.10317');
};
+subtest 'Check entity replacement' => sub {
+ my $t = test_tei2korapxml(
+ file => catfile($f, 'data', 'text_with_entities.i5.xml'),
+ tmp => 'script_entity_replacement',
+ param => '-ti'
+ )->stderr_like(qr!tei2korapxml: .*? text_id=CORP_DOC.00003!);
+
+ $t->unzip_xml('CORP/DOC/00003/data.xml')
+ ->content_like(qr!üüü Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ!);
+
+ $t->unzip_xml('CORP/DOC/00003/header.xml')
+ ->content_like(qr!üüü x α•α y!);
+};
+
subtest 'Test Log' => sub {
test_tei2korapxml(
tmp => 'script_out',
diff --git a/t/tei.t b/t/tei.t
index 69b4ee1..be5cb3d 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,3 +1,4 @@
+use utf8;
use strict;
use warnings;
use Test::More;
@@ -9,7 +10,7 @@
use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal', 'replace_entities');
subtest 'remove_xml_comments' => sub {
my ($fh, $filename) = korap_tempfile('tei');
@@ -114,4 +115,15 @@
);
};
+subtest 'Replace all entities' => sub {
+ is(
+ replace_entities('α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ'),
+ 'α≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…‗ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’‘šŠσ□■▪⊂˜™▀ŸžŽ'
+ );
+ is(replace_entities('A'), 'A');
+ is(replace_entities('«'), replace_entities('«'));
+ is(replace_entities('A'), 'A');
+ is(replace_entities('&<>'), '&<>')
+};
+
done_testing;