Improve stability of XML entity conversion
Change-Id: I341b76b9d898acd59cae0379aa5cc4a5bbbe81e8
diff --git a/Changes b/Changes
index cc16d48..4c8a75b 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+2.3.4 2022-11-04
+ - Improve stability of XML entity replacement.
+
2.3.3 2022-03-28
- Load KorAP-Tokenizer only on request.
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index aa8cd0f..1111c8b 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -169,6 +169,11 @@
# Replace all entities, except %ent
sub replace_entities {
$_= shift;
+ s/"/"/gi;
+ s/&/&/gi;
+ s/'/'/gi;
+ s/</</gi;
+ s/>/>/gi;
s/[&]#(x[0-9A-Fa-f]+);/chr(hex("0$1"))/ge;
s/[&]#(\d+);/chr($1)/ge;
s/\&(alpha|ap|bdquo|blk12|blk14|blk34|block|boxDL|boxdl|boxdr|boxDR|boxH|boxh|boxhd|boxHD|boxhu|boxHU|boxUL|boxul|boxur|boxUR|boxv|boxV|boxvh|boxVH|boxvl|boxVL|boxVR|boxvr|bull|caron|ccaron|circ|dagger|Dagger|ecaron|euro|fnof|hellip|Horbar|inodot|iota|ldquo|ldquor|lhblk|lsaquo|lsquo|lsquor|mdash|ndash|nu|oelig|OElig|omega|Omega|permil|phi|pi|piv|rcaron|rdquo|rho|rsaquo|rsquo|rsquor|scaron|Scaron|sigma|squ|squb|squf|sub|tilde|trade|uhblk|Yuml|zcaron|Zcaron);/$html_entities{$1}/ge;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 4090753..63ea525 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -24,7 +24,7 @@
use KorAP::XML::TEI::Header;
use KorAP::XML::TEI::Inline;
-our $VERSION = '2.3.3';
+our $VERSION = '2.3.4';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
diff --git a/t/data/text_with_entities.i5.xml b/t/data/text_with_entities.i5.xml
index e675176..ac91fd7 100644
--- a/t/data/text_with_entities.i5.xml
+++ b/t/data/text_with_entities.i5.xml
@@ -9,6 +9,6 @@
</fileDesc>
</idsHeader>
<text>
- <p id="p1">üüü  Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ</p>
+ <p id="p1" pause="rend="">üüü  Aα≈„▒░▓█╗┐┌╔═─┬╦┴╩╝┘└╚│║┼╬┤╣╠├•ˇčˆ†‡ě€ƒ…&Horbar;ıι“„▄‹‘‚—–νœŒωΩ‰φπϖř”ρ›’’šŠσ□&squb;▪⊂˜™▀ŸžŽ</p>
</text>
</idsText>