Automatically replace entities with their corresponding characters
Source of the symbolic entities is the entity file from the TEI-I5 DTD
http://corpora.ids-mannheim.de/I5/DTD/ids-lat1.ent which contains all
entities that have been used in DeReKo. The list is very similar to
the Mathematical, Greek and Symbolic characters for XHTML
http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent, but not identical.
Numeric decimal and hexadecimal entities are replaced, too
Change-Id: Id00376c6953e9ac96ef04703872f38d37ef68096
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f15376f..bd54462 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -19,7 +19,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments!;
+use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -236,6 +236,7 @@
};
$_ = decode($input_enc, $_);
+ $_ = replace_entities($_);
if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
@@ -258,6 +259,7 @@
$_ = remove_xml_comments( $input_fh, $_ );
$_ = decode($input_enc, $_);
+ $_ = replace_entities($_);
# ~ end of text body ~
if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {