Do not escape double quoutes inside raw_text elements
This is not necessary and breaks tokenization compatibility.
Change-Id: Ib43733cf7264ee07b010a3478e8c4b728f7bd708
diff --git a/script/tei2korapxml b/script/tei2korapxml
index ab1975c..eed322d 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -20,7 +20,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -364,7 +364,7 @@
};
# Encode and escape data
- $data = escape_xml(encode( "UTF-8", $data ));
+ $data = escape_xml_minimal(encode( "UTF-8", $data ));
# note: the index still refers to the 'single character'-versions,
# which are counted as 1 (search for '&' in data.xml and see
# corresponding indices in $_tokens_file)