Add exportable XML escape function
Change-Id: I50f2ee398e4b1c3dc5bb79009eaf3b204562887f
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 40ad372..66f36b3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -18,7 +18,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw'remove_xml_comments';
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -136,11 +136,6 @@
my ( $text_id, $text_id_esc ); # '$text_id_esc' = escaped version of $text_id (see %ent)
-my %ent = ('"', '"', '&','&', # convert '&', '<' and '>' into their corresponding sgml-entities
- '<','<','>','>');
- # note: the index still refers to the 'single character'-versions, which are counted as 1
- # (search for '&' in data.xml and see corresponding indices in $_tokens_file)
-
my ( $data_fl );
my ( $data_prfx1, $data_prfx2, $data_sfx ); # $data_* are written to $_data_file
@@ -348,13 +343,14 @@
$cons_tok->reset;
}
+ # Encode and escape data
+ $data = escape_xml(encode( "UTF-8", $data ));
+ # note: the index still refers to the 'single character'-versions,
+ # which are counted as 1 (search for '&' in data.xml and see
+ # corresponding indices in $_tokens_file)
+
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
- $data =~ s/(&|<|>)/$ent{$1}/g;
-
- # convert text strings to binary strings
- $data = encode( "UTF-8", $data );
-
$zipper->new_stream("$_root_dir$dir/$_data_file")
->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
@@ -910,7 +906,8 @@
for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
- ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # see explanation in func. 'write_tokens'
+ # see explanation in func. 'write_tokens'
+ ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]);
# attribute (at index $att_idx) with value (at index $att_idx+1)
$output .= " <f name=\"${$ref}[ $att_idx ]\">${$ref}[ $att_idx+1 ]</f>\n";
@@ -979,7 +976,7 @@
for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
- ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # ... <w lemma=">" ana="PUNCTUATION">></w> ...
+ ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]); # ... <w lemma=">" ana="PUNCTUATION">></w> ...
# the '>' is translated to '>' and hence the result would be '<f name="lemma">></f>'
if ( $_INLINE_ANNOT && ${$ref}[ $att_idx ] eq "$_INLINE_ATT_RD" ){