Add exportable XML escape function

Change-Id: I50f2ee398e4b1c3dc5bb79009eaf3b204562887f
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 40ad372..66f36b3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -18,7 +18,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI qw'remove_xml_comments';
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -136,11 +136,6 @@
 
 my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id (see %ent)
 
-my %ent = ('"', '&quot;', '&','&amp;',               # convert '&', '<' and '>' into their corresponding sgml-entities
-           '<','&lt;','>','&gt;');
-                                                     # note: the index still refers to the 'single character'-versions, which are counted as 1
-                                                     #  (search for '&amp;' in data.xml and see corresponding indices in $_tokens_file)
-
 my ( $data_fl );
 
 my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
@@ -348,13 +343,14 @@
           $cons_tok->reset;
         }
 
+        # Encode and escape data
+        $data = escape_xml(encode( "UTF-8", $data ));
+        # note: the index still refers to the 'single character'-versions,
+        # which are counted as 1 (search for '&amp;' in data.xml and see
+        # corresponding indices in $_tokens_file)
+
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
-        $data =~ s/(&|<|>)/$ent{$1}/g;
-
-        # convert text strings to binary strings
-        $data        = encode( "UTF-8", $data );
-
         $zipper->new_stream("$_root_dir$dir/$_data_file")
           ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
 
@@ -910,7 +906,8 @@
 
         for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
 
-          ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # see explanation in func. 'write_tokens'
+          # see explanation in func. 'write_tokens'
+          ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]);
 
           # attribute (at index $att_idx) with value (at index $att_idx+1)
           $output .= "            <f name=\"${$ref}[ $att_idx ]\">${$ref}[ $att_idx+1 ]</f>\n";
@@ -979,7 +976,7 @@
 
       for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
 
-        ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # ... <w lemma="&gt;" ana="PUNCTUATION">&gt;</w> ...
+        ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]); # ... <w lemma="&gt;" ana="PUNCTUATION">&gt;</w> ...
                                                        # the '&gt;' is translated to '>' and hence the result would be '<f name="lemma">></f>'
 
         if ( $_INLINE_ANNOT && ${$ref}[ $att_idx ] eq "$_INLINE_ATT_RD" ){