Add exportable XML escape function Change-Id: I50f2ee398e4b1c3dc5bb79009eaf3b204562887f

commit: 0465e9e255944a60e7e16ca6ab070eb8ac4cb961 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Jul 27 15:55:21 2020 +0200
committer: Peter Harders <harders@ids-mannheim.de> Tue Jul 28 19:37:29 2020 +0200
tree: 0c881974f41a5adf5d9210c4c5eea803a57b7317
parent: 1c5ce154fac1331d0663633eb212fbfc67b20323 [diff]
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 2f7f11d..23b6625 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm

@@ -3,7 +3,15 @@
 use warnings;
 
 use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml);
+
+# convert '&', '<' and '>' into their corresponding sgml-entities
+my %ent = (
+  '"' => '&quot;',
+  '&' => '&amp;',
+  '<' => '&lt;',
+  '>' => '&gt;'
+);
 
 # remove xml comments
 sub remove_xml_comments {
@@ -49,9 +57,18 @@
 
     $html = <$fh>;
     goto CHECK;
-  }
+  };
 
   return $html
-}
+};
+
+
+# Escape strings using XML entities
+sub escape_xml {
+  my $data = shift // '';
+  $data =~ s/([&<>"])/$ent{$1}/ge;
+  return $data;
+};
+
 
 1;

diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index ddb9a25..669c64b 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm

@@ -2,6 +2,7 @@
 use strict;
 use warnings;
 use Encode qw(encode decode);
+use KorAP::XML::TEI qw!escape_xml!;
 
 # Parsing of i5 header files
 
@@ -17,13 +18,6 @@
   SIGLE     => 2
 };
 
-# convert '&', '<' and '>' into their corresponding sgml-entities
-our %ent = (
-  '"' => '&quot;',
-  '&' => '&amp;',
-  '<' => '&lt;',
-  '>' => '&gt;'
-);
 
 # convert header type to sigle type
 our %sig = (
@@ -138,13 +132,13 @@
 
 # corpus/doc/text sigle escaped
 sub sigle_esc {
-  $_[0]->[SIGLE] =~ s/("|&|<|>)/$ent{$1}/gr;
+  escape_xml($_[0]->[SIGLE]);
 };
 
 
 # corpus/doc/text id escaped
 sub id_esc {
-  $_[0]->[SIGLE] =~ tr/\//_/r =~ s/("|&|<|>)/$ent{$1}/gr;
+  escape_xml($_[0]->[SIGLE] =~ tr/\//_/r);
 };
 
 

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 40ad372..66f36b3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -18,7 +18,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI qw'remove_xml_comments';
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -136,11 +136,6 @@
 
 my ( $text_id, $text_id_esc );                       # '$text_id_esc' = escaped version of $text_id (see %ent)
 
-my %ent = ('"', '&quot;', '&','&amp;',               # convert '&', '<' and '>' into their corresponding sgml-entities
-           '<','&lt;','>','&gt;');
-                                                     # note: the index still refers to the 'single character'-versions, which are counted as 1
-                                                     #  (search for '&amp;' in data.xml and see corresponding indices in $_tokens_file)
-
 my ( $data_fl );
 
 my ( $data_prfx1, $data_prfx2, $data_sfx );          # $data_* are written to $_data_file
@@ -348,13 +343,14 @@
           $cons_tok->reset;
         }
 
+        # Encode and escape data
+        $data = escape_xml(encode( "UTF-8", $data ));
+        # note: the index still refers to the 'single character'-versions,
+        # which are counted as 1 (search for '&amp;' in data.xml and see
+        # corresponding indices in $_tokens_file)
+
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
-        $data =~ s/(&|<|>)/$ent{$1}/g;
-
-        # convert text strings to binary strings
-        $data        = encode( "UTF-8", $data );
-
         $zipper->new_stream("$_root_dir$dir/$_data_file")
           ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
 
@@ -910,7 +906,8 @@
 
         for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
 
-          ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # see explanation in func. 'write_tokens'
+          # see explanation in func. 'write_tokens'
+          ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]);
 
           # attribute (at index $att_idx) with value (at index $att_idx+1)
           $output .= "            <f name=\"${$ref}[ $att_idx ]\">${$ref}[ $att_idx+1 ]</f>\n";
@@ -979,7 +976,7 @@
 
       for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
 
-        ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # ... <w lemma="&gt;" ana="PUNCTUATION">&gt;</w> ...
+        ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]); # ... <w lemma="&gt;" ana="PUNCTUATION">&gt;</w> ...
                                                        # the '&gt;' is translated to '>' and hence the result would be '<f name="lemma">></f>'
 
         if ( $_INLINE_ANNOT && ${$ref}[ $att_idx ] eq "$_INLINE_ATT_RD" ){

diff --git a/t/tei.t b/t/tei.t
index 6dca05c..928d28f 100644
--- a/t/tei.t
+++ b/t/tei.t

@@ -9,38 +9,68 @@
 
 use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
-use_ok('KorAP::XML::TEI', 'remove_xml_comments');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml');
 
-my ($fh, $filename) = korap_tempfile('tei');
+subtest 'remove_xml_comments' => sub {
+  my ($fh, $filename) = korap_tempfile('tei');
 
-print $fh <<'HTML';
+  print $fh <<'HTML';
 mehrzeiliger
 Kommentar
   -->
 Test
 HTML
 
-is(remove_xml_comments($fh, "hallo"),"hallo");
-is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
-is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
+  is(remove_xml_comments($fh, "hallo"),"hallo");
+  is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
+  is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
 
-seek($fh, 0, 0);
+  seek($fh, 0, 0);
 
-is(remove_xml_comments($fh, '<!--'), "Test\n");
+  is(remove_xml_comments($fh, '<!--'), "Test\n");
 
-seek($fh, 0, 0);
+  seek($fh, 0, 0);
 
-print $fh <<'HTML';
+  print $fh <<'HTML';
 mehrzeiliger
 Kommentar
   --><!-- Versuch
 -->ist <!-- a --><!-- b --> ein Test
 HTML
 
-seek($fh, 0, 0);
+  seek($fh, 0, 0);
 
-is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist  ein Test\n");
+  is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist  ein Test\n");
 
-close($fh);
+  close($fh);
+};
+
+subtest 'escape_xml' => sub {
+  is(
+    escape_xml('"""'),
+    '&quot;&quot;&quot;'
+  );
+
+  is(
+    escape_xml('&&&'),
+    '&amp;&amp;&amp;'
+  );
+
+  is(
+    escape_xml('<<<'),
+    '&lt;&lt;&lt;'
+  );
+
+  is(
+    escape_xml('>>>'),
+    '&gt;&gt;&gt;'
+  );
+
+  is(
+    escape_xml('<tag att1="foo" att2="bar">C&A</tag>'),
+    '&lt;tag att1=&quot;foo&quot; att2=&quot;bar&quot;&gt;C&amp;A&lt;/tag&gt;'
+  );
+};
+
 
 done_testing;
commit	0465e9e255944a60e7e16ca6ab070eb8ac4cb961	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Jul 27 15:55:21 2020 +0200
committer	Peter Harders <harders@ids-mannheim.de>	Tue Jul 28 19:37:29 2020 +0200
tree	0c881974f41a5adf5d9210c4c5eea803a57b7317
parent	1c5ce154fac1331d0663633eb212fbfc67b20323 [diff]