Add exportable XML escape function
Change-Id: I50f2ee398e4b1c3dc5bb79009eaf3b204562887f
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 2f7f11d..23b6625 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -3,7 +3,15 @@
use warnings;
use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml);
+
+# convert '&', '<' and '>' into their corresponding sgml-entities
+my %ent = (
+ '"' => '"',
+ '&' => '&',
+ '<' => '<',
+ '>' => '>'
+);
# remove xml comments
sub remove_xml_comments {
@@ -49,9 +57,18 @@
$html = <$fh>;
goto CHECK;
- }
+ };
return $html
-}
+};
+
+
+# Escape strings using XML entities
+sub escape_xml {
+ my $data = shift // '';
+ $data =~ s/([&<>"])/$ent{$1}/ge;
+ return $data;
+};
+
1;
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index ddb9a25..669c64b 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -2,6 +2,7 @@
use strict;
use warnings;
use Encode qw(encode decode);
+use KorAP::XML::TEI qw!escape_xml!;
# Parsing of i5 header files
@@ -17,13 +18,6 @@
SIGLE => 2
};
-# convert '&', '<' and '>' into their corresponding sgml-entities
-our %ent = (
- '"' => '"',
- '&' => '&',
- '<' => '<',
- '>' => '>'
-);
# convert header type to sigle type
our %sig = (
@@ -138,13 +132,13 @@
# corpus/doc/text sigle escaped
sub sigle_esc {
- $_[0]->[SIGLE] =~ s/("|&|<|>)/$ent{$1}/gr;
+ escape_xml($_[0]->[SIGLE]);
};
# corpus/doc/text id escaped
sub id_esc {
- $_[0]->[SIGLE] =~ tr/\//_/r =~ s/("|&|<|>)/$ent{$1}/gr;
+ escape_xml($_[0]->[SIGLE] =~ tr/\//_/r);
};
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 40ad372..66f36b3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -18,7 +18,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw'remove_xml_comments';
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -136,11 +136,6 @@
my ( $text_id, $text_id_esc ); # '$text_id_esc' = escaped version of $text_id (see %ent)
-my %ent = ('"', '"', '&','&', # convert '&', '<' and '>' into their corresponding sgml-entities
- '<','<','>','>');
- # note: the index still refers to the 'single character'-versions, which are counted as 1
- # (search for '&' in data.xml and see corresponding indices in $_tokens_file)
-
my ( $data_fl );
my ( $data_prfx1, $data_prfx2, $data_sfx ); # $data_* are written to $_data_file
@@ -348,13 +343,14 @@
$cons_tok->reset;
}
+ # Encode and escape data
+ $data = escape_xml(encode( "UTF-8", $data ));
+ # note: the index still refers to the 'single character'-versions,
+ # which are counted as 1 (search for '&' in data.xml and see
+ # corresponding indices in $_tokens_file)
+
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
- $data =~ s/(&|<|>)/$ent{$1}/g;
-
- # convert text strings to binary strings
- $data = encode( "UTF-8", $data );
-
$zipper->new_stream("$_root_dir$dir/$_data_file")
->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
@@ -910,7 +906,8 @@
for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
- ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # see explanation in func. 'write_tokens'
+ # see explanation in func. 'write_tokens'
+ ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]);
# attribute (at index $att_idx) with value (at index $att_idx+1)
$output .= " <f name=\"${$ref}[ $att_idx ]\">${$ref}[ $att_idx+1 ]</f>\n";
@@ -979,7 +976,7 @@
for ( $att_idx = 1; $att_idx < $idx; $att_idx += 2 ){
- ${$ref}[ $att_idx+1 ] =~ s/(&|<|>)/$ent{$1}/g; # ... <w lemma=">" ana="PUNCTUATION">></w> ...
+ ${$ref}[ $att_idx+1 ] = escape_xml(${$ref}[ $att_idx+1 ]); # ... <w lemma=">" ana="PUNCTUATION">></w> ...
# the '>' is translated to '>' and hence the result would be '<f name="lemma">></f>'
if ( $_INLINE_ANNOT && ${$ref}[ $att_idx ] eq "$_INLINE_ATT_RD" ){
diff --git a/t/tei.t b/t/tei.t
index 6dca05c..928d28f 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -9,38 +9,68 @@
use Test::KorAP::XML::TEI qw!korap_tempfile!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml');
-my ($fh, $filename) = korap_tempfile('tei');
+subtest 'remove_xml_comments' => sub {
+ my ($fh, $filename) = korap_tempfile('tei');
-print $fh <<'HTML';
+ print $fh <<'HTML';
mehrzeiliger
Kommentar
-->
Test
HTML
-is(remove_xml_comments($fh, "hallo"),"hallo");
-is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
-is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
+ is(remove_xml_comments($fh, "hallo"),"hallo");
+ is(remove_xml_comments($fh, "hallo <!-- Test -->"),"hallo ");
+ is(remove_xml_comments($fh, "<!-- Test --> hallo")," hallo");
-seek($fh, 0, 0);
+ seek($fh, 0, 0);
-is(remove_xml_comments($fh, '<!--'), "Test\n");
+ is(remove_xml_comments($fh, '<!--'), "Test\n");
-seek($fh, 0, 0);
+ seek($fh, 0, 0);
-print $fh <<'HTML';
+ print $fh <<'HTML';
mehrzeiliger
Kommentar
--><!-- Versuch
-->ist <!-- a --><!-- b --> ein Test
HTML
-seek($fh, 0, 0);
+ seek($fh, 0, 0);
-is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist ein Test\n");
+ is(remove_xml_comments($fh, 'Dies <!--'), "Dies ist ein Test\n");
-close($fh);
+ close($fh);
+};
+
+subtest 'escape_xml' => sub {
+ is(
+ escape_xml('"""'),
+ '"""'
+ );
+
+ is(
+ escape_xml('&&&'),
+ '&&&'
+ );
+
+ is(
+ escape_xml('<<<'),
+ '<<<'
+ );
+
+ is(
+ escape_xml('>>>'),
+ '>>>'
+ );
+
+ is(
+ escape_xml('<tag att1="foo" att2="bar">C&A</tag>'),
+ '<tag att1="foo" att2="bar">C&A</tag>'
+ );
+};
+
done_testing;