Do not escape double quoutes inside raw_text elements
This is not necessary and breaks tokenization compatibility.
Change-Id: Ib43733cf7264ee07b010a3478e8c4b728f7bd708
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 23b6625..8f1678d 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -3,16 +3,21 @@
use warnings;
use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
# convert '&', '<' and '>' into their corresponding sgml-entities
-my %ent = (
- '"' => '"',
+my %ent_without_quot = (
'&' => '&',
'<' => '<',
'>' => '>'
);
+my %ent = (
+ %ent_without_quot,
+ '"' => '"'
+);
+
+
# remove xml comments
sub remove_xml_comments {
my ($fh, $html) = @_;
@@ -71,4 +76,12 @@
};
+# Escape
+sub escape_xml_minimal {
+ my $data = shift // '';
+ $data =~ s/([&<>])/$ent_without_quot{$1}/ge;
+ return $data;
+};
+
+
1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index ab1975c..eed322d 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -20,7 +20,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -364,7 +364,7 @@
};
# Encode and escape data
- $data = escape_xml(encode( "UTF-8", $data ));
+ $data = escape_xml_minimal(encode( "UTF-8", $data ));
# note: the index still refers to the 'single character'-versions,
# which are counted as 1 (search for '&' in data.xml and see
# corresponding indices in $_tokens_file)
diff --git a/t/script.t b/t/script.t
index 4254937..9521d73 100644
--- a/t/script.t
+++ b/t/script.t
@@ -66,7 +66,8 @@
# Uncompress GOE/AGA/00000/data.xml from zip file
$t->unzip_xml('GOE/AGA/00000/data.xml')
->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
- ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
+ ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content')
+ ->text_like('raw_text > text', qr!unter dem Titel "Kriegstheater"!, 'text content');
$t->unzip_xml('GOE/AGA/00000/struct/structure.xml')
->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
diff --git a/t/tei.t b/t/tei.t
index 94f7577..69b4ee1 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -9,7 +9,7 @@
use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
subtest 'remove_xml_comments' => sub {
my ($fh, $filename) = korap_tempfile('tei');
@@ -87,5 +87,31 @@
);
};
+subtest 'escape_xml_minimal' => sub {
+ is(
+ escape_xml_minimal('"""'),
+ '"""'
+ );
+
+ is(
+ escape_xml_minimal('&&&'),
+ '&&&'
+ );
+
+ is(
+ escape_xml_minimal('<<<'),
+ '<<<'
+ );
+
+ is(
+ escape_xml_minimal('>>>'),
+ '>>>'
+ );
+
+ is(
+ escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'),
+ '<tag att1="foo" att2="bar">C&A</tag>'
+ );
+};
done_testing;