Do not escape double quoutes inside raw_text elements
This is not necessary and breaks tokenization compatibility.
Change-Id: Ib43733cf7264ee07b010a3478e8c4b728f7bd708
diff --git a/t/script.t b/t/script.t
index 4254937..9521d73 100644
--- a/t/script.t
+++ b/t/script.t
@@ -66,7 +66,8 @@
# Uncompress GOE/AGA/00000/data.xml from zip file
$t->unzip_xml('GOE/AGA/00000/data.xml')
->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
- ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
+ ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content')
+ ->text_like('raw_text > text', qr!unter dem Titel "Kriegstheater"!, 'text content');
$t->unzip_xml('GOE/AGA/00000/struct/structure.xml')
->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
diff --git a/t/tei.t b/t/tei.t
index 94f7577..69b4ee1 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -9,7 +9,7 @@
use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
subtest 'remove_xml_comments' => sub {
my ($fh, $filename) = korap_tempfile('tei');
@@ -87,5 +87,31 @@
);
};
+subtest 'escape_xml_minimal' => sub {
+ is(
+ escape_xml_minimal('"""'),
+ '"""'
+ );
+
+ is(
+ escape_xml_minimal('&&&'),
+ '&&&'
+ );
+
+ is(
+ escape_xml_minimal('<<<'),
+ '<<<'
+ );
+
+ is(
+ escape_xml_minimal('>>>'),
+ '>>>'
+ );
+
+ is(
+ escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'),
+ '<tag att1="foo" att2="bar">C&A</tag>'
+ );
+};
done_testing;