change utf8_encode and utf8_decode
ensure strictly valid UTF-8 output by using utf-8-strict instead of utf8
(see in Encode: 'encode_utf8' and 'UTF-8 vs. utf8 vs. UTF8'
and in perlunifaq: What's the difference between "UTF-8" and "utf8"?)
Change-Id: I6d8797ddd24339ecf2ab4ccacad3801a6a054ca2
diff --git a/t/data/template.i5.xml b/t/data/template.i5.xml
new file mode 100644
index 0000000..9e0b26d
--- /dev/null
+++ b/t/data/template.i5.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd">
+<idsCorpus>
+ <idsHeader type="corpus">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>[KORPUSSIGLE]</korpusSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsDoc version="1.0">
+ <idsHeader type="document">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>[DOKUMENTSIGLE]</dokumentSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>[TEXTSIGLE]</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ [TEXT]
+ </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index f7f9468..31ff24b 100644
--- a/t/script.t
+++ b/t/script.t
@@ -2,6 +2,7 @@
use warnings;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
+use Encode qw!encode_utf8 decode_utf8 encode!;
use IO::Uncompress::Unzip qw(unzip $UnzipError);
use Test::More;
@@ -12,6 +13,7 @@
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
+
use Test::KorAP::XML::TEI qw!korap_tempfile!;
my $f = dirname(__FILE__);
@@ -340,4 +342,52 @@
ok($zip, 'External generated');
};
+
+subtest 'Test utf-8 handling' => sub {
+
+ # Load template file
+ $file = catfile($f, 'data', 'template.i5.xml');
+ my $tpl = '';
+ {
+ open($fh, $file);
+ $tpl .= <$fh> while !eof($fh);
+ close($fh);
+ }
+
+ # Introduce invalid utf-8 characters
+ my $text_sigle;
+ { no warnings;
+ # $text_sigle printed to file, without encoding: Aþ¿¿¿¿¿A_Bþ¿¿¿¿¿B.Cþ¿¿¿¿¿C
+ # the utf8-sequence 'þ¿¿¿¿¿' encodes 32 bit of data (see 0x7FFF_FFFF in perlunicode)
+ $text_sigle = "A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B.C\x{FFFF_FFFF}C" }
+ # If CHECK is 0, encoding and decoding replace any malformed character with a substitution character.
+ # � = substitution character
+ my $text_sigle_lax = encode_utf8($text_sigle);
+ my $text_sigle_esc = encode('UTF-8', $text_sigle);
+
+ is(length($text_sigle), 11); # A�A_B�B.C�C (char string => length(�) = 1)
+ is(length($text_sigle_lax), 29); # Aþ¿¿¿¿¿A_Bþ¿¿¿¿¿B.Cþ¿¿¿¿¿C (byte string)
+ is(length($text_sigle_esc), 17); # A�A_B�B.C�C (byte string => length(�) = 3)
+
+ { no warnings;
+ $tpl =~ s!\[KORPUSSIGLE\]!A\x{FFFF_FFFF}A!;
+ $tpl =~ s!\[DOKUMENTSIGLE\]!A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B!;
+ $tpl =~ s!\[TEXT\]!<p>d\x{FFFF_FFFF}d e\x{FFFF_FFFF}e f\x{FFFF_FFFF}f</p>! }
+ $tpl =~ s!\[TEXTSIGLE\]!$text_sigle!;
+
+ my ($fh, $tplfile) = korap_tempfile('script_out4');
+ binmode($fh);
+ print $fh encode_utf8($tpl); # => text_id=Aþ¿¿¿¿¿A_Bþ¿¿¿¿¿B.Cþ¿¿¿¿¿C
+ close($fh);
+
+ my (undef, $outzip) = korap_tempfile('script_out5');
+
+ binmode STDERR, qw{ :encoding(UTF-8) }; # because output 'textid=...' goes to STDERR (see script/tei2korapxml)
+
+ stderr_like(
+ sub { `cat '$tplfile' | perl '$script' -ti > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=$text_sigle_lax!, # see above: print $fh encode_utf8($tpl);
+ );
+};
+
done_testing;