change utf8_encode and utf8_decode

ensure strictly valid UTF-8 output by using utf-8-strict instead of utf8
(see in Encode: 'encode_utf8' and 'UTF-8 vs. utf8 vs. UTF8'
 and in perlunifaq: What's the difference between "UTF-8" and "utf8"?)

Change-Id: I6d8797ddd24339ecf2ab4ccacad3801a6a054ca2
diff --git a/t/data/template.i5.xml b/t/data/template.i5.xml
new file mode 100644
index 0000000..9e0b26d
--- /dev/null
+++ b/t/data/template.i5.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE idsCorpus  PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd">
+<idsCorpus>
+  <idsHeader type="corpus">
+    <fileDesc>
+      <titleStmt>
+        <korpusSigle>[KORPUSSIGLE]</korpusSigle>
+      </titleStmt>
+    </fileDesc>
+  </idsHeader>
+  <idsDoc version="1.0">
+    <idsHeader type="document">
+      <fileDesc>
+        <titleStmt>
+          <dokumentSigle>[DOKUMENTSIGLE]</dokumentSigle>
+        </titleStmt>
+      </fileDesc>
+    </idsHeader>
+    <idsText version="1.0">
+      <idsHeader type="text">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>[TEXTSIGLE]</textSigle>
+          </titleStmt>
+        </fileDesc>
+      </idsHeader>
+      <text>
+        [TEXT]
+      </text>
+    </idsText>
+  </idsDoc>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index f7f9468..31ff24b 100644
--- a/t/script.t
+++ b/t/script.t
@@ -2,6 +2,7 @@
 use warnings;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use Encode qw!encode_utf8 decode_utf8 encode!;
 use IO::Uncompress::Unzip qw(unzip $UnzipError);
 
 use Test::More;
@@ -12,6 +13,7 @@
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };
+
 use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
 my $f = dirname(__FILE__);
@@ -340,4 +342,52 @@
   ok($zip, 'External generated');
 };
 
+
+subtest 'Test utf-8 handling' => sub {
+
+  # Load template file
+  $file = catfile($f, 'data', 'template.i5.xml');
+  my $tpl = '';
+  {
+    open($fh, $file);
+    $tpl .= <$fh> while !eof($fh);
+    close($fh);
+  }
+
+  # Introduce invalid utf-8 characters
+  my $text_sigle;
+  { no warnings;
+  # $text_sigle printed to file, without encoding: Aþƒ¿¿¿¿¿A_Bþƒ¿¿¿¿¿B.Cþƒ¿¿¿¿¿C
+  # the utf8-sequence 'þƒ¿¿¿¿¿' encodes 32 bit of data (see 0x7FFF_FFFF in perlunicode)
+  $text_sigle = "A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B.C\x{FFFF_FFFF}C" }
+  # If CHECK is 0, encoding and decoding replace any malformed character with a substitution character.
+  # � = substitution character
+  my $text_sigle_lax = encode_utf8($text_sigle);
+  my $text_sigle_esc = encode('UTF-8', $text_sigle);
+
+  is(length($text_sigle), 11);     # A�A_B�B.C�C (char string => length(�) = 1)
+  is(length($text_sigle_lax), 29); # Aþƒ¿¿¿¿¿A_Bþƒ¿¿¿¿¿B.Cþƒ¿¿¿¿¿C (byte string)
+  is(length($text_sigle_esc), 17); # A�A_B�B.C�C (byte string => length(�) = 3)
+
+  { no warnings;
+  $tpl =~ s!\[KORPUSSIGLE\]!A\x{FFFF_FFFF}A!;
+  $tpl =~ s!\[DOKUMENTSIGLE\]!A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B!;
+  $tpl =~ s!\[TEXT\]!<p>d\x{FFFF_FFFF}d e\x{FFFF_FFFF}e f\x{FFFF_FFFF}f</p>! }
+  $tpl =~ s!\[TEXTSIGLE\]!$text_sigle!;
+
+  my ($fh, $tplfile) = korap_tempfile('script_out4');
+  binmode($fh);
+  print $fh encode_utf8($tpl); # => text_id=Aþƒ¿¿¿¿¿A_Bþƒ¿¿¿¿¿B.Cþƒ¿¿¿¿¿C
+  close($fh);
+
+  my (undef, $outzip) = korap_tempfile('script_out5');
+
+  binmode STDERR, qw{ :encoding(UTF-8) }; # because output 'textid=...' goes to STDERR (see script/tei2korapxml)
+
+  stderr_like(
+    sub { `cat '$tplfile' | perl '$script' -ti > '$outzip'` },
+    qr!tei2korapxml: .*? text_id=$text_sigle_lax!, # see above: print $fh encode_utf8($tpl);
+  );
+};
+
 done_testing;