change utf8_encode and utf8_decode

ensure strictly valid UTF-8 output by using utf-8-strict instead of utf8
(see in Encode: 'encode_utf8' and 'UTF-8 vs. utf8 vs. UTF8'
 and in perlunifaq: What's the difference between "UTF-8" and "utf8"?)

Change-Id: I6d8797ddd24339ecf2ab4ccacad3801a6a054ca2
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index 3d9c06d..ddb9a25 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -1,7 +1,7 @@
 package KorAP::XML::TEI::Header;
 use strict;
 use warnings;
-use Encode qw(encode_utf8);
+use Encode qw(encode decode);
 
 # Parsing of i5 header files
 
@@ -98,10 +98,10 @@
       die "ERROR ($0): main(): input line number $.: line with sigle-tag is not in expected format ... => Aborting\n\tline=$_"
         if $pfx !~ /^\s*$/  || $sfx !~ m!^</$sig_type>\s*$! || $sig =~ /^\s*$/;
 
-      $self->[SIGLE] = encode_utf8($sig);
+      $self->[SIGLE] = encode('UTF-8' , $sig);
 
       # Escape sig
-      my $sig_esc = $self->sigle_esc;
+      my $sig_esc = decode('UTF-8', $self->sigle_esc);
 
       # replace sigle in header, if there's an escaped version that differs
       s!(<$sig_type(?: [^>]*)?>)[^<]+</$sig_type>!$1$sig_esc</$sig_type>! if $sig_esc ne $sig;
@@ -173,7 +173,7 @@
 # Write data to zip stream
 sub to_zip {
   my ($self, $zip) = @_;
-  $zip->print(encode_utf8($self->to_string));
+  $zip->print(encode('UTF-8', $self->to_string));
 };
 
 
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f0e04c3..40ad372 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -8,7 +8,7 @@
 use File::Basename qw(dirname);
 
 use open qw(:std :utf8); # assume utf-8 encoding
-use Encode qw(encode_utf8 decode_utf8);
+use Encode qw(encode decode);
 
 use XML::CompactTree::XS;
 use XML::LibXML::Reader;
@@ -25,6 +25,7 @@
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
+
 our $VERSION = '0.01';
 
 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
@@ -242,6 +243,8 @@
 
   # prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in  'man XML::LibXML')
   # removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
+  # see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
+  # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
   binmode $input_fh;
 
 
@@ -345,13 +348,13 @@
           $cons_tok->reset;
         }
 
-        $data = encode_utf8( $data );
-
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
-
         $data =~ s/(&|<|>)/$ent{$1}/g;
 
+        # convert text strings to binary strings
+        $data        = encode( "UTF-8", $data );
+
         $zipper->new_stream("$_root_dir$dir/$_data_file")
           ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
 
@@ -474,7 +477,7 @@
           $text_id_esc = $header->id_esc;
 
           # log output for seeing progression
-          print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
+          print STDERR "$0: main(): text_id=".decode("UTF-8", $text_id )."\n";
 
           $tl = 0; # reset (needed for ~ whitespace handling ~)
         };
@@ -868,7 +871,7 @@
 
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
            ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
-           .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n";
+           .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n"; # convert binary string to text string
 
   $c = 0;
 
@@ -926,7 +929,7 @@
 
   $output .= "  </spanList>\n</layer>";
 
-  $output = encode_utf8( $output );
+  $output = encode( "UTF-8", $output ); # convert text string to binary string
 
   $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
     ->print($output);
@@ -950,7 +953,7 @@
 
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
            ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
-           .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n";
+           .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n"; # convert binary string to text string
 
   $c = 0;
 
@@ -1023,7 +1026,7 @@
 
   $output .= "  </spanList>\n</layer>";
 
-  $output = encode_utf8( $output );
+  $output = encode( "UTF-8", $output ); # convert text string to binary string
 
   $zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
     ->print($output);
diff --git a/t/data/template.i5.xml b/t/data/template.i5.xml
new file mode 100644
index 0000000..9e0b26d
--- /dev/null
+++ b/t/data/template.i5.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE idsCorpus  PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd">
+<idsCorpus>
+  <idsHeader type="corpus">
+    <fileDesc>
+      <titleStmt>
+        <korpusSigle>[KORPUSSIGLE]</korpusSigle>
+      </titleStmt>
+    </fileDesc>
+  </idsHeader>
+  <idsDoc version="1.0">
+    <idsHeader type="document">
+      <fileDesc>
+        <titleStmt>
+          <dokumentSigle>[DOKUMENTSIGLE]</dokumentSigle>
+        </titleStmt>
+      </fileDesc>
+    </idsHeader>
+    <idsText version="1.0">
+      <idsHeader type="text">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>[TEXTSIGLE]</textSigle>
+          </titleStmt>
+        </fileDesc>
+      </idsHeader>
+      <text>
+        [TEXT]
+      </text>
+    </idsText>
+  </idsDoc>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index f7f9468..31ff24b 100644
--- a/t/script.t
+++ b/t/script.t
@@ -2,6 +2,7 @@
 use warnings;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use Encode qw!encode_utf8 decode_utf8 encode!;
 use IO::Uncompress::Unzip qw(unzip $UnzipError);
 
 use Test::More;
@@ -12,6 +13,7 @@
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };
+
 use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
 my $f = dirname(__FILE__);
@@ -340,4 +342,52 @@
   ok($zip, 'External generated');
 };
 
+
+subtest 'Test utf-8 handling' => sub {
+
+  # Load template file
+  $file = catfile($f, 'data', 'template.i5.xml');
+  my $tpl = '';
+  {
+    open($fh, $file);
+    $tpl .= <$fh> while !eof($fh);
+    close($fh);
+  }
+
+  # Introduce invalid utf-8 characters
+  my $text_sigle;
+  { no warnings;
+  # $text_sigle printed to file, without encoding: Aþƒ¿¿¿¿¿A_Bþƒ¿¿¿¿¿B.Cþƒ¿¿¿¿¿C
+  # the utf8-sequence 'þƒ¿¿¿¿¿' encodes 32 bit of data (see 0x7FFF_FFFF in perlunicode)
+  $text_sigle = "A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B.C\x{FFFF_FFFF}C" }
+  # If CHECK is 0, encoding and decoding replace any malformed character with a substitution character.
+  # � = substitution character
+  my $text_sigle_lax = encode_utf8($text_sigle);
+  my $text_sigle_esc = encode('UTF-8', $text_sigle);
+
+  is(length($text_sigle), 11);     # A�A_B�B.C�C (char string => length(�) = 1)
+  is(length($text_sigle_lax), 29); # Aþƒ¿¿¿¿¿A_Bþƒ¿¿¿¿¿B.Cþƒ¿¿¿¿¿C (byte string)
+  is(length($text_sigle_esc), 17); # A�A_B�B.C�C (byte string => length(�) = 3)
+
+  { no warnings;
+  $tpl =~ s!\[KORPUSSIGLE\]!A\x{FFFF_FFFF}A!;
+  $tpl =~ s!\[DOKUMENTSIGLE\]!A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B!;
+  $tpl =~ s!\[TEXT\]!<p>d\x{FFFF_FFFF}d e\x{FFFF_FFFF}e f\x{FFFF_FFFF}f</p>! }
+  $tpl =~ s!\[TEXTSIGLE\]!$text_sigle!;
+
+  my ($fh, $tplfile) = korap_tempfile('script_out4');
+  binmode($fh);
+  print $fh encode_utf8($tpl); # => text_id=Aþƒ¿¿¿¿¿A_Bþƒ¿¿¿¿¿B.Cþƒ¿¿¿¿¿C
+  close($fh);
+
+  my (undef, $outzip) = korap_tempfile('script_out5');
+
+  binmode STDERR, qw{ :encoding(UTF-8) }; # because output 'textid=...' goes to STDERR (see script/tei2korapxml)
+
+  stderr_like(
+    sub { `cat '$tplfile' | perl '$script' -ti > '$outzip'` },
+    qr!tei2korapxml: .*? text_id=$text_sigle_lax!, # see above: print $fh encode_utf8($tpl);
+  );
+};
+
 done_testing;