change utf8_encode and utf8_decode
ensure strictly valid UTF-8 output by using utf-8-strict instead of utf8
(see in Encode: 'encode_utf8' and 'UTF-8 vs. utf8 vs. UTF8'
 and in perlunifaq: What's the difference between "UTF-8" and "utf8"?)
Change-Id: I6d8797ddd24339ecf2ab4ccacad3801a6a054ca2
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f0e04c3..40ad372 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -8,7 +8,7 @@
 use File::Basename qw(dirname);
 
 use open qw(:std :utf8); # assume utf-8 encoding
-use Encode qw(encode_utf8 decode_utf8);
+use Encode qw(encode decode);
 
 use XML::CompactTree::XS;
 use XML::LibXML::Reader;
@@ -25,6 +25,7 @@
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
+
 our $VERSION = '0.01';
 
 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
@@ -242,6 +243,8 @@
 
   # prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in  'man XML::LibXML')
   # removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
+  # see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
+  # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
   binmode $input_fh;
 
 
@@ -345,13 +348,13 @@
           $cons_tok->reset;
         }
 
-        $data = encode_utf8( $data );
-
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
-
         $data =~ s/(&|<|>)/$ent{$1}/g;
 
+        # convert text strings to binary strings
+        $data        = encode( "UTF-8", $data );
+
         $zipper->new_stream("$_root_dir$dir/$_data_file")
           ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
 
@@ -474,7 +477,7 @@
           $text_id_esc = $header->id_esc;
 
           # log output for seeing progression
-          print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
+          print STDERR "$0: main(): text_id=".decode("UTF-8", $text_id )."\n";
 
           $tl = 0; # reset (needed for ~ whitespace handling ~)
         };
@@ -868,7 +871,7 @@
 
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
            ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
-           .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n";
+           .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n"; # convert binary string to text string
 
   $c = 0;
 
@@ -926,7 +929,7 @@
 
   $output .= "  </spanList>\n</layer>";
 
-  $output = encode_utf8( $output );
+  $output = encode( "UTF-8", $output ); # convert text string to binary string
 
   $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
     ->print($output);
@@ -950,7 +953,7 @@
 
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
            ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
-           .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n";
+           .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n"; # convert binary string to text string
 
   $c = 0;
 
@@ -1023,7 +1026,7 @@
 
   $output .= "  </spanList>\n</layer>";
 
-  $output = encode_utf8( $output );
+  $output = encode( "UTF-8", $output ); # convert text string to binary string
 
   $zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
     ->print($output);