change utf8_encode and utf8_decode
ensure strictly valid UTF-8 output by using utf-8-strict instead of utf8
(see in Encode: 'encode_utf8' and 'UTF-8 vs. utf8 vs. UTF8'
and in perlunifaq: What's the difference between "UTF-8" and "utf8"?)
Change-Id: I6d8797ddd24339ecf2ab4ccacad3801a6a054ca2
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f0e04c3..40ad372 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -8,7 +8,7 @@
use File::Basename qw(dirname);
use open qw(:std :utf8); # assume utf-8 encoding
-use Encode qw(encode_utf8 decode_utf8);
+use Encode qw(encode decode);
use XML::CompactTree::XS;
use XML::LibXML::Reader;
@@ -25,6 +25,7 @@
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
+
our $VERSION = '0.01';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
@@ -242,6 +243,8 @@
# prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in 'man XML::LibXML')
# removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
+ # see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
+ # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
binmode $input_fh;
@@ -345,13 +348,13 @@
$cons_tok->reset;
}
- $data = encode_utf8( $data );
-
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
-
$data =~ s/(&|<|>)/$ent{$1}/g;
+ # convert text strings to binary strings
+ $data = encode( "UTF-8", $data );
+
$zipper->new_stream("$_root_dir$dir/$_data_file")
->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
@@ -474,7 +477,7 @@
$text_id_esc = $header->id_esc;
# log output for seeing progression
- print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
+ print STDERR "$0: main(): text_id=".decode("UTF-8", $text_id )."\n";
$tl = 0; # reset (needed for ~ whitespace handling ~)
};
@@ -868,7 +871,7 @@
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
- .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
+ .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n"; # convert binary string to text string
$c = 0;
@@ -926,7 +929,7 @@
$output .= " </spanList>\n</layer>";
- $output = encode_utf8( $output );
+ $output = encode( "UTF-8", $output ); # convert text string to binary string
$zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
->print($output);
@@ -950,7 +953,7 @@
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
- .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
+ .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n"; # convert binary string to text string
$c = 0;
@@ -1023,7 +1026,7 @@
$output .= " </spanList>\n</layer>";
- $output = encode_utf8( $output );
+ $output = encode( "UTF-8", $output ); # convert text string to binary string
$zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
->print($output);