change utf8_encode and utf8_decode
ensure strictly valid UTF-8 output by using utf-8-strict instead of utf8
(see in Encode: 'encode_utf8' and 'UTF-8 vs. utf8 vs. UTF8'
and in perlunifaq: What's the difference between "UTF-8" and "utf8"?)
Change-Id: I6d8797ddd24339ecf2ab4ccacad3801a6a054ca2
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index 3d9c06d..ddb9a25 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -1,7 +1,7 @@
package KorAP::XML::TEI::Header;
use strict;
use warnings;
-use Encode qw(encode_utf8);
+use Encode qw(encode decode);
# Parsing of i5 header files
@@ -98,10 +98,10 @@
die "ERROR ($0): main(): input line number $.: line with sigle-tag is not in expected format ... => Aborting\n\tline=$_"
if $pfx !~ /^\s*$/ || $sfx !~ m!^</$sig_type>\s*$! || $sig =~ /^\s*$/;
- $self->[SIGLE] = encode_utf8($sig);
+ $self->[SIGLE] = encode('UTF-8' , $sig);
# Escape sig
- my $sig_esc = $self->sigle_esc;
+ my $sig_esc = decode('UTF-8', $self->sigle_esc);
# replace sigle in header, if there's an escaped version that differs
s!(<$sig_type(?: [^>]*)?>)[^<]+</$sig_type>!$1$sig_esc</$sig_type>! if $sig_esc ne $sig;
@@ -173,7 +173,7 @@
# Write data to zip stream
sub to_zip {
my ($self, $zip) = @_;
- $zip->print(encode_utf8($self->to_string));
+ $zip->print(encode('UTF-8', $self->to_string));
};
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f0e04c3..40ad372 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -8,7 +8,7 @@
use File::Basename qw(dirname);
use open qw(:std :utf8); # assume utf-8 encoding
-use Encode qw(encode_utf8 decode_utf8);
+use Encode qw(encode decode);
use XML::CompactTree::XS;
use XML::LibXML::Reader;
@@ -25,6 +25,7 @@
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
+
our $VERSION = '0.01';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
@@ -242,6 +243,8 @@
# prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in 'man XML::LibXML')
# removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
+ # see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
+ # see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
binmode $input_fh;
@@ -345,13 +348,13 @@
$cons_tok->reset;
}
- $data = encode_utf8( $data );
-
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
-
$data =~ s/(&|<|>)/$ent{$1}/g;
+ # convert text strings to binary strings
+ $data = encode( "UTF-8", $data );
+
$zipper->new_stream("$_root_dir$dir/$_data_file")
->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
@@ -474,7 +477,7 @@
$text_id_esc = $header->id_esc;
# log output for seeing progression
- print STDERR "$0: main(): text_id=".decode_utf8( $text_id )."\n";
+ print STDERR "$0: main(): text_id=".decode("UTF-8", $text_id )."\n";
$tl = 0; # reset (needed for ~ whitespace handling ~)
};
@@ -868,7 +871,7 @@
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
- .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
+ .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n"; # convert binary string to text string
$c = 0;
@@ -926,7 +929,7 @@
$output .= " </spanList>\n</layer>";
- $output = encode_utf8( $output );
+ $output = encode( "UTF-8", $output ); # convert text string to binary string
$zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
->print($output);
@@ -950,7 +953,7 @@
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
- .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
+ .decode( "UTF-8", $text_id_esc )."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n"; # convert binary string to text string
$c = 0;
@@ -1023,7 +1026,7 @@
$output .= " </spanList>\n</layer>";
- $output = encode_utf8( $output );
+ $output = encode( "UTF-8", $output ); # convert text string to binary string
$zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
->print($output);
diff --git a/t/data/template.i5.xml b/t/data/template.i5.xml
new file mode 100644
index 0000000..9e0b26d
--- /dev/null
+++ b/t/data/template.i5.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-I5 1.0//EN" "http://corpora.ids-mannheim.de/I5/DTD/i5.dtd">
+<idsCorpus>
+ <idsHeader type="corpus">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>[KORPUSSIGLE]</korpusSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsDoc version="1.0">
+ <idsHeader type="document">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>[DOKUMENTSIGLE]</dokumentSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>[TEXTSIGLE]</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ [TEXT]
+ </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index f7f9468..31ff24b 100644
--- a/t/script.t
+++ b/t/script.t
@@ -2,6 +2,7 @@
use warnings;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
+use Encode qw!encode_utf8 decode_utf8 encode!;
use IO::Uncompress::Unzip qw(unzip $UnzipError);
use Test::More;
@@ -12,6 +13,7 @@
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
+
use Test::KorAP::XML::TEI qw!korap_tempfile!;
my $f = dirname(__FILE__);
@@ -340,4 +342,52 @@
ok($zip, 'External generated');
};
+
+subtest 'Test utf-8 handling' => sub {
+
+ # Load template file
+ $file = catfile($f, 'data', 'template.i5.xml');
+ my $tpl = '';
+ {
+ open($fh, $file);
+ $tpl .= <$fh> while !eof($fh);
+ close($fh);
+ }
+
+ # Introduce invalid utf-8 characters
+ my $text_sigle;
+ { no warnings;
+ # $text_sigle printed to file, without encoding: Aþ¿¿¿¿¿A_Bþ¿¿¿¿¿B.Cþ¿¿¿¿¿C
+ # the utf8-sequence 'þ¿¿¿¿¿' encodes 32 bit of data (see 0x7FFF_FFFF in perlunicode)
+ $text_sigle = "A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B.C\x{FFFF_FFFF}C" }
+ # If CHECK is 0, encoding and decoding replace any malformed character with a substitution character.
+ # � = substitution character
+ my $text_sigle_lax = encode_utf8($text_sigle);
+ my $text_sigle_esc = encode('UTF-8', $text_sigle);
+
+ is(length($text_sigle), 11); # A�A_B�B.C�C (char string => length(�) = 1)
+ is(length($text_sigle_lax), 29); # Aþ¿¿¿¿¿A_Bþ¿¿¿¿¿B.Cþ¿¿¿¿¿C (byte string)
+ is(length($text_sigle_esc), 17); # A�A_B�B.C�C (byte string => length(�) = 3)
+
+ { no warnings;
+ $tpl =~ s!\[KORPUSSIGLE\]!A\x{FFFF_FFFF}A!;
+ $tpl =~ s!\[DOKUMENTSIGLE\]!A\x{FFFF_FFFF}A_B\x{FFFF_FFFF}B!;
+ $tpl =~ s!\[TEXT\]!<p>d\x{FFFF_FFFF}d e\x{FFFF_FFFF}e f\x{FFFF_FFFF}f</p>! }
+ $tpl =~ s!\[TEXTSIGLE\]!$text_sigle!;
+
+ my ($fh, $tplfile) = korap_tempfile('script_out4');
+ binmode($fh);
+ print $fh encode_utf8($tpl); # => text_id=Aþ¿¿¿¿¿A_Bþ¿¿¿¿¿B.Cþ¿¿¿¿¿C
+ close($fh);
+
+ my (undef, $outzip) = korap_tempfile('script_out5');
+
+ binmode STDERR, qw{ :encoding(UTF-8) }; # because output 'textid=...' goes to STDERR (see script/tei2korapxml)
+
+ stderr_like(
+ sub { `cat '$tplfile' | perl '$script' -ti > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=$text_sigle_lax!, # see above: print $fh encode_utf8($tpl);
+ );
+};
+
done_testing;