Switch input encoding based on XML processing instruction
Change-Id: I89e20c8af762615d37c216b0c89227fc3644fcb3
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8c35e90..8ace82a 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -9,8 +9,7 @@
use File::Basename qw(dirname);
-use open qw(:std :utf8); # assume utf-8 encoding
-use Encode qw(encode decode);
+use Encode qw(decode);
use XML::CompactTree::XS;
use XML::LibXML::Reader;
@@ -215,6 +214,7 @@
binmode $input_fh;
my $pos;
+my $input_enc = 'UTF-8';
my $l = length('</' . $_TEXT_BODY) + 1;
# ~ loop (reading input document) ~
@@ -223,6 +223,14 @@
$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
+ # Set input encoding
+ if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
+ $input_enc = $2;
+ next;
+ };
+
+ $_ = decode($input_enc, $_);
+
if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
# ~ start of text body ~
@@ -243,6 +251,7 @@
while (<$input_fh>) {
$_ = remove_xml_comments( $input_fh, $_ );
+ $_ = decode($input_enc, $_);
# ~ end of text body ~
if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
@@ -418,7 +427,7 @@
};
# Parse header
- my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+ my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
# Header was parseable
if ($header) {