Switch input encoding based on XML processing instruction

Change-Id: I89e20c8af762615d37c216b0c89227fc3644fcb3
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8c35e90..8ace82a 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -9,8 +9,7 @@
 
 use File::Basename qw(dirname);
 
-use open qw(:std :utf8); # assume utf-8 encoding
-use Encode qw(encode decode);
+use Encode qw(decode);
 
 use XML::CompactTree::XS;
 use XML::LibXML::Reader;
@@ -215,6 +214,7 @@
 binmode $input_fh;
 
 my $pos;
+my $input_enc = 'UTF-8';
 my $l = length('</' . $_TEXT_BODY) + 1;
 
 # ~ loop (reading input document) ~
@@ -223,6 +223,14 @@
 
   $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
 
+  # Set input encoding
+  if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
+    $input_enc = $2;
+    next;
+  };
+
+  $_ = decode($input_enc, $_);
+
   if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
 
     # ~ start of text body ~
@@ -243,6 +251,7 @@
     while (<$input_fh>) {
 
       $_ = remove_xml_comments( $input_fh, $_ );
+      $_ = decode($input_enc, $_);
 
       # ~ end of text body ~
       if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
@@ -418,7 +427,7 @@
     };
 
     # Parse header
-    my $header = KorAP::XML::TEI::Header->new($content)->parse($input_fh);
+    my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
 
     # Header was parseable
     if ($header) {