Use implicit default utf8 encoding instead of explicit de/encodes
Saves 10% time
Change-Id: I7b4aa14b8469dbd9e49bc9330449e07f50a11dac
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 7f70440..40a06d3 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -6,7 +6,6 @@
use Log::Any::Adapter;
use Pod::Usage;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
-use Encode;
my $MAX_SENTENCE_LENGTH=10000;
my $COMMENT_START="#";
@@ -35,6 +34,8 @@
DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
};
+use open ':std', ':encoding(UTF-8)';
+
GetOptions(
'sigle-pattern|p=s' => \(my $sigle_pattern = ''),
'extract-attributes-regex|e=s' => \(my $extract_attributes_regex = ''),
@@ -221,7 +222,7 @@
}
$token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
@conll = ("_") x 10;
- $conll[$FORM_idx] = encode("utf-8", $token);
+ $conll[$FORM_idx] = $token;
if($baseOnly) {
my @vals = ($current_from, $current_to);
# $log->debug("joining : ", join(" ", @vals));
@@ -291,11 +292,11 @@
return if(!$offsets);
print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
foreach my $t (@current_lines) {
- print STDOUT " $t->[0]";
+ print " $t->[0]";
}
print "$COMMENT_END\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1] if($comments);
foreach my $t (@current_lines) {
- print STDOUT " $t->[1]";
+ print " $t->[1]";
}
print "$COMMENT_END\n";
}
@@ -308,7 +309,7 @@
chomp $current;
$current .= "\n\n";
printTokenRanges();
- print STDOUT $current;
+ print $current;
}
}
@@ -373,7 +374,7 @@
}
}
} elsif (m@<text>(.*)</text>@) {
- $_= decode("utf-8", $1, Encode::FB_DEFAULT);
+ $_= $1;
s/</</go;
s/>/>/go;
s/&/&/go;
@@ -381,7 +382,7 @@
$plain_texts{$docid} = $_;
last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id} && (!$extract_metadata || $metadata{$target_id})));
} elsif (m@<text>(.*)@) {
- $_= decode("utf-8", $1, Encode::FB_DEFAULT);
+ $_= $1;
s/</</go;
s/>/>/go;
s/&/&/go;
@@ -389,7 +390,7 @@
$plain_texts{$docid} = "$_ ";
$text_started=1;
} elsif ($text_started && m@(.*)</text>@) {
- $_= decode("utf-8", $1, Encode::FB_DEFAULT);
+ $_= $1;
s/</</go;
s/>/>/go;
s/&/&/go;
@@ -399,7 +400,6 @@
last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id} && (!$extract_metadata || $metadata{$target_id})));
} elsif ($text_started) {
chomp;
- $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
s/</</go;
s/>/>/go;
s/&/&/go;
@@ -416,7 +416,7 @@
} else {
$log->debug("Using $docid");
}
- }
+ }
return(1);
}