Support latin-1 in, utf-8 out and <co>
Change-Id: I50a0d457207bef8dbc60b3fddd628d41490cceee
diff --git a/lib/KorAP/Def.pm b/lib/KorAP/Def.pm
index b71dce8..a204c5c 100644
--- a/lib/KorAP/Def.pm
+++ b/lib/KorAP/Def.pm
@@ -1,6 +1,7 @@
package KorAP::Def;
use KorAP::VirtualCorpus::Group;
use IO::Uncompress::Bunzip2 qw($Bunzip2Error);
+use Mojo::Util qw!decode!;
use IO::File;
use strict;
use warnings;
@@ -79,6 +80,7 @@
while (!eof($fh)) {
my $line = <$fh>;
chomp $line;
+ $line = decode 'latin-1', $line;
# Skip empty lines
if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
@@ -234,7 +236,7 @@
elsif ($key eq 'co') {
# Country, z.B. DE für Text in Deutschland erschienen
- warn $key . ' is not yet supported';
+ ${$vc}->union_field(pubPlaceKey => $value);
}
elsif ($key eq 'tl') {
diff --git a/lib/KorAP/DefList.pm b/lib/KorAP/DefList.pm
index 076b97d..9f58856 100644
--- a/lib/KorAP/DefList.pm
+++ b/lib/KorAP/DefList.pm
@@ -3,6 +3,7 @@
use KorAP::VirtualCorpus::Group;
use KorAP::VirtualCorpus::Doc;
use File::Spec::Functions qw!catfile!;
+use Mojo::Util qw'encode';
use strict;
use warnings;
@@ -48,7 +49,6 @@
elsif ($line =~ /^([^\t]+?)\t+(.*?)\t(.+?)\/?$/) {
$self->regex_to_vc($1,$2,$3);
- # print $1,': /'.$3.'/',"\n";
}
elsif ($line =~ /^\s*$/) {
@@ -79,7 +79,8 @@
# Output
my $out = catfile($self->{output}, $file . '.json');
if (open(my $koral, '>' . $out)) {
- print $koral $def->to_string;
+ binmode($koral);
+ print $koral encode('utf-8', $def->to_string);
close($koral);
return;
};
@@ -98,7 +99,8 @@
# Output
my $out = catfile($self->{output}, $name . '.json');
if (open(my $koral, '>' . $out)) {
- print $koral $vc->to_string;
+ binmode($koral);
+ print $koral encode('utf-8', $vc->to_string);
close($koral);
return;
};
diff --git a/lib/KorAP/VirtualCorpus/Group.pm b/lib/KorAP/VirtualCorpus/Group.pm
index bcdc95c..8e77fff 100644
--- a/lib/KorAP/VirtualCorpus/Group.pm
+++ b/lib/KorAP/VirtualCorpus/Group.pm
@@ -76,6 +76,10 @@
sub to_koral {
my $self = shift;
+ unless ($self->{ops}) {
+ return;
+ };
+
# Single object
if (@{$self->{ops}} == 1) {
return $self->{ops}->[0]->name($self->name)->flatten;
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
index 4c3e5b0..e3edd5f 100755
--- a/script/cosmasvc2koralquery
+++ b/script/cosmasvc2koralquery
@@ -6,6 +6,8 @@
use lib 'lib';
use Getopt::Long;
+binmode(STDERR, ':encoding(UTF-8)');
+
# 2020-05-20
# Preliminary support for C2 def-files.
# 2020-05-29
diff --git a/t/data/corp-a.def b/t/data/corp-a.def
new file mode 100644
index 0000000..ecc71d9
--- /dev/null
+++ b/t/data/corp-a.def
@@ -0,0 +1,3 @@
+<name>Korpora aus Österreich</name>
+<co>A</co>
+<end></end>
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 95b212e..17bece0 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -5,9 +5,11 @@
use File::Basename;
use File::Spec::Functions;
use Data::Dumper;
+use utf8;
use Test::Output;
use Mojo::JSON 'decode_json';
+use Mojo::Util qw'decode encode';
my $script = catfile(dirname(__FILE__), '..', 'script', 'cosmasvc2koralquery');
my $list1 = catfile(dirname(__FILE__), 'data', 'list2.def');
@@ -88,5 +90,14 @@
is($json->{value}->[1], 'A97/APR/00002', 'Value');
is($json->{value}->[-1], 'A97/APR/01001', 'Value');
+my $corpa = catfile(dirname(__FILE__), 'data', 'corp-a.def');
+$json = decode_json(encode('utf-8',join('', `$script def $corpa`)));
+
+is($json->{'collection'}->{'@type'}, 'koral:doc', 'type');
+is($json->{'collection'}->{'key'}, 'pubPlaceKey', 'type');
+is($json->{'collection'}->{'value'}, 'A', 'type');
+is($json->{'collection'}->{'comment'}, 'name:"Korpora aus Österreich"', 'type');
+
+
done_testing;
__END__