Support koral versioning
Change-Id: Id07e878843c790c55aca0e2da0869f4cd2e9150c
diff --git a/Changes b/Changes
index ce01343..db8eed7 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.37 2019-02-07
+ - Support for 'koral:field' array.
+ - Support for Koral versioning.
+
0.36 2019-01-22
- Support for non-word tokens (fixes #5).
diff --git a/Readme.pod b/Readme.pod
index 3b3ef8d..8cef05d 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -199,6 +199,17 @@
This is I<experimental>.
+=item B<--koral|-k>
+
+Version of the output format. Supported versions are:
+C<0> for legacy serialization, C<0.03> for serialization
+with metadata fields as key-values on the root object,
+C<0.4> for serialization with metadata fields as a list
+of C<"@type":"koral:field"> objects.
+
+Currently defaults to C<0.03>.
+
+
=item B<--sequential-extraction|-se>
Flag to indicate, if the C<jobs> value also applies to extraction.
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index c982bdd..1bb768b 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -19,6 +19,7 @@
layer => $param{layer} || 'Tokens',
anno => $param{anno} || [[]],
log => $param{log} || Mojo::Log->new(level => 'fatal'),
+ koral => $param{koral},
primary => $param{primary},
non_word_tokens => $param{non_word_tokens},
pretty => $param{pretty},
@@ -70,7 +71,11 @@
};
my $file;
- my $print_text = ($self->{pretty} ? $tokens->to_pretty_json(undef, $self->{primary}) : $tokens->to_json(undef, $self->{primary}));
+ my $print_text = (
+ $self->{pretty} ?
+ $tokens->to_pretty_json($self->{koral}, $self->{primary}) :
+ $tokens->to_json($self->{koral}, $self->{primary})
+ );
# There is an output file given
if ($output) {
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 89fadd0..b48f963 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
-our $VERSION = '0.36';
+our $VERSION = '0.37';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index f1ba9d4..669c0d3 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -132,10 +132,15 @@
#
# 2019/01/22
# - Support for non-word tokens.
+#
+# 2019/02/07
+# - Support for 'koral:field' array.
+# - Support for Koral versioning.
# ----------------------------------------------------------
-our $LAST_CHANGE = '2019/01/22';
+our $LAST_CHANGE = '2019/02/07';
our $LOCAL = $FindBin::Bin;
+our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
@@ -177,6 +182,7 @@
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs),
+ 'koral|k=f' => \(my $koral),
'to-tar' => \(my $to_tar),
'non-word-tokens|nwt' => \(my $non_word_tokens),
'sequential-extraction|se' => \(my $sequential_extraction),
@@ -222,6 +228,11 @@
$jobs = $config{jobs};
};
+ # Koral version
+ if (!defined($koral) && defined $config{koral}) {
+ $koral = $config{koral};
+ };
+
# Input root base directory
if (!defined($input_base) && defined $config{'input-base'}) {
$input_base = $config{'input-base'};
@@ -324,6 +335,7 @@
$cache_file //= 'korapxml2krill.cache';
$cache_size //= '50m';
$jobs //= 0;
+$koral //= $KORAL_VERSION;
$cache_delete //= 1;
$cache_init //= 1;
$sequential_extraction //= 0;
@@ -556,6 +568,7 @@
layer => $token_base_layer,
gzip => $gzip,
log => $log,
+ koral => $koral,
primary => $primary,
pretty => $pretty,
anno => \@filtered_anno,
@@ -1253,6 +1266,17 @@
This is I<experimental>.
+=item B<--koral|-k>
+
+Version of the output format. Supported versions are:
+C<0> for legacy serialization, C<0.03> for serialization
+with metadata fields as key-values on the root object,
+C<0.4> for serialization with metadata fields as a list
+of C<"@type":"koral:field"> objects.
+
+Currently defaults to C<0.03>.
+
+
=item B<--sequential-extraction|-se>
Flag to indicate, if the C<jobs> value also applies to extraction.
diff --git a/t/script/single.t b/t/script/single.t
index 24e2559..3f43da5 100644
--- a/t/script/single.t
+++ b/t/script/single.t
@@ -30,6 +30,7 @@
'--input' => $input,
'--output' => $output,
'--cache' => $cache,
+ '-k' => 0.03,
'-t' => 'OpenNLP#Tokens',
'-l' => 'INFO'
);
@@ -247,6 +248,69 @@
is($json->{title}, 'Autobiographische Einzelheiten', 'title');
is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
+
+
+# Koral version
+$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'OpenNLP#Tokens',
+ '-k' => 0.4,
+ '-l' => 'INFO'
+);
+
+$call .= ' -w ';
+
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+ok(!$json->{textType}, 'text type');
+ok(!$json->{title}, 'Title');
+
+is($json->{fields}->[0]->{key}, 'corpusSigle');
+is($json->{fields}->[0]->{type}, 'type:string');
+is($json->{fields}->[0]->{value}, 'Corpus');
+is($json->{fields}->[0]->{'@type'}, 'koral:field');
+
+is($json->{fields}->[8]->{key}, 'textClass');
+is($json->{fields}->[8]->{value}->[0], 'freizeit-unterhaltung');
+is($json->{fields}->[8]->{value}->[1], 'vereine-veranstaltungen');
+is($json->{fields}->[8]->{type}, 'type:keywords');
+is($json->{fields}->[8]->{'@type'}, 'koral:field');
+
+is($json->{fields}->[13]->{key}, 'textType');
+is($json->{fields}->[13]->{value}, 'Zeitung: Tageszeitung');
+is($json->{fields}->[13]->{type}, 'type:string');
+is($json->{fields}->[13]->{'@type'}, 'koral:field');
+
+is($json->{fields}->[21]->{key}, 'title');
+is($json->{fields}->[21]->{value}, 'Beispiel Text');
+is($json->{fields}->[21]->{type}, 'type:text');
+is($json->{fields}->[21]->{'@type'}, 'koral:field');
+
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
+
+# Delete output
+unlink $output;
+ok(!-f $output, 'Output does not exist');
+
+
done_testing;
__END__
diff --git a/t/tokenization.t b/t/tokenization.t
index 4e883d1..33b0911 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -4,6 +4,7 @@
use utf8;
use Test::More;
use JSON::XS;
+use Data::Dumper;
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
@@ -70,6 +71,10 @@
is($json->{fields}->[6]->{key}, 'creationDate');
is($json->{fields}->[6]->{value}, '2005');
+is($json->{data}->{name}, 'tokens');
+is($json->{data}->{tokenSource}, 'opennlp#tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1067<b>0');
+
done_testing;
__END__