Support koral versioning Change-Id: Id07e878843c790c55aca0e2da0869f4cd2e9150c

commit: 263274ce22923765fb6ab0723bb7558cf85230e2 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Feb 07 09:48:30 2019 +0100
committer: Akron <nils@diewald-online.de> Thu Feb 07 09:48:30 2019 +0100
tree: 55a3d7389c68c340c637b2cb80c18d9a15f39146
parent: c526e75f0e5b2e2bb232d173cad3e6780eb01944 [diff]
diff --git a/Changes b/Changes
index ce01343..db8eed7 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,7 @@
+0.37 2019-02-07
+        - Support for 'koral:field' array.
+        - Support for Koral versioning.
+
 0.36 2019-01-22
         - Support for non-word tokens (fixes #5).
 

diff --git a/Readme.pod b/Readme.pod
index 3b3ef8d..8cef05d 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -199,6 +199,17 @@
 This is I<experimental>.
 
 
+=item B<--koral|-k>
+
+Version of the output format. Supported versions are:
+C<0> for legacy serialization, C<0.03> for serialization
+with metadata fields as key-values on the root object,
+C<0.4> for serialization with metadata fields as a list
+of C<"@type":"koral:field"> objects.
+
+Currently defaults to C<0.03>.
+
+
 =item B<--sequential-extraction|-se>
 
 Flag to indicate, if the C<jobs> value also applies to extraction.

diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index c982bdd..1bb768b 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm

@@ -19,6 +19,7 @@
     layer           => $param{layer}     || 'Tokens',
     anno            => $param{anno}      || [[]],
     log             => $param{log}       || Mojo::Log->new(level => 'fatal'),
+    koral           => $param{koral},
     primary         => $param{primary},
     non_word_tokens => $param{non_word_tokens},
     pretty          => $param{pretty},
@@ -70,7 +71,11 @@
   };
 
   my $file;
-  my $print_text = ($self->{pretty} ? $tokens->to_pretty_json(undef, $self->{primary}) : $tokens->to_json(undef, $self->{primary}));
+  my $print_text = (
+    $self->{pretty} ?
+      $tokens->to_pretty_json($self->{koral}, $self->{primary}) :
+      $tokens->to_json($self->{koral}, $self->{primary})
+    );
 
   # There is an output file given
   if ($output) {

diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 89fadd0..b48f963 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm

@@ -16,7 +16,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.36';
+our $VERSION = '0.37';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];

diff --git a/script/korapxml2krill b/script/korapxml2krill
index f1ba9d4..669c0d3 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -132,10 +132,15 @@
 #
 # 2019/01/22
 # - Support for non-word tokens.
+#
+# 2019/02/07
+# - Support for 'koral:field' array.
+# - Support for Koral versioning.
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2019/01/22';
+our $LAST_CHANGE = '2019/02/07';
 our $LOCAL = $FindBin::Bin;
+our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
 VERSION
@@ -177,6 +182,7 @@
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs),
+  'koral|k=f'    => \(my $koral),
   'to-tar'      => \(my $to_tar),
   'non-word-tokens|nwt' => \(my $non_word_tokens),
   'sequential-extraction|se' => \(my $sequential_extraction),
@@ -222,6 +228,11 @@
     $jobs = $config{jobs};
   };
 
+  # Koral version
+  if (!defined($koral) && defined $config{koral}) {
+    $koral = $config{koral};
+  };
+
   # Input root base directory
   if (!defined($input_base) && defined $config{'input-base'}) {
     $input_base = $config{'input-base'};
@@ -324,6 +335,7 @@
 $cache_file          //= 'korapxml2krill.cache';
 $cache_size          //= '50m';
 $jobs                //= 0;
+$koral               //= $KORAL_VERSION;
 $cache_delete        //= 1;
 $cache_init          //= 1;
 $sequential_extraction //= 0;
@@ -556,6 +568,7 @@
   layer     => $token_base_layer,
   gzip      => $gzip,
   log       => $log,
+  koral     => $koral,
   primary   => $primary,
   pretty    => $pretty,
   anno      => \@filtered_anno,
@@ -1253,6 +1266,17 @@
 This is I<experimental>.
 
 
+=item B<--koral|-k>
+
+Version of the output format. Supported versions are:
+C<0> for legacy serialization, C<0.03> for serialization
+with metadata fields as key-values on the root object,
+C<0.4> for serialization with metadata fields as a list
+of C<"@type":"koral:field"> objects.
+
+Currently defaults to C<0.03>.
+
+
 =item B<--sequential-extraction|-se>
 
 Flag to indicate, if the C<jobs> value also applies to extraction.

diff --git a/t/script/single.t b/t/script/single.t
index 24e2559..3f43da5 100644
--- a/t/script/single.t
+++ b/t/script/single.t

@@ -30,6 +30,7 @@
   '--input' => $input,
   '--output' => $output,
   '--cache' => $cache,
+  '-k' => 0.03,
   '-t' => 'OpenNLP#Tokens',
   '-l' => 'INFO'
 );
@@ -247,6 +248,69 @@
 is($json->{title}, 'Autobiographische Einzelheiten', 'title');
 is($json->{data}->{stream}->[0]->[-1], '~:base/s:pb$<i>529<i>0', 'Pagebreak annotation');
 
+
+
+# Koral version
+$input = catdir($f, '..', 'annotation', 'corpus', 'doc', '0001');
+$call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '--cache' => $cache,
+  '-t' => 'OpenNLP#Tokens',
+  '-k' => 0.4,
+  '-l' => 'INFO'
+);
+
+$call .= ' -w ';
+
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+ok(!$json->{textType}, 'text type');
+ok(!$json->{title}, 'Title');
+
+is($json->{fields}->[0]->{key}, 'corpusSigle');
+is($json->{fields}->[0]->{type}, 'type:string');
+is($json->{fields}->[0]->{value}, 'Corpus');
+is($json->{fields}->[0]->{'@type'}, 'koral:field');
+
+is($json->{fields}->[8]->{key}, 'textClass');
+is($json->{fields}->[8]->{value}->[0], 'freizeit-unterhaltung');
+is($json->{fields}->[8]->{value}->[1], 'vereine-veranstaltungen');
+is($json->{fields}->[8]->{type}, 'type:keywords');
+is($json->{fields}->[8]->{'@type'}, 'koral:field');
+
+is($json->{fields}->[13]->{key}, 'textType');
+is($json->{fields}->[13]->{value}, 'Zeitung: Tageszeitung');
+is($json->{fields}->[13]->{type}, 'type:string');
+is($json->{fields}->[13]->{'@type'}, 'koral:field');
+
+is($json->{fields}->[21]->{key}, 'title');
+is($json->{fields}->[21]->{value}, 'Beispiel Text');
+is($json->{fields}->[21]->{type}, 'type:text');
+is($json->{fields}->[21]->{'@type'}, 'koral:field');
+
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
+
+# Delete output
+unlink $output;
+ok(!-f $output, 'Output does not exist');
+
+
 done_testing;
 __END__
 

diff --git a/t/tokenization.t b/t/tokenization.t
index 4e883d1..33b0911 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -4,6 +4,7 @@
 use utf8;
 use Test::More;
 use JSON::XS;
+use Data::Dumper;
 
 use File::Basename 'dirname';
 use File::Spec::Functions 'catdir';
@@ -70,6 +71,10 @@
 is($json->{fields}->[6]->{key}, 'creationDate');
 is($json->{fields}->[6]->{value}, '2005');
 
+is($json->{data}->{name}, 'tokens');
+is($json->{data}->{tokenSource}, 'opennlp#tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>6083<i>1067<b>0');
+
 done_testing;
 
 __END__
commit	263274ce22923765fb6ab0723bb7558cf85230e2	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Feb 07 09:48:30 2019 +0100
committer	Akron <nils@diewald-online.de>	Thu Feb 07 09:48:30 2019 +0100
tree	55a3d7389c68c340c637b2cb80c18d9a15f39146
parent	c526e75f0e5b2e2bb232d173cad3e6780eb01944 [diff]