Support non-verbal annotations Change-Id: I6cc0e7c8279f523d3c4b66b14125866ec0be1695

commit: f1849aa25d77eb716e539e3b66c11fa282d40e30 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Dec 16 23:35:33 2019 +0100
committer: Akron <nils@diewald-online.de> Tue Dec 17 08:15:59 2019 +0100
tree: 1f92020f3ddf09ccf9a149d90bc579fd9fbae57c
parent: c29b8e1239bf241123db2b231681b428b62fc450 [diff]
diff --git a/Readme.pod b/Readme.pod
index 8e82eda..edc314b 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -127,7 +127,8 @@
 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.
-
+This will directly take the file instead of running
+the layer implementation!
 
 =item B<--base-sentences|-bs> <foundry>#<layer>
 
@@ -186,6 +187,15 @@
 
  Defaults to unset.
 
+
+=item B<--non-verbal-tokens|-nvt>
+
+Tokenize non-verbal tokens marked as in the primary data as 
+the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
+
+ Defaults to unset.
+
+
 =item B<--jobs|-j>
 
 Define the number of concurrent jobs in seperated forks
@@ -436,7 +446,7 @@
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
-L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
 
 This program is free software published under the
 L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.

diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index 1456cf7..01a7cd1 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm

@@ -22,6 +22,7 @@
     koral           => $param{koral},
     primary         => $param{primary},
     non_word_tokens => $param{non_word_tokens},
+    non_verbal_tokens => $param{non_verbal_tokens},
     pretty          => $param{pretty},
     gzip            => $param{gzip}      // 0
   }, $class;
@@ -58,7 +59,8 @@
     foundry => $self->{foundry},
     layer => $self->{layer},
     name => 'tokens',
-    non_word_tokens => $self->{non_word_tokens}
+    non_word_tokens => $self->{non_word_tokens},
+    non_verbal_tokens => $self->{non_verbal_tokens}
   );
 
   # Unable to process base tokenization

diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 92c8d1f..f9595dd 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm

@@ -35,6 +35,7 @@
 has [qw/path foundry doc stream should have name/];
 has layer => 'Tokens';
 has non_word_tokens => 0;
+has non_verbal_tokens => 0;
 
 has 'error';
 
@@ -117,7 +118,7 @@
   # my $span = $_;
   my $mtt;
   my $distance = 0;
-  my (@non_word_tokens);
+  # my (@non_word_tokens);
   foreach my $span (@$tokens) {
     my $from = $span->{'-from'};
     my $to = $span->{'-to'};
@@ -136,9 +137,10 @@
     # This token should be recognized
     $should++;
 
-    # Ignore non-word and non-number tokens (sorry!)
-    if (!$self->non_word_tokens && $token !~ /[\w\d]/) {
-
+    # Ignore non-word, non-number, and non-verbal tokens per default
+    if ($self->non_verbal_tokens && ord($token) == 9646) {
+      # Non-verbal token
+    } elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
       # TODO: Recognize punctuations!
       #	if ($mtt) {
       #	  my $term = [$token, $from, $to];

diff --git a/script/korapxml2krill b/script/korapxml2krill
index e6754d9..56189aa 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -143,12 +143,13 @@
 # 2019/08/08
 # - Support for Talismane.
 #
-# 2019/12/16
+# 2019/12/17
 # - Added support for DGD pseudo-sentences
 #   based on anchor milestones.
+# - Support for non-verbal annotations.
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2019/12/16';
+our $LAST_CHANGE = '2019/12/17';
 our $LOCAL = $FindBin::Bin;
 our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
@@ -195,6 +196,7 @@
   'koral|k=f'    => \(my $koral),
   'to-tar'      => \(my $to_tar),
   'non-word-tokens|nwt' => \(my $non_word_tokens),
+  'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
   'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
@@ -258,11 +260,16 @@
     $token_base = $config{token};
   };
 
-  # temporary-extract
+  # Non-word tokenization
   if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
     $non_word_tokens = $config{'non-word-tokens'};
   };
 
+  # Non-verbal tokenization
+  if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
+    $non_verbal_tokens = $config{'non-verbal-tokens'};
+  };
+
   # Cache file
   if (!defined($cache_file) && defined $config{cache}) {
     $cache_file = $config{cache};
@@ -354,6 +361,7 @@
 $base_paragraphs     //= '';
 $base_pagebreaks     //= '';
 $non_word_tokens     //= 0;
+$non_verbal_tokens   //= 0;
 
 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
@@ -592,7 +600,8 @@
   primary   => $primary,
   pretty    => $pretty,
   anno      => \@filtered_anno,
-  non_word_tokens => $non_word_tokens
+  non_word_tokens => $non_word_tokens,
+  non_verbal_tokens => $non_verbal_tokens
 );
 
 # Get file name based on path information
@@ -1228,6 +1237,8 @@
 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.
+This will directly take the file instead of running
+the layer implementation!
 
 
 =item B<--base-sentences|-bs> <foundry>#<layer>
@@ -1287,6 +1298,15 @@
 
  Defaults to unset.
 
+
+=item B<--non-verbal-tokens|-nvt>
+
+Tokenize non-verbal tokens marked as in the primary data as 
+the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
+
+ Defaults to unset.
+
+
 =item B<--jobs|-j>
 
 Define the number of concurrent jobs in seperated forks
@@ -1537,7 +1557,7 @@
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
-L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
 
 This program is free software published under the
 L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.

diff --git a/t/real/agd.t b/t/real/agd.t
index 498f4e2..6ba7ebe 100644
--- a/t/real/agd.t
+++ b/t/real/agd.t

@@ -53,7 +53,7 @@
   foundry => $token_base_foundry,
   layer => $token_base_layer,
   name => 'tokens',
-  non_word_tokens => 1
+  non_verbal_tokens => 1
 );
 
 ok($tokens, 'Token Object is fine');
@@ -71,9 +71,9 @@
 is($output->{version}, '0.03', 'version');
 is($output->{data}->{foundries}, '', 'Foundries');
 is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[1]->[2], 's:ku', 'data');
-is($output->{data}->{stream}->[2]->[2], 's:sqn', 'data');
-is($output->{data}->{stream}->[3]->[2], 's:alxv', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:ku', 'data');
+is($output->{data}->{stream}->[1]->[2], 's:sqn', 'data');
+is($output->{data}->{stream}->[2]->[2], 's:alxv', 'data');
 is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
 is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
 is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
@@ -103,7 +103,7 @@
 is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
    'layerInfos');
 
-my $third_token = join('||', @{$output->{data}->{stream}->[3]});
+my $third_token = join('||', @{$output->{data}->{stream}->[2]});
 like($third_token, qr!dgd/l:alui!);
 like($third_token, qr!dgd/p:VMGWY!);
 like($third_token, qr!i:alxv!);
@@ -116,13 +116,18 @@
 # Offsets are suboptimal set, but good enough
 
 $first_token = join('||', @{$output->{data}->{stream}->[0]});
-like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>3<b>1!);
+like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>2<b>1!);
 
 my $token = join('||', @{$output->{data}->{stream}->[1]});
+like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>4<b>1!);
+$token = join('||', @{$output->{data}->{stream}->[2]});
 unlike($token, qr!<>:base/s:s!);
 
-$token = join('||', @{$output->{data}->{stream}->[2]});
-like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>5<b>1!);
+$token = join('||', @{$output->{data}->{stream}->[3]});
+like($token, qr!<>:base/s:s\$<b>64<i>23<i>27<i>5<b>1!);
+
+$token = join('||', @{$output->{data}->{stream}->[5]});
+like($token, qr!dgd/para:pause!);
 
 done_testing;
 __END__

diff --git a/t/script/non_verbal_tokens.t b/t/script/non_verbal_tokens.t
new file mode 100644
index 0000000..fa460e4
--- /dev/null
+++ b/t/script/non_verbal_tokens.t

@@ -0,0 +1,99 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'AGD-scrambled', 'DOC', '00001');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '--cache' => $cache,
+  '-t' => 'DGD#Annot',
+  '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+
+is($json->{textSigle}, 'AGD/DOC/00001', 'text sigle');
+is($json->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
+is($json->{data}->{tokenSource}, 'dgd#annot', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dgd dgd/morpho', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+$token = $stream->[5];
+is($token->[13], 'dgd/l:xui', 'Token');
+
+$call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input,
+  '--output' => $output,
+  '--cache' => $cache,
+  '-t' => 'DGD#annot',
+  '-l' => 'INFO',
+  '-w' => '',
+  '-nvt' => ''
+);
+
+# Test without compression
+stderr_like(
+  sub {
+    system($call);
+  },
+  qr!The code took!,
+  $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+$stream = $json->{data}->{stream};
+
+$stream = $json->{data}->{stream};
+
+$token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+
+$token = $stream->[5];
+is($token->[14], 'dgd/para:pause$<b>128<s>5', 'Token');
+
+$token = $stream->[6];
+is($token->[1], 'dgd/l:xui', 'Token');
+
+
+
+done_testing;
+__END__
commit	f1849aa25d77eb716e539e3b66c11fa282d40e30	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Dec 16 23:35:33 2019 +0100
committer	Akron <nils@diewald-online.de>	Tue Dec 17 08:15:59 2019 +0100
tree	1f92020f3ddf09ccf9a149d90bc579fd9fbae57c
parent	c29b8e1239bf241123db2b231681b428b62fc450 [diff]