Support non-verbal annotations
Change-Id: I6cc0e7c8279f523d3c4b66b14125866ec0be1695
diff --git a/Readme.pod b/Readme.pod
index 8e82eda..edc314b 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -127,7 +127,8 @@
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
-
+This will directly take the file instead of running
+the layer implementation!
=item B<--base-sentences|-bs> <foundry>#<layer>
@@ -186,6 +187,15 @@
Defaults to unset.
+
+=item B<--non-verbal-tokens|-nvt>
+
+Tokenize non-verbal tokens marked as in the primary data as
+the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
+
+ Defaults to unset.
+
+
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
@@ -436,7 +446,7 @@
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
-L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index 1456cf7..01a7cd1 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -22,6 +22,7 @@
koral => $param{koral},
primary => $param{primary},
non_word_tokens => $param{non_word_tokens},
+ non_verbal_tokens => $param{non_verbal_tokens},
pretty => $param{pretty},
gzip => $param{gzip} // 0
}, $class;
@@ -58,7 +59,8 @@
foundry => $self->{foundry},
layer => $self->{layer},
name => 'tokens',
- non_word_tokens => $self->{non_word_tokens}
+ non_word_tokens => $self->{non_word_tokens},
+ non_verbal_tokens => $self->{non_verbal_tokens}
);
# Unable to process base tokenization
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 92c8d1f..f9595dd 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -35,6 +35,7 @@
has [qw/path foundry doc stream should have name/];
has layer => 'Tokens';
has non_word_tokens => 0;
+has non_verbal_tokens => 0;
has 'error';
@@ -117,7 +118,7 @@
# my $span = $_;
my $mtt;
my $distance = 0;
- my (@non_word_tokens);
+ # my (@non_word_tokens);
foreach my $span (@$tokens) {
my $from = $span->{'-from'};
my $to = $span->{'-to'};
@@ -136,9 +137,10 @@
# This token should be recognized
$should++;
- # Ignore non-word and non-number tokens (sorry!)
- if (!$self->non_word_tokens && $token !~ /[\w\d]/) {
-
+ # Ignore non-word, non-number, and non-verbal tokens per default
+ if ($self->non_verbal_tokens && ord($token) == 9646) {
+ # Non-verbal token
+ } elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
# TODO: Recognize punctuations!
# if ($mtt) {
# my $term = [$token, $from, $to];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index e6754d9..56189aa 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -143,12 +143,13 @@
# 2019/08/08
# - Support for Talismane.
#
-# 2019/12/16
+# 2019/12/17
# - Added support for DGD pseudo-sentences
# based on anchor milestones.
+# - Support for non-verbal annotations.
# ----------------------------------------------------------
-our $LAST_CHANGE = '2019/12/16';
+our $LAST_CHANGE = '2019/12/17';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -195,6 +196,7 @@
'koral|k=f' => \(my $koral),
'to-tar' => \(my $to_tar),
'non-word-tokens|nwt' => \(my $non_word_tokens),
+ 'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
'sequential-extraction|se' => \(my $sequential_extraction),
'cache-size|cs=s' => \(my $cache_size),
'cache-delete|cd!' => \(my $cache_delete),
@@ -258,11 +260,16 @@
$token_base = $config{token};
};
- # temporary-extract
+ # Non-word tokenization
if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
$non_word_tokens = $config{'non-word-tokens'};
};
+ # Non-verbal tokenization
+ if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
+ $non_verbal_tokens = $config{'non-verbal-tokens'};
+ };
+
# Cache file
if (!defined($cache_file) && defined $config{cache}) {
$cache_file = $config{cache};
@@ -354,6 +361,7 @@
$base_paragraphs //= '';
$base_pagebreaks //= '';
$non_word_tokens //= 0;
+$non_verbal_tokens //= 0;
$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
@@ -592,7 +600,8 @@
primary => $primary,
pretty => $pretty,
anno => \@filtered_anno,
- non_word_tokens => $non_word_tokens
+ non_word_tokens => $non_word_tokens,
+ non_verbal_tokens => $non_verbal_tokens
);
# Get file name based on path information
@@ -1228,6 +1237,8 @@
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
+This will directly take the file instead of running
+the layer implementation!
=item B<--base-sentences|-bs> <foundry>#<layer>
@@ -1287,6 +1298,15 @@
Defaults to unset.
+
+=item B<--non-verbal-tokens|-nvt>
+
+Tokenize non-verbal tokens marked as in the primary data as
+the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
+
+ Defaults to unset.
+
+
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
@@ -1537,7 +1557,7 @@
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
-L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
diff --git a/t/real/agd.t b/t/real/agd.t
index 498f4e2..6ba7ebe 100644
--- a/t/real/agd.t
+++ b/t/real/agd.t
@@ -53,7 +53,7 @@
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens',
- non_word_tokens => 1
+ non_verbal_tokens => 1
);
ok($tokens, 'Token Object is fine');
@@ -71,9 +71,9 @@
is($output->{version}, '0.03', 'version');
is($output->{data}->{foundries}, '', 'Foundries');
is($output->{data}->{layerInfos}, '', 'layerInfos');
-is($output->{data}->{stream}->[1]->[2], 's:ku', 'data');
-is($output->{data}->{stream}->[2]->[2], 's:sqn', 'data');
-is($output->{data}->{stream}->[3]->[2], 's:alxv', 'data');
+is($output->{data}->{stream}->[0]->[4], 's:ku', 'data');
+is($output->{data}->{stream}->[1]->[2], 's:sqn', 'data');
+is($output->{data}->{stream}->[2]->[2], 's:alxv', 'data');
is($output->{textSigle}, 'AGD/DOC/00001', 'Correct text sigle');
is($output->{docSigle}, 'AGD/DOC', 'Correct document sigle');
is($output->{corpusSigle}, 'AGD', 'Correct corpus sigle');
@@ -103,7 +103,7 @@
is($output->{data}->{layerInfos}, 'dereko/s=spans dgd/l=tokens dgd/p=tokens dgd/para=tokens',
'layerInfos');
-my $third_token = join('||', @{$output->{data}->{stream}->[3]});
+my $third_token = join('||', @{$output->{data}->{stream}->[2]});
like($third_token, qr!dgd/l:alui!);
like($third_token, qr!dgd/p:VMGWY!);
like($third_token, qr!i:alxv!);
@@ -116,13 +116,18 @@
# Offsets are suboptimal set, but good enough
$first_token = join('||', @{$output->{data}->{stream}->[0]});
-like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>3<b>1!);
+like($first_token, qr!<>:base/s:s\$<b>64<i>0<i>16<i>2<b>1!);
my $token = join('||', @{$output->{data}->{stream}->[1]});
+like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>4<b>1!);
+$token = join('||', @{$output->{data}->{stream}->[2]});
unlike($token, qr!<>:base/s:s!);
-$token = join('||', @{$output->{data}->{stream}->[2]});
-like($token, qr!<>:base/s:s\$<b>64<i>16<i>23<i>5<b>1!);
+$token = join('||', @{$output->{data}->{stream}->[3]});
+like($token, qr!<>:base/s:s\$<b>64<i>23<i>27<i>5<b>1!);
+
+$token = join('||', @{$output->{data}->{stream}->[5]});
+like($token, qr!dgd/para:pause!);
done_testing;
__END__
diff --git a/t/script/non_verbal_tokens.t b/t/script/non_verbal_tokens.t
new file mode 100644
index 0000000..fa460e4
--- /dev/null
+++ b/t/script/non_verbal_tokens.t
@@ -0,0 +1,99 @@
+#/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catdir catfile/;
+use File::Temp qw/ :POSIX /;
+use Mojo::File;
+use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Test::More;
+use Test::Output;
+use Data::Dumper;
+use utf8;
+
+my $f = dirname(__FILE__);
+my $script = catfile($f, '..', '..', 'script', 'korapxml2krill');
+
+my $input = catdir($f, '..', 'corpus', 'AGD-scrambled', 'DOC', '00001');
+ok(-d $input, 'Input directory found');
+
+my $output = tmpnam();
+my $cache = tmpnam();
+
+ok(!-f $output, 'Output does not exist');
+
+my $call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'DGD#Annot',
+ '-l' => 'INFO'
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok((my $file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok((my $json = decode_json $file), 'decode json');
+
+is($json->{textSigle}, 'AGD/DOC/00001', 'text sigle');
+is($json->{title}, 'FOLK_E_00321_SE_01_T_01_DF_01', 'Title');
+is($json->{data}->{tokenSource}, 'dgd#annot', 'Title');
+is($json->{data}->{foundries}, 'dereko dereko/structure dgd dgd/morpho', 'Foundries');
+my $stream = $json->{data}->{stream};
+my $token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+$token = $stream->[5];
+is($token->[13], 'dgd/l:xui', 'Token');
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input,
+ '--output' => $output,
+ '--cache' => $cache,
+ '-t' => 'DGD#annot',
+ '-l' => 'INFO',
+ '-w' => '',
+ '-nvt' => ''
+);
+
+# Test without compression
+stderr_like(
+ sub {
+ system($call);
+ },
+ qr!The code took!,
+ $call
+);
+
+ok(-f $output, 'Output does exist');
+ok(($file = Mojo::File->new($output)->slurp), 'Slurp data');
+ok(($json = decode_json $file), 'decode json');
+$stream = $json->{data}->{stream};
+
+$stream = $json->{data}->{stream};
+
+$token = $stream->[4];
+is($token->[3], 'dgd/l:pui', 'Token');
+
+$token = $stream->[5];
+is($token->[14], 'dgd/para:pause$<b>128<s>5', 'Token');
+
+$token = $stream->[6];
+is($token->[1], 'dgd/l:xui', 'Token');
+
+
+
+done_testing;
+__END__