Support for 'orig' annotations in NKJP data
Change-Id: I1ec02dc7ac8de09bb3ee1d3f0a9f3d9332b4ec01
diff --git a/Changes b/Changes
index 8c3c5ed..63f3079 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
-0.47 2022-07-27
+0.47 2022-08-08
- Support for preferred language transformation.
- Support for NKJP taxonomies.
+ - Support for NKJP 'orig' values.
0.46 2022-07-21
- Support NKJP Meta, Morpho and NamedEntities.
diff --git a/lib/KorAP/XML/Annotation/NKJP/Morpho.pm b/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
index 68ea04e..b6a7304 100644
--- a/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/NKJP/Morpho.pm
@@ -50,6 +50,12 @@
$mtt->add_by_term('nkjp/l:' . $found);
}
+ # orig tag
+ elsif (($name eq 'orig')
+ && ($found = $f->{'#text'})) {
+ $mtt->add_by_term('nkjp/ov:' . $found);
+ }
+
# msd tag
elsif (($name eq 'msd')
&& ($found = $f->{'#text'})) {
@@ -63,7 +69,7 @@
};
sub layer_info {
- ['nkjp/l=tokens', 'nkjp/p=tokens', 'nkjp/m=tokens']
+ ['nkjp/l=tokens', 'nkjp/p=tokens', 'nkjp/m=tokens', 'nkjp/ov=tokens']
}
1;
diff --git a/t/real/nkjp.t b/t/real/nkjp.t
index 7f651c0..aa9f5df 100644
--- a/t/real/nkjp.t
+++ b/t/real/nkjp.t
@@ -92,7 +92,7 @@
is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
-is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
+is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ov=tokens nkjp/p=tokens', 'layerInfos');
my $token = join('||', @{$output->{data}->{stream}->[7]});
@@ -162,7 +162,7 @@
is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho nkjp/namedentities', 'Foundries');
-is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/p=tokens', 'layerInfos');
+is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/ne=tokens nkjp/ov=tokens nkjp/p=tokens', 'layerInfos');
$token = join('||', @{$output->{data}->{stream}->[5]});
@@ -173,6 +173,9 @@
like($token, qr!nkjp/m:sg:nom:n:pos!);
like($token, qr!nkjp/p:adj!);
like($token, qr!s:takie!);
+like($token, qr!nkjp/ov:takie!);
+
+
$token = join('||', @{$output->{data}->{stream}->[67]});
@@ -186,6 +189,7 @@
like($token, qr!nkjp/ne:persName:surname!);
like($token, qr!nkjp/p:subst!);
like($token, qr!s:Kierkegaard!);
+like($token, qr!nkjp/ov:Kierkegaard!);
done_testing;
__END__