Add simple Spacy support
Change-Id: I37ec0dce14ca456c8a4804dc9dd198c3d153b359
diff --git a/Changes b/Changes
index 684dd95..338a5d5 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.53 2023-03-20
+ - Added Spacy support. (kupietz)
+
0.52 2023-01-23
- Introduced 'quiet' flag.
diff --git a/Readme.pod b/Readme.pod
index a9ab89c..0b5bcbf 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -436,6 +436,9 @@
#Lemma
#Morpho
+ Spacy
+ #Morpho
+
Talismane
#Dependency
#Morpho
@@ -613,7 +616,7 @@
Author: L<Nils Diewald|https://www.nils-diewald.de/>
-Contributor: Eliza Margaretha
+Contributor: Eliza Margaretha, Marc Kupietz
L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
diff --git a/lib/KorAP/XML/Annotation/Spacy/Morpho.pm b/lib/KorAP/XML/Annotation/Spacy/Morpho.pm
new file mode 100644
index 0000000..5c36b98
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/Spacy/Morpho.pm
@@ -0,0 +1,47 @@
+package KorAP::XML::Annotation::Spacy::Morpho;
+use KorAP::XML::Annotation::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'spacy',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->get_pos);
+
+ my $content = $token->get_hash->{fs}->{f};
+
+ my $array = $content->{fs}->{f} or return;
+
+ # In case there is only a lemma/pos ...
+ $array = ref $array ne 'ARRAY' ? [$array] : $array;
+
+ my $found;
+
+ foreach my $f (@$array) {
+
+ next unless $f->{-name};
+
+ # pos tag
+ if (($f->{-name} eq 'pos') &&
+ ($found = $f->{'#text'})) {
+ $mtt->add_by_term('spacy/p:' . $found);
+ }
+
+ # lemma tag
+ elsif (($f->{-name} eq 'lemma')
+ && ($found = $f->{'#text'})) {
+ $mtt->add_by_term('spacy/l:' . $found);
+ };
+ };
+ }) or return;
+ return 1;
+};
+
+sub layer_info {
+ ['spacy/l=tokens', 'spacy/p=tokens']
+};
+
+1;
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 06dd102..2a16e6b 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
-our $VERSION = '0.52';
+our $VERSION = '0.53';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 51352e8..77a1260 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -171,9 +171,12 @@
# 2023/02/13
# - Fix temporary-extract handling from configuration file.
#
+# 2024/03/20
+# - Added Spacy support.
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2023/05/16';
+our $LAST_CHANGE = '2024/03/20';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -514,6 +517,10 @@
['Sgbr', 'Lemma'],
['Sgbr', 'Morpho']);
+# Spacy
+push(@layers,
+ ['Spacy', 'Morpho']);
+
# Talismane
push(@layers,
['Talismane', 'Dependency'],
@@ -1521,6 +1528,9 @@
#Lemma
#Morpho
+ Spacy
+ #Morpho
+
Talismane
#Dependency
#Morpho
diff --git a/t/annotation/corpus/doc/0001/spacy/morpho.xml b/t/annotation/corpus/doc/0001/spacy/morpho.xml
new file mode 100644
index 0000000..3328a8a
--- /dev/null
+++ b/t/annotation/corpus/doc/0001/spacy/morpho.xml
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="Corpus_Doc.0001" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+ <span id="s1_n1" from="0" to="3">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADP</f>
+ <f name="lemma">zu</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1_n2" from="4" to="11">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJ</f>
+ <f name="lemma">letzter</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1_n3" from="12" to="23">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJ</f>
+ <f name="lemma">kulturell</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1_n4" from="24" to="30">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NOUN</f>
+ <f name="lemma">Anlass</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2_n1" from="31" to="35">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VERB</f>
+ <f name="lemma">laden</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2_n2" from="36" to="39">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">DET</f>
+ <f name="lemma">der</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2_n3" from="" to="">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NOUN</f>
+ <f name="lemma">Leitung</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n1" from="48" to="51">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">DET</f>
+ <f name="lemma">der</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n2" from="52" to="63">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NOUN</f>
+ <f name="lemma">Schulheim</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n3" from="64" to="73">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">PROPN</f>
+ <f name="lemma">Hofbergli</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n4" from="74" to="77">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADV</f>
+ <f name="lemma">ein</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n5" from="77" to="78">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">PUNCT</f>
+ <f name="lemma">--</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n6" from="79" to="84">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">SCONJ</f>
+ <f name="lemma">bevor</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n7" from="85" to="88">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">DET</f>
+ <f name="lemma">der</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n8" from="89" to="96">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NOUN</f>
+ <f name="lemma">Betrieb</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n9" from="97" to="101">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NOUN</f>
+ <f name="lemma">Ende</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n10" from="102" to="111">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NOUN</f>
+ <f name="lemma">Schuljahr</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n11" from="112" to="123">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VERB</f>
+ <f name="lemma">einstellen</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n12" from="124" to="128">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">AUX</f>
+ <f name="lemma">werden</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s4_n1" from="48" to="51">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">PUNCT</f>
+ <f name="lemma">--</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ </spanList>
+</layer>
diff --git a/t/annotation/spacy_morpho.t b/t/annotation/spacy_morpho.t
new file mode 100644
index 0000000..8999acc
--- /dev/null
+++ b/t/annotation/spacy_morpho.t
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use KorAP::XML::Annotation::Spacy::Morpho;
+use Scalar::Util qw/weaken/;
+use Data::Dumper;
+use lib 't/annotation';
+use TestInit;
+
+ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
+
+ok($tokens->add('Spacy', 'Morpho'), 'Add Structure');
+
+my $data = $tokens->to_data->{data};
+
+like($data->{foundries}, qr!spacy/morpho!, 'data');
+like($data->{layerInfos}, qr!spacy/p=tokens!, 'data');
+like($data->{layerInfos}, qr!spacy/l=tokens!, 'data');
+
+is($data->{stream}->[0]->[5], 'spacy/l:zu', 'POS');
+is($data->{stream}->[0]->[6], 'spacy/p:ADP', 'POS');
+
+is($data->{stream}->[3]->[3], 'spacy/l:Anlass', 'POS');
+is($data->{stream}->[3]->[4], 'spacy/p:NOUN', 'POS');
+
+is($data->{stream}->[10]->[3], 'spacy/l:ein', 'POS');
+is($data->{stream}->[10]->[4], 'spacy/p:ADV', 'POS');
+
+is($data->{stream}->[13]->[3], 'spacy/l:Betrieb', 'POS');
+
+is($data->{stream}->[-1]->[3], 'spacy/l:werden', 'POS');
+is($data->{stream}->[-1]->[4], 'spacy/p:AUX', 'POS');
+
+is($data->{stream}->[11]->[3], 'spacy/l:bevor',
+ 'Lemma');
+is($data->{stream}->[11]->[4], 'spacy/p:SCONJ',
+ 'POS');
+
+is($data->{stream}->[12]->[1], 'i:der','Surface');
+is($data->{stream}->[13]->[1], 'i:betrieb','Surface');
+is($data->{stream}->[14]->[1], 'i:ende','Surface');
+is($data->{stream}->[15]->[1], 'i:schuljahr','Surface');
+is($data->{stream}->[16]->[1], 'i:eingestellt','Surface');
+is($data->{stream}->[17]->[1], 'i:wird','Surface');
+
+ok(!$data->{stream}->[18],'Nothing');
+
+is(scalar(@{$data->{stream}}), 18, 'Length');
+
+done_testing;
+
+__END__
+
diff --git a/t/corpus/archives/wpd15-single.spacy.zip b/t/corpus/archives/wpd15-single.spacy.zip
new file mode 100644
index 0000000..5bbc121
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.spacy.zip
Binary files differ
diff --git a/t/script/archive.t b/t/script/archive.t
index 0f57183..41389bb 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -123,7 +123,7 @@
ok(($json_1 = decode_json $file), 'decode json');
is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
- is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+ is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
ok(-f $json_2, 'Json file exists');
diff --git a/t/script/single.t b/t/script/single.t
index 01d119f..4772c1c 100644
--- a/t/script/single.t
+++ b/t/script/single.t
@@ -54,7 +54,7 @@
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
@@ -87,7 +87,7 @@
is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
is($json->{title}, 'Beispiel Text', 'Title');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
@@ -227,11 +227,41 @@
is($json->{fields}->[22]->{'@type'}, 'koral:field');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
+my $token = join(',',@{$json->{data}->{stream}->[1]});
+
+like($token, qr!<>:xip\/c:AP\$<b>64<i>4<i>11<i>2<b>5!);
+like($token, qr!<>:xip\/c:ADJ\$<b>64<i>4<i>11<i>2<b>6!);
+like($token, qr!<>:cnx\/c:np\$<b>64<i>4<i>30<i>4<b>0!);
+like($token, qr!<>:xip\/c:NP\$<b>64<i>4<i>30<i>4<b>3!);
+like($token, qr!<>:xip\/c:NPA\$<b>64<i>4<i>30<i>4<b>4!);
+like($token, qr!>:mate\/d:NK\$<b>32<i>3!);
+like($token, qr!_1\$<i>4<i>11!);
+like($token, qr!cnx\/l:letzt!);
+like($token, qr!cnx\/p:A!);
+like($token, qr!cnx\/syn:\@PREMOD!);
+like($token, qr!corenlp\/p:ADJ!);
+like($token, qr!glemm\/l:__letzt-!);
+like($token, qr!i:letzten!);
+like($token, qr!mate\/l:letzter!);
+like($token, qr!mate\/m:case:dat!);
+like($token, qr!mate\/m:degree:pos!);
+like($token, qr!mate\/m:gender:neut!);
+like($token, qr!mate\/m:number:sg!);
+like($token, qr!mate\/p:ADJA!);
+like($token, qr!opennlp\/p:ADJA!);
+like($token, qr!s:letzten!);
+like($token, qr!spacy\/l:letzter!);
+like($token, qr!spacy\/p:ADJ!);
+like($token, qr!tt\/l:letzt!);
+like($token, qr!tt\/p:ADJA!);
+like($token, qr!xip\/l:letzt!);
+like($token, qr!xip\/p:ADJ!);
+
# Delete output
unlink $output;
ok(!-f $output, 'Output does not exist');