Add simple Spacy support Change-Id: I37ec0dce14ca456c8a4804dc9dd198c3d153b359

commit: b8c538256f2749fa184894d52052660473c6b9ee [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 16 18:54:08 2024 +0100
committer: Akron <nils@diewald-online.de> Wed Mar 20 11:39:45 2024 +0100
tree: 46b06e4151864f7cf062850e45330a9be4e1723b
parent: a351837ef17c26ce1ade9e503a1617c624bc41b4 [diff]
diff --git a/t/annotation/corpus/doc/0001/spacy/morpho.xml b/t/annotation/corpus/doc/0001/spacy/morpho.xml
new file mode 100644
index 0000000..3328a8a
--- /dev/null
+++ b/t/annotation/corpus/doc/0001/spacy/morpho.xml

@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="Corpus_Doc.0001" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+  <span id="s1_n1" from="0" to="3">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADP</f>
+      <f name="lemma">zu</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s1_n2" from="4" to="11">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADJ</f>
+      <f name="lemma">letzter</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s1_n3" from="12" to="23">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADJ</f>
+      <f name="lemma">kulturell</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s1_n4" from="24" to="30">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Anlass</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s2_n1" from="31" to="35">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">VERB</f>
+      <f name="lemma">laden</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s2_n2" from="36" to="39">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">DET</f>
+      <f name="lemma">der</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s2_n3" from="" to="">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Leitung</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n1" from="48" to="51">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">DET</f>
+      <f name="lemma">der</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n2" from="52" to="63">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Schulheim</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n3" from="64" to="73">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">PROPN</f>
+      <f name="lemma">Hofbergli</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n4" from="74" to="77">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADV</f>
+      <f name="lemma">ein</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n5" from="77" to="78">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">PUNCT</f>
+      <f name="lemma">--</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n6" from="79" to="84">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">SCONJ</f>
+      <f name="lemma">bevor</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n7" from="85" to="88">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">DET</f>
+      <f name="lemma">der</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n8" from="89" to="96">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Betrieb</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n9" from="97" to="101">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Ende</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n10" from="102" to="111">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Schuljahr</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n11" from="112" to="123">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">VERB</f>
+      <f name="lemma">einstellen</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n12" from="124" to="128">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">AUX</f>
+      <f name="lemma">werden</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s4_n1" from="48" to="51">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">PUNCT</f>
+      <f name="lemma">--</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+ </spanList>
+</layer>

diff --git a/t/annotation/spacy_morpho.t b/t/annotation/spacy_morpho.t
new file mode 100644
index 0000000..8999acc
--- /dev/null
+++ b/t/annotation/spacy_morpho.t

@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use KorAP::XML::Annotation::Spacy::Morpho;
+use Scalar::Util qw/weaken/;
+use Data::Dumper;
+use lib 't/annotation';
+use TestInit;
+
+ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
+
+ok($tokens->add('Spacy', 'Morpho'), 'Add Structure');
+
+my $data = $tokens->to_data->{data};
+
+like($data->{foundries}, qr!spacy/morpho!, 'data');
+like($data->{layerInfos}, qr!spacy/p=tokens!, 'data');
+like($data->{layerInfos}, qr!spacy/l=tokens!, 'data');
+
+is($data->{stream}->[0]->[5], 'spacy/l:zu', 'POS');
+is($data->{stream}->[0]->[6], 'spacy/p:ADP', 'POS');
+
+is($data->{stream}->[3]->[3], 'spacy/l:Anlass', 'POS');
+is($data->{stream}->[3]->[4], 'spacy/p:NOUN', 'POS');
+
+is($data->{stream}->[10]->[3], 'spacy/l:ein', 'POS');
+is($data->{stream}->[10]->[4], 'spacy/p:ADV', 'POS');
+
+is($data->{stream}->[13]->[3], 'spacy/l:Betrieb', 'POS');
+
+is($data->{stream}->[-1]->[3], 'spacy/l:werden', 'POS');
+is($data->{stream}->[-1]->[4], 'spacy/p:AUX', 'POS');
+
+is($data->{stream}->[11]->[3], 'spacy/l:bevor',
+   'Lemma');
+is($data->{stream}->[11]->[4], 'spacy/p:SCONJ',
+   'POS');
+
+is($data->{stream}->[12]->[1], 'i:der','Surface');
+is($data->{stream}->[13]->[1], 'i:betrieb','Surface');
+is($data->{stream}->[14]->[1], 'i:ende','Surface');
+is($data->{stream}->[15]->[1], 'i:schuljahr','Surface');
+is($data->{stream}->[16]->[1], 'i:eingestellt','Surface');
+is($data->{stream}->[17]->[1], 'i:wird','Surface');
+
+ok(!$data->{stream}->[18],'Nothing');
+
+is(scalar(@{$data->{stream}}), 18, 'Length');
+
+done_testing;
+
+__END__
+

diff --git a/t/corpus/archives/wpd15-single.spacy.zip b/t/corpus/archives/wpd15-single.spacy.zip
new file mode 100644
index 0000000..5bbc121
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.spacy.zip
Binary files differ

diff --git a/t/script/archive.t b/t/script/archive.t
index 0f57183..41389bb 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t

@@ -123,7 +123,7 @@
   ok(($json_1 = decode_json $file), 'decode json');
 
   is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
-  is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+  is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
   is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
 
   ok(-f $json_2, 'Json file exists');

diff --git a/t/script/single.t b/t/script/single.t
index 01d119f..4772c1c 100644
--- a/t/script/single.t
+++ b/t/script/single.t

@@ -54,7 +54,7 @@
 is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
 is($json->{title}, 'Beispiel Text', 'Title');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
@@ -87,7 +87,7 @@
 is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
 is($json->{title}, 'Beispiel Text', 'Title');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 
@@ -227,11 +227,41 @@
 is($json->{fields}->[22]->{'@type'}, 'koral:field');
 
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
 
+my $token = join(',',@{$json->{data}->{stream}->[1]});
+
+like($token, qr!<>:xip\/c:AP\$<b>64<i>4<i>11<i>2<b>5!);
+like($token, qr!<>:xip\/c:ADJ\$<b>64<i>4<i>11<i>2<b>6!);
+like($token, qr!<>:cnx\/c:np\$<b>64<i>4<i>30<i>4<b>0!);
+like($token, qr!<>:xip\/c:NP\$<b>64<i>4<i>30<i>4<b>3!);
+like($token, qr!<>:xip\/c:NPA\$<b>64<i>4<i>30<i>4<b>4!);
+like($token, qr!>:mate\/d:NK\$<b>32<i>3!);
+like($token, qr!_1\$<i>4<i>11!);
+like($token, qr!cnx\/l:letzt!);
+like($token, qr!cnx\/p:A!);
+like($token, qr!cnx\/syn:\@PREMOD!);
+like($token, qr!corenlp\/p:ADJ!);
+like($token, qr!glemm\/l:__letzt-!);
+like($token, qr!i:letzten!);
+like($token, qr!mate\/l:letzter!);
+like($token, qr!mate\/m:case:dat!);
+like($token, qr!mate\/m:degree:pos!);
+like($token, qr!mate\/m:gender:neut!);
+like($token, qr!mate\/m:number:sg!);
+like($token, qr!mate\/p:ADJA!);
+like($token, qr!opennlp\/p:ADJA!);
+like($token, qr!s:letzten!);
+like($token, qr!spacy\/l:letzter!);
+like($token, qr!spacy\/p:ADJ!);
+like($token, qr!tt\/l:letzt!);
+like($token, qr!tt\/p:ADJA!);
+like($token, qr!xip\/l:letzt!);
+like($token, qr!xip\/p:ADJ!);
+
 # Delete output
 unlink $output;
 ok(!-f $output, 'Output does not exist');
commit	b8c538256f2749fa184894d52052660473c6b9ee	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 16 18:54:08 2024 +0100
committer	Akron <nils@diewald-online.de>	Wed Mar 20 11:39:45 2024 +0100
tree	46b06e4151864f7cf062850e45330a9be4e1723b
parent	a351837ef17c26ce1ade9e503a1617c624bc41b4 [diff]