Add simple Spacy support Change-Id: I37ec0dce14ca456c8a4804dc9dd198c3d153b359

commit: b8c538256f2749fa184894d52052660473c6b9ee [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Mar 16 18:54:08 2024 +0100
committer: Akron <nils@diewald-online.de> Wed Mar 20 11:39:45 2024 +0100
tree: 46b06e4151864f7cf062850e45330a9be4e1723b
parent: a351837ef17c26ce1ade9e503a1617c624bc41b4 [diff]
diff --git a/Changes b/Changes
index 684dd95..338a5d5 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,6 @@
+0.53 2023-03-20
+        - Added Spacy support. (kupietz)
+
 0.52 2023-01-23
         - Introduced 'quiet' flag.
 

diff --git a/Readme.pod b/Readme.pod
index a9ab89c..0b5bcbf 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -436,6 +436,9 @@
     #Lemma
     #Morpho
 
+  Spacy
+    #Morpho
+
   Talismane
     #Dependency
     #Morpho
@@ -613,7 +616,7 @@
 
 Author: L<Nils Diewald|https://www.nils-diewald.de/>
 
-Contributor: Eliza Margaretha
+Contributor: Eliza Margaretha, Marc Kupietz
 
 L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the

diff --git a/lib/KorAP/XML/Annotation/Spacy/Morpho.pm b/lib/KorAP/XML/Annotation/Spacy/Morpho.pm
new file mode 100644
index 0000000..5c36b98
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/Spacy/Morpho.pm

@@ -0,0 +1,47 @@
+package KorAP::XML::Annotation::Spacy::Morpho;
+use KorAP::XML::Annotation::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'spacy',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->get_pos);
+
+      my $content = $token->get_hash->{fs}->{f};
+
+      my $array = $content->{fs}->{f} or return;
+
+      # In case there is only a lemma/pos ...
+      $array = ref $array ne 'ARRAY' ? [$array] : $array;
+
+      my $found;
+
+      foreach my $f (@$array) {
+
+        next unless $f->{-name};
+
+        # pos tag
+        if (($f->{-name} eq 'pos') &&
+              ($found = $f->{'#text'})) {
+          $mtt->add_by_term('spacy/p:' . $found);
+        }
+
+        # lemma tag
+        elsif (($f->{-name} eq 'lemma')
+                 && ($found = $f->{'#text'})) {
+          $mtt->add_by_term('spacy/l:' . $found);
+        };
+      };
+    }) or return;
+  return 1;
+};
+
+sub layer_info {
+  ['spacy/l=tokens', 'spacy/p=tokens']
+};
+
+1;

diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 06dd102..2a16e6b 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm

@@ -16,7 +16,7 @@
 
 our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
 
-our $VERSION = '0.52';
+our $VERSION = '0.53';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 51352e8..77a1260 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -171,9 +171,12 @@
 # 2023/02/13
 # - Fix temporary-extract handling from configuration file.
 #
+# 2024/03/20
+# - Added Spacy support.
+#
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2023/05/16';
+our $LAST_CHANGE = '2024/03/20';
 our $LOCAL = $FindBin::Bin;
 our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
@@ -514,6 +517,10 @@
      ['Sgbr', 'Lemma'],
      ['Sgbr', 'Morpho']);
 
+# Spacy
+push(@layers,
+     ['Spacy', 'Morpho']);
+
 # Talismane
 push(@layers,
      ['Talismane', 'Dependency'],
@@ -1521,6 +1528,9 @@
     #Lemma
     #Morpho
 
+  Spacy
+    #Morpho
+
   Talismane
     #Dependency
     #Morpho

diff --git a/t/annotation/corpus/doc/0001/spacy/morpho.xml b/t/annotation/corpus/doc/0001/spacy/morpho.xml
new file mode 100644
index 0000000..3328a8a
--- /dev/null
+++ b/t/annotation/corpus/doc/0001/spacy/morpho.xml

@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="Corpus_Doc.0001" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+  <span id="s1_n1" from="0" to="3">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADP</f>
+      <f name="lemma">zu</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s1_n2" from="4" to="11">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADJ</f>
+      <f name="lemma">letzter</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s1_n3" from="12" to="23">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADJ</f>
+      <f name="lemma">kulturell</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s1_n4" from="24" to="30">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Anlass</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s2_n1" from="31" to="35">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">VERB</f>
+      <f name="lemma">laden</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s2_n2" from="36" to="39">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">DET</f>
+      <f name="lemma">der</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s2_n3" from="" to="">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Leitung</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n1" from="48" to="51">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">DET</f>
+      <f name="lemma">der</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n2" from="52" to="63">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Schulheim</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n3" from="64" to="73">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">PROPN</f>
+      <f name="lemma">Hofbergli</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n4" from="74" to="77">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">ADV</f>
+      <f name="lemma">ein</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n5" from="77" to="78">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">PUNCT</f>
+      <f name="lemma">--</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n6" from="79" to="84">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">SCONJ</f>
+      <f name="lemma">bevor</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n7" from="85" to="88">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">DET</f>
+      <f name="lemma">der</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n8" from="89" to="96">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Betrieb</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n9" from="97" to="101">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Ende</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n10" from="102" to="111">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">NOUN</f>
+      <f name="lemma">Schuljahr</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n11" from="112" to="123">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">VERB</f>
+      <f name="lemma">einstellen</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s3_n12" from="124" to="128">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">AUX</f>
+      <f name="lemma">werden</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+  <span id="s4_n1" from="48" to="51">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="pos">PUNCT</f>
+      <f name="lemma">--</f>
+     </fs>
+    </f>
+   </fs>
+  </span>
+ </spanList>
+</layer>

diff --git a/t/annotation/spacy_morpho.t b/t/annotation/spacy_morpho.t
new file mode 100644
index 0000000..8999acc
--- /dev/null
+++ b/t/annotation/spacy_morpho.t

@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use KorAP::XML::Annotation::Spacy::Morpho;
+use Scalar::Util qw/weaken/;
+use Data::Dumper;
+use lib 't/annotation';
+use TestInit;
+
+ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
+
+ok($tokens->add('Spacy', 'Morpho'), 'Add Structure');
+
+my $data = $tokens->to_data->{data};
+
+like($data->{foundries}, qr!spacy/morpho!, 'data');
+like($data->{layerInfos}, qr!spacy/p=tokens!, 'data');
+like($data->{layerInfos}, qr!spacy/l=tokens!, 'data');
+
+is($data->{stream}->[0]->[5], 'spacy/l:zu', 'POS');
+is($data->{stream}->[0]->[6], 'spacy/p:ADP', 'POS');
+
+is($data->{stream}->[3]->[3], 'spacy/l:Anlass', 'POS');
+is($data->{stream}->[3]->[4], 'spacy/p:NOUN', 'POS');
+
+is($data->{stream}->[10]->[3], 'spacy/l:ein', 'POS');
+is($data->{stream}->[10]->[4], 'spacy/p:ADV', 'POS');
+
+is($data->{stream}->[13]->[3], 'spacy/l:Betrieb', 'POS');
+
+is($data->{stream}->[-1]->[3], 'spacy/l:werden', 'POS');
+is($data->{stream}->[-1]->[4], 'spacy/p:AUX', 'POS');
+
+is($data->{stream}->[11]->[3], 'spacy/l:bevor',
+   'Lemma');
+is($data->{stream}->[11]->[4], 'spacy/p:SCONJ',
+   'POS');
+
+is($data->{stream}->[12]->[1], 'i:der','Surface');
+is($data->{stream}->[13]->[1], 'i:betrieb','Surface');
+is($data->{stream}->[14]->[1], 'i:ende','Surface');
+is($data->{stream}->[15]->[1], 'i:schuljahr','Surface');
+is($data->{stream}->[16]->[1], 'i:eingestellt','Surface');
+is($data->{stream}->[17]->[1], 'i:wird','Surface');
+
+ok(!$data->{stream}->[18],'Nothing');
+
+is(scalar(@{$data->{stream}}), 18, 'Length');
+
+done_testing;
+
+__END__
+

diff --git a/t/corpus/archives/wpd15-single.spacy.zip b/t/corpus/archives/wpd15-single.spacy.zip
new file mode 100644
index 0000000..5bbc121
--- /dev/null
+++ b/t/corpus/archives/wpd15-single.spacy.zip
Binary files differ

diff --git a/t/script/archive.t b/t/script/archive.t
index 0f57183..41389bb 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t

@@ -123,7 +123,7 @@
   ok(($json_1 = decode_json $file), 'decode json');
 
   is($json_1->{data}->{tokenSource}, 'tree_tagger#tokens', 'TokenSource');
-  is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+  is($json_1->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
   is($json_1->{textSigle}, 'Corpus/Doc/0001', 'Sigle');
 
   ok(-f $json_2, 'Json file exists');

diff --git a/t/script/single.t b/t/script/single.t
index 01d119f..4772c1c 100644
--- a/t/script/single.t
+++ b/t/script/single.t

@@ -54,7 +54,7 @@
 is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
 is($json->{title}, 'Beispiel Text', 'Title');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
@@ -87,7 +87,7 @@
 is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
 is($json->{title}, 'Beispiel Text', 'Title');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 
@@ -227,11 +227,41 @@
 is($json->{fields}->[22]->{'@type'}, 'koral:field');
 
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
-is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
+is($json->{data}->{foundries}, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure glemm glemm/morpho mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences spacy spacy/morpho treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Foundries');
 like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
 is($json->{data}->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Paragraphs');
 is($json->{data}->{tokenSource}, 'opennlp#tokens', 'TokenSource');
 
+my $token = join(',',@{$json->{data}->{stream}->[1]});
+
+like($token, qr!<>:xip\/c:AP\$<b>64<i>4<i>11<i>2<b>5!);
+like($token, qr!<>:xip\/c:ADJ\$<b>64<i>4<i>11<i>2<b>6!);
+like($token, qr!<>:cnx\/c:np\$<b>64<i>4<i>30<i>4<b>0!);
+like($token, qr!<>:xip\/c:NP\$<b>64<i>4<i>30<i>4<b>3!);
+like($token, qr!<>:xip\/c:NPA\$<b>64<i>4<i>30<i>4<b>4!);
+like($token, qr!>:mate\/d:NK\$<b>32<i>3!);
+like($token, qr!_1\$<i>4<i>11!);
+like($token, qr!cnx\/l:letzt!);
+like($token, qr!cnx\/p:A!);
+like($token, qr!cnx\/syn:\@PREMOD!);
+like($token, qr!corenlp\/p:ADJ!);
+like($token, qr!glemm\/l:__letzt-!);
+like($token, qr!i:letzten!);
+like($token, qr!mate\/l:letzter!);
+like($token, qr!mate\/m:case:dat!);
+like($token, qr!mate\/m:degree:pos!);
+like($token, qr!mate\/m:gender:neut!);
+like($token, qr!mate\/m:number:sg!);
+like($token, qr!mate\/p:ADJA!);
+like($token, qr!opennlp\/p:ADJA!);
+like($token, qr!s:letzten!);
+like($token, qr!spacy\/l:letzter!);
+like($token, qr!spacy\/p:ADJ!);
+like($token, qr!tt\/l:letzt!);
+like($token, qr!tt\/p:ADJA!);
+like($token, qr!xip\/l:letzt!);
+like($token, qr!xip\/p:ADJ!);
+
 # Delete output
 unlink $output;
 ok(!-f $output, 'Output does not exist');
commit	b8c538256f2749fa184894d52052660473c6b9ee	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Mar 16 18:54:08 2024 +0100
committer	Akron <nils@diewald-online.de>	Wed Mar 20 11:39:45 2024 +0100
tree	46b06e4151864f7cf062850e45330a9be4e1723b
parent	a351837ef17c26ce1ade9e503a1617c624bc41b4 [diff]