Accept "pos" as an alias for ctag and default certainty to 1 for TreeTagger
Fixes CoNLL-U-Treetagger compatibility.
Change-Id: I6301b3d826da8330ee33d83a286f765b08af04b6
diff --git a/Changes b/Changes
index 338a5d5..aac2050 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,9 @@
0.53 2023-03-20
- Added Spacy support. (kupietz)
+ - Support 'pos' as an alternative to 'ctag'
+ in Treetagger. (kupietz)
+ - Change default certainty value in TreeTagger
+ to 1.
0.52 2023-01-23
- Introduced 'quiet' flag.
diff --git a/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm b/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
index 81fc525..e66376c 100644
--- a/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
@@ -26,7 +26,7 @@
$content = $fs->{fs}->{f};
my @val;
- my $certainty = 0;
+ my $certainty = 1;
foreach (@$content) {
if ($_->{-name} eq 'certainty') {
@@ -54,7 +54,7 @@
};
# pos
- if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
+ if (($_->{-name} eq 'ctag' || $_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
$pos{$found} += $certainty // 1;
};
};
diff --git a/t/annotation/corpus/doc/0003/data.xml b/t/annotation/corpus/doc/0003/data.xml
new file mode 100644
index 0000000..a1dad20
--- /dev/null
+++ b/t/annotation/corpus/doc/0003/data.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="Corpus_Doc.0003" xmlns="http://ids-mannheim.de/ns/KorAP">
+ <metadata file="metadata.xml" />
+ <text>Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, bevor der Betrieb Ende Schuljahr eingestellt wird.</text>
+</raw_text>
diff --git a/t/annotation/corpus/doc/0003/header.xml b/t/annotation/corpus/doc/0003/header.xml
new file mode 100644
index 0000000..dd5c085
--- /dev/null
+++ b/t/annotation/corpus/doc/0003/header.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="text" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>Corpus/Doc.Text</textSigle>
+ <t.title assemblage="regular"/>
+ </titleStmt>
+ <publicationStmt>
+ <distributor/>
+ <pubAddress/>
+ <availability region="world" status="unknown"/>
+ <pubDate/>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <h.title type="main">Beispiel Text</h.title>
+ <h.title type="sub">Beispiel Text Untertitel</h.title>
+ <h.author>Mustermann, Max</h.author>
+ <editor>Monika Mustermann</editor>
+ <imprint/>
+ <biblScope type="pp"/>
+ <biblScope type="suppl"/>
+ <biblScope type="suppltitle"/>
+ <biblNote n="1"/>
+ </analytic>
+ <monogr>
+ <h.title type="main">Beispiel Text</h.title>
+ <h.title type="sub">Best of!</h.title>
+ <h.author>Mustermann, Max</h.author>
+ <editor>Monika Mustermann</editor>
+ <imprint>
+ <publisher>Artificial articles Inc.</publisher>
+ <pubDate type="year">2001</pubDate>
+ <pubDate type="month">04</pubDate>
+ <pubDate type="day">02</pubDate>
+ <pubPlace>Mannheim</pubPlace>
+ </imprint>
+ <biblScope type="issue"/>
+ <biblScope type="issueplace"/>
+ </monogr>
+ </biblStruct>
+ <reference type="complete" assemblage="regular"/>
+ <reference type="short" assemblage="regular"/>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <creation>
+ <creatDate>1999.06.01</creatDate>
+ </creation>
+ <textClass>
+ <catRef n="1" target="topic.freizeit-unterhaltung.vereine-veranstaltungen" scheme="topic"/>
+ <h.keywords>
+ <keyTerm/>
+ </h.keywords>
+ </textClass>
+ <textDesc>
+ <textType>Zeitung: Tageszeitung</textType>
+ <textTypeArt>Bericht</textTypeArt>
+ <textDomain/>
+ <column/>
+ </textDesc>
+ </profileDesc>
+</idsHeader>
diff --git a/t/annotation/corpus/doc/0003/opennlp/tokens.xml b/t/annotation/corpus/doc/0003/opennlp/tokens.xml
new file mode 100644
index 0000000..a56e28c
--- /dev/null
+++ b/t/annotation/corpus/doc/0003/opennlp/tokens.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="Corpus_Doc.0003" VERSION="KorAP-0.4">
+<spanList>
+ <span id="s_7" from="0" to="3"/>
+ <span id="s_8" from="4" to="11"/>
+ <span id="s_9" from="12" to="23"/>
+ <span id="s_10" from="24" to="30"/>
+ <span id="s_11" from="31" to="35"/>
+ <span id="s_12" from="36" to="39"/>
+ <span id="s_13" from="40" to="47"/>
+ <span id="s_14" from="48" to="51"/>
+ <span id="s_15" from="52" to="63"/>
+ <span id="s_16" from="64" to="73"/>
+ <span id="s_17" from="74" to="77"/>
+ <span id="s_18" from="77" to="78"/>
+ <span id="s_19" from="79" to="84"/>
+ <span id="s_20" from="85" to="88"/>
+ <span id="s_21" from="89" to="96"/>
+ <span id="s_22" from="97" to="101"/>
+ <span id="s_23" from="102" to="111"/>
+ <span id="s_24" from="112" to="123"/>
+ <span id="s_25" from="124" to="128"/>
+ <span id="s_26" from="128" to="129"/>
+ </spanList>
+</layer>
diff --git a/t/annotation/corpus/doc/0003/tree_tagger/morpho.xml b/t/annotation/corpus/doc/0003/tree_tagger/morpho.xml
new file mode 100644
index 0000000..50228ea
--- /dev/null
+++ b/t/annotation/corpus/doc/0003/tree_tagger/morpho.xml
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="Corpus_Doc.0003" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+ <span id="s1_n1" from="0" to="3">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">APPRART</f>
+ <f name="lemma">zu+die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1_n2" from="4" to="11">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">letzt</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1_n3" from="12" to="23">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">kulturell</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1_n4" from="24" to="30">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Anlass</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2_n1" from="31" to="35">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VVFIN</f>
+ <f name="lemma">laden</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2_n2" from="36" to="39">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2_n3" from="" to="">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Leitung</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n1" from="48" to="51">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n2" from="52" to="63">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Schulheim</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n3" from="64" to="73">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n4" from="74" to="77">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">PTKVZ</f>
+ <f name="lemma">ein</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n5" from="77" to="78">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">$,</f>
+ <f name="lemma">,</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n6" from="79" to="84">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">KOUS</f>
+ <f name="lemma">bevor</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n7" from="85" to="88">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n8" from="89" to="96">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Betrieb</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n9" from="97" to="101">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Ende</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n10" from="102" to="111">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Schuljahr</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n11" from="112" to="123">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VVPP</f>
+ <f name="lemma">einstellen</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3_n12" from="124" to="128">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VAFIN</f>
+ <f name="lemma">werden</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s4_n1" from="48" to="51">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">$.</f>
+ <f name="lemma">.</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ </spanList>
+</layer>
diff --git a/t/annotation/corpus/doc/0003/tree_tagger/tokens.xml b/t/annotation/corpus/doc/0003/tree_tagger/tokens.xml
new file mode 100644
index 0000000..a56e28c
--- /dev/null
+++ b/t/annotation/corpus/doc/0003/tree_tagger/tokens.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?><?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="Corpus_Doc.0003" VERSION="KorAP-0.4">
+<spanList>
+ <span id="s_7" from="0" to="3"/>
+ <span id="s_8" from="4" to="11"/>
+ <span id="s_9" from="12" to="23"/>
+ <span id="s_10" from="24" to="30"/>
+ <span id="s_11" from="31" to="35"/>
+ <span id="s_12" from="36" to="39"/>
+ <span id="s_13" from="40" to="47"/>
+ <span id="s_14" from="48" to="51"/>
+ <span id="s_15" from="52" to="63"/>
+ <span id="s_16" from="64" to="73"/>
+ <span id="s_17" from="74" to="77"/>
+ <span id="s_18" from="77" to="78"/>
+ <span id="s_19" from="79" to="84"/>
+ <span id="s_20" from="85" to="88"/>
+ <span id="s_21" from="89" to="96"/>
+ <span id="s_22" from="97" to="101"/>
+ <span id="s_23" from="102" to="111"/>
+ <span id="s_24" from="112" to="123"/>
+ <span id="s_25" from="124" to="128"/>
+ <span id="s_26" from="128" to="129"/>
+ </spanList>
+</layer>
diff --git a/t/annotation/tt_morpho.t b/t/annotation/tt_morpho.t
index 705f7d0..0544394 100644
--- a/t/annotation/tt_morpho.t
+++ b/t/annotation/tt_morpho.t
@@ -42,8 +42,40 @@
is($data->{stream}->[11]->[6], 'tt/p:PTKVZ$<b>129<b>51',
'Lemma');
+is(scalar(@{$data->{stream}}), 18);
+
+
+ok($tokens = TestInit::tokens('0003'), 'Parse tokens');
+
+ok($tokens->add('TreeTagger', 'Morpho'), 'Add Structure');
+
+$data = $tokens->to_data->{data};
+
+like($data->{foundries}, qr!treetagger/morpho!, 'data');
+like($data->{layerInfos}, qr!tt/p=tokens!, 'data');
+like($data->{layerInfos}, qr!tt/l=tokens!, 'data');
+
+is($data->{stream}->[0]->[5], 'tt/l:zu+die', 'POS');
+is($data->{stream}->[0]->[6], 'tt/p:APPRART', 'POS');
+
+is($data->{stream}->[3]->[3], 'tt/l:Anlass', 'POS');
+is($data->{stream}->[3]->[4], 'tt/p:NN', 'POS');
+
+is($data->{stream}->[10]->[3], 'tt/l:ein', 'POS');
+is($data->{stream}->[10]->[4], 'tt/p:PTKVZ', 'POS');
+
+is($data->{stream}->[13]->[3], 'tt/l:Betrieb', 'POS');
+
+is($data->{stream}->[-1]->[3], 'tt/l:werden', 'POS');
+is($data->{stream}->[-1]->[4], 'tt/p:VAFIN', 'POS');
+
+is($data->{stream}->[11]->[3], 'tt/l:bevor',
+ 'Lemma');
+is($data->{stream}->[11]->[4], 'tt/p:KOUS',
+ 'Lemma');
+ok(!$data->{stream}->[11]->[6], 'No alternatives');
+
+is(scalar(@{$data->{stream}}), 18);
done_testing;
-
__END__
-
diff --git a/t/script/archive.t b/t/script/archive.t
index 41389bb..b0244ca 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -103,15 +103,20 @@
# That's not really stable on slow machines!
my $out = stdout_from(sub { system($call); });
- ok($out =~ m!\[\$(\d+?):1\/2\]!s, $call . ' pid 1');
+ ok($out =~ m!\[\$(\d+?):1\/3\]!s, $call . ' pid 1');
my $pid1 = $1;
- ok($out =~ m!\[\$(\d+?):2\/2\]!s, $call . ' pid 2');
+ ok($out =~ m!\[\$(\d+?):2\/3\]!s, $call . ' pid 2');
my $pid2 = $1;
+ ok($out =~ m!\[\$(\d+?):3\/3\]!s, $call . ' pid 3');
+ my $pid3 = $1;
isnt($pid1, $pid2, 'No PID match');
+ isnt($pid2, $pid3, 'No PID match');
+ isnt($pid1, $pid3, 'No PID match');
ok($out =~ m!Processed .+?\/corpus-doc-0001\.json!s, $call);
ok($out =~ m!Processed .+?\/corpus-doc-0002\.json!s, $call);
+ ok($out =~ m!Processed .+?\/corpus-doc-0003\.json!s, $call);
ok(-d $output, 'Temporary directory still exists');
my $json_1 = catfile($output, 'corpus-doc-0001.json');