Add support for Universal Dependency annotations
Change-Id: I2d55192cdc2c442aa27a6c600ae989771c2c7898
diff --git a/lib/KorAP/XML/Annotation/UDPipe/Dependency.pm b/lib/KorAP/XML/Annotation/UDPipe/Dependency.pm
new file mode 100644
index 0000000..10a05ad
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/UDPipe/Dependency.pm
@@ -0,0 +1,105 @@
+package KorAP::XML::Annotation::UDPipe::Dependency;
+use KorAP::XML::Annotation::Base;
+use strict;
+use warnings;
+
+sub parse {
+ my $self = shift;
+
+ # TODO: Create UD tree here - for indirect dependency
+ # >>:xip/d:SUBJ<i>566<i>789
+
+ # Relation data
+ $$self->add_tokendata(
+ foundry => 'ud',
+ layer => 'dependency',
+ cb => sub {
+ my ($stream, $source, $tokens) = @_;
+
+ # Get MultiTermToken from stream for source
+ my $mtt = $stream->pos($source->get_pos);
+
+ # Serialized information from token
+ my $content = $source->get_hash;
+
+ # Get relation information
+ my $rel = $content->{rel};
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ my $mt;
+
+ # Iterate over relations
+ foreach (@$rel) {
+ my $label = $_->{-label};
+
+ #my $target = $stream->tui($source->pos);
+ my $from = $_->{span}->{-from};
+ my $to = $_->{span}->{-to};
+
+ # Target
+ my $target = $tokens->token($from, $to);
+
+ # Relation is term-to-term with a found target!
+ if ($target) {
+
+ # Unary means, it refers to itself!
+ $mt = $mtt->add_by_term('>:ud/d:' . $label);
+ $mt->set_pti(32); # term-to-term relation
+ $mt->set_payload(
+ '<i>' . $target->get_pos # . # right part token position
+ # '<s>0' . # $source_term->tui . # left part tui
+ # '<s>0' # . $target_term->tui # right part tui
+ );
+
+ my $target_mtt = $stream->pos($target->get_pos);
+
+ $mt = $target_mtt->add_by_term('<:ud/d:' . $label);
+ $mt->set_pti(32); # term-to-term relation
+ $mt->set_payload(
+ '<i>' . $source->get_pos # . # left part token position
+ # '<s>0' . # $source_term->tui . # left part tui
+ # '<s>0' # . $target_term->tui # right part tui
+ );
+ }
+
+ # Relation is possibly term-to-element with a found target!
+ elsif ($target = $tokens->span($from, $to)) {
+ $mt = $mtt->add_by_term('>:ud/d:' . $label);
+ $mt->set_pti(33); # term-to-element relation
+ $mt->set_payload(
+ '<i>' . $target->get_o_start . # end position
+ '<i>' . $target->get_o_end . # end position
+ '<i>' . $target->get_p_start . # right part start position
+ '<i>' . $target->get_p_end # . # right part end position
+ # '<s>0' . # $source_term->tui . # left part tui
+ # '<s>0' # . $target_span->tui # right part tui
+ );
+
+ my $target_mtt = $stream->pos($target->get_p_start);
+ $mt = $target_mtt->add_by_term('<:ud/d:' . $label);
+ $mt->set_pti(34); # element-to-term relation
+ $mt->set_payload(
+ '<i>' . $target->get_o_start . # end position
+ '<i>' . $target->get_o_end . # end position
+ '<i>' . $target->get_p_end . # right part end position
+ '<i>' . $source->get_pos # . # left part token position
+ # '<s>0' . # $source_term->tui . # left part tui
+ # '<s>0' # . $target_span->tui # right part tui
+ );
+ }
+ else {
+ use Data::Dumper;
+ $$self->log->warn('Relation currently not supported: ' . Dumper($content));
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+sub layer_info {
+ ['ud/d=rels']
+};
+
+
+1;
diff --git a/lib/KorAP/XML/Annotation/UDPipe/Morpho.pm b/lib/KorAP/XML/Annotation/UDPipe/Morpho.pm
new file mode 100644
index 0000000..3c80b87
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/UDPipe/Morpho.pm
@@ -0,0 +1,56 @@
+package KorAP::XML::Annotation::UDPipe::Morpho;
+use KorAP::XML::Annotation::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'ud',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->get_pos);
+
+ my $content = $token->get_hash->{fs}->{f};
+
+ my $found;
+
+ # If no array - make array
+ my $fs_array = ref($content->{fs}->{f}) eq 'ARRAY' ?
+ $content->{fs}->{f} : [$content->{fs}->{f}];
+
+ foreach my $f (@$fs_array) {
+
+ # pos tag
+ if (($f->{-name} eq 'pos') &&
+ ($found = $f->{'#text'})) {
+ $mtt->add_by_term('ud/p:' . $found);
+ }
+
+ # lemma tag
+ elsif (($f->{-name} eq 'lemma') &&
+ ($found = $f->{'#text'})) {
+ $mtt->add_by_term('ud/l:' . $found);
+ }
+
+ # msd tag
+ elsif ($f->{-name} eq 'msd' &&
+ ($found = $f->{'#text'})) {
+
+ # Split all values
+ foreach (split '\|', $found) {
+ my ($x, $y) = split "=", lc($_);
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add_by_term('ud/m:' . $x . ($y ? ':' . $y : ''));
+ };
+ };
+ };
+ }) or return;
+ return 1;
+};
+
+sub layer_info {
+ ['ud/p=tokens', 'ud/l=tokens', 'ud/m=tokens']
+}
+
+1;
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index a675e4f..14999b8 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -242,7 +242,7 @@
# text title
elsif ($type eq 'text') {
unless ($self->{T_title}) {
- if ($titles = $dom->find('fileDesc > titleStmt > t\.title')) {
+ if ($titles = $dom->find('fileDesc > titleStmt > t\.title, fileDesc > titleStmt > title')) {
if ($lang) {
$title = $titles->first(sub{ $_->attr('xml:lang') && lc($_->attr('xml:lang')) eq lc($lang) });
};