| package KorAP::XML::Index::MultiTermToken; |
| use KorAP::XML::Index::MultiTerm; |
| use Scalar::Util qw/blessed/; |
| use List::MoreUtils 'uniq'; |
| use Carp qw/carp croak/; |
| use strict; |
| use warnings; |
| |
| # This tries to be highly optimized (it's not supposed to be readable) |
| # but is rather slow on sorting relations. |
| # Should be replaced by an efficient implementation! |
| |
| sub new { |
| bless [], shift; |
| }; |
| |
| |
| sub add { |
| my $self = shift; |
| |
| my $mt; |
| unless (blessed $_[0]) { |
| if (@_ == 1) { |
| $mt = KorAP::XML::Index::MultiTerm->new(term => $_[0]); |
| } |
| else { |
| $mt = KorAP::XML::Index::MultiTerm->new(@_); |
| }; |
| } |
| else { |
| $mt = $_[0]; |
| }; |
| $self->[0] //= []; |
| push(@{$self->[0]}, $mt); |
| $mt; |
| }; |
| |
| # 0 -> mt |
| |
| # 1 |
| sub o_start { |
| if (defined $_[1]) { |
| return $_[0]->[1] = $_[1]; |
| }; |
| $_[0]->[1]; |
| }; |
| |
| # 2 |
| sub o_end { |
| if (defined $_[1]) { |
| return $_[0]->[2] = $_[1]; |
| }; |
| $_[0]->[2]; |
| }; |
| |
| # 3: Return a new term id |
| sub id_counter { |
| $_[0]->[3] //= 1; |
| return $_[0]->[3]++; |
| }; |
| |
| sub surface { |
| substr($_[0]->[0]->[0]->term,2); |
| }; |
| |
| sub lc_surface { |
| substr($_[0]->[0]->[1]->term,2); |
| }; |
| |
| sub to_array { |
| my $self = shift; |
| [uniq(map($_->to_string, sort _sort @{$self->[0]}))]; |
| }; |
| |
| # Get multiterm based on term content (treat as prefix) |
| # TODO: This currently only works for simple terms! |
| sub grep_mt { |
| my $self = shift; |
| my $term = shift; |
| foreach (@{$self->[0]}) { |
| return $_ if index($_->term, $term) == 0; |
| }; |
| return; |
| }; |
| |
| sub to_string { |
| my $self = shift; |
| my $string = '[(' . $self->o_start . '-'. $self->o_end . ')'; |
| $string .= join ('|', @{$self->to_array}); |
| $string .= ']'; |
| return $string; |
| }; |
| |
| # Get relation based positions |
| sub _rel_right_pos { |
| # Both are either < or > |
| |
| # term to term - right token |
| if ($_[1] =~ m/^<i>(\d+)(?:<s>|$)/o) { |
| return ($1, $1); |
| } |
| |
| # term to span - right token |
| # (including character offsets) |
| elsif ($_[0] == 33 && $_[1] =~ m/^(?:<i>\d+){2}<i>(\d+)<i>(\d+)(?:<s>|$)/o) { |
| return ($1, $2); |
| } |
| |
| # span to term |
| elsif ($_[0] == 34 && $_[1] =~ m/^(?:<i>\d+){3}<i>(\d+)(?:<s>|$)/o) { |
| return ($1, $1); |
| } |
| |
| # span-to-span |
| elsif ($_[0] == 35 && $_[1] =~ m/^(?:<i>\d+){5}<i>(\d+)<i>(\d+)(?:<s>|$)/o) { |
| return ($1, $2); |
| }; |
| |
| # span to term - right token |
| carp 'Unknown relation format! ' .$_[0] . ':' . $_[1]; |
| return (0,0); |
| }; |
| |
| # Sort spans, attributes and relations |
| sub _sort { |
| |
| # Both are no spans |
| if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) { |
| |
| # Both are attributes |
| # Order attributes by reference id |
| if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) { |
| |
| # Check TUI |
| my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/); |
| my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/); |
| if ($a_id > $b_id) { |
| return 1; |
| } |
| elsif ($a_id < $b_id) { |
| return -1; |
| } |
| else { |
| return 1; |
| }; |
| } |
| |
| # Both are relations |
| elsif ( |
| (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) && |
| (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) { |
| |
| my $a_end = ($a->pti < 34 ? $a->p_start : ( |
| ($a->pti == 35 ? ($a->[0] =~ /^(?:<i>\d+){4}<i>(\d+)</ && $1) : |
| ($a->[0] =~ /^(?:<i>\d+){2}<i>(\d+)</ && $1) |
| ) |
| )); |
| |
| my $b_end = ($b->pti < 34 ? $b->p_start : ( |
| ($b->pti == 35 ? ($b->[0] =~ /^(?:<i>\d+){4}<i>(\d+)</ && $1) : |
| ($b->[0] =~ /^(?:<i>\d+){2}<i>(\d+)</ && $1) |
| ) |
| )); |
| |
| # my $a_end = $a->[2] // 0; |
| # my $b_end = $b->[2] // 0; |
| |
| # left is p_end |
| if ($a_end < $b_end) { |
| return -1; |
| } |
| elsif ($a_end > $b_end) { |
| return 1; |
| } |
| else { |
| # Both are either > or < |
| |
| # Check for right positions |
| (my $a_start, $a_end) = _rel_right_pos($a->pti, $a->[0]); |
| (my $b_start, $b_end) = _rel_right_pos($b->pti, $b->[0]); |
| if ($a_start < $b_start) { |
| return -1; |
| } |
| elsif ($a_start > $b_start) { |
| return 1; |
| } |
| elsif ($a_end < $b_end) { |
| return -1; |
| } |
| elsif ($a_end > $b_end) { |
| return 1; |
| } |
| else { |
| return 1; |
| }; |
| }; |
| }; |
| |
| # This has to be sorted alphabetically! |
| return $a->[5] cmp $b->[5]; |
| } |
| |
| # Not identical |
| elsif (index($a->[5], '<>:') != 0) { |
| return $a->[5] cmp $b->[5]; |
| } |
| # Not identical |
| elsif (index($b->[5], '<>:') != 0) { |
| return $a->[5] cmp $b->[5]; |
| } |
| |
| # Sort both spans |
| else { |
| if ($a->[2] < $b->[2]) { |
| return -1; |
| } |
| elsif ($a->[2] > $b->[2]) { |
| return 1; |
| } |
| |
| # Check depth |
| else { |
| my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0); |
| my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0); |
| |
| $a_depth //= 0; |
| $b_depth //= 0; |
| if ($a_depth < $b_depth) { |
| return -1; |
| } |
| elsif ($a_depth > $b_depth) { |
| return 1; |
| } |
| else { |
| return $a->[5] cmp $b->[5]; |
| }; |
| }; |
| }; |
| }; |
| |
| |
| sub to_solr { |
| my $self = shift; |
| my @array = map { $_->to_solr(0) } @{$self->{mt}}; |
| $array[0]->{i} = 1; |
| return \@array; |
| }; |
| |
| |
| 1; |
| |
| |
| __END__ |
| |
| [ |
| { |
| "e":128, |
| "i":22, |
| "p":"DQ4KDQsODg8=", |
| "s":123, |
| "t":"one", |
| "y":"word" |
| }, |
| { |
| "e":8, |
| "i":1, |
| "s":5, |
| "t":"two", |
| "y":"word" |
| }, |
| { |
| "e":22, |
| "i":1, |
| "s":20, |
| "t":"three", |
| "y":"foobar" |
| } |
| ] |
| |