| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 1 | package KorAP::Field::MultiTermToken; |
| 2 | use KorAP::Field::MultiTerm; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 3 | use List::MoreUtils 'uniq'; |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 4 | use strict; |
| 5 | use warnings; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 6 | |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 7 | # This tries to be highly optimized - it's not supposed to be readable |
| 8 | |
| 9 | sub new { |
| 10 | bless [], shift; |
| 11 | }; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 12 | |
| 13 | sub add { |
| 14 | my $self = shift; |
| 15 | my $mt; |
| 16 | unless (ref $_[0] eq 'MultiTerm') { |
| 17 | if (@_ == 1) { |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 18 | $mt = KorAP::Field::MultiTerm->new(term => $_[0]); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 19 | } |
| 20 | else { |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 21 | $mt = KorAP::Field::MultiTerm->new(@_); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 22 | }; |
| 23 | } |
| 24 | else { |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 25 | $mt = $_[0]; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 26 | }; |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 27 | $self->[0] //= []; |
| 28 | push(@{$self->[0]}, $mt); |
| 29 | $mt; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 30 | }; |
| 31 | |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 32 | # 0 -> mt |
| 33 | |
| 34 | # 1 |
| 35 | sub o_start { |
| 36 | if (defined $_[1]) { |
| 37 | return $_[0]->[1] = $_[1]; |
| 38 | }; |
| 39 | $_[0]->[1]; |
| 40 | }; |
| 41 | |
| 42 | # 2 |
| 43 | sub o_end { |
| 44 | if (defined $_[1]) { |
| 45 | return $_[0]->[2] = $_[1]; |
| 46 | }; |
| 47 | $_[0]->[2]; |
| 48 | }; |
| 49 | |
| 50 | # 3: Return a new term id |
| Nils Diewald | 79a355c | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 51 | sub id_counter { |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 52 | $_[0]->[3] //= 1; |
| 53 | return $_[0]->[3]++; |
| Nils Diewald | 79a355c | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 54 | }; |
| 55 | |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 56 | sub surface { |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 57 | substr($_[0]->[0]->[0]->term,2); |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 58 | }; |
| 59 | |
| 60 | sub lc_surface { |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 61 | substr($_[0]->[0]->[1]->term,2); |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 62 | }; |
| 63 | |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 64 | sub to_array { |
| 65 | my $self = shift; |
| 66 | [uniq(map($_->to_string, sort _sort @{$self->[0]}))]; |
| 67 | }; |
| 68 | |
| 69 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 70 | sub to_string { |
| 71 | my $self = shift; |
| 72 | my $string = '[(' . $self->o_start . '-'. $self->o_end . ')'; |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 73 | $string .= join ('|', @{$self->to_array}); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 74 | $string .= ']'; |
| 75 | return $string; |
| 76 | }; |
| 77 | |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 78 | # Get relation based positions |
| 79 | sub _rel_right_pos { |
| 80 | # token to token - right token |
| 81 | if ($_[0] =~ m/^<i>(\d+)<s>/o) { |
| 82 | return ($1, $1); |
| 83 | } |
| 84 | # token/span to span - right token |
| 85 | elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) { |
| 86 | return ($1, $2); |
| 87 | } |
| 88 | # span to token - right token |
| 89 | elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) { |
| 90 | return ($1, $1); |
| 91 | }; |
| 92 | warn 'Unknown relation format!'; |
| 93 | return (0,0); |
| 94 | }; |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 95 | |
| Nils Diewald | a5565f6 | 2014-10-30 23:20:58 +0000 | [diff] [blame^] | 96 | # Sort spans, attributes and relations |
| 97 | sub _sort { |
| 98 | |
| 99 | # Both are no spans |
| 100 | if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) { |
| 101 | |
| 102 | # Both are attributes |
| 103 | # Order attributes by reference id |
| 104 | if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) { |
| 105 | my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/); |
| 106 | my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/); |
| 107 | if ($a_id > $b_id) { |
| 108 | return 1; |
| 109 | } |
| 110 | elsif ($a_id < $b_id) { |
| 111 | return -1; |
| 112 | } |
| 113 | else { |
| 114 | return 1; |
| 115 | }; |
| 116 | } |
| 117 | |
| 118 | # Both are relations |
| 119 | elsif ( |
| 120 | (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) && |
| 121 | (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) { |
| 122 | my $a_end = $a->[2] // 0; |
| 123 | my $b_end = $b->[2] // 0; |
| 124 | |
| 125 | # left is p_end |
| 126 | if ($a_end < $b_end) { |
| 127 | return -1; |
| 128 | } |
| 129 | elsif ($a_end > $b_end) { |
| 130 | return 1; |
| 131 | } |
| 132 | else { |
| 133 | # Check for right positions |
| 134 | (my $a_start, $a_end) = _rel_right_pos($a->[0]); |
| 135 | (my $b_start, $b_end) = _rel_right_pos($b->[0]); |
| 136 | if ($a_start < $b_start) { |
| 137 | return -1; |
| 138 | } |
| 139 | elsif ($a_start > $b_start) { |
| 140 | return 1; |
| 141 | } |
| 142 | elsif ($a_end < $b_end) { |
| 143 | return -1; |
| 144 | } |
| 145 | elsif ($a_end > $b_end) { |
| 146 | return 1; |
| 147 | } |
| 148 | else { |
| 149 | return 1; |
| 150 | }; |
| 151 | }; |
| 152 | }; |
| 153 | |
| 154 | # This has to be sorted alphabetically! |
| 155 | return $a->[5] cmp $b->[5]; |
| 156 | } |
| 157 | |
| 158 | # Not identical |
| 159 | elsif (index($a->[5], '<>:') != 0) { |
| 160 | return $a->[5] cmp $b->[5]; |
| 161 | } |
| 162 | # Not identical |
| 163 | elsif (index($b->[5], '<>:') != 0) { |
| 164 | return $a->[5] cmp $b->[5]; |
| 165 | } |
| 166 | |
| 167 | # Sort both spans |
| 168 | else { |
| 169 | if ($a->[2] < $b->[2]) { |
| 170 | return -1; |
| 171 | } |
| 172 | elsif ($a->[2] > $b->[2]) { |
| 173 | return 1; |
| 174 | } |
| 175 | |
| 176 | # Check depth |
| 177 | else { |
| 178 | my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/); |
| 179 | my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/); |
| 180 | |
| 181 | $a_depth //= 0; |
| 182 | $b_depth //= 0; |
| 183 | if ($a_depth < $b_depth) { |
| 184 | return -1; |
| 185 | } |
| 186 | elsif ($a_depth > $b_depth) { |
| 187 | return 1; |
| 188 | } |
| 189 | else { |
| 190 | return 1; |
| 191 | }; |
| 192 | }; |
| 193 | }; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 194 | }; |
| 195 | |
| Nils Diewald | 79a355c | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 196 | |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 197 | sub to_solr { |
| 198 | my $self = shift; |
| 199 | my @array = map { $_->to_solr(0) } @{$self->{mt}}; |
| 200 | $array[0]->{i} = 1; |
| 201 | return \@array; |
| 202 | }; |
| 203 | |
| Nils Diewald | 79a355c | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 204 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 205 | 1; |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 206 | |
| 207 | |
| 208 | __END__ |
| 209 | |
| 210 | [ |
| 211 | { |
| 212 | "e":128, |
| 213 | "i":22, |
| 214 | "p":"DQ4KDQsODg8=", |
| 215 | "s":123, |
| 216 | "t":"one", |
| 217 | "y":"word" |
| 218 | }, |
| 219 | { |
| 220 | "e":8, |
| 221 | "i":1, |
| 222 | "s":5, |
| 223 | "t":"two", |
| 224 | "y":"word" |
| 225 | }, |
| 226 | { |
| 227 | "e":22, |
| 228 | "i":1, |
| 229 | "s":20, |
| 230 | "t":"three", |
| 231 | "y":"foobar" |
| 232 | } |
| 233 | ] |
| 234 | |