| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 1 | package KorAP::XML::Field::MultiTermToken; |
| 2 | use KorAP::XML::Field::MultiTerm; |
| Nils Diewald | 3cf08c7 | 2013-12-16 20:31:10 +0000 | [diff] [blame] | 3 | use List::MoreUtils 'uniq'; |
| Nils Diewald | 1448c26 | 2015-10-01 17:25:33 +0000 | [diff] [blame] | 4 | use Carp qw/carp croak/; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 5 | use strict; |
| 6 | use warnings; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 7 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 8 | # This tries to be highly optimized - it's not supposed to be readable |
| 9 | |
| 10 | sub new { |
| 11 | bless [], shift; |
| 12 | }; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 13 | |
| Nils Diewald | 1448c26 | 2015-10-01 17:25:33 +0000 | [diff] [blame] | 14 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 15 | sub add { |
| 16 | my $self = shift; |
| 17 | my $mt; |
| 18 | unless (ref $_[0] eq 'MultiTerm') { |
| 19 | if (@_ == 1) { |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 20 | $mt = KorAP::XML::Field::MultiTerm->new(term => $_[0]); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 21 | } |
| 22 | else { |
| Akron | e4c2e41 | 2016-01-28 15:10:50 +0100 | [diff] [blame] | 23 | $mt = KorAP::XML::Field::MultiTerm->new(@_); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 24 | }; |
| 25 | } |
| 26 | else { |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 27 | $mt = $_[0]; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 28 | }; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 29 | $self->[0] //= []; |
| 30 | push(@{$self->[0]}, $mt); |
| 31 | $mt; |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 32 | }; |
| 33 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 34 | # 0 -> mt |
| 35 | |
| 36 | # 1 |
| 37 | sub o_start { |
| 38 | if (defined $_[1]) { |
| 39 | return $_[0]->[1] = $_[1]; |
| 40 | }; |
| 41 | $_[0]->[1]; |
| 42 | }; |
| 43 | |
| 44 | # 2 |
| 45 | sub o_end { |
| 46 | if (defined $_[1]) { |
| 47 | return $_[0]->[2] = $_[1]; |
| 48 | }; |
| 49 | $_[0]->[2]; |
| 50 | }; |
| 51 | |
| 52 | # 3: Return a new term id |
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 53 | sub id_counter { |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 54 | $_[0]->[3] //= 1; |
| 55 | return $_[0]->[3]++; |
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 56 | }; |
| 57 | |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 58 | sub surface { |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 59 | substr($_[0]->[0]->[0]->term,2); |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 60 | }; |
| 61 | |
| 62 | sub lc_surface { |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 63 | substr($_[0]->[0]->[1]->term,2); |
| Nils Diewald | f03c680 | 2014-07-21 16:39:44 +0000 | [diff] [blame] | 64 | }; |
| 65 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 66 | sub to_array { |
| 67 | my $self = shift; |
| 68 | [uniq(map($_->to_string, sort _sort @{$self->[0]}))]; |
| 69 | }; |
| 70 | |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 71 | # Get multiterm based on term content (treat as prefix) |
| 72 | # TODO: This currently only works for simple terms! |
| 73 | sub grep_mt { |
| 74 | my $self = shift; |
| 75 | my $term = shift; |
| 76 | foreach (@{$self->[0]}) { |
| 77 | return $_ if index($_->term, $term) == 0; |
| 78 | }; |
| 79 | return; |
| 80 | }; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 81 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 82 | sub to_string { |
| 83 | my $self = shift; |
| 84 | my $string = '[(' . $self->o_start . '-'. $self->o_end . ')'; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 85 | $string .= join ('|', @{$self->to_array}); |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 86 | $string .= ']'; |
| 87 | return $string; |
| 88 | }; |
| 89 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 90 | # Get relation based positions |
| Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 91 | # TODO: Fix! |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 92 | sub _rel_right_pos { |
| Nils Diewald | 1448c26 | 2015-10-01 17:25:33 +0000 | [diff] [blame] | 93 | |
| 94 | # There are relation ids! |
| 95 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 96 | # token to token - right token |
| 97 | if ($_[0] =~ m/^<i>(\d+)<s>/o) { |
| 98 | return ($1, $1); |
| 99 | } |
| Nils Diewald | 1448c26 | 2015-10-01 17:25:33 +0000 | [diff] [blame] | 100 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 101 | # token/span to span - right token |
| 102 | elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) { |
| 103 | return ($1, $2); |
| 104 | } |
| Nils Diewald | 1448c26 | 2015-10-01 17:25:33 +0000 | [diff] [blame] | 105 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 106 | # span to token - right token |
| 107 | elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) { |
| 108 | return ($1, $1); |
| 109 | }; |
| Nils Diewald | 1448c26 | 2015-10-01 17:25:33 +0000 | [diff] [blame] | 110 | carp 'Unknown relation format!'; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 111 | return (0,0); |
| 112 | }; |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 113 | |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 114 | # Sort spans, attributes and relations |
| 115 | sub _sort { |
| 116 | |
| 117 | # Both are no spans |
| 118 | if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) { |
| 119 | |
| 120 | # Both are attributes |
| 121 | # Order attributes by reference id |
| 122 | if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) { |
| Akron | 126e33c | 2016-01-07 21:08:45 +0100 | [diff] [blame] | 123 | |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 124 | # Check TUI |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 125 | my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/); |
| 126 | my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/); |
| 127 | if ($a_id > $b_id) { |
| 128 | return 1; |
| 129 | } |
| 130 | elsif ($a_id < $b_id) { |
| 131 | return -1; |
| 132 | } |
| 133 | else { |
| 134 | return 1; |
| 135 | }; |
| 136 | } |
| 137 | |
| 138 | # Both are relations |
| 139 | elsif ( |
| 140 | (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) && |
| 141 | (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) { |
| 142 | my $a_end = $a->[2] // 0; |
| 143 | my $b_end = $b->[2] // 0; |
| 144 | |
| 145 | # left is p_end |
| 146 | if ($a_end < $b_end) { |
| 147 | return -1; |
| 148 | } |
| 149 | elsif ($a_end > $b_end) { |
| 150 | return 1; |
| 151 | } |
| 152 | else { |
| 153 | # Check for right positions |
| 154 | (my $a_start, $a_end) = _rel_right_pos($a->[0]); |
| 155 | (my $b_start, $b_end) = _rel_right_pos($b->[0]); |
| 156 | if ($a_start < $b_start) { |
| 157 | return -1; |
| 158 | } |
| 159 | elsif ($a_start > $b_start) { |
| 160 | return 1; |
| 161 | } |
| 162 | elsif ($a_end < $b_end) { |
| 163 | return -1; |
| 164 | } |
| 165 | elsif ($a_end > $b_end) { |
| 166 | return 1; |
| 167 | } |
| 168 | else { |
| 169 | return 1; |
| 170 | }; |
| 171 | }; |
| 172 | }; |
| 173 | |
| 174 | # This has to be sorted alphabetically! |
| 175 | return $a->[5] cmp $b->[5]; |
| 176 | } |
| 177 | |
| 178 | # Not identical |
| 179 | elsif (index($a->[5], '<>:') != 0) { |
| 180 | return $a->[5] cmp $b->[5]; |
| 181 | } |
| 182 | # Not identical |
| 183 | elsif (index($b->[5], '<>:') != 0) { |
| 184 | return $a->[5] cmp $b->[5]; |
| 185 | } |
| 186 | |
| 187 | # Sort both spans |
| 188 | else { |
| 189 | if ($a->[2] < $b->[2]) { |
| 190 | return -1; |
| 191 | } |
| 192 | elsif ($a->[2] > $b->[2]) { |
| 193 | return 1; |
| 194 | } |
| 195 | |
| 196 | # Check depth |
| 197 | else { |
| Akron | 31d788e | 2016-02-05 20:49:03 +0100 | [diff] [blame] | 198 | my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0); |
| 199 | my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0); |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 200 | |
| 201 | $a_depth //= 0; |
| 202 | $b_depth //= 0; |
| 203 | if ($a_depth < $b_depth) { |
| 204 | return -1; |
| 205 | } |
| 206 | elsif ($a_depth > $b_depth) { |
| 207 | return 1; |
| 208 | } |
| 209 | else { |
| Akron | ee443f9 | 2016-02-25 23:56:49 +0100 | [diff] [blame^] | 210 | return $a->[5] cmp $b->[5]; |
| Nils Diewald | 6d56507 | 2014-10-30 23:20:58 +0000 | [diff] [blame] | 211 | }; |
| 212 | }; |
| 213 | }; |
| Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 214 | }; |
| 215 | |
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 216 | |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 217 | sub to_solr { |
| 218 | my $self = shift; |
| 219 | my @array = map { $_->to_solr(0) } @{$self->{mt}}; |
| 220 | $array[0]->{i} = 1; |
| 221 | return \@array; |
| 222 | }; |
| 223 | |
| Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 224 | |
| Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 225 | 1; |
| Nils Diewald | ff6d078 | 2014-06-10 18:26:36 +0000 | [diff] [blame] | 226 | |
| 227 | |
| 228 | __END__ |
| 229 | |
| 230 | [ |
| 231 | { |
| 232 | "e":128, |
| 233 | "i":22, |
| 234 | "p":"DQ4KDQsODg8=", |
| 235 | "s":123, |
| 236 | "t":"one", |
| 237 | "y":"word" |
| 238 | }, |
| 239 | { |
| 240 | "e":8, |
| 241 | "i":1, |
| 242 | "s":5, |
| 243 | "t":"two", |
| 244 | "y":"word" |
| 245 | }, |
| 246 | { |
| 247 | "e":22, |
| 248 | "i":1, |
| 249 | "s":20, |
| 250 | "t":"three", |
| 251 | "y":"foobar" |
| 252 | } |
| 253 | ] |
| 254 | |