blob: 52c3d2085d1333f83cf05f8f12bfba7b23d0f4d9 [file] [log] [blame]
Akrone4c2e412016-01-28 15:10:50 +01001package KorAP::XML::Field::MultiTermToken;
2use KorAP::XML::Field::MultiTerm;
Nils Diewald3cf08c72013-12-16 20:31:10 +00003use List::MoreUtils 'uniq';
Nils Diewald1448c262015-10-01 17:25:33 +00004use Carp qw/carp croak/;
Nils Diewald6d565072014-10-30 23:20:58 +00005use strict;
6use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00007
Nils Diewald6d565072014-10-30 23:20:58 +00008# This tries to be highly optimized - it's not supposed to be readable
9
10sub new {
11 bless [], shift;
12};
Nils Diewald2db9ad02013-10-29 19:26:43 +000013
Nils Diewald1448c262015-10-01 17:25:33 +000014
Nils Diewald2db9ad02013-10-29 19:26:43 +000015sub add {
16 my $self = shift;
17 my $mt;
18 unless (ref $_[0] eq 'MultiTerm') {
19 if (@_ == 1) {
Akrone4c2e412016-01-28 15:10:50 +010020 $mt = KorAP::XML::Field::MultiTerm->new(term => $_[0]);
Nils Diewald2db9ad02013-10-29 19:26:43 +000021 }
22 else {
Akrone4c2e412016-01-28 15:10:50 +010023 $mt = KorAP::XML::Field::MultiTerm->new(@_);
Nils Diewald2db9ad02013-10-29 19:26:43 +000024 };
25 }
26 else {
Nils Diewald6d565072014-10-30 23:20:58 +000027 $mt = $_[0];
Nils Diewald2db9ad02013-10-29 19:26:43 +000028 };
Nils Diewald6d565072014-10-30 23:20:58 +000029 $self->[0] //= [];
30 push(@{$self->[0]}, $mt);
31 $mt;
Nils Diewald2db9ad02013-10-29 19:26:43 +000032};
33
Nils Diewald6d565072014-10-30 23:20:58 +000034# 0 -> mt
35
36# 1
37sub o_start {
38 if (defined $_[1]) {
39 return $_[0]->[1] = $_[1];
40 };
41 $_[0]->[1];
42};
43
44# 2
45sub o_end {
46 if (defined $_[1]) {
47 return $_[0]->[2] = $_[1];
48 };
49 $_[0]->[2];
50};
51
52# 3: Return a new term id
Nils Diewald32e30f02014-10-30 00:52:36 +000053sub id_counter {
Nils Diewald6d565072014-10-30 23:20:58 +000054 $_[0]->[3] //= 1;
55 return $_[0]->[3]++;
Nils Diewald32e30f02014-10-30 00:52:36 +000056};
57
Nils Diewaldf03c6802014-07-21 16:39:44 +000058sub surface {
Nils Diewald6d565072014-10-30 23:20:58 +000059 substr($_[0]->[0]->[0]->term,2);
Nils Diewaldf03c6802014-07-21 16:39:44 +000060};
61
62sub lc_surface {
Nils Diewald6d565072014-10-30 23:20:58 +000063 substr($_[0]->[0]->[1]->term,2);
Nils Diewaldf03c6802014-07-21 16:39:44 +000064};
65
Nils Diewald6d565072014-10-30 23:20:58 +000066sub to_array {
67 my $self = shift;
68 [uniq(map($_->to_string, sort _sort @{$self->[0]}))];
69};
70
Akron14ca9f02016-01-29 19:38:18 +010071# Get multiterm based on term content (treat as prefix)
72# TODO: This currently only works for simple terms!
73sub grep_mt {
74 my $self = shift;
75 my $term = shift;
76 foreach (@{$self->[0]}) {
77 return $_ if index($_->term, $term) == 0;
78 };
79 return;
80};
Nils Diewald6d565072014-10-30 23:20:58 +000081
Nils Diewald2db9ad02013-10-29 19:26:43 +000082sub to_string {
83 my $self = shift;
84 my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
Nils Diewald6d565072014-10-30 23:20:58 +000085 $string .= join ('|', @{$self->to_array});
Nils Diewald2db9ad02013-10-29 19:26:43 +000086 $string .= ']';
87 return $string;
88};
89
Nils Diewald6d565072014-10-30 23:20:58 +000090# Get relation based positions
Akron14ca9f02016-01-29 19:38:18 +010091# TODO: Fix!
Nils Diewald6d565072014-10-30 23:20:58 +000092sub _rel_right_pos {
Nils Diewald1448c262015-10-01 17:25:33 +000093
94 # There are relation ids!
95
Nils Diewald6d565072014-10-30 23:20:58 +000096 # token to token - right token
97 if ($_[0] =~ m/^<i>(\d+)<s>/o) {
98 return ($1, $1);
99 }
Nils Diewald1448c262015-10-01 17:25:33 +0000100
Nils Diewald6d565072014-10-30 23:20:58 +0000101 # token/span to span - right token
102 elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) {
103 return ($1, $2);
104 }
Nils Diewald1448c262015-10-01 17:25:33 +0000105
Nils Diewald6d565072014-10-30 23:20:58 +0000106 # span to token - right token
107 elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) {
108 return ($1, $1);
109 };
Nils Diewald1448c262015-10-01 17:25:33 +0000110 carp 'Unknown relation format!';
Nils Diewald6d565072014-10-30 23:20:58 +0000111 return (0,0);
112};
Nils Diewaldff6d0782014-06-10 18:26:36 +0000113
Nils Diewald6d565072014-10-30 23:20:58 +0000114# Sort spans, attributes and relations
115sub _sort {
116
117 # Both are no spans
118 if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) {
119
120 # Both are attributes
121 # Order attributes by reference id
122 if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
Akron126e33c2016-01-07 21:08:45 +0100123
Akron31d788e2016-02-05 20:49:03 +0100124 # Check TUI
Nils Diewald6d565072014-10-30 23:20:58 +0000125 my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
126 my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
127 if ($a_id > $b_id) {
128 return 1;
129 }
130 elsif ($a_id < $b_id) {
131 return -1;
132 }
133 else {
134 return 1;
135 };
136 }
137
138 # Both are relations
139 elsif (
140 (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) &&
141 (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) {
142 my $a_end = $a->[2] // 0;
143 my $b_end = $b->[2] // 0;
144
145 # left is p_end
146 if ($a_end < $b_end) {
147 return -1;
148 }
149 elsif ($a_end > $b_end) {
150 return 1;
151 }
152 else {
153 # Check for right positions
154 (my $a_start, $a_end) = _rel_right_pos($a->[0]);
155 (my $b_start, $b_end) = _rel_right_pos($b->[0]);
156 if ($a_start < $b_start) {
157 return -1;
158 }
159 elsif ($a_start > $b_start) {
160 return 1;
161 }
162 elsif ($a_end < $b_end) {
163 return -1;
164 }
165 elsif ($a_end > $b_end) {
166 return 1;
167 }
168 else {
169 return 1;
170 };
171 };
172 };
173
174 # This has to be sorted alphabetically!
175 return $a->[5] cmp $b->[5];
176 }
177
178 # Not identical
179 elsif (index($a->[5], '<>:') != 0) {
180 return $a->[5] cmp $b->[5];
181 }
182 # Not identical
183 elsif (index($b->[5], '<>:') != 0) {
184 return $a->[5] cmp $b->[5];
185 }
186
187 # Sort both spans
188 else {
189 if ($a->[2] < $b->[2]) {
190 return -1;
191 }
192 elsif ($a->[2] > $b->[2]) {
193 return 1;
194 }
195
196 # Check depth
197 else {
Akron31d788e2016-02-05 20:49:03 +0100198 my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
199 my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
Nils Diewald6d565072014-10-30 23:20:58 +0000200
201 $a_depth //= 0;
202 $b_depth //= 0;
203 if ($a_depth < $b_depth) {
204 return -1;
205 }
206 elsif ($a_depth > $b_depth) {
207 return 1;
208 }
209 else {
210 return 1;
211 };
212 };
213 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000214};
215
Nils Diewald32e30f02014-10-30 00:52:36 +0000216
Nils Diewaldff6d0782014-06-10 18:26:36 +0000217sub to_solr {
218 my $self = shift;
219 my @array = map { $_->to_solr(0) } @{$self->{mt}};
220 $array[0]->{i} = 1;
221 return \@array;
222};
223
Nils Diewald32e30f02014-10-30 00:52:36 +0000224
Nils Diewald2db9ad02013-10-29 19:26:43 +00002251;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000226
227
228__END__
229
230[
231 {
232 "e":128,
233 "i":22,
234 "p":"DQ4KDQsODg8=",
235 "s":123,
236 "t":"one",
237 "y":"word"
238 },
239 {
240 "e":8,
241 "i":1,
242 "s":5,
243 "t":"two",
244 "y":"word"
245 },
246 {
247 "e":22,
248 "i":1,
249 "s":20,
250 "t":"three",
251 "y":"foobar"
252 }
253 ]
254