blob: fb83c1a555047d1593b6565441af6a92408399c0 [file] [log] [blame]
Nils Diewald7364d1f2013-11-05 19:26:35 +00001package KorAP::Field::MultiTermToken;
2use KorAP::Field::MultiTerm;
Nils Diewald3cf08c72013-12-16 20:31:10 +00003use List::MoreUtils 'uniq';
Nils Diewalda5565f62014-10-30 23:20:58 +00004use strict;
5use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00006
Nils Diewalda5565f62014-10-30 23:20:58 +00007# This tries to be highly optimized - it's not supposed to be readable
8
9sub new {
10 bless [], shift;
11};
Nils Diewald2db9ad02013-10-29 19:26:43 +000012
13sub add {
14 my $self = shift;
15 my $mt;
16 unless (ref $_[0] eq 'MultiTerm') {
17 if (@_ == 1) {
Nils Diewalda5565f62014-10-30 23:20:58 +000018 $mt = KorAP::Field::MultiTerm->new(term => $_[0]);
Nils Diewald2db9ad02013-10-29 19:26:43 +000019 }
20 else {
Nils Diewald7364d1f2013-11-05 19:26:35 +000021 $mt = KorAP::Field::MultiTerm->new(@_);
Nils Diewald2db9ad02013-10-29 19:26:43 +000022 };
23 }
24 else {
Nils Diewalda5565f62014-10-30 23:20:58 +000025 $mt = $_[0];
Nils Diewald2db9ad02013-10-29 19:26:43 +000026 };
Nils Diewalda5565f62014-10-30 23:20:58 +000027 $self->[0] //= [];
28 push(@{$self->[0]}, $mt);
29 $mt;
Nils Diewald2db9ad02013-10-29 19:26:43 +000030};
31
Nils Diewalda5565f62014-10-30 23:20:58 +000032# 0 -> mt
33
34# 1
35sub o_start {
36 if (defined $_[1]) {
37 return $_[0]->[1] = $_[1];
38 };
39 $_[0]->[1];
40};
41
42# 2
43sub o_end {
44 if (defined $_[1]) {
45 return $_[0]->[2] = $_[1];
46 };
47 $_[0]->[2];
48};
49
50# 3: Return a new term id
Nils Diewald79a355c2014-10-30 00:52:36 +000051sub id_counter {
Nils Diewalda5565f62014-10-30 23:20:58 +000052 $_[0]->[3] //= 1;
53 return $_[0]->[3]++;
Nils Diewald79a355c2014-10-30 00:52:36 +000054};
55
Nils Diewaldf03c6802014-07-21 16:39:44 +000056sub surface {
Nils Diewalda5565f62014-10-30 23:20:58 +000057 substr($_[0]->[0]->[0]->term,2);
Nils Diewaldf03c6802014-07-21 16:39:44 +000058};
59
60sub lc_surface {
Nils Diewalda5565f62014-10-30 23:20:58 +000061 substr($_[0]->[0]->[1]->term,2);
Nils Diewaldf03c6802014-07-21 16:39:44 +000062};
63
Nils Diewalda5565f62014-10-30 23:20:58 +000064sub to_array {
65 my $self = shift;
66 [uniq(map($_->to_string, sort _sort @{$self->[0]}))];
67};
68
69
Nils Diewald2db9ad02013-10-29 19:26:43 +000070sub to_string {
71 my $self = shift;
72 my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
Nils Diewalda5565f62014-10-30 23:20:58 +000073 $string .= join ('|', @{$self->to_array});
Nils Diewald2db9ad02013-10-29 19:26:43 +000074 $string .= ']';
75 return $string;
76};
77
Nils Diewalda5565f62014-10-30 23:20:58 +000078# Get relation based positions
79sub _rel_right_pos {
80 # token to token - right token
81 if ($_[0] =~ m/^<i>(\d+)<s>/o) {
82 return ($1, $1);
83 }
84 # token/span to span - right token
85 elsif ($_[0] =~ m/^<i>(\d+)<i>(\d+)<s>/o) {
86 return ($1, $2);
87 }
88 # span to token - right token
89 elsif ($_[0] =~ m/^<b>\d+<i>(\d+)<s>/o) {
90 return ($1, $1);
91 };
92 warn 'Unknown relation format!';
93 return (0,0);
94};
Nils Diewaldff6d0782014-06-10 18:26:36 +000095
Nils Diewalda5565f62014-10-30 23:20:58 +000096# Sort spans, attributes and relations
97sub _sort {
98
99 # Both are no spans
100 if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) {
101
102 # Both are attributes
103 # Order attributes by reference id
104 if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
105 my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
106 my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
107 if ($a_id > $b_id) {
108 return 1;
109 }
110 elsif ($a_id < $b_id) {
111 return -1;
112 }
113 else {
114 return 1;
115 };
116 }
117
118 # Both are relations
119 elsif (
120 (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) &&
121 (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) {
122 my $a_end = $a->[2] // 0;
123 my $b_end = $b->[2] // 0;
124
125 # left is p_end
126 if ($a_end < $b_end) {
127 return -1;
128 }
129 elsif ($a_end > $b_end) {
130 return 1;
131 }
132 else {
133 # Check for right positions
134 (my $a_start, $a_end) = _rel_right_pos($a->[0]);
135 (my $b_start, $b_end) = _rel_right_pos($b->[0]);
136 if ($a_start < $b_start) {
137 return -1;
138 }
139 elsif ($a_start > $b_start) {
140 return 1;
141 }
142 elsif ($a_end < $b_end) {
143 return -1;
144 }
145 elsif ($a_end > $b_end) {
146 return 1;
147 }
148 else {
149 return 1;
150 };
151 };
152 };
153
154 # This has to be sorted alphabetically!
155 return $a->[5] cmp $b->[5];
156 }
157
158 # Not identical
159 elsif (index($a->[5], '<>:') != 0) {
160 return $a->[5] cmp $b->[5];
161 }
162 # Not identical
163 elsif (index($b->[5], '<>:') != 0) {
164 return $a->[5] cmp $b->[5];
165 }
166
167 # Sort both spans
168 else {
169 if ($a->[2] < $b->[2]) {
170 return -1;
171 }
172 elsif ($a->[2] > $b->[2]) {
173 return 1;
174 }
175
176 # Check depth
177 else {
178 my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/);
179 my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/);
180
181 $a_depth //= 0;
182 $b_depth //= 0;
183 if ($a_depth < $b_depth) {
184 return -1;
185 }
186 elsif ($a_depth > $b_depth) {
187 return 1;
188 }
189 else {
190 return 1;
191 };
192 };
193 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194};
195
Nils Diewald79a355c2014-10-30 00:52:36 +0000196
Nils Diewaldff6d0782014-06-10 18:26:36 +0000197sub to_solr {
198 my $self = shift;
199 my @array = map { $_->to_solr(0) } @{$self->{mt}};
200 $array[0]->{i} = 1;
201 return \@array;
202};
203
Nils Diewald79a355c2014-10-30 00:52:36 +0000204
Nils Diewald2db9ad02013-10-29 19:26:43 +00002051;
Nils Diewaldff6d0782014-06-10 18:26:36 +0000206
207
208__END__
209
210[
211 {
212 "e":128,
213 "i":22,
214 "p":"DQ4KDQsODg8=",
215 "s":123,
216 "t":"one",
217 "y":"word"
218 },
219 {
220 "e":8,
221 "i":1,
222 "s":5,
223 "t":"two",
224 "y":"word"
225 },
226 {
227 "e":22,
228 "i":1,
229 "s":20,
230 "t":"three",
231 "y":"foobar"
232 }
233 ]
234