blob: ecddc2aff60bb8015f678fe8c5132d7223224d7d [file] [log] [blame]
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
Eliza Margaretha57c9fc72014-09-16 16:44:00 +00004import java.nio.ByteBuffer;
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +00005import java.util.ArrayList;
6import java.util.List;
7import java.util.Map;
8
Akron700c1eb2015-09-25 16:57:30 +02009import org.apache.lucene.index.LeafReaderContext;
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +000010import org.apache.lucene.index.Term;
11import org.apache.lucene.index.TermContext;
12import org.apache.lucene.search.spans.Spans;
13import org.apache.lucene.util.Bits;
14
15import de.ids_mannheim.korap.query.SpanExpansionQuery;
16
Eliza Margarethadffd0592015-01-15 18:24:39 +000017/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000018 * Enumeration of Spans expanded with minimum <code>m</code> and
19 * maximum
20 * <code>n</code> tokens, and throughout all the expansions do
21 * <em>not</em>
Eliza Margarethadffd0592015-01-15 18:24:39 +000022 * contain a specific Spans (notClause). See examples in
23 * {@link SpanExpansionQuery}.
Eliza Margaretha7dd87122014-09-16 15:34:46 +000024 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000025 * The expansion direction is designated with the sign of a number,
26 * namely a
27 * negative number signifies the expansion to the <em>left</em> of the
28 * original
29 * span, and a positive number (including 0) signifies the expansion
30 * to the
Eliza Margarethadffd0592015-01-15 18:24:39 +000031 * <em>right</em> of the original span.
32 *
margaretha6cbe3712018-10-23 13:22:49 +020033 * The expansion offsets, namely the start and end positions of an
34 * expansion part, are stored in payloads. A class number is assigned
35 * to the offsets grouping them altogether.
Eliza Margarethaafe98122015-01-23 17:37:57 +000036 *
Eliza Margarethadffd0592015-01-15 18:24:39 +000037 * @author margaretha
Eliza Margaretha6f989202016-10-14 21:48:29 +020038 */
Eliza Margarethadffd0592015-01-15 18:24:39 +000039public class ExpandedExclusionSpans extends SimpleSpans {
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +000040
Eliza Margarethadffd0592015-01-15 18:24:39 +000041 private int min, max;
42 private int direction;
43 private byte classNumber;
44 private List<CandidateSpan> candidateSpans;
45 private boolean hasMoreNotClause;
46 private Spans notClause;
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +000047
Eliza Margarethadffd0592015-01-15 18:24:39 +000048 private long matchCost;
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +000049
Eliza Margarethadffd0592015-01-15 18:24:39 +000050 /**
51 * Constructs ExpandedExclusionSpans from the given
52 * {@link SpanExpansionQuery}.
53 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000054 * @param spanExpansionQuery
55 * a SpanExpansionQuery
Eliza Margarethadffd0592015-01-15 18:24:39 +000056 * @param context
57 * @param acceptDocs
58 * @param termContexts
59 * @throws IOException
60 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000061 public ExpandedExclusionSpans (SpanExpansionQuery spanExpansionQuery,
Akron42993552016-02-04 13:24:24 +010062 LeafReaderContext context, Bits acceptDocs,
Nils Diewaldbb33da22015-03-04 16:24:25 +000063 Map<Term, TermContext> termContexts)
64 throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +000065 super(spanExpansionQuery, context, acceptDocs, termContexts);
66
67 if (spanExpansionQuery.getSecondClause() == null) {
Eliza Margaretha6f989202016-10-14 21:48:29 +020068 throw new IllegalArgumentException("The SpanExpansionQuery "
69 + "is not valid. The spanquery to exclude (notClause) cannot "
70 + "be null.");
Eliza Margarethadffd0592015-01-15 18:24:39 +000071 }
72
73 /*
74 * if (spanExpansionQuery.getMin() < 1){ throw new
75 * IllegalArgumentException("Min occurrence for notClause " +
76 * "must be at least 1."); }
77 */
78
79 this.min = spanExpansionQuery.getMin();
80 this.max = spanExpansionQuery.getMax();
81 this.direction = spanExpansionQuery.getDirection();
82 this.classNumber = spanExpansionQuery.getClassNumber();
83
84 this.notClause = secondSpans;
85 this.hasMoreNotClause = notClause.next();
86
87 candidateSpans = new ArrayList<CandidateSpan>();
88 hasMoreSpans = firstSpans.next();
89 }
90
91 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000092 public boolean next () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +000093 matchPayload.clear();
94 isStartEnumeration = false;
95 return advance();
96 }
97
98 /**
99 * Advances the ExpandedExclusionSpans to the next match.
100 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000101 * @return <code>true</code> if a match is found,
102 * <code>false</code>
Eliza Margarethadffd0592015-01-15 18:24:39 +0000103 * otherwise.
104 * @throws IOException
105 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000106 private boolean advance () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000107 while (hasMoreSpans || candidateSpans.size() > 0) {
108 if (candidateSpans.size() > 0) {
109 // set a candidate span as a match
110 CandidateSpan cs = candidateSpans.get(0);
111 matchDocNumber = cs.getDoc();
112 matchStartPosition = cs.getStart();
113 matchEndPosition = cs.getEnd();
114 matchPayload = cs.getPayloads();
115 matchCost = cs.getCost() + notClause.cost();
116 candidateSpans.remove(0);
117 return true;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000118 }
119 else if (!hasMoreNotClause || notClause.doc() > firstSpans.doc()) {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000120 generateCandidates(min, max, direction);
121 hasMoreSpans = firstSpans.next();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 }
123 else
Eliza Margarethadffd0592015-01-15 18:24:39 +0000124 findMatches();
125 }
126 return false;
127 }
128
129 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130 * Finds matches by expanding the firstspans either to the left or
131 * to the
Eliza Margarethadffd0592015-01-15 18:24:39 +0000132 * right.
133 *
134 * @throws IOException
135 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000136 private void findMatches () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000137 while (hasMoreNotClause && notClause.doc() <= firstSpans.doc()) {
138 if (notClause.doc() == firstSpans.doc()) {
139 if (direction < 0) { // left
140 expandLeft();
141 } // right
142 else {
143 expandRight();
144 }
145 break;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000146 }
margarethae43c5e52018-03-20 15:24:53 +0100147 else if (!notClause.next()) hasMoreNotClause = false;
Eliza Margarethadffd0592015-01-15 18:24:39 +0000148 }
149 }
150
151 /**
152 * Expands the firstspans to the left.
153 *
154 * @throws IOException
155 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000156 private void expandLeft () throws IOException {
margaretha6cbe3712018-10-23 13:22:49 +0200157 // int counter = max;
Eliza Margarethadffd0592015-01-15 18:24:39 +0000158 int maxPos = max;
159 CandidateSpan lastNotClause = null;
margaretha6cbe3712018-10-23 13:22:49 +0200160 while (hasMoreNotClause &&
161 notClause.doc() == firstSpans.doc() &&
162 notClause.start() < firstSpans.start()) {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000163
164 // between max and firstspan.start()
165 if (notClause.start() >= firstSpans.start() - maxPos) {
166 maxPos = firstSpans.start() - notClause.start() - 1;
167 lastNotClause = new CandidateSpan(notClause);
margaretha6cbe3712018-10-23 13:22:49 +0200168 // counter--;
Eliza Margarethadffd0592015-01-15 18:24:39 +0000169 }
margarethae43c5e52018-03-20 15:24:53 +0100170 if (!notClause.next()) {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000171 hasMoreNotClause = false;
margarethae43c5e52018-03-20 15:24:53 +0100172 }
Eliza Margarethadffd0592015-01-15 18:24:39 +0000173 }
174
margaretha6cbe3712018-10-23 13:22:49 +0200175 // if a notClause is between max and firstspan.start,
Eliza Margarethadffd0592015-01-15 18:24:39 +0000176 // then maxPos = last NotClause pos -1
177 generateCandidates(min, maxPos, direction);
178
margarethae43c5e52018-03-20 15:24:53 +0100179 if (lastNotClause != null && hasMoreNotClause)
Eliza Margarethadffd0592015-01-15 18:24:39 +0000180 while ((hasMoreSpans = firstSpans.next())
margaretha6cbe3712018-10-23 13:22:49 +0200181 // the next notClause is not in between max and
182 // firstspan.start()
Eliza Margarethadffd0592015-01-15 18:24:39 +0000183 && notClause.start() > firstSpans.start()
margaretha6cbe3712018-10-23 13:22:49 +0200184 // the last notClause is in between max and
185 // firstspan.start()
Eliza Margarethadffd0592015-01-15 18:24:39 +0000186 && lastNotClause.getStart() < firstSpans.start()
187 && lastNotClause.getStart() >= firstSpans.start() - max) {
188
189 maxPos = firstSpans.start() - lastNotClause.getStart() - 1;
190 generateCandidates(min, maxPos, direction);
191 }
margarethae43c5e52018-03-20 15:24:53 +0100192 else {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000193 hasMoreSpans = firstSpans.next();
margarethae43c5e52018-03-20 15:24:53 +0100194 }
Eliza Margarethadffd0592015-01-15 18:24:39 +0000195 }
196
197 /**
198 * Expands the firstspans to the right.
199 *
200 * @throws IOException
201 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000202 private void expandRight () throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000203 int expansionEnd = firstSpans.end() + max;
204 int maxPos = max;
205 boolean isFound = false;
206
207 CandidateSpan firstNotClause = null;
margaretha6cbe3712018-10-23 13:22:49 +0200208 // System.out.println("main start:"+firstSpans.start());
Eliza Margarethadffd0592015-01-15 18:24:39 +0000209 while (hasMoreNotClause && notClause.start() < expansionEnd) {
210 // between firstspan.end() and expansionEnd
211 if (!isFound && notClause.start() >= firstSpans.end()) {
212 maxPos = notClause.start() - firstSpans.end() - 1;
213 firstNotClause = new CandidateSpan(notClause);
214 isFound = true;
215 }
margarethae43c5e52018-03-20 15:24:53 +0100216 if (!notClause.next()) {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000217 hasMoreNotClause = false;
margarethae43c5e52018-03-20 15:24:53 +0100218 }
Eliza Margarethadffd0592015-01-15 18:24:39 +0000219 }
220 // if a notClause is between firstSpan.end and max
margaretha6cbe3712018-10-23 13:22:49 +0200221 // then maxPos = the first notClause pos -1
Eliza Margarethadffd0592015-01-15 18:24:39 +0000222 generateCandidates(min, maxPos, direction);
223
224 if (firstNotClause != null) {
225 while ((hasMoreSpans = firstSpans.next())
226 // in between
227 && firstNotClause.getStart() < firstSpans.end() + max
228 && firstNotClause.getStart() >= firstSpans.end()) {
margaretha6cbe3712018-10-23 13:22:49 +0200229 // System.out.println("first
230 // start:"+firstNotClause.getStart()+", main
231 // start:"+firstSpans.start());
Eliza Margarethadffd0592015-01-15 18:24:39 +0000232 maxPos = firstNotClause.getStart() - firstSpans.end() - 1;
233 generateCandidates(min, maxPos, direction);
234 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000235 }
margarethae43c5e52018-03-20 15:24:53 +0100236 else {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000237 hasMoreSpans = firstSpans.next();
margarethae43c5e52018-03-20 15:24:53 +0100238 }
Eliza Margarethadffd0592015-01-15 18:24:39 +0000239 }
240
241 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000242 * Creates new candidate matches for the given direction, minimum
243 * and
Eliza Margarethadffd0592015-01-15 18:24:39 +0000244 * maximum positions.
245 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000246 * @param minPos
247 * minimum position
248 * @param maxPos
249 * maximum position
250 * @param direction
251 * the expansion direction
Eliza Margarethadffd0592015-01-15 18:24:39 +0000252 * @throws IOException
253 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000254 private void generateCandidates (int minPos, int maxPos, int direction)
Eliza Margarethadffd0592015-01-15 18:24:39 +0000255 throws IOException {
256 int counter;
257 int start, end;
258 CandidateSpan cs;
259 if (direction < 0) { // left
260 counter = maxPos;
261 while (counter >= min) {
262 start = Math.max(0, firstSpans.start() - counter);
263 if (start > -1) {
264 end = firstSpans.end();
margaretha6cbe3712018-10-23 13:22:49 +0200265 // System.out.println(start+","+end);
Eliza Margarethadffd0592015-01-15 18:24:39 +0000266 cs = new CandidateSpan(start, end, firstSpans.doc(),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200267 firstSpans.cost(),
268 createPayloads(start, firstSpans.start()));
Eliza Margarethadffd0592015-01-15 18:24:39 +0000269 candidateSpans.add(cs);
270 }
271 counter--;
272 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000273 }
274 else { // right
Eliza Margarethadffd0592015-01-15 18:24:39 +0000275 counter = minPos;
276 while (counter <= maxPos) {
277 start = firstSpans.start();
278 end = firstSpans.end() + counter;
margaretha6cbe3712018-10-23 13:22:49 +0200279 // System.out.println(start+","+end);
Eliza Margarethadffd0592015-01-15 18:24:39 +0000280
281 cs = new CandidateSpan(start, end, firstSpans.doc(),
Eliza Margarethaafe98122015-01-23 17:37:57 +0000282 firstSpans.cost(),
283 createPayloads(firstSpans.end(), end));
Eliza Margarethadffd0592015-01-15 18:24:39 +0000284 candidateSpans.add(cs);
285 counter++;
286 }
287 }
288 }
289
290 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000291 * Creates payloads for a candiate match by copying the payloads
292 * of the
293 * firstspans, and adds expansion offsets with the given start and
294 * end
Eliza Margarethadffd0592015-01-15 18:24:39 +0000295 * positions to the payloads, if the class number is set.
296 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000297 * @param start
298 * the start offset
299 * @param end
300 * the end offset
Eliza Margarethadffd0592015-01-15 18:24:39 +0000301 * @return payloads
302 * @throws IOException
303 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000304 private ArrayList<byte[]> createPayloads (int start, int end)
Eliza Margarethadffd0592015-01-15 18:24:39 +0000305 throws IOException {
306
307 ArrayList<byte[]> payload = new ArrayList<byte[]>();
308
309 if (firstSpans.isPayloadAvailable()) {
310 payload.addAll(firstSpans.getPayload());
311 }
312 if (classNumber > 0) {
margaretha6cbe3712018-10-23 13:22:49 +0200313 // System.out.println("Extension offsets "+start+","+end);
Eliza Margarethadffd0592015-01-15 18:24:39 +0000314 payload.add(createExtensionPayloads(start, end));
315 }
316 return payload;
317 }
318
319 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000320 * Generates a byte array of extension offsets and class number to
321 * be added
Eliza Margarethadffd0592015-01-15 18:24:39 +0000322 * into the payloads.
323 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000324 * @param start
325 * the start offset
326 * @param end
327 * the end offset
Eliza Margarethadffd0592015-01-15 18:24:39 +0000328 * @return a byte array of extension offsets and class number
329 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000330 private byte[] createExtensionPayloads (int start, int end) {
Akron42993552016-02-04 13:24:24 +0100331 ByteBuffer buffer = ByteBuffer.allocate(10);
332 Byte classPTI = 0;
333 buffer.put(classPTI);
Eliza Margarethadffd0592015-01-15 18:24:39 +0000334 buffer.putInt(start);
335 buffer.putInt(end);
336 buffer.put(classNumber);
337 return buffer.array();
338 }
339
340 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000341 public boolean skipTo (int target) throws IOException {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000342 if (hasMoreSpans && (firstSpans.doc() < target)) {
343 if (!firstSpans.skipTo(target)) {
344 hasMoreSpans = false;
345 return false;
346 }
347 }
348 matchPayload.clear();
349 return advance();
350 }
351
352 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000353 public long cost () {
Eliza Margarethadffd0592015-01-15 18:24:39 +0000354 return matchCost;
355 }
Eliza Margarethac8d0a7a2014-09-16 13:08:30 +0000356}