blob: 78c0e760e35ced023665a58d6ea8dded2237cbfd [file] [log] [blame]
Eliza Margarethae335beb2014-02-27 12:56:14 +00001package de.ids_mannheim.korap.query.spans;
2
3import java.io.IOException;
4import java.util.ArrayList;
5import java.util.Iterator;
6import java.util.List;
7import java.util.Map;
8
Akron700c1eb2015-09-25 16:57:30 +02009import org.apache.lucene.index.LeafReaderContext;
Eliza Margarethae335beb2014-02-27 12:56:14 +000010import org.apache.lucene.index.Term;
11import org.apache.lucene.index.TermContext;
12import org.apache.lucene.search.spans.Spans;
13import org.apache.lucene.util.Bits;
14
15import de.ids_mannheim.korap.query.SpanDistanceQuery;
16
Eliza Margarethac8d59202014-12-16 16:21:16 +000017/**
Nils Diewaldbb33da22015-03-04 16:24:25 +000018 * Span enumeration of spans (firstSpans) which do <em>not</em> occur
19 * together
20 * with other spans (secondSpans) on the right side, within a range of
21 * an
22 * element-based distance (i.e. a sentence or a paragraph as the
23 * distance unit).
24 * If the query requires that the spans are ordered, then the
25 * firstSpans must
26 * occur before the secondSpans. In this class, firstSpans are also
27 * referred to
Eliza Margarethac8d59202014-12-16 16:21:16 +000028 * as target spans and second spans as candidate spans.<br/>
29 * <br/>
30 * Note: The element distance unit does not overlap to each other.
Eliza Margarethae335beb2014-02-27 12:56:14 +000031 *
Eliza Margarethac8d59202014-12-16 16:21:16 +000032 * @author margaretha
Eliza Margarethae335beb2014-02-27 12:56:14 +000033 * */
Eliza Margarethac8d59202014-12-16 16:21:16 +000034public class ElementDistanceExclusionSpans extends DistanceSpans {
Eliza Margarethae335beb2014-02-27 12:56:14 +000035
Eliza Margarethac8d59202014-12-16 16:21:16 +000036 private Spans elements;
37 private boolean hasMoreElements;
38 private int elementPosition;
Nils Diewald34eaa862014-06-03 10:56:27 +000039
Eliza Margarethac8d59202014-12-16 16:21:16 +000040 private boolean isOrdered;
41 private boolean hasMoreSecondSpans;
Nils Diewald34eaa862014-06-03 10:56:27 +000042
Eliza Margarethac8d59202014-12-16 16:21:16 +000043 // other first spans occurred between the current target and the second
44 // spans
45 protected List<CandidateSpan> targetList;
46 // secondSpans occurring near the firstSpans
47 protected List<CandidateSpan> candidateList;
48 private int currentDocNum;
Eliza Margarethae335beb2014-02-27 12:56:14 +000049
Eliza Margarethac8d59202014-12-16 16:21:16 +000050 private int minDistance, maxDistance;
51 private int firstSpanPostion;
Eliza Margarethae335beb2014-02-27 12:56:14 +000052
Nils Diewaldbb33da22015-03-04 16:24:25 +000053
Eliza Margarethac8d59202014-12-16 16:21:16 +000054 /**
55 * Constructs ElementDistanceExclusionSpans from the specified
56 * {@link SpanDistanceQuery}.
57 *
Nils Diewaldbb33da22015-03-04 16:24:25 +000058 * @param query
59 * a SpanDistanceQuery
Eliza Margarethac8d59202014-12-16 16:21:16 +000060 * @param context
61 * @param acceptDocs
62 * @param termContexts
63 * @throws IOException
64 */
Nils Diewaldbb33da22015-03-04 16:24:25 +000065 public ElementDistanceExclusionSpans (SpanDistanceQuery query,
Akron700c1eb2015-09-25 16:57:30 +020066 LeafReaderContext context,
Nils Diewaldbb33da22015-03-04 16:24:25 +000067 Bits acceptDocs,
68 Map<Term, TermContext> termContexts)
69 throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +000070 super(query, context, acceptDocs, termContexts);
Nils Diewald34eaa862014-06-03 10:56:27 +000071
Eliza Margarethac8d59202014-12-16 16:21:16 +000072 elements = query.getElementQuery().getSpans(context, acceptDocs,
73 termContexts);
74 hasMoreElements = elements.next();
75 hasMoreSpans = firstSpans.next() && hasMoreElements;
76 hasMoreSecondSpans = secondSpans.next();
Eliza Margarethae335beb2014-02-27 12:56:14 +000077
Eliza Margarethac8d59202014-12-16 16:21:16 +000078 elementPosition = 0;
79 this.isOrdered = query.isOrdered();
80 candidateList = new ArrayList<CandidateSpan>();
81 targetList = new ArrayList<CandidateSpan>();
82 currentDocNum = firstSpans.doc();
Eliza Margarethae335beb2014-02-27 12:56:14 +000083
Eliza Margarethac8d59202014-12-16 16:21:16 +000084 minDistance = query.getMinDistance();
85 maxDistance = query.getMaxDistance();
86 }
87
Nils Diewaldbb33da22015-03-04 16:24:25 +000088
Eliza Margarethac8d59202014-12-16 16:21:16 +000089 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +000090 protected boolean advance () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +000091 while (!targetList.isEmpty()
92 || (hasMoreSpans && ensureSameDoc(firstSpans, elements))) {
93 if (!targetList.isEmpty()) {
94 if (isFirstTargetValid())
95 return true;
96 else
97 continue;
98 }
99 if (findMatch())
100 return true;
101 }
102 return false;
103 }
104
Nils Diewaldbb33da22015-03-04 16:24:25 +0000105
Eliza Margarethac8d59202014-12-16 16:21:16 +0000106 /**
107 * Tells if the first target from the target list is a match.
108 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000109 * @return <code>true</code> if the first target from the target
110 * list is a
Eliza Margarethac8d59202014-12-16 16:21:16 +0000111 * match, <code>false</code> otherwise.
112 * @throws IOException
113 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000114 private boolean isFirstTargetValid () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000115 CandidateSpan target = targetList.get(0);
116 targetList.remove(0);
117 firstSpanPostion = target.getPosition();
118 filterCandidateList(firstSpanPostion);
119 collectRightCandidates();
120
121 if (isWithinDistance()) {
122 return false;
123 }
124 setMatchProperties(target);
125 return true;
126 }
127
Nils Diewaldbb33da22015-03-04 16:24:25 +0000128
Eliza Margarethac8d59202014-12-16 16:21:16 +0000129 /**
130 * Validate if the current firstSpan is a match.
131 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000132 * @return <code>true</code> if a match is found,
133 * <code>false</code>
Eliza Margarethac8d59202014-12-16 16:21:16 +0000134 * otherwise.
135 * @throws IOException
136 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000137 private boolean findMatch () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000138 if (firstSpans.doc() != currentDocNum) {
139 currentDocNum = firstSpans.doc();
140 candidateList.clear();
141 }
142
143 if (hasMoreSecondSpans) {
144 if (secondSpans.doc() == firstSpans.doc()) {
145 return (isFirstSpanValid() ? true : false);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000146 }
147 else if (secondSpans.doc() < firstSpans.doc()) {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000148 hasMoreSecondSpans = secondSpans.skipTo(firstSpans.doc());
149 return false;
150 }
151 }
152
153 // return (isFirstSpanValid() ? true : false);
154
155 if (candidateList.isEmpty()) {
156 if (isFirstSpanInElement()) {
157 setMatchProperties(new CandidateSpan(firstSpans,
158 elementPosition));
159 hasMoreSpans = firstSpans.next();
160 return true;
161 }
162 hasMoreSpans = firstSpans.next();
163 return false;
164 }
165 return (isFirstSpanValid() ? true : false);
166 }
167
Nils Diewaldbb33da22015-03-04 16:24:25 +0000168
Eliza Margarethac8d59202014-12-16 16:21:16 +0000169 /**
170 * Tells if the current firstSpan is a match.
171 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000172 * @return <code>true</code> if a match is found,
173 * <code>false</code>
Eliza Margarethac8d59202014-12-16 16:21:16 +0000174 * otherwise.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000175 * @throws IOException
176 * <pre>
177 * private boolean isFirstSpanValid() throws
178 * IOException {
179 * if (candidateList.isEmpty()) {
180 * if (isFirstSpanInElement()) {
181 * setMatchProperties(new CandidateSpan(firstSpans,
182 * elementPosition));
Eliza Margarethac8d59202014-12-16 16:21:16 +0000183 * hasMoreSpans = firstSpans.next();
184 * return true;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000185 * }
186 * hasMoreSpans = firstSpans.next();
187 * return false;
188 * }
189 * return (findMatch() ? true : false);
190 * }
191 * </pre>
Eliza Margarethac8d59202014-12-16 16:21:16 +0000192 */
193
194 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000195 * Tells if the given span is in an element distance unit, or not,
196 * by
Eliza Margarethac8d59202014-12-16 16:21:16 +0000197 * advancing the element distance unit to the span position.
198 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000199 * @param span
200 * a span
201 * @return <code>true</code> if the element distance unit can be
202 * advanced to
Eliza Margarethac8d59202014-12-16 16:21:16 +0000203 * contain the given span, <code>false</code> otherwise.
204 * @throws IOException
205 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000206 private boolean advanceElementTo (Spans span) throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000207 while (hasMoreElements && elements.doc() == currentDocNum
208 && elements.start() < span.end()) {
209
210 if (span.start() >= elements.start()
211 && span.end() <= elements.end()) {
212 return true;
213 }
214
215 hasMoreElements = elements.next();
216 elementPosition++;
217 }
218 return false;
219 }
220
Nils Diewaldbb33da22015-03-04 16:24:25 +0000221
Eliza Margarethac8d59202014-12-16 16:21:16 +0000222 /**
223 * Tells if the current firstSpan is a match.
224 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000225 * @return <code>true</code> if a match is found,
226 * <code>false</code>
Eliza Margarethac8d59202014-12-16 16:21:16 +0000227 * otherwise.
228 * @throws IOException
229 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000230 private boolean isFirstSpanValid () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000231 if (!isOrdered)
232 collectLeftCandidates();
233
234 if (isFirstSpanInElement()) {
235 CandidateSpan target = new CandidateSpan(firstSpans,
236 elementPosition);
237 hasMoreSpans = firstSpans.next();
238 // Checking if the secondspans in the *left* side are not within the
239 // distance range
240 if (!isOrdered && isWithinDistance())
241 return false;
242 // Checking if the secondspans in the *right* side are not within
243 // the distance range
244 collectRightCandidates();
245 if (isWithinDistance())
246 return false;
247
248 setMatchProperties(target);
249 return true;
250 }
251 hasMoreSpans = firstSpans.next();
252 return false;
253 }
254
Nils Diewaldbb33da22015-03-04 16:24:25 +0000255
Eliza Margarethac8d59202014-12-16 16:21:16 +0000256 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000257 * Collects all second spans (candidates) on the right side of the
258 * current
259 * first span (target) position. At the same time, also collects
260 * all other
Eliza Margarethac8d59202014-12-16 16:21:16 +0000261 * first spans occurring before the second spans.
262 *
263 * @throws IOException
264 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000265 private void collectRightCandidates () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000266 while (hasMoreSecondSpans && secondSpans.doc() == currentDocNum) {
267
268 if (elementPosition > firstSpanPostion + maxDistance) {
269 break;
270 }
271 // stores all first spans occurring before the current second span
272 // in the target list.
273 if (hasMoreSpans && firstSpans.start() < secondSpans.start()
274 && firstSpans.doc() == currentDocNum) {
275 if (advanceElementTo(firstSpans)) {
276 targetList.add(new CandidateSpan(firstSpans,
277 elementPosition));
278 }
279 hasMoreSpans = firstSpans.next();
280 continue;
281 }
282 // collects only second spans occurring inside an element
283 if (advanceElementTo(secondSpans)) {
284 candidateList.add(new CandidateSpan(secondSpans,
285 elementPosition));
286 }
287 hasMoreSecondSpans = secondSpans.next();
288 }
289 }
290
Nils Diewaldbb33da22015-03-04 16:24:25 +0000291
Eliza Margarethac8d59202014-12-16 16:21:16 +0000292 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000293 * Collects all the second spans (candidates) occurring before the
294 * first
Eliza Margarethac8d59202014-12-16 16:21:16 +0000295 * spans, and are within an element distance unit.
296 *
297 * @throws IOException
298 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000299 private void collectLeftCandidates () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000300 while (hasMoreSecondSpans && secondSpans.doc() == firstSpans.doc()
301 && secondSpans.start() < firstSpans.end()) {
302 if (advanceElementTo(secondSpans)) {
303 candidateList.add(new CandidateSpan(secondSpans,
304 elementPosition));
305 filterCandidateList(elementPosition);
306 }
307 hasMoreSecondSpans = secondSpans.next();
308 }
309 }
310
Nils Diewaldbb33da22015-03-04 16:24:25 +0000311
Eliza Margarethac8d59202014-12-16 16:21:16 +0000312 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000313 * Tells if there is a candidate span (second span) occurring
314 * together with
315 * the target span (firstspan) within the minimum and maximum
316 * distance
Eliza Margarethac8d59202014-12-16 16:21:16 +0000317 * range.
318 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000319 * @return <code>true</code> if there is a candidate span (second
320 * span)
321 * occurring together with the target span (firstspan)
322 * within the
323 * minimum and maximum distance range, <code>false</code>
324 * otherwise.
Eliza Margarethac8d59202014-12-16 16:21:16 +0000325 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000326 private boolean isWithinDistance () {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000327 int actualDistance;
328 for (CandidateSpan cs : candidateList) {
329 actualDistance = cs.getPosition() - firstSpanPostion;
330 if (!isOrdered)
331 actualDistance = Math.abs(actualDistance);
332
333 if (minDistance <= actualDistance && actualDistance <= maxDistance)
334 return true;
335 }
336 return false;
337 }
338
Nils Diewaldbb33da22015-03-04 16:24:25 +0000339
Eliza Margarethac8d59202014-12-16 16:21:16 +0000340 /**
341 * Tells if the current firstSpans is in an element.
342 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000343 * @return <code>true</code> if the current firstSpans in is an
344 * element,
Eliza Margarethac8d59202014-12-16 16:21:16 +0000345 * <code>false</code> otherwise.
346 * @throws IOException
347 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000348 private boolean isFirstSpanInElement () throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000349 if (advanceElementTo(firstSpans)) {
350 firstSpanPostion = elementPosition;
351 filterCandidateList(firstSpanPostion);
352 return true;
353 }
354 return false;
355 }
356
Nils Diewaldbb33da22015-03-04 16:24:25 +0000357
Eliza Margarethac8d59202014-12-16 16:21:16 +0000358 /**
Nils Diewaldbb33da22015-03-04 16:24:25 +0000359 * From the candidateList, removes all candidate spans that are
360 * too far from
361 * the given target position, and have exactly the same position
362 * as the
363 * target position. Only candidate spans occurring within a range
364 * of
Eliza Margarethac8d59202014-12-16 16:21:16 +0000365 * distance from the target position, are retained.
366 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000367 * @param position
368 * target/firstSpan position
Eliza Margarethac8d59202014-12-16 16:21:16 +0000369 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000370 private void filterCandidateList (int position) {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000371
372 Iterator<CandidateSpan> i = candidateList.iterator();
373 CandidateSpan cs;
374 while (i.hasNext()) {
375 cs = i.next();
376 if (cs.getPosition() == position
377 || cs.getPosition() + maxDistance >= position) {
378 break;
379 }
380 i.remove();
381 }
382 }
383
Nils Diewaldbb33da22015-03-04 16:24:25 +0000384
Eliza Margarethac8d59202014-12-16 16:21:16 +0000385 /**
386 * Sets the given target/match CandidateSpan as the current match.
387 *
Nils Diewaldbb33da22015-03-04 16:24:25 +0000388 * @param match
389 * a target/firstSpan wrapped as a CandidateSpan
Eliza Margarethac8d59202014-12-16 16:21:16 +0000390 * @throws IOException
391 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000392 private void setMatchProperties (CandidateSpan match) throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000393 matchDocNumber = match.getDoc();
394 matchStartPosition = match.getStart();
395 matchEndPosition = match.getEnd();
396
397 if (collectPayloads && match.getPayloads() != null)
398 matchPayload.addAll(match.getPayloads());
399
400 setMatchFirstSpan(match);
401 }
402
Nils Diewaldbb33da22015-03-04 16:24:25 +0000403
Eliza Margarethac8d59202014-12-16 16:21:16 +0000404 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000405 public boolean skipTo (int target) throws IOException {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000406 if (hasMoreSpans && firstSpans.doc() < target) {
407 if (!firstSpans.skipTo(target)) {
408 hasMoreSpans = false;
409 return false;
410 }
411 }
412 return advance();
413 }
414
Nils Diewaldbb33da22015-03-04 16:24:25 +0000415
Eliza Margarethac8d59202014-12-16 16:21:16 +0000416 @Override
Nils Diewaldbb33da22015-03-04 16:24:25 +0000417 public long cost () {
Eliza Margarethac8d59202014-12-16 16:21:16 +0000418 return elements.cost() + firstSpans.cost() + secondSpans.cost();
419 }
Eliza Margarethae335beb2014-02-27 12:56:14 +0000420
421}