blob: a2e4b86b4e1f1cf32caf8990b41c50f6c0d99e9d [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.query.spans;
2
3import org.apache.lucene.index.AtomicReaderContext;
4import org.apache.lucene.index.Term;
5import org.apache.lucene.index.TermContext;
6import org.apache.lucene.util.ArrayUtil;
7import org.apache.lucene.util.Bits;
8import org.apache.lucene.search.spans.Spans;
9import org.apache.lucene.search.spans.SpanQuery;
10import org.apache.lucene.search.DocIdSetIterator;
11
12import java.nio.ByteBuffer;
13
14import java.io.IOException;
15import java.util.ArrayList;
16import java.util.Comparator;
17import java.util.HashSet;
18import java.util.LinkedList;
19import java.util.List;
20import java.util.Collection;
21import java.util.Map;
22import java.util.Set;
23
24import de.ids_mannheim.korap.query.SpanWithinQuery;
25import de.ids_mannheim.korap.query.spans.KorapLongSpan;
26
27import org.slf4j.Logger;
28import org.slf4j.LoggerFactory;
29
30import java.util.*;
31import java.io.*;
32
33/*
34<a>x<b>y<c>z[1]</c>z[2]</b>z[3]</a>
35
36Erst:
37a mit ?
38-> fetch empty
39-> [].next -> 1
40=> match!
41-> speicher [1]
42a mit ?
43-> fetch gecheckt! (WIE MERKE ICH MIR DAS?)
44-> [].next -> 1
45=> match!
46-> speicher [2]
47a mit [3]
48=> match!
49-> speicher [3]
50b mit ?
51-> fetch [1]
52=> match!
53-> wenn [1].end <= b.end
54 -> speicher [1]
55b mit ?
56-> fetch [2]
57=> match!
58-> wenn [2].end <= b.end
59 -> speicher [2]
60
61Speicher: start, end, payload, wrapStart, wrapEnd
62
63Problem: Es ist nicht garantiert, dass bei
64<a><b><c>x</c>y</b>z</a>
65die Wrapreihenfolge a,b,c rauskommt!
66*/
67
68public class WithinSpans extends Spans {
69 private boolean firstTime = true;
70 private boolean more = false;
71
72 // Initialize as invalid
73 private int matchDoc = -1;
74 private int matchStart = -1;
75 private int matchEnd = -1;
76
77 /** Indicates that the wrap and the embedded spans are in the same doc */
78 private boolean inSameDoc = false;
79 private int wrapDoc;
80 private int embeddedDoc;
81 private int wrapStart, wrapEnd, embeddedStart, embeddedEnd;
82 private Collection<byte[]> embeddedPayload;
83
84 // Wrap span
85 private final Spans wrapSpans;
86 private final Spans wrapSpansByDoc; // Necessary?
87
88 // Embedded span
89 private final Spans embeddedSpans;
90 private final Spans embeddedSpansByDoc;
91
92 private SpanWithinQuery query;
93
94 private LinkedList<KorapLongSpan> spanStore1, spanStore2;
95
96 private List<byte[]> matchPayload;
97
98 private short flag;
99
100 private final Logger log = LoggerFactory.getLogger(WithinSpans.class);
101
102 // Constructor
103 public WithinSpans (SpanWithinQuery spanWithinQuery,
104 AtomicReaderContext context,
105 Bits acceptDocs,
106 Map<Term,TermContext> termContexts,
107 short flag) throws IOException {
108
109 log.trace("Init WithinSpans");
110
111 // Init copies
112 this.matchPayload = new LinkedList<byte[]>();
113
114 // Get span
115 this.wrapSpans = spanWithinQuery.wrap().getSpans(context, acceptDocs, termContexts);
116 this.wrapSpansByDoc = wrapSpans; // used in toSameDoc()
117
118 this.embeddedSpans = spanWithinQuery.embedded().getSpans(context, acceptDocs, termContexts);
119 this.embeddedSpansByDoc = embeddedSpans; // used in toSameDoc()
120
121 this.flag = flag;
122
123 this.spanStore1 = new LinkedList<KorapLongSpan>();
124 this.spanStore2 = new LinkedList<KorapLongSpan>();
125
126 this.query = spanWithinQuery; // kept for toString() only.
127 };
128
129
130 /** Move to the next match, returning true iff any such exists. */
131 @Override
132 public boolean next () throws IOException {
133 log.trace("Next with doc {}", matchDoc);
134
135 // Check for init next
136 if (firstTime) {
137 firstTime = false;
138 if (!embeddedSpans.next() || !wrapSpans.next()) {
139 log.trace("No next in firstSpan nor in secondSpan 1");
140 more = false;
141 return false;
142 };
143 log.trace("Spans are initialized");
144 more = true;
145 wrapStart = wrapSpans.start();
146 wrapEnd = wrapSpans.end();
147 wrapDoc = matchDoc = wrapSpans.doc();
148
149 embeddedStart = embeddedSpans.start();
150 embeddedEnd = embeddedSpans.end();
151 embeddedDoc = embeddedSpans.doc();
152
153 if (embeddedSpans.isPayloadAvailable()) {
154 Collection<byte[]> payload = embeddedSpans.getPayload();
155 embeddedPayload = new ArrayList<byte[]>(payload.size());
156 embeddedPayload.addAll(payload);
157 };
158
159 log.trace("Init spans: {}", _actPos());
160
161 if (embeddedDoc == matchDoc) {
162 inSameDoc = true;
163 log.trace("Is now inSameDoc");
164 }
165 else {
166 log.trace("Is not inSameDoc");
167 };
168 log.trace("Next with doc {} (wrap) and {} (embedded)", wrapDoc, embeddedDoc);
169 };
170
171 matchPayload.clear();
172 return advanceAfterCheck();
173 };
174
175
176 /** Advances the subSpans to just after a within match.
177 * @return true iff there is such a match.
178 */
179 private boolean advanceAfterCheck() throws IOException {
180 log.trace("advanceAfterChecked inSameDoc: {} and more: {}", inSameDoc, more);
181 log.trace("advanceAfterCheck with doc {} (wrap) and {} (embedded)", wrapDoc, embeddedDoc);
182
183 // There are more spans, and both spans are either in the
184 // same doc or can be forwarded to the same doc.
185 while (more && (inSameDoc || toSameDoc())) {
186
187 log.trace("There are more spans in doc {}", embeddedDoc);
188
189 /* spans are in the same doc */
190 if (within()) {
191 return true;
192 }
193 else {
194 log.trace("No within");
195 };
196 };
197
198 log.trace("No more matches");
199
200 return false; // no more matches
201 };
202
203
204 /** Advance the subSpans to the same document */
205 private boolean toSameDoc () throws IOException {
206 log.trace("toSameDoc");
207
208 /*
209 wrapDoc = wrapSpansByDoc.doc();
210 embeddedDoc = embeddedSpansByDoc.doc();
211
212 */
213
214 if (wrapDoc != embeddedDoc) {
215 log.trace("Docs not identical: {} vs {}", wrapDoc, embeddedDoc);
216
217 spanStore1.clear(); // = new LinkedList<KorapLongSpan>();
218 spanStore2.clear(); // = new LinkedList<KorapLongSpan>();
219
220 if (wrapDoc < embeddedDoc) {
221 log.trace("Skip wrap from {} to {}", wrapDoc, embeddedDoc);
222 if (!wrapSpansByDoc.skipTo(embeddedDoc)) {
223 more = false;
224 inSameDoc = false;
225 return false;
226 };
227 wrapDoc = wrapSpans.doc();
228 }
229 else if (wrapDoc > embeddedDoc) {
230 log.trace("Skip embedded from {} to {}", embeddedSpans.doc(), wrapDoc);
231 // if (!embeddedSpansByDoc.skipTo( wrapDoc )) {
232 if (wrapDoc != embeddedSpans.doc()) {
233 if (embeddedSpans.doc() == DocIdSetIterator.NO_MORE_DOCS || !embeddedSpans.skipTo( wrapDoc )) {
234 more = false;
235 inSameDoc = false;
236 return false;
237 };
238 }
239 else {
240 _add_current();
241 // embeddedDoc = embeddedSpans.doc();
242 };
243 };
244 }
245 else {
246 log.trace("Docs identical");
247 };
248 embeddedStart = embeddedSpans.start();
249 embeddedEnd = embeddedSpans.end();
250 log.trace("The new embedded start is {}-{}", embeddedStart, embeddedEnd);
251 inSameDoc = true;
252 return true;
253 };
254
255
256 /** Skips to the first match beyond the current, whose document number is
257 * greater than or equal to <i>target</i>. <p>Returns true iff there is such
258 * a match. <p>Behaves as if written: <pre class="prettyprint">
259 * boolean skipTo(int target) {
260 * do {
261 * if (!next())
262 * return false;
263 * } while (target > doc());
264 * return true;
265 * }
266 * </pre>
267 * Most implementations are considerably more efficient than that.
268 */
269 public boolean skipTo (int target) throws IOException {
270 log.trace("skipTo {}", target);
271
272 // Check for init next
273 if (firstTime) {
274 firstTime = false;
275 if (!embeddedSpans.next() || !wrapSpans.next()) {
276 log.trace("No next in firstSpan nor in secondSpan 2");
277 more = false;
278 return false;
279 };
280 more = true;
281 wrapStart = wrapSpans.start();
282 wrapEnd = wrapSpans.end();
283 wrapDoc = embeddedSpans.doc();
284 embeddedStart = embeddedSpans.start();
285 embeddedEnd = embeddedSpans.end();
286 embeddedDoc = embeddedSpans.doc();
287 }
288
289 /*
290 See NextSpans for the same problem!
291 This should be dealt with in toSameDoc!!!
292 */
293 else if (more && (embeddedSpans.doc() < target)) {
294 if (embeddedSpans.skipTo(target)) {
295 inSameDoc = false;
296 embeddedDoc = target;
297 }
298
299 // Can't be skipped to target
300 else {
301 more = false;
302 return false;
303 };
304 };
305
306 matchPayload.clear();
307 return advanceAfterCheck();
308 };
309
310 private String _actPos () {
311 StringBuilder sb = new StringBuilder();
312 sb.append("<").append(wrapStart).append('-').append(wrapEnd).append('>');
313 sb.append(embeddedStart).append('-').append(embeddedEnd);
314 sb.append("</>");
315 return sb.toString();
316 };
317
318
319 private boolean within () throws IOException {
320 log.trace("within");
321
322 while (more && inSameDoc) {
323
324 // Case 1-5
325 // Case 1
326 // |---|
327 // |-|
328 // Case 2
329 // |---|
330 // |-|
331 // Case 3
332 // |---|
333 // |---|
334 // Case 4
335 // |-|
336 // |---|
337 // Case 5
338 // |-|"
339 // |---|
340 if (wrapStart > embeddedStart) {
341 log.trace("[Case] 1-5 with {}", _actPos());
342
343 if (this.fetchNext()) {
344 continue;
345 };
346
347 // Forward wrapSpan
348 if (wrapSpans.next()) {
349 wrapDoc = wrapSpans.doc();
350 if (this.toSameDoc()) {
351 wrapStart = wrapSpans.start();
352 wrapEnd = wrapSpans.end();
353 continue;
354 };
355 };
356
357 this.more = false;
358 this.inSameDoc = false;
359 return false;
360 };
361
362 // Get wrapEnd
363 // wrapEnd = wrapSpans.end();
364
365 KorapLongSpan embedded = new KorapLongSpan();
366 embedded.start = embeddedStart;
367 embedded.end = embeddedEnd;
368 embedded.doc = embeddedDoc;
369 if (embeddedPayload != null)
370 embedded.payload = embeddedPayload;
371
372 this.spanStore1.add(embedded);
373 log.trace("pushed to spanStore1: {}", embedded.toString());
374
375
376 // Case 12
377 // |---|
378 // |-|
379 // Case 13
380 // |---|
381 // |-|
382 if (wrapEnd <= embeddedStart) {
383 log.trace("[Case] 12-13 with {}", _actPos());
384
385 // Copy content of spanStores
386 if (!spanStore1.isEmpty()) {
387 log.trace("First store is not empty - copy to second store!");
388 spanStore2.addAll(0, (LinkedList<KorapLongSpan>) spanStore1.clone());
389 spanStore1.clear();
390 log.trace("Second store is now: {}", spanStore2.toString());
391 };
392
393 // Forward wrapSpan
394 log.trace("Try to forward wrapspan");
395
396 if (wrapSpans.next()) {
397 wrapDoc = wrapSpans.doc();
398 log.trace("wrapDoc is now {} while embeddedDoc is {}", wrapDoc, embeddedDoc);
399 if (this.toSameDoc()) {
400 wrapStart = wrapSpans.start();
401 wrapEnd = wrapSpans.end();
402 if (fetchTwoNext())
403 continue;
404 };
405 }
406 else {
407 log.trace("Unable to forward wrapspan");
408 };
409
410 this.inSameDoc = false;
411 this.more = false;
412 return false;
413 }
414
415
416 // Case 6 - 8
417 else if (wrapStart == embeddedStart) {
418
419 // Case 6
420 // |---|
421 // |-|
422 if (wrapEnd > embeddedEnd) {
423 log.trace("[Case] 6 with {}", _actPos());
424
425 // neither match nor endWith
426 if (this.flag < (short) 2) {
427 _setMatch(embedded);
428 log.trace("MATCH1!! with {}", _actPos());
429 fetchTwoNext();
430 return true;
431 };
432
433 fetchTwoNext();
434 continue;
435 }
436
437 // Case 7
438 // |---|
439 // |---|
440 else if (wrapEnd == embeddedEnd) {
441 log.trace("[Case] 7 with {}", _actPos());
442
443 _setMatch(embedded);
444 log.trace("MATCH2!! with {}", _actPos());
445 fetchTwoNext();
446 return true;
447 };
448
449 // Case 8
450 // |-|
451 // |---|
452 // wrapEnd < embeddedEnd
453 log.trace("[Case] 8 with {}", _actPos());
454 fetchTwoNext();
455 continue;
456 };
457
458 // Case 9-11
459 // wrapStart < wrapEnd
460
461 // Case 9
462 // |---|
463 // |-|
464 if (wrapEnd > embeddedEnd) {
465 log.trace("[Case] 9 with {}", _actPos());
466
467 // neither match nor endWith
468 if (this.flag == (short) 0) {
469 _setMatch(embedded);
470 log.trace("MATCH3!! with {}", _actPos());
471 fetchTwoNext();
472 return true;
473 };
474
475 fetchTwoNext();
476 continue;
477 }
478 // Case 10
479 // |---|
480 // |-|
481 else if (wrapEnd == embeddedEnd) {
482 log.trace("[Case] 10 with {}", _actPos());
483
484 // neither match nor endWith
485 if (this.flag == (short) 0 || this.flag == (short) 2) {
486 _setMatch(embedded);
487 log.trace("MATCH4!! with {}", _actPos());
488 fetchTwoNext();
489 return true;
490 };
491
492 fetchTwoNext();
493 continue;
494 };
495
496 // Case 11
497 // |---|
498 // |---|
499 // wrapEnd < embeddedEnd
500 log.trace("[Case] 11 with {}", _actPos());
501 fetchTwoNext();
502 continue;
503 };
504
505 this.more = false;
506 return false;
507 };
508
509
510 private boolean fetchNext () throws IOException {
511
512 // Fetch span from first store
513 if (spanStore1.isEmpty()) {
514 log.trace("First store is empty");
515 return fetchTwoNext();
516 };
517
518 KorapLongSpan current = spanStore1.removeFirst();
519 log.trace("Fetch from first store: {}", current.toString());
520
521 embeddedStart = current.start;
522 embeddedEnd = current.end;
523 embeddedDoc = current.doc;
524 if (current.payload != null)
525 embeddedPayload = current.payload;
526
527 return true;
528 };
529
530
531 private boolean fetchTwoNext () throws IOException {
532
533 // Fetch span from second store
534 if (spanStore2.isEmpty()) {
535 log.trace("Second store is empty");
536
537 // Forward spans
538 if (this.embeddedSpans.next()) {
539 log.trace("Forwarded embeddedSpans");
540
541 if (this.embeddedSpans.doc() != wrapDoc && !spanStore1.isEmpty()) {
542
543 log.trace("No docmatch and still stuff in store");
544 log.trace("First store is not empty - copy to second store!");
545 spanStore2.addAll(0, (LinkedList<KorapLongSpan>) spanStore1.clone());
546 spanStore1.clear();
547
548 _add_current();
549
550 log.trace("Second store is now: {}", spanStore2.toString());
551 }
552 else {
553 embeddedStart = embeddedSpans.start();
554 embeddedEnd = embeddedSpans.end();
555 embeddedDoc = embeddedSpans.doc();
556
557 if (embeddedSpans.isPayloadAvailable()) {
558 Collection<byte[]> payload = embeddedSpans.getPayload();
559 // Maybe just clear
560 embeddedPayload = new ArrayList<byte[]>(payload.size());
561 embeddedPayload.addAll(payload);
562 };
563
564 return this.toSameDoc();
565 };
566 }
567 else {
568 log.trace("Forwarded embeddedSpans failed");
569 };
570
571 log.trace("EmbeddedDoc: " + embeddedDoc);
572
573 // Forward wrapSpan
574 log.trace("Try to forward wrapspan");
575 if (wrapSpans.next()) {
576 wrapDoc = wrapSpans.doc();
577 if (this.toSameDoc()) {
578 wrapStart = wrapSpans.start();
579 wrapEnd = wrapSpans.end();
580
581 log.trace("WrapSpan forwarded");
582
583 // Copy content of spanStores
584 if (!spanStore1.isEmpty()) {
585 log.trace("First store is not empty - copy to second store!");
586 spanStore2.addAll(0, (LinkedList<KorapLongSpan>) spanStore1.clone());
587 spanStore1.clear();
588 log.trace("Second store is now: {}", spanStore2.toString());
589 };
590
591 return this.fetchTwoNext();
592 };
593 };
594
595 // Don't know.
596 log.trace("No more fetchNext()");
597
598 more = false;
599 return false;
600 };
601
602 KorapLongSpan current = spanStore2.removeFirst();
603 log.trace("Fetch from second store: {}", current.toString());
604
605 embeddedStart = current.start;
606 embeddedEnd = current.end;
607 embeddedDoc = current.doc;
608 embeddedPayload = current.payload;
609
610 return true;
611 };
612
613
614 /*
615TODO: Maybe ignore "embedded" parameter and use embeddedPayload directly
616 */
617 private void _setMatch (KorapLongSpan embedded) throws IOException {
618 matchStart = wrapStart;
619 matchEnd = wrapEnd;
620 matchDoc = embeddedDoc;
621 matchPayload.clear();
622
623 if (embedded.payload != null)
624 matchPayload.addAll(embedded.payload);
625
626 if (wrapSpans.isPayloadAvailable()) {
627 Collection<byte[]> payload = wrapSpans.getPayload();
628 matchPayload.addAll(payload);
629 };
630 };
631
632
633 private void _add_current () throws IOException{
634 KorapLongSpan embedded = new KorapLongSpan();
635 embedded.start = embeddedSpans.start();
636 embedded.end = embeddedSpans.end();
637 embedded.doc = embeddedSpans.doc();
638
639 if (embeddedSpans.isPayloadAvailable()) {
640 Collection<byte[]> payload = embeddedSpans.getPayload();
641 embedded.payload = new ArrayList<byte[]>(payload.size());
642 embedded.payload.addAll(payload);
643 };
644
645 this.spanStore2.add(embedded);
646 log.trace("pushed to spanStore2: {}", embedded.toString());
647 };
648
649
650 /** Returns the document number of the current match. Initially invalid. */
651 @Override
652 public int doc () {
653 return matchDoc;
654 };
655
656 /** Returns the start position of the embedding wrap. Initially invalid. */
657 @Override
658 public int start () {
659 return matchStart;
660 };
661
662 /** Returns the end position of the embedding wrap. Initially invalid. */
663 @Override
664 public int end () {
665 return matchEnd;
666 };
667
668 /**
669 * Returns the payload data for the current span.
670 * This is invalid until {@link #next()} is called for
671 * the first time.
672 * This method must not be called more than once after each call
673 * of {@link #next()}. However, most payloads are loaded lazily,
674 * so if the payload data for the current position is not needed,
675 * this method may not be called at all for performance reasons. An ordered
676 * SpanQuery does not lazy load, so if you have payloads in your index and
677 * you do not want ordered SpanNearQuerys to collect payloads, you can
678 * disable collection with a constructor option.<br>
679 * <br>
680 * Note that the return type is a collection, thus the ordering should not be relied upon.
681 * <br/>
682 * @lucene.experimental
683 *
684 * @return a List of byte arrays containing the data of this payload, otherwise null if isPayloadAvailable is false
685 * @throws IOException if there is a low-level I/O error
686 */
687 // public abstract Collection<byte[]> getPayload() throws IOException;
688 @Override
689 public Collection<byte[]> getPayload() throws IOException {
690 return matchPayload;
691 };
692
693
694 /**
695 * Checks if a payload can be loaded at this position.
696 * <p/>
697 * Payloads can only be loaded once per call to
698 * {@link #next()}.
699 *
700 * @return true if there is a payload available at this position that can be loaded
701 */
702 @Override
703 public boolean isPayloadAvailable() {
704 return matchPayload.isEmpty() == false;
705 };
706
707 // Todo: This may be in the wrong version
708 @Override
709 public long cost() {
710 return Math.min(wrapSpans.cost(), embeddedSpans.cost());
711 };
712
713 @Override
714 public String toString() {
715 return getClass().getName() + "("+query.toString()+")@"+
716 (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END"));
717 };
718};