blob: c2a6da17d1267094821b9ca3735211f54db7c7bd [file] [log] [blame]
Nils Diewaldf399a672013-11-18 17:55:22 +00001package de.ids_mannheim.korap.query.spans;
2
Nils Diewald82a4b862014-02-20 21:17:41 +00003import de.ids_mannheim.korap.query.spans.KorapTermSpan;
4
Nils Diewaldf399a672013-11-18 17:55:22 +00005import org.apache.lucene.index.Term;
6import org.apache.lucene.index.DocsAndPositionsEnum;
7import org.apache.lucene.search.DocIdSetIterator;
8import org.apache.lucene.search.spans.Spans;
9import org.apache.lucene.util.BytesRef;
10
11import java.nio.ByteBuffer;
12
13import org.slf4j.Logger;
14import org.slf4j.LoggerFactory;
15
16import java.io.IOException;
17import java.util.Collections;
18import java.util.Collection;
19import java.util.LinkedList;
20import java.util.ArrayList;
21import java.util.List;
22
23// TODO: Store payloads in 12 byte instead of the complicated ByteBuffer stuff!
Nils Diewald20607ab2014-03-20 23:28:36 +000024// Todo: Use copyFrom() instead of clone()
Nils Diewaldf399a672013-11-18 17:55:22 +000025
Eliza Margaretha1c3bf272014-06-11 11:50:39 +000026/**
27 * @author Nils Diewald, margaretha
Nils Diewald6802acd2014-03-18 18:29:30 +000028 *
Nils Diewald20607ab2014-03-20 23:28:36 +000029 * Use copyFrom instead of clone
Nils Diewald82a4b862014-02-20 21:17:41 +000030 */
Nils Diewaldf399a672013-11-18 17:55:22 +000031public class ElementSpans extends Spans {
Nils Diewald82a4b862014-02-20 21:17:41 +000032
Nils Diewald20607ab2014-03-20 23:28:36 +000033 private byte[] payloadByte;
Nils Diewaldf399a672013-11-18 17:55:22 +000034 private ByteBuffer bb = ByteBuffer.allocate(4);
35
36 protected final DocsAndPositionsEnum postings;
37 protected final Term term;
Nils Diewald20607ab2014-03-20 23:28:36 +000038 private int freq = 0, count = 0;
39
Nils Diewald6802acd2014-03-18 18:29:30 +000040 private LinkedList<KorapTermSpan> memory;
Nils Diewald20607ab2014-03-20 23:28:36 +000041 private KorapTermSpan overflow, current, temp;
42
Eliza Margaretha1c3bf272014-06-11 11:50:39 +000043 public boolean isElementRef = false; // A dummy flag for
44
Nils Diewald20607ab2014-03-20 23:28:36 +000045 public static final ElementSpans EMPTY_ELEMENT_SPANS
46 = new EmptyElementSpans();
Nils Diewaldf399a672013-11-18 17:55:22 +000047
48 private final static Logger log = LoggerFactory.getLogger(ElementSpans.class);
Nils Diewaldc025a232014-02-28 19:01:14 +000049 // This advices the java compiler to ignore all loggings
Nils Diewaldb5b7b8d2014-06-06 18:41:54 +000050 public static final boolean DEBUG = false;
Nils Diewaldf399a672013-11-18 17:55:22 +000051
Eliza Margarethafb25cef2014-06-06 14:19:07 +000052
Nils Diewald8c543432014-02-27 18:25:38 +000053 /**
54 * The constructor.
55 */
Nils Diewaldf399a672013-11-18 17:55:22 +000056 public ElementSpans(DocsAndPositionsEnum postings, Term term) {
57 this.postings = postings;
58 this.term = term;
Nils Diewald20607ab2014-03-20 23:28:36 +000059
Nils Diewaldf399a672013-11-18 17:55:22 +000060 // storedPayload = null;
Nils Diewald20607ab2014-03-20 23:28:36 +000061 this.memory = new LinkedList<KorapTermSpan>();
62
63 // Overflow span
64 this.overflow = new KorapTermSpan();
65
66 // Current span
67 this.current = new KorapTermSpan();
68
69 // Temporary span
70 this.temp = new KorapTermSpan();
Nils Diewaldf399a672013-11-18 17:55:22 +000071 };
Nils Diewald20607ab2014-03-20 23:28:36 +000072
Nils Diewaldf399a672013-11-18 17:55:22 +000073 // only for EmptyElementSpans (below)
Nils Diewald8c543432014-02-27 18:25:38 +000074 public ElementSpans() {
Nils Diewald20607ab2014-03-20 23:28:36 +000075 this.term = null;
76 this.postings = null;
Nils Diewaldf399a672013-11-18 17:55:22 +000077 };
78
79 @Override
80 public boolean next() throws IOException {
Nils Diewald20607ab2014-03-20 23:28:36 +000081
82 // There is a memory
83 if (this.memory.size() > 0) {
84 this.setToCurrent(memory.removeFirst(), 1);
Nils Diewaldf399a672013-11-18 17:55:22 +000085
Nils Diewald82a4b862014-02-20 21:17:41 +000086 if (DEBUG)
Nils Diewald20607ab2014-03-20 23:28:36 +000087 log.trace(" --- MATCH --- Fetch from memory {}",
88 this.current.toString());
89
Nils Diewaldf399a672013-11-18 17:55:22 +000090 return true;
91 };
92
Nils Diewald20607ab2014-03-20 23:28:36 +000093 // Last element in document is reached
94 if (this.count == this.freq) {
Nils Diewaldf399a672013-11-18 17:55:22 +000095
Nils Diewald20607ab2014-03-20 23:28:36 +000096 if (this.postings == null)
97 return false;
Nils Diewald82a4b862014-02-20 21:17:41 +000098
Nils Diewaldf399a672013-11-18 17:55:22 +000099
Nils Diewald20607ab2014-03-20 23:28:36 +0000100 // There is an overflow
101 if (this.overflow.doc != -1) {
102 if (DEBUG)
103 log.trace("Fetch from overflow");
104
105 this.setToCurrent(this.overflow, 2);
106
107 // Reset overflow
108 this.overflow.reset();
Nils Diewald82a4b862014-02-20 21:17:41 +0000109
110 if (DEBUG)
Nils Diewald20607ab2014-03-20 23:28:36 +0000111 log.trace(" --- MATCH --- Fetch from memory {}",
112 this.current.toString());
113
Nils Diewaldf399a672013-11-18 17:55:22 +0000114 return true;
115 };
116
Nils Diewald20607ab2014-03-20 23:28:36 +0000117 // There is no next document
118 if (!this.nextDoc())
Nils Diewaldf399a672013-11-18 17:55:22 +0000119 return false;
Nils Diewaldf399a672013-11-18 17:55:22 +0000120 };
121
Nils Diewald20607ab2014-03-20 23:28:36 +0000122 // overflow is not empty - let's treat this as current
123 if (this.overflow.doc != -1) {
124
125 if (DEBUG)
126 log.trace("Overflow is not empty");
127
128 this.setToCurrent(this.overflow, 3);
129
130 // TODO: newOverflow() ???
131 this.overflow.reset();
132 }
133 else {
134 if (DEBUG)
135 log.trace("Overflow is empty");
136
137 // Get next posting - count is still < freq
138 this.setToCurrent(4);
139
140 if (this.count == this.freq) {
141 if (DEBUG)
142 log.trace(" --- MATCH --- Direct {}",
143 this.current.toString());
144 return true;
Nils Diewald82a4b862014-02-20 21:17:41 +0000145 };
Nils Diewald20607ab2014-03-20 23:28:36 +0000146 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000147
Nils Diewald20607ab2014-03-20 23:28:36 +0000148 while (this.count < this.freq) {
Nils Diewaldf399a672013-11-18 17:55:22 +0000149
Nils Diewald20607ab2014-03-20 23:28:36 +0000150 // Temp is now the old current
151 this.setCurrentToTemp();
Nils Diewaldf399a672013-11-18 17:55:22 +0000152
Nils Diewald20607ab2014-03-20 23:28:36 +0000153 // Get new current
154 this.setToCurrent(5);
155
156 if (DEBUG)
157 log.trace("Compare {} with {}",
158 this.current.toString(),
159 this.temp.toString());
160
161 // The next span is not at the same position
162 if (this.current.start != this.temp.start) {
163
164 // Add this to memory
165 if (this.memory.size() > 0) {
Nils Diewald82a4b862014-02-20 21:17:41 +0000166 if (DEBUG)
Nils Diewald20607ab2014-03-20 23:28:36 +0000167 log.trace("[1] Add to memory {}", this.temp.toString());
168 this.memory.add((KorapTermSpan) this.temp.clone());
169 this.overflow = this.current;
Nils Diewaldf399a672013-11-18 17:55:22 +0000170 break;
171 };
172
Nils Diewald20607ab2014-03-20 23:28:36 +0000173 // There is no reason to start a memory
174 this.overflow = this.current;
175 this.current = this.temp;
Nils Diewaldf399a672013-11-18 17:55:22 +0000176
Nils Diewald82a4b862014-02-20 21:17:41 +0000177 if (DEBUG)
Nils Diewald20607ab2014-03-20 23:28:36 +0000178 log.trace(" --- MATCH --- Fetch from memory {}",
179 this.current.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +0000180
Nils Diewald20607ab2014-03-20 23:28:36 +0000181 return true;
182 }
Nils Diewaldf399a672013-11-18 17:55:22 +0000183
Nils Diewald20607ab2014-03-20 23:28:36 +0000184 // The positions are equal
185 else {
186 if (DEBUG)
187 log.trace("[2] Add to memory {}", this.temp.toString());
188 this.memory.add((KorapTermSpan) this.temp.clone());
Nils Diewaldf399a672013-11-18 17:55:22 +0000189 };
190 };
Nils Diewald82a4b862014-02-20 21:17:41 +0000191
Nils Diewald20607ab2014-03-20 23:28:36 +0000192 if (this.temp.doc == this.current.doc &&
193 this.temp.start == this.current.start) {
Nils Diewald82a4b862014-02-20 21:17:41 +0000194 if (DEBUG)
Nils Diewald20607ab2014-03-20 23:28:36 +0000195 log.trace("[3] Add to memory {}", this.current.toString());
196 this.memory.add((KorapTermSpan) this.current.clone());
Nils Diewaldf399a672013-11-18 17:55:22 +0000197 };
Nils Diewald82a4b862014-02-20 21:17:41 +0000198
Nils Diewald20607ab2014-03-20 23:28:36 +0000199 // Sort the memory
200 Collections.sort(memory);
Nils Diewald82a4b862014-02-20 21:17:41 +0000201
Nils Diewald20607ab2014-03-20 23:28:36 +0000202 // There is now a memory
203 return this.next();
Nils Diewaldf399a672013-11-18 17:55:22 +0000204 };
Nils Diewald20607ab2014-03-20 23:28:36 +0000205
Nils Diewaldf399a672013-11-18 17:55:22 +0000206
Nils Diewald20607ab2014-03-20 23:28:36 +0000207 // get next doc
208 private boolean nextDoc () throws IOException {
Nils Diewaldf399a672013-11-18 17:55:22 +0000209
Nils Diewald20607ab2014-03-20 23:28:36 +0000210 // Check if this doc is the last
211 if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
Nils Diewaldf399a672013-11-18 17:55:22 +0000212 return false;
Nils Diewaldf399a672013-11-18 17:55:22 +0000213
Nils Diewald20607ab2014-03-20 23:28:36 +0000214 if (DEBUG)
215 log.trace("Go to next document");
216
217 this.current.reset();
218
219 // Advance to next doc
220 this.current.doc = this.postings.nextDoc();
221
222 // Check if this doc is the last
223 if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
224 return false;
225
226 // check frequencies
227 this.freq = this.postings.freq();
228
229 if (DEBUG)
230 log.trace("Document <{}> has {} occurrences",
231 this.current.doc,
232 this.freq);
233
234
235 this.count = 0;
Nils Diewaldf399a672013-11-18 17:55:22 +0000236 return true;
237 };
238
Nils Diewald20607ab2014-03-20 23:28:36 +0000239
240 @Override
241 public boolean skipTo(int target) throws IOException {
242
243 assert target > this.current.doc;
244
245 // Get this doc
246 this.current.doc = postings.advance(target);
247
248 if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
249 return false;
250
251 if (this.memory != null)
252 this.memory.clear();
253
254 this.overflow.reset();
255
256
257 this.freq = this.postings.freq();
258
259 if (DEBUG)
260 log.trace("Document {} has {} occurrences", this.current.doc, this.freq);
261
262
263 this.count = 0;
264
265 if (this.next())
266 return true;
267
268 return false;
269 };
270
271
Nils Diewaldf399a672013-11-18 17:55:22 +0000272 @Override
273 public int doc() {
Nils Diewald20607ab2014-03-20 23:28:36 +0000274 return this.current.doc;
Nils Diewaldf399a672013-11-18 17:55:22 +0000275 };
276
Nils Diewald20607ab2014-03-20 23:28:36 +0000277
Nils Diewaldf399a672013-11-18 17:55:22 +0000278 @Override
279 public int start() {
Nils Diewald20607ab2014-03-20 23:28:36 +0000280 return this.current.start;
Nils Diewaldf399a672013-11-18 17:55:22 +0000281 };
282
Nils Diewald20607ab2014-03-20 23:28:36 +0000283
Nils Diewaldf399a672013-11-18 17:55:22 +0000284 @Override
285 public int end() {
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000286 if (!this.current.isPayloadRead){
287 try {
288 readPayload();
289 } catch (IOException e) {
290 e.printStackTrace();
291 }
292 }
293 return this.current.end;
Nils Diewaldf399a672013-11-18 17:55:22 +0000294 };
295
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000296 public short getElementRef() throws IOException{
297 if (!this.current.isPayloadRead){
298 readPayload();
299 }
300 return this.current.elementRef;
301 }
Nils Diewald20607ab2014-03-20 23:28:36 +0000302
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000303 private void readPayload() throws IOException {
304
305 this.current.clearPayload();
306 BytesRef payload = postings.getPayload();
307
308 if (payload != null) {
309 //System.out.println(payload.bytes.length);
310
311 // Copy some payloads like start character and end character
312 this.current.payload.put(payload.bytes, payload.offset, 8);
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000313
Eliza Margaretha1c3bf272014-06-11 11:50:39 +0000314 this.current.end = readEndPostion(payload);
315
316 if (isElementRef ){
317 // Copy rest of payloads after the end position and elementref
318 this.current.payload.put(payload.bytes, payload.offset + 14, payload.length - 14);
319 this.current.elementRef = readElementRef(payload);
320 }
321 else{
322 // Copy rest of payloads after the end position
323 this.current.payload.put(payload.bytes, payload.offset + 12, payload.length - 12);
Eliza Margaretha669e7a82014-06-26 12:57:18 +0000324 this.current.elementRef = -1;
Eliza Margaretha1c3bf272014-06-11 11:50:39 +0000325 }
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000326 }
327 else {
328 this.current.end = this.current.start;
329 this.current.elementRef = -1;
330 };
331
332 this.current.isPayloadRead = true;
333
334 }
335
336 private short readElementRef(BytesRef payload) {
337 byte[] b = new byte[2];
338 System.arraycopy(payload.bytes, payload.offset + 12, b, 0, 2);
339 ByteBuffer wrapper = ByteBuffer.wrap(b);
340 return wrapper.getShort();
341 }
342
343
344
345 private int readEndPostion(BytesRef payload) {
346
347 this.payloadByte = new byte[4];
348 // Copy end position integer to payloadByte
349 System.arraycopy(payload.bytes, payload.offset + 8, this.payloadByte, 0, 4);
350
351 bb.clear();
352 int t = bb.wrap(payloadByte).getInt();
353
354 if (DEBUG)
355 log.trace("Get Endposition and payload: {}-{} with end position {} in doc {}",
356 this.current.payload.getInt(0),
357 this.current.payload.getInt(4),
358 t,
359 this.current.doc);
360
361 return t;
362 }
363
364 @Override
Nils Diewaldf399a672013-11-18 17:55:22 +0000365 public long cost() {
Nils Diewald20607ab2014-03-20 23:28:36 +0000366 // ???
367 return this.postings.cost();
Nils Diewaldf399a672013-11-18 17:55:22 +0000368 };
369
Nils Diewald20607ab2014-03-20 23:28:36 +0000370
Nils Diewaldf399a672013-11-18 17:55:22 +0000371 @Override
372 public Collection<byte[]> getPayload() throws IOException {
373 byte[] offsetCharacters = new byte[8];
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000374 if (!this.current.isPayloadRead)
375 readPayload();
Nils Diewaldf399a672013-11-18 17:55:22 +0000376
Nils Diewald20607ab2014-03-20 23:28:36 +0000377 System.arraycopy(this.current.payload.array(), 0, offsetCharacters, 0, 8);
Nils Diewaldf399a672013-11-18 17:55:22 +0000378
Nils Diewaldf399a672013-11-18 17:55:22 +0000379 return Collections.singletonList(offsetCharacters);
380 };
381
Nils Diewald20607ab2014-03-20 23:28:36 +0000382
383 /**
384 * Sets KorapTermSpan to current element
385 */
386 private void setToCurrent (KorapTermSpan act, int debugNumber) {
387
388 if (DEBUG)
389 log.trace(
390 "[{}] Set to current with {}",
391 debugNumber,
392 act.toString()
393 );
394
395 this.current = (KorapTermSpan) act.clone();
Nils Diewaldf399a672013-11-18 17:55:22 +0000396 };
397
Nils Diewald20607ab2014-03-20 23:28:36 +0000398 /**
399 * Sets KorapTermSpan to current element
400 */
401 private void setToCurrent (int debugNumber) throws IOException {
402
403 this.current.start = this.postings.nextPosition();
Nils Diewald20607ab2014-03-20 23:28:36 +0000404 // This will directly save stored payloads
Eliza Margarethafb25cef2014-06-06 14:19:07 +0000405 //this.current.end = this.getPayloadEndPosition();
406 readPayload();
Nils Diewald20607ab2014-03-20 23:28:36 +0000407
408 if (DEBUG)
409 log.trace(
410 "[{}] Set new to current with {}",
411 debugNumber,
412 this.current.toString()
413 );
414
415 this.count++;
Nils Diewaldf399a672013-11-18 17:55:22 +0000416 };
417
Nils Diewald20607ab2014-03-20 23:28:36 +0000418 private void setCurrentToTemp () {
419 this.temp = (KorapTermSpan) this.current.clone();
Nils Diewaldaa5c1d32014-03-20 23:46:55 +0000420 // this.temp.copyFrom(this.current);
Nils Diewaldf399a672013-11-18 17:55:22 +0000421 };
422
Nils Diewald20607ab2014-03-20 23:28:36 +0000423
Nils Diewald20607ab2014-03-20 23:28:36 +0000424 @Override
425 public boolean isPayloadAvailable() throws IOException {
426
427 if (current.payload != null)
428 return true;
429
430 return false;
431 };
432
433
434 @Override
435 public String toString() {
436 return "spans(" + this.term.toString() + ")@" +
437 (this.current.doc == -1 ? "START" : (this.current.doc == Integer.MAX_VALUE) ? "END" : this.current.doc + "-" + this.current.start);
438 };
439
440 public DocsAndPositionsEnum getPostings() {
441 return postings;
442 };
443
Nils Diewaldf399a672013-11-18 17:55:22 +0000444 private static final class EmptyElementSpans extends ElementSpans {
445
446 @Override
447 public boolean next() { return false; };
448
449 @Override
450 public boolean skipTo(int target) { return false; };
451
452 @Override
453 public int doc() { return DocIdSetIterator.NO_MORE_DOCS; };
454
455 @Override
456 public int start() { return -1; };
457
458 @Override
459 public int end() { return -1; };
460
461 @Override
462 public Collection<byte[]> getPayload() { return null; };
463
464 @Override
465 public boolean isPayloadAvailable() { return false; };
466
467 @Override
468 public long cost() { return 0; };
469 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000470};