Bugfixing in ElementSpans and KorapMatch
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
index d32c520..9f39290 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
@@ -21,27 +21,27 @@
import java.util.List;
// TODO: Store payloads in 12 byte instead of the complicated ByteBuffer stuff!
+// Todo: Use copyFrom() instead of clone()
/**
* @author Nils Diewald
*
- * TODO: Support lazy loading for .end()
+ * Use copyFrom instead of clone
*/
public class ElementSpans extends Spans {
- private byte[] payloadByte = new byte[4];
+ private byte[] payloadByte;
private ByteBuffer bb = ByteBuffer.allocate(4);
protected final DocsAndPositionsEnum postings;
protected final Term term;
- private int doc, freq, count, position, end;
- protected boolean readPayload;
-
+ private int freq = 0, count = 0;
+
private LinkedList<KorapTermSpan> memory;
- private ByteBuffer storedPayload = ByteBuffer.allocate(128);
- boolean hasStoredPayload = false;
-
- private KorapTermSpan overflow, tempSpan;
+ private KorapTermSpan overflow, current, temp;
+
+ public static final ElementSpans EMPTY_ELEMENT_SPANS
+ = new EmptyElementSpans();
private final static Logger log = LoggerFactory.getLogger(ElementSpans.class);
// This advices the java compiler to ignore all loggings
@@ -54,523 +54,379 @@
public ElementSpans(DocsAndPositionsEnum postings, Term term) {
this.postings = postings;
this.term = term;
- this.doc = -1;
- this.end = -1;
- storedPayload.clear();
- hasStoredPayload = false;
+
// storedPayload = null;
- memory = new LinkedList<KorapTermSpan>();
- overflow = new KorapTermSpan();
- tempSpan = new KorapTermSpan();
+ this.memory = new LinkedList<KorapTermSpan>();
+
+ // Overflow span
+ this.overflow = new KorapTermSpan();
+
+ // Current span
+ this.current = new KorapTermSpan();
+
+ // Temporary span
+ this.temp = new KorapTermSpan();
};
+
// only for EmptyElementSpans (below)
public ElementSpans() {
- term = null;
- postings = null;
+ this.term = null;
+ this.postings = null;
};
@Override
public boolean next() throws IOException {
- end = -1;
-
- if (memory.size() > 0) {
- if (DEBUG)
- log.trace("There is a memory entry");
-
- _setToCurrent(memory.removeFirst());
+
+ // There is a memory
+ if (this.memory.size() > 0) {
+ this.setToCurrent(memory.removeFirst(), 1);
if (DEBUG)
- log.trace("Current1: [{}-{}]", position, end);
-
+ log.trace(" --- MATCH --- Fetch from memory {}",
+ this.current.toString());
+
return true;
};
- if (DEBUG)
- log.trace("There is no memory entry");
+ // Last element in document is reached
+ if (this.count == this.freq) {
- if (count == freq) {
+ if (this.postings == null)
+ return false;
- if (DEBUG)
- log.trace("last position in document");
- // Check for overflow on document boundary
- if (overflow.start != -1) {
+ // There is an overflow
+ if (this.overflow.doc != -1) {
+ if (DEBUG)
+ log.trace("Fetch from overflow");
+
+ this.setToCurrent(this.overflow, 2);
+
+ // Reset overflow
+ this.overflow.reset();
if (DEBUG)
- log.trace(" but there is an overflow");
-
- _setToCurrent(overflow).clear();
-
- if (DEBUG)
- log.trace("Current2: [{}-{}]", position, end);
-
+ log.trace(" --- MATCH --- Fetch from memory {}",
+ this.current.toString());
+
return true;
};
- if (postings == null) {
- if (DEBUG)
- log.trace("no more postings");
+ // There is no next document
+ if (!this.nextDoc())
return false;
- };
-
- if (DEBUG)
- log.trace("Go to next doc");
-
- doc = postings.nextDoc();
-
- if (doc == DocIdSetIterator.NO_MORE_DOCS) {
- if (DEBUG)
- log.trace("no more docs");
- return false;
- };
-
- // New doc!
- end = -1;
- storedPayload.clear();
- hasStoredPayload = false;
-
- freq = postings.freq();
- count = 0;
};
- int pos = overflow.start;
-
- while (true) {
- if (DEBUG) {
- log.trace("pos is {}", pos);
- _log_payloads(1);
+ // overflow is not empty - let's treat this as current
+ if (this.overflow.doc != -1) {
+
+ if (DEBUG)
+ log.trace("Overflow is not empty");
+
+ this.setToCurrent(this.overflow, 3);
+
+ // TODO: newOverflow() ???
+ this.overflow.reset();
+ }
+ else {
+ if (DEBUG)
+ log.trace("Overflow is empty");
+
+ // Get next posting - count is still < freq
+ this.setToCurrent(4);
+
+ if (this.count == this.freq) {
+ if (DEBUG)
+ log.trace(" --- MATCH --- Direct {}",
+ this.current.toString());
+ return true;
};
+ };
- if (count == freq) {
- if (DEBUG)
- log.trace("last position in document");
+ while (this.count < this.freq) {
- if (postings == null) {
+ // Temp is now the old current
+ this.setCurrentToTemp();
+ // Get new current
+ this.setToCurrent(5);
+
+ if (DEBUG)
+ log.trace("Compare {} with {}",
+ this.current.toString(),
+ this.temp.toString());
+
+ // The next span is not at the same position
+ if (this.current.start != this.temp.start) {
+
+ // Add this to memory
+ if (this.memory.size() > 0) {
if (DEBUG)
- log.trace("no more postings");
-
- // Check for overflow on document boundary
- if (overflow.start != -1) {
- if (DEBUG)
- log.trace(" but there is an overflow");
-
- _setToCurrent(overflow).clear();
- if (DEBUG)
- log.trace("Current3: [{}-{}]", position, end);
-
- return true;
- };
-
- return false;
- };
-
- if (DEBUG) {
- log.trace("go to next doc");
- _log_payloads(2);
- };
-
- if (overflow.start != -1) {
- if (DEBUG) {
- log.trace("Storing overflow {} ...", overflow.toString());
- log.trace("... in memory with {}-{}", overflow.startChar(), overflow.endChar());
- };
- memory.add((KorapTermSpan) overflow.clone());
- overflow.clear();
- };
- if (DEBUG)
- _log_payloads(3);
-
- if (memory.size() > 0) {
- if (DEBUG) {
- log.trace("sort and return first");
- _log_payloads(4);
- };
-
- Collections.sort(memory);
-
- if (DEBUG)
- _log_payloads(5);
-
- _setToCurrent(memory.removeFirst());
-
- if (DEBUG)
- _log_payloads(6);
-
- if (DEBUG)
- log.trace("Current4: [{}-{}]]", position, end);
+ log.trace("[1] Add to memory {}", this.temp.toString());
+ this.memory.add((KorapTermSpan) this.temp.clone());
+ this.overflow = this.current;
break;
};
- doc = postings.nextDoc();
- // New doc
- end = pos = -1;
-
- if (doc == DocIdSetIterator.NO_MORE_DOCS) {
- if (DEBUG)
- log.trace("no more docs");
- return false;
- };
-
- freq = postings.freq();
- count = 0;
- };
-
-
- if (DEBUG)
- log.trace("Forward postings");
-
- position = postings.nextPosition();
- // New pos!
- end = -1;
-
- if (DEBUG) {
- _log_payloads(9);
- log.trace("CLEAR PAYLOAD");
- };
-
- storedPayload.clear();
- hasStoredPayload = false;
-
- if (DEBUG) {
- _log_payloads(10);
- log.trace("next position is {}", position);
- };
-
- count++;
-
- // There was no overflow
- if (pos == -1 || pos == position) {
- if (pos == position) {
- if (DEBUG)
- log.trace("Add overflow to memory");
-
- memory.add((KorapTermSpan) overflow.clone());
- }
-
- else {
- if (DEBUG)
- log.trace("There was no overflow");
- pos = position;
- };
-
- if (DEBUG) {
- _log_payloads(8);
- log.trace("*****************************");
- };
-
- _setCurrentTo(overflow);
-
- if (DEBUG) {
- log.trace("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
- log.trace("Set overflow and continue: {} ...", overflow.toString());
- log.trace("... with {}-{}", overflow.startChar(), overflow.endChar());
- };
-
- continue;
- }
-
- // overflow was older
- else if (pos != position) {
+ // There is no reason to start a memory
+ this.overflow = this.current;
+ this.current = this.temp;
if (DEBUG)
- log.trace("Overflow was older");
+ log.trace(" --- MATCH --- Fetch from memory {}",
+ this.current.toString());
- // Use memory
- if (memory.size() > 0) {
+ return true;
+ }
- if (DEBUG)
- log.trace("Add overflow to memory");
-
- memory.add((KorapTermSpan) overflow.clone());
-
- if (DEBUG)
- log.trace("Sort memory");
-
- // Sort by end position
- Collections.sort(memory);
-
- // Store current information in overflow
- _setCurrentTo(overflow);
-
- if (DEBUG) {
- log.trace("Set new overflow: {}", overflow.toString());
- log.trace("Get first element from sorted memory");
- };
-
- _setToCurrent(memory.removeFirst());
- }
-
- // Nothing in memory - use overflow!
- else {
-
- if (DEBUG)
- log.trace("There is nothing in memory");
-
- /* Make overflow active and store last position in overflow */
- _setCurrentTo(tempSpan);
-
- if (DEBUG)
- log.trace("Temp is now {}", overflow.toString());
-
- _setToCurrent(overflow);
-
- // Store current information in overflow
- overflow.copyFrom(tempSpan);
-
- if (DEBUG)
- log.trace("Overflow is now {}", overflow.toString());
-
- };
- break;
+ // The positions are equal
+ else {
+ if (DEBUG)
+ log.trace("[2] Add to memory {}", this.temp.toString());
+ this.memory.add((KorapTermSpan) this.temp.clone());
};
};
- if (DEBUG)
- log.trace("Current4: [{}-{}]", position, end);
-
- readPayload = false;
- return true;
- };
-
- private KorapTermSpan _setToCurrent (KorapTermSpan act) {
- if (act.payload != null)
- act.payload.rewind();
-
- if (DEBUG)
- log.trace("Set to current with {}, meaning {} - {}",
- act.toString(),
- act.payload.getInt(0),
- act.payload.getInt(4)
- );
-
- if (act.payload != null)
- act.payload.rewind();
-
- position = act.start;
- end = act.end;
- storedPayload.clear();
- hasStoredPayload = false;
-
- if (act.payload != null) {
+ if (this.temp.doc == this.current.doc &&
+ this.temp.start == this.current.start) {
if (DEBUG)
- log.trace("Payload is not null");
-
- act.payload.rewind();
- storedPayload.put(act.payload);
- hasStoredPayload = true;
- }
- else if (DEBUG)
- log.trace("Payload is null");
-
- return act;
- };
-
- private void _log_payloads (int nr) {
- if (!DEBUG)
- return;
-
- if (hasStoredPayload)
- log.trace(
- "[{}] payload offsets are {}-{}",
- nr,
- storedPayload.getInt(0),
- storedPayload.getInt(4)
- );
- else
- log.trace("[{}] payload is empty", nr);
- };
-
- private void _setCurrentTo () {
- overflow.start = position;
- overflow.end = this.end();
- overflow.payload.clear();
-
- if (hasStoredPayload)
- overflow.payload.put(storedPayload);
-
- if (DEBUG)
- log.trace("Set current to Overflow {} with {}-{}", overflow.toString(), overflow.startChar(), overflow.endChar());
- };
-
- private void _setCurrentTo (KorapTermSpan o) {
-
- if (DEBUG)
- _log_payloads(7);
-
- o.start = position;
- o.end = this.end();
- o.payload.clear();
-
- if (hasStoredPayload) {
- storedPayload.rewind();
- o.payload.put(storedPayload);
-
- if (DEBUG)
- log.trace("Object now has offset {}-{}", o.payload.getInt(0), o.payload.getInt(4));
-
- // Import:
- o.payload.rewind();
+ log.trace("[3] Add to memory {}", this.current.toString());
+ this.memory.add((KorapTermSpan) this.current.clone());
};
- if (DEBUG)
- log.trace("Set current to object {} ...", o.toString());
-
- if (hasStoredPayload) {
- if (DEBUG)
- log.trace("with {}-{} from {}-{}", o.startChar(), o.endChar(), storedPayload.getInt(0), storedPayload.getInt(4));
+ // Sort the memory
+ Collections.sort(memory);
- storedPayload.rewind();
- };
+ // There is now a memory
+ return this.next();
};
+
+ // get next doc
+ private boolean nextDoc () throws IOException {
- @Override
- public boolean skipTo(int target) throws IOException {
- assert target > doc;
- doc = postings.advance(target);
-
- end = -1;
- overflow.clear();
- storedPayload.clear();
- hasStoredPayload = false;
-
- if (memory != null)
- memory.clear();
-
- if (doc == DocIdSetIterator.NO_MORE_DOCS)
+ // Check if this doc is the last
+ if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
return false;
- freq = postings.freq();
- count = 0;
- position = postings.nextPosition();
- count++;
- readPayload = false;
+ if (DEBUG)
+ log.trace("Go to next document");
+
+ this.current.reset();
+
+ // Advance to next doc
+ this.current.doc = this.postings.nextDoc();
+
+ // Check if this doc is the last
+ if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
+ return false;
+
+ // check frequencies
+ this.freq = this.postings.freq();
+
+ if (DEBUG)
+ log.trace("Document <{}> has {} occurrences",
+ this.current.doc,
+ this.freq);
+
+
+ this.count = 0;
return true;
};
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+
+ assert target > this.current.doc;
+
+ // Get this doc
+ this.current.doc = postings.advance(target);
+
+ if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
+ return false;
+
+ if (this.memory != null)
+ this.memory.clear();
+
+ this.overflow.reset();
+
+
+ this.freq = this.postings.freq();
+
+ if (DEBUG)
+ log.trace("Document {} has {} occurrences", this.current.doc, this.freq);
+
+
+ this.count = 0;
+
+ if (this.next())
+ return true;
+
+ return false;
+ };
+
+
@Override
public int doc() {
- return doc;
+ return this.current.doc;
};
+
@Override
public int start() {
- return position;
+ return this.current.start;
};
+
@Override
public int end() {
- if (end >= 0)
- return end;
+ if (this.current.end >= 0)
+ return this.current.end;
try {
- end = this.getPayloadEndPosition();
+ this.current.end = this.getPayloadEndPosition();
}
catch (Exception e) {
- end = position;
+ this.current.end = this.current.start;
};
- return end;
+ return this.current.end;
};
+
@Override
public long cost() {
- return postings.cost();
+ // ???
+ return this.postings.cost();
};
+
@Override
public Collection<byte[]> getPayload() throws IOException {
byte[] offsetCharacters = new byte[8];
-
- if (storedPayload.position() <= 0)
+ if (this.current.end <= 0)
this.getPayloadEndPosition();
- if (DEBUG) {
- if (hasStoredPayload)
- log.trace("storedPayload: {} - {}",
- storedPayload.getInt(0),
- storedPayload.getInt(4));
- else
- log.trace("storedPayload is empty");
- };
-
- System.arraycopy(storedPayload.array(), 0, offsetCharacters, 0, 8);
+ System.arraycopy(this.current.payload.array(), 0, offsetCharacters, 0, 8);
return Collections.singletonList(offsetCharacters);
};
- @Override
- public boolean isPayloadAvailable() throws IOException {
- return readPayload == false && postings.getPayload() != null;
+
+ /**
+ * Sets KorapTermSpan to current element
+ */
+ private void setToCurrent (KorapTermSpan act, int debugNumber) {
+
+ if (DEBUG)
+ log.trace(
+ "[{}] Set to current with {}",
+ debugNumber,
+ act.toString()
+ );
+
+ this.current = (KorapTermSpan) act.clone();
};
- @Override
- public String toString() {
- return "spans(" + term.toString() + ")@" +
- (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position);
+ /**
+ * Sets KorapTermSpan to current element
+ */
+ private void setToCurrent (int debugNumber) throws IOException {
+
+ this.current.start = this.postings.nextPosition();
+
+ // This will directly save stored payloads
+ this.current.end = this.getPayloadEndPosition();
+
+ if (DEBUG)
+ log.trace(
+ "[{}] Set new to current with {}",
+ debugNumber,
+ this.current.toString()
+ );
+
+ this.count++;
};
- public DocsAndPositionsEnum getPostings() {
- return postings;
+ private void setCurrentToTemp () {
+ this.temp = (KorapTermSpan) this.current.clone();
};
+
private int getPayloadEndPosition () {
- if (DEBUG)
- log.trace("getPayloadEndPosition of element ...");
-
try {
BytesRef payload = postings.getPayload();
- if (DEBUG)
- log.trace(" BytesRef: {}", payload.toString());
+ this.current.clearPayload();
- readPayload = true;
- storedPayload.clear();
- hasStoredPayload = false;
-
if (payload != null) {
- if (DEBUG)
- log.trace("Do bit magic");
-
- storedPayload.put(payload.bytes, payload.offset, 8);
- storedPayload.put(payload.bytes, payload.offset + 12, payload.length - 12);
- System.arraycopy(payload.bytes, payload.offset + 8, payloadByte, 0, 4);
- hasStoredPayload = true;
- if (DEBUG)
- log.trace("~~ Bytes: {}-{}-{}",
- storedPayload.getInt(0),
- storedPayload.getInt(4),
- payloadByte);
+ this.payloadByte = new byte[4];
+
+ // Copy some payloads like start character and end character
+ this.current.payload.put(payload.bytes, payload.offset, 8);
+ this.current.payload.put(payload.bytes, payload.offset + 12, payload.length - 12);
+
+ // Copy end position integer to payloadByte
+ System.arraycopy(payload.bytes, payload.offset + 8, this.payloadByte, 0, 4);
}
- else {
- if (DEBUG)
- log.trace("There's no payload available");
-
- payloadByte = null;
+ else {
+ this.payloadByte = null;
};
- if (payloadByte != null) {
+ // Todo: REWRITE!
+ if (this.payloadByte != null) {
+
+ // Todo: This is weird!
+
bb.clear();
int t = bb.wrap(payloadByte).getInt();
+
if (DEBUG)
- log.trace(" |-> {}", t);
+ log.trace("Get Endposition and payload: {}-{} with end position {} in doc {}",
+ this.current.payload.getInt(0),
+ this.current.payload.getInt(4),
+ t,
+ this.current.doc);
return t;
+ }
+ else if (DEBUG) {
+ log.trace("Get Endposition and payload: None found");
};
-
}
catch (IOException e) {
if (DEBUG)
log.trace("IOException {}", e);
};
+
return -1;
};
+ @Override
+ public boolean isPayloadAvailable() throws IOException {
+
+ if (current.payload != null)
+ return true;
+
+ return false;
+ };
+
+
+ @Override
+ public String toString() {
+ return "spans(" + this.term.toString() + ")@" +
+ (this.current.doc == -1 ? "START" : (this.current.doc == Integer.MAX_VALUE) ? "END" : this.current.doc + "-" + this.current.start);
+ };
+
+ public DocsAndPositionsEnum getPostings() {
+ return postings;
+ };
+
private static final class EmptyElementSpans extends ElementSpans {
@Override
@@ -597,6 +453,4 @@
@Override
public long cost() { return 0; };
};
-
- public static final ElementSpans EMPTY_ELEMENT_SPANS = new EmptyElementSpans();
};