Bugfixing in ElementSpans and KorapMatch
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index 3467b53..dc28449 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -42,7 +42,7 @@
private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
// Mapper for JSON serialization
ObjectMapper mapper = new ObjectMapper();
@@ -841,13 +841,23 @@
HighlightCombinatorElement lastComb;
this.tempStack.clear();
- StringBuilder sb = new StringBuilder("Stack for checking with ");
- sb.append(number).append(" is ");
- for (int s : this.balanceStack) {
- sb.append('[').append(s).append(']');
+ // Shouldn't happen
+ if (this.balanceStack.size() == 0) {
+ if (DEBUG)
+ log.trace("The balance stack is empty");
+ return;
};
- if (DEBUG)
+
+ if (DEBUG) {
+ StringBuilder sb = new StringBuilder(
+ "Stack for checking with class "
+ );
+ sb.append(number).append(" is ");
+ for (int s : this.balanceStack) {
+ sb.append('[').append(s).append(']');
+ };
log.trace(sb.toString());
+ };
// class number of the last element
int eold = this.balanceStack.removeLast();
@@ -1091,6 +1101,11 @@
if (openList.isEmpty()) {
stack.addAll(closeList);
break;
+ }
+
+ // Not sure about this, but it can happen
+ else if (closeList.isEmpty()) {
+ break;
};
if (openList.peekFirst()[0] < closeList.peekFirst()[1]) {
@@ -1129,6 +1144,9 @@
// Match position
startPosChar = this.positionsToOffset.start(ldid, this.startPos);
+ if (DEBUG)
+ log.trace("Unaltered startPosChar is {}", startPosChar);
+
// Check potential differing start characters
// e.g. from element spans
if (potentialStartPosChar != -1 &&
@@ -1136,27 +1154,22 @@
startPosChar = potentialStartPosChar;
endPosChar = this.positionsToOffset.end(ldid, this.endPos - 1);
-
+
if (DEBUG)
- log.trace("Match offset is pos {}-{} (chars {}-{})",
+ log.trace("Unaltered endPosChar is {}", endPosChar);
+
+ // Potential end characters may come from spans with
+ // defined character offsets like sentences including .", ... etc.
+ if (endPosChar < potentialEndPosChar)
+ endPosChar = potentialEndPosChar;
+
+ if (DEBUG)
+ log.trace("Refined: Match offset is pos {}-{} (chars {}-{})",
this.startPos,
this.endPos,
startPosChar,
endPosChar);
- // Potential end characters may come from spans with
- // defined character offsets like sentences including .", ... etc.
- if (endPosChar < potentialEndPosChar) {
- endPosChar = potentialEndPosChar;
-
- if (DEBUG)
- log.trace("Refined: Match offset is pos {}-{} (chars {}-{})",
- this.startPos,
- this.endPos,
- startPosChar,
- endPosChar);
- };
-
// left context
if (leftTokenContext) {
if (DEBUG)
@@ -1222,6 +1235,7 @@
}
else {
this.tempSnippet = this.getPrimaryData(startOffsetChar);
+ // endPosChar = this.tempSnippet.length() - 1 + startOffsetChar;
endMore = false;
};
@@ -1241,12 +1255,23 @@
-1,
0};
+ if (DEBUG)
+ log.trace("The match entry is {}-{} ({}-{}) with startOffsetChar {}",
+ startPosChar - startOffsetChar,
+ endPosChar - startOffsetChar,
+ startPosChar,
+ endPosChar,
+ startOffsetChar);
+
// Add match span
this.span.add(intArray);
// highlights
// -- I'm not sure about this.
if (this.highlight != null) {
+ if (DEBUG)
+ log.trace("There are highlights!");
+
for (Highlight highlight : this.highlight) {
int start = this.positionsToOffset.start(
ldid, highlight.start
diff --git a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
index cab5410..eae79c1 100644
--- a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
+++ b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
@@ -76,6 +76,9 @@
public void add (PositionsToOffsetArray ptoa) {
if (DEBUG)
log.trace("Add positionsToOffsetArray {}/{}", ptoa.docID, ptoa.pos);
+ if (ptoa.pos < 0)
+ return;
+
if (this.processed && this.exists(ptoa))
return;
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
index d32c520..9f39290 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ElementSpans.java
@@ -21,27 +21,27 @@
import java.util.List;
// TODO: Store payloads in 12 byte instead of the complicated ByteBuffer stuff!
+// Todo: Use copyFrom() instead of clone()
/**
* @author Nils Diewald
*
- * TODO: Support lazy loading for .end()
+ * Use copyFrom instead of clone
*/
public class ElementSpans extends Spans {
- private byte[] payloadByte = new byte[4];
+ private byte[] payloadByte;
private ByteBuffer bb = ByteBuffer.allocate(4);
protected final DocsAndPositionsEnum postings;
protected final Term term;
- private int doc, freq, count, position, end;
- protected boolean readPayload;
-
+ private int freq = 0, count = 0;
+
private LinkedList<KorapTermSpan> memory;
- private ByteBuffer storedPayload = ByteBuffer.allocate(128);
- boolean hasStoredPayload = false;
-
- private KorapTermSpan overflow, tempSpan;
+ private KorapTermSpan overflow, current, temp;
+
+ public static final ElementSpans EMPTY_ELEMENT_SPANS
+ = new EmptyElementSpans();
private final static Logger log = LoggerFactory.getLogger(ElementSpans.class);
// This advices the java compiler to ignore all loggings
@@ -54,523 +54,379 @@
public ElementSpans(DocsAndPositionsEnum postings, Term term) {
this.postings = postings;
this.term = term;
- this.doc = -1;
- this.end = -1;
- storedPayload.clear();
- hasStoredPayload = false;
+
// storedPayload = null;
- memory = new LinkedList<KorapTermSpan>();
- overflow = new KorapTermSpan();
- tempSpan = new KorapTermSpan();
+ this.memory = new LinkedList<KorapTermSpan>();
+
+ // Overflow span
+ this.overflow = new KorapTermSpan();
+
+ // Current span
+ this.current = new KorapTermSpan();
+
+ // Temporary span
+ this.temp = new KorapTermSpan();
};
+
// only for EmptyElementSpans (below)
public ElementSpans() {
- term = null;
- postings = null;
+ this.term = null;
+ this.postings = null;
};
@Override
public boolean next() throws IOException {
- end = -1;
-
- if (memory.size() > 0) {
- if (DEBUG)
- log.trace("There is a memory entry");
-
- _setToCurrent(memory.removeFirst());
+
+ // There is a memory
+ if (this.memory.size() > 0) {
+ this.setToCurrent(memory.removeFirst(), 1);
if (DEBUG)
- log.trace("Current1: [{}-{}]", position, end);
-
+ log.trace(" --- MATCH --- Fetch from memory {}",
+ this.current.toString());
+
return true;
};
- if (DEBUG)
- log.trace("There is no memory entry");
+ // Last element in document is reached
+ if (this.count == this.freq) {
- if (count == freq) {
+ if (this.postings == null)
+ return false;
- if (DEBUG)
- log.trace("last position in document");
- // Check for overflow on document boundary
- if (overflow.start != -1) {
+ // There is an overflow
+ if (this.overflow.doc != -1) {
+ if (DEBUG)
+ log.trace("Fetch from overflow");
+
+ this.setToCurrent(this.overflow, 2);
+
+ // Reset overflow
+ this.overflow.reset();
if (DEBUG)
- log.trace(" but there is an overflow");
-
- _setToCurrent(overflow).clear();
-
- if (DEBUG)
- log.trace("Current2: [{}-{}]", position, end);
-
+ log.trace(" --- MATCH --- Fetch from memory {}",
+ this.current.toString());
+
return true;
};
- if (postings == null) {
- if (DEBUG)
- log.trace("no more postings");
+ // There is no next document
+ if (!this.nextDoc())
return false;
- };
-
- if (DEBUG)
- log.trace("Go to next doc");
-
- doc = postings.nextDoc();
-
- if (doc == DocIdSetIterator.NO_MORE_DOCS) {
- if (DEBUG)
- log.trace("no more docs");
- return false;
- };
-
- // New doc!
- end = -1;
- storedPayload.clear();
- hasStoredPayload = false;
-
- freq = postings.freq();
- count = 0;
};
- int pos = overflow.start;
-
- while (true) {
- if (DEBUG) {
- log.trace("pos is {}", pos);
- _log_payloads(1);
+ // overflow is not empty - let's treat this as current
+ if (this.overflow.doc != -1) {
+
+ if (DEBUG)
+ log.trace("Overflow is not empty");
+
+ this.setToCurrent(this.overflow, 3);
+
+ // TODO: newOverflow() ???
+ this.overflow.reset();
+ }
+ else {
+ if (DEBUG)
+ log.trace("Overflow is empty");
+
+ // Get next posting - count is still < freq
+ this.setToCurrent(4);
+
+ if (this.count == this.freq) {
+ if (DEBUG)
+ log.trace(" --- MATCH --- Direct {}",
+ this.current.toString());
+ return true;
};
+ };
- if (count == freq) {
- if (DEBUG)
- log.trace("last position in document");
+ while (this.count < this.freq) {
- if (postings == null) {
+ // Temp is now the old current
+ this.setCurrentToTemp();
+ // Get new current
+ this.setToCurrent(5);
+
+ if (DEBUG)
+ log.trace("Compare {} with {}",
+ this.current.toString(),
+ this.temp.toString());
+
+ // The next span is not at the same position
+ if (this.current.start != this.temp.start) {
+
+ // Add this to memory
+ if (this.memory.size() > 0) {
if (DEBUG)
- log.trace("no more postings");
-
- // Check for overflow on document boundary
- if (overflow.start != -1) {
- if (DEBUG)
- log.trace(" but there is an overflow");
-
- _setToCurrent(overflow).clear();
- if (DEBUG)
- log.trace("Current3: [{}-{}]", position, end);
-
- return true;
- };
-
- return false;
- };
-
- if (DEBUG) {
- log.trace("go to next doc");
- _log_payloads(2);
- };
-
- if (overflow.start != -1) {
- if (DEBUG) {
- log.trace("Storing overflow {} ...", overflow.toString());
- log.trace("... in memory with {}-{}", overflow.startChar(), overflow.endChar());
- };
- memory.add((KorapTermSpan) overflow.clone());
- overflow.clear();
- };
- if (DEBUG)
- _log_payloads(3);
-
- if (memory.size() > 0) {
- if (DEBUG) {
- log.trace("sort and return first");
- _log_payloads(4);
- };
-
- Collections.sort(memory);
-
- if (DEBUG)
- _log_payloads(5);
-
- _setToCurrent(memory.removeFirst());
-
- if (DEBUG)
- _log_payloads(6);
-
- if (DEBUG)
- log.trace("Current4: [{}-{}]]", position, end);
+ log.trace("[1] Add to memory {}", this.temp.toString());
+ this.memory.add((KorapTermSpan) this.temp.clone());
+ this.overflow = this.current;
break;
};
- doc = postings.nextDoc();
- // New doc
- end = pos = -1;
-
- if (doc == DocIdSetIterator.NO_MORE_DOCS) {
- if (DEBUG)
- log.trace("no more docs");
- return false;
- };
-
- freq = postings.freq();
- count = 0;
- };
-
-
- if (DEBUG)
- log.trace("Forward postings");
-
- position = postings.nextPosition();
- // New pos!
- end = -1;
-
- if (DEBUG) {
- _log_payloads(9);
- log.trace("CLEAR PAYLOAD");
- };
-
- storedPayload.clear();
- hasStoredPayload = false;
-
- if (DEBUG) {
- _log_payloads(10);
- log.trace("next position is {}", position);
- };
-
- count++;
-
- // There was no overflow
- if (pos == -1 || pos == position) {
- if (pos == position) {
- if (DEBUG)
- log.trace("Add overflow to memory");
-
- memory.add((KorapTermSpan) overflow.clone());
- }
-
- else {
- if (DEBUG)
- log.trace("There was no overflow");
- pos = position;
- };
-
- if (DEBUG) {
- _log_payloads(8);
- log.trace("*****************************");
- };
-
- _setCurrentTo(overflow);
-
- if (DEBUG) {
- log.trace("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
- log.trace("Set overflow and continue: {} ...", overflow.toString());
- log.trace("... with {}-{}", overflow.startChar(), overflow.endChar());
- };
-
- continue;
- }
-
- // overflow was older
- else if (pos != position) {
+ // There is no reason to start a memory
+ this.overflow = this.current;
+ this.current = this.temp;
if (DEBUG)
- log.trace("Overflow was older");
+ log.trace(" --- MATCH --- Fetch from memory {}",
+ this.current.toString());
- // Use memory
- if (memory.size() > 0) {
+ return true;
+ }
- if (DEBUG)
- log.trace("Add overflow to memory");
-
- memory.add((KorapTermSpan) overflow.clone());
-
- if (DEBUG)
- log.trace("Sort memory");
-
- // Sort by end position
- Collections.sort(memory);
-
- // Store current information in overflow
- _setCurrentTo(overflow);
-
- if (DEBUG) {
- log.trace("Set new overflow: {}", overflow.toString());
- log.trace("Get first element from sorted memory");
- };
-
- _setToCurrent(memory.removeFirst());
- }
-
- // Nothing in memory - use overflow!
- else {
-
- if (DEBUG)
- log.trace("There is nothing in memory");
-
- /* Make overflow active and store last position in overflow */
- _setCurrentTo(tempSpan);
-
- if (DEBUG)
- log.trace("Temp is now {}", overflow.toString());
-
- _setToCurrent(overflow);
-
- // Store current information in overflow
- overflow.copyFrom(tempSpan);
-
- if (DEBUG)
- log.trace("Overflow is now {}", overflow.toString());
-
- };
- break;
+ // The positions are equal
+ else {
+ if (DEBUG)
+ log.trace("[2] Add to memory {}", this.temp.toString());
+ this.memory.add((KorapTermSpan) this.temp.clone());
};
};
- if (DEBUG)
- log.trace("Current4: [{}-{}]", position, end);
-
- readPayload = false;
- return true;
- };
-
- private KorapTermSpan _setToCurrent (KorapTermSpan act) {
- if (act.payload != null)
- act.payload.rewind();
-
- if (DEBUG)
- log.trace("Set to current with {}, meaning {} - {}",
- act.toString(),
- act.payload.getInt(0),
- act.payload.getInt(4)
- );
-
- if (act.payload != null)
- act.payload.rewind();
-
- position = act.start;
- end = act.end;
- storedPayload.clear();
- hasStoredPayload = false;
-
- if (act.payload != null) {
+ if (this.temp.doc == this.current.doc &&
+ this.temp.start == this.current.start) {
if (DEBUG)
- log.trace("Payload is not null");
-
- act.payload.rewind();
- storedPayload.put(act.payload);
- hasStoredPayload = true;
- }
- else if (DEBUG)
- log.trace("Payload is null");
-
- return act;
- };
-
- private void _log_payloads (int nr) {
- if (!DEBUG)
- return;
-
- if (hasStoredPayload)
- log.trace(
- "[{}] payload offsets are {}-{}",
- nr,
- storedPayload.getInt(0),
- storedPayload.getInt(4)
- );
- else
- log.trace("[{}] payload is empty", nr);
- };
-
- private void _setCurrentTo () {
- overflow.start = position;
- overflow.end = this.end();
- overflow.payload.clear();
-
- if (hasStoredPayload)
- overflow.payload.put(storedPayload);
-
- if (DEBUG)
- log.trace("Set current to Overflow {} with {}-{}", overflow.toString(), overflow.startChar(), overflow.endChar());
- };
-
- private void _setCurrentTo (KorapTermSpan o) {
-
- if (DEBUG)
- _log_payloads(7);
-
- o.start = position;
- o.end = this.end();
- o.payload.clear();
-
- if (hasStoredPayload) {
- storedPayload.rewind();
- o.payload.put(storedPayload);
-
- if (DEBUG)
- log.trace("Object now has offset {}-{}", o.payload.getInt(0), o.payload.getInt(4));
-
- // Import:
- o.payload.rewind();
+ log.trace("[3] Add to memory {}", this.current.toString());
+ this.memory.add((KorapTermSpan) this.current.clone());
};
- if (DEBUG)
- log.trace("Set current to object {} ...", o.toString());
-
- if (hasStoredPayload) {
- if (DEBUG)
- log.trace("with {}-{} from {}-{}", o.startChar(), o.endChar(), storedPayload.getInt(0), storedPayload.getInt(4));
+ // Sort the memory
+ Collections.sort(memory);
- storedPayload.rewind();
- };
+ // There is now a memory
+ return this.next();
};
+
+ // get next doc
+ private boolean nextDoc () throws IOException {
- @Override
- public boolean skipTo(int target) throws IOException {
- assert target > doc;
- doc = postings.advance(target);
-
- end = -1;
- overflow.clear();
- storedPayload.clear();
- hasStoredPayload = false;
-
- if (memory != null)
- memory.clear();
-
- if (doc == DocIdSetIterator.NO_MORE_DOCS)
+ // Check if this doc is the last
+ if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
return false;
- freq = postings.freq();
- count = 0;
- position = postings.nextPosition();
- count++;
- readPayload = false;
+ if (DEBUG)
+ log.trace("Go to next document");
+
+ this.current.reset();
+
+ // Advance to next doc
+ this.current.doc = this.postings.nextDoc();
+
+ // Check if this doc is the last
+ if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
+ return false;
+
+ // check frequencies
+ this.freq = this.postings.freq();
+
+ if (DEBUG)
+ log.trace("Document <{}> has {} occurrences",
+ this.current.doc,
+ this.freq);
+
+
+ this.count = 0;
return true;
};
+
+ @Override
+ public boolean skipTo(int target) throws IOException {
+
+ assert target > this.current.doc;
+
+ // Get this doc
+ this.current.doc = postings.advance(target);
+
+ if (this.current.doc == DocIdSetIterator.NO_MORE_DOCS)
+ return false;
+
+ if (this.memory != null)
+ this.memory.clear();
+
+ this.overflow.reset();
+
+
+ this.freq = this.postings.freq();
+
+ if (DEBUG)
+ log.trace("Document {} has {} occurrences", this.current.doc, this.freq);
+
+
+ this.count = 0;
+
+ if (this.next())
+ return true;
+
+ return false;
+ };
+
+
@Override
public int doc() {
- return doc;
+ return this.current.doc;
};
+
@Override
public int start() {
- return position;
+ return this.current.start;
};
+
@Override
public int end() {
- if (end >= 0)
- return end;
+ if (this.current.end >= 0)
+ return this.current.end;
try {
- end = this.getPayloadEndPosition();
+ this.current.end = this.getPayloadEndPosition();
}
catch (Exception e) {
- end = position;
+ this.current.end = this.current.start;
};
- return end;
+ return this.current.end;
};
+
@Override
public long cost() {
- return postings.cost();
+ // ???
+ return this.postings.cost();
};
+
@Override
public Collection<byte[]> getPayload() throws IOException {
byte[] offsetCharacters = new byte[8];
-
- if (storedPayload.position() <= 0)
+ if (this.current.end <= 0)
this.getPayloadEndPosition();
- if (DEBUG) {
- if (hasStoredPayload)
- log.trace("storedPayload: {} - {}",
- storedPayload.getInt(0),
- storedPayload.getInt(4));
- else
- log.trace("storedPayload is empty");
- };
-
- System.arraycopy(storedPayload.array(), 0, offsetCharacters, 0, 8);
+ System.arraycopy(this.current.payload.array(), 0, offsetCharacters, 0, 8);
return Collections.singletonList(offsetCharacters);
};
- @Override
- public boolean isPayloadAvailable() throws IOException {
- return readPayload == false && postings.getPayload() != null;
+
+ /**
+ * Sets KorapTermSpan to current element
+ */
+ private void setToCurrent (KorapTermSpan act, int debugNumber) {
+
+ if (DEBUG)
+ log.trace(
+ "[{}] Set to current with {}",
+ debugNumber,
+ act.toString()
+ );
+
+ this.current = (KorapTermSpan) act.clone();
};
- @Override
- public String toString() {
- return "spans(" + term.toString() + ")@" +
- (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position);
+ /**
+ * Sets KorapTermSpan to current element
+ */
+ private void setToCurrent (int debugNumber) throws IOException {
+
+ this.current.start = this.postings.nextPosition();
+
+ // This will directly save stored payloads
+ this.current.end = this.getPayloadEndPosition();
+
+ if (DEBUG)
+ log.trace(
+ "[{}] Set new to current with {}",
+ debugNumber,
+ this.current.toString()
+ );
+
+ this.count++;
};
- public DocsAndPositionsEnum getPostings() {
- return postings;
+ private void setCurrentToTemp () {
+ this.temp = (KorapTermSpan) this.current.clone();
};
+
private int getPayloadEndPosition () {
- if (DEBUG)
- log.trace("getPayloadEndPosition of element ...");
-
try {
BytesRef payload = postings.getPayload();
- if (DEBUG)
- log.trace(" BytesRef: {}", payload.toString());
+ this.current.clearPayload();
- readPayload = true;
- storedPayload.clear();
- hasStoredPayload = false;
-
if (payload != null) {
- if (DEBUG)
- log.trace("Do bit magic");
-
- storedPayload.put(payload.bytes, payload.offset, 8);
- storedPayload.put(payload.bytes, payload.offset + 12, payload.length - 12);
- System.arraycopy(payload.bytes, payload.offset + 8, payloadByte, 0, 4);
- hasStoredPayload = true;
- if (DEBUG)
- log.trace("~~ Bytes: {}-{}-{}",
- storedPayload.getInt(0),
- storedPayload.getInt(4),
- payloadByte);
+ this.payloadByte = new byte[4];
+
+ // Copy some payloads like start character and end character
+ this.current.payload.put(payload.bytes, payload.offset, 8);
+ this.current.payload.put(payload.bytes, payload.offset + 12, payload.length - 12);
+
+ // Copy end position integer to payloadByte
+ System.arraycopy(payload.bytes, payload.offset + 8, this.payloadByte, 0, 4);
}
- else {
- if (DEBUG)
- log.trace("There's no payload available");
-
- payloadByte = null;
+ else {
+ this.payloadByte = null;
};
- if (payloadByte != null) {
+ // Todo: REWRITE!
+ if (this.payloadByte != null) {
+
+ // Todo: This is weird!
+
bb.clear();
int t = bb.wrap(payloadByte).getInt();
+
if (DEBUG)
- log.trace(" |-> {}", t);
+ log.trace("Get Endposition and payload: {}-{} with end position {} in doc {}",
+ this.current.payload.getInt(0),
+ this.current.payload.getInt(4),
+ t,
+ this.current.doc);
return t;
+ }
+ else if (DEBUG) {
+ log.trace("Get Endposition and payload: None found");
};
-
}
catch (IOException e) {
if (DEBUG)
log.trace("IOException {}", e);
};
+
return -1;
};
+ @Override
+ public boolean isPayloadAvailable() throws IOException {
+
+ if (current.payload != null)
+ return true;
+
+ return false;
+ };
+
+
+ @Override
+ public String toString() {
+ return "spans(" + this.term.toString() + ")@" +
+ (this.current.doc == -1 ? "START" : (this.current.doc == Integer.MAX_VALUE) ? "END" : this.current.doc + "-" + this.current.start);
+ };
+
+ public DocsAndPositionsEnum getPostings() {
+ return postings;
+ };
+
private static final class EmptyElementSpans extends ElementSpans {
@Override
@@ -597,6 +453,4 @@
@Override
public long cost() { return 0; };
};
-
- public static final ElementSpans EMPTY_ELEMENT_SPANS = new EmptyElementSpans();
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/KorapSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/KorapSpan.java
index a865ca4..098573c 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/KorapSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/KorapSpan.java
@@ -23,7 +23,7 @@
this.start = o.start;
this.end = o.end;
this.doc = o.doc;
- clearPayload();
+ this.clearPayload();
return this;
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/KorapTermSpan.java b/src/main/java/de/ids_mannheim/korap/query/spans/KorapTermSpan.java
index 4d50e14..c1fe75e 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/KorapTermSpan.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/KorapTermSpan.java
@@ -21,21 +21,23 @@
public Object clone() {
KorapTermSpan span = new KorapTermSpan();
span.start = this.start;
- span.end = this.end;
- span.doc = this.doc;
+ span.end = this.end;
+ span.doc = this.doc;
- this.payload.rewind();
- span.payload.put(this.payload);
+ if (this.payload != null) {
+ this.payload.rewind();
+ span.payload.put(this.payload);
- if (DEBUG) {
- log.trace("Clone payload {} to payload {} ...",
- this.payload.toString(),
- span.payload.toString());
- log.trace("... from {}-{} to {}-{}",
- this.startChar(),
- this.endChar(),
- span.startChar(),
- span.endChar());
+ if (DEBUG) {
+ log.trace("[TS] Clone payload {} to payload {} ...",
+ this.payload.toString(),
+ span.payload.toString());
+ log.trace("[TS] ... from {}-{} to {}-{}",
+ this.startChar(),
+ this.endChar(),
+ span.startChar(),
+ span.endChar());
+ };
};
return span;
@@ -47,11 +49,18 @@
return this;
};
+ public KorapSpan shallowCopyFrom (KorapTermSpan o) {
+ super.copyFrom((KorapSpan) o);
+ this.payload = o.payload;
+ return this;
+ };
+
+
@Override
public void clearPayload () {
if (this.payload != null) {
this.payload.clear();
- this.payload.rewind();
+ // this.payload.rewind();
};
};
@@ -60,12 +69,13 @@
this.payload = ByteBuffer.allocate(128);
};
-
@Override
public String toString () {
StringBuilder sb = new StringBuilder("[");
return sb.append(this.start).append('-')
.append(this.end)
+ .append("#")
+ .append(this.startChar()).append('-').append(this.endChar())
.append('(').append(this.doc).append(')')
.append('$').append(this.payload.toString())
.append(']')
@@ -79,4 +89,11 @@
public int endChar () {
return this.payload.getInt(4);
};
+
+ public void reset () {
+ this.clearPayload();
+ this.start = -1;
+ this.end = -1;
+ this.doc = -1;
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java
index 6a1b93a..94413ed 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/WithinSpans.java
@@ -337,14 +337,15 @@
this.wrapStart = -1;
this.wrapEnd = -1;
+ // Retrieve doc information
+ this.wrapDoc = this.wrapSpans.doc();
+
if (DEBUG)
log.trace(
" Forward wrap span to {}",
_currentWrap().toString()
);
- // Retrieve doc information
- this.wrapDoc = this.wrapSpans.doc();
if (this.embeddedDoc != this.wrapDoc) {
if (DEBUG)
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index ac757b7..f48fe94 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -1,11 +1,11 @@
## logger file can be used with
-#log4j.rootLogger = DEBUG, stdout
+log4j.rootLogger = DEBUG, stdout
# Spans:
-# log4j.logger.de.ids_mannheim.korap.query.spans.ElementSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.query.spans.ElementSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.KorapTermSpan = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.query.spans.WithinSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.query.spans.WithinSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.SpanNextQuery = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.NextSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.SimpleSpans = TRACE, stdout
@@ -19,8 +19,8 @@
# Results:
# log4j.logger.de.ids_mannheim.korap.KorapIndex = TRACE, stdout
-// log4j.logger.de.ids_mannheim.korap.KorapMatch = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.KorapMatch = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.index.TestSegmentIndex = TRACE, stdout
diff --git a/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkSpans.java b/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkSpans.java
index 831d91f..4d8c191 100644
--- a/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkSpans.java
+++ b/src/test/java/de/ids_mannheim/korap/benchmark/TestBenchmarkSpans.java
@@ -73,8 +73,6 @@
// After refactoring
// 100 times
// 273.58114372 seconds
-
-
};
@@ -148,7 +146,7 @@
};
t2 = System.nanoTime();
- System.err.println(kr.getMatch(0).toJSON());
+ // System.err.println(kr.getMatch(0).toJSON());
assertEquals("TotalResults1", 4116282, kr.getTotalResults());
assertEquals("TotalResults2", kr.getTotalResults(), ki.numberOf("sentences"));
@@ -156,6 +154,8 @@
double seconds = (double)(t2-t1) / 1000000000.0;
System.out.println("It took " + seconds + " seconds");
+ // 100 rounds
+ // 56.253 secs
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java
index ac84b6a..a42131f 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java
@@ -20,6 +20,7 @@
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.index.Term;
@@ -324,4 +325,89 @@
assertEquals("... ccc[222222]fff ...", kr.match(1).getSnippetBrackets());
assertEquals("... fff[333333]iii ...", kr.match(2).getSnippetBrackets());
};
+
+
+ @Test
+ public void indexExample6 () throws IOException {
+
+ KorapIndex ki = new KorapIndex();
+
+ // <a>x<a>y<a>zhij</a>hij</a>hij</a>
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z h i j h i j h i j ",
+ "[(0-3)s:x|_0#0-3|<>:a#0-36$<i>12]" + // 1
+ "[(3-6)s:y|_1#3-6|<>:a#3-27$<i>9]" + // 2
+ "[(6-9)s:z|_2#6-9|<>:a#6-18$<i>6]" + // 3
+ "[(9-12)s:h|_3#9-12]" + // 4
+ "[(12-15)s:i|_4#12-15]" + // 5
+ "[(15-18)s:j|_5#15-18]" + // 6
+ "[(18-21)s:h|_6#18-21]" + // 7
+ "[(21-24)s:i|_7#21-24]" + // 8
+ "[(24-27)s:j|_8#24-27]" + // 9
+ "[(27-30)s:h|_9#27-30]" + // 10
+ "[(30-33)s:i|_10#30-33]" + // 11
+ "[(33-36)s:j|_11#33-36]"); // 12
+ ki.addDoc(fd);
+
+ fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z h ",
+ "[(0-3)s:x|_0#0-3]" + // 1
+ "[(3-6)s:y|_1#3-6]" + // 2
+ "[(6-9)s:z|_2#6-9]" + // 3
+ "[(9-12)s:h|_3#9-12]"); // 4
+ ki.addDoc(fd);
+
+ // Here is a larger offset than expected
+ fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z h ",
+ "[(0-3)s:x|_0#0-3|<>:a#0-36$<i>12]" + // 1
+ "[(3-6)s:y|_1#3-6]" + // 2
+ "[(6-9)s:z|_2#6-9]" + // 3
+ "[(9-12)s:h|_3#9-12]"); // 4
+ ki.addDoc(fd);
+
+ // <a>x<a>y<a>zabc</a>abc</a>abc</a>
+ fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z a b c a b c a b c ",
+ "[(0-3)s:x|_0#0-3|<>:a#0-36$<i>12]" + // 1
+ "[(3-6)s:y|_1#3-6|<>:a#3-27$<i>9]" + // 2
+ "[(6-9)s:z|_2#6-9|<>:a#6-18$<i>6]" + // 3
+ "[(9-12)s:a|_3#9-12]" + // 4
+ "[(12-15)s:b|_4#12-15]" + // 5
+ "[(15-18)s:c|_5#15-18]" + // 6
+ "[(18-21)s:a|_6#18-21]" + // 7
+ "[(21-24)s:b|_7#21-24]" + // 8
+ "[(24-27)s:c|_8#24-27]" + // 9
+ "[(27-30)s:a|_9#27-30]" + // 10
+ "[(30-33)s:b|_10#30-33]" + // 11
+ "[(33-36)s:c|_11#33-36]"); // 12
+ ki.addDoc(fd);
+
+ fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z h ",
+ "[(0-3)s:x|_0#0-3]" + // 1
+ "[(3-6)s:y|_1#3-6]" + // 2
+ "[(6-9)s:z|_2#6-9]" + // 3
+ "[(9-12)s:h|_3#9-12]"); // 4
+ ki.addDoc(fd);
+
+ // Save documents
+ ki.commit();
+
+ SpanQuery sq;
+ KorapResult kr;
+
+ sq = new SpanElementQuery("base", "a");
+ kr = ki.search(sq, (short) 15);
+
+ // System.err.println(kr.toJSON());
+
+ assertEquals(5, ki.numberOf("documents"));
+ assertEquals("totalResults", 7, kr.totalResults());
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java
index 48d6815..dbad545 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java
@@ -73,9 +73,8 @@
assertEquals("SnippetBrackets (0)", "... bcabca[b{a}]c", kr.match(0).snippetBrackets());
assertEquals("Test no 'more' context", "<span class=\"context-left\"><span class=\"more\"></span>bcabca</span><span class=\"match\">b<em class=\"class-0 level-0\">a</em></span><span class=\"context-right\">c</span>", kr.match(0).snippetHTML());
-
sq = new SpanMatchModifyClassQuery(
- new SpanNextQuery(
+ new SpanNextQuery(
new SpanTermQuery(new Term("base", "s:b")),
new SpanClassQuery(
new SpanTermQuery(new Term("base", "s:a"))
@@ -88,7 +87,6 @@
assertEquals("StartPos (0)", 8, kr.match(0).startPos);
assertEquals("EndPos (0)", 9, kr.match(0).endPos);
assertEquals("SnippetBrackets (0)", "... cabcab[a]c", kr.match(0).snippetBrackets());
-
sq = new SpanMatchModifyClassQuery(
new SpanNextQuery(
new SpanClassQuery(new SpanTermQuery(new Term("base", "s:a")), (byte) 2),
@@ -243,7 +241,6 @@
assertEquals("SnippetBrackets (6)", "... abcaba[c]", kr.match(6).snippetBrackets());
assertEquals("SnippetBrackets (6)", "<span class=\"context-left\"><span class=\"more\"></span>abcaba</span><span class=\"match\">c</span><span class=\"context-right\"></span>", kr.match(6).snippetHTML());
-
kr = ki.search(sq, 0, (short) 20, true, (short) 0, true, (short) 0);
assertEquals("totalResults", 7, kr.totalResults());
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestWithinIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestWithinIndex.java
index ad35733..970b30f 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestWithinIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestWithinIndex.java
@@ -277,6 +277,102 @@
@Test
+ public void indexExample1d () throws IOException {
+ // Cases 9, 12, 13
+ KorapIndex ki = new KorapIndex();
+
+ // <a>x<a>y<a>zhij</a>hij</a>hij</a>
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z h i j h i j h i j ",
+ "[(0-3)s:x|<>:a#0-36$<i>12]" + // 1
+ "[(3-6)s:y|<>:a#3-27$<i>9]" + // 2
+ "[(6-9)s:z|<>:a#6-18$<i>6]" + // 3
+ "[(9-12)s:h]" + // 4
+ "[(12-15)s:i]" + // 5
+ "[(15-18)s:j]" + // 6
+ "[(18-21)s:h]" + // 7
+ "[(21-24)s:i]" + // 8
+ "[(24-27)s:j]" + // 9
+ "[(27-30)s:h]" + // 10
+ "[(30-33)s:i]" + // 11
+ "[(33-36)s:j]"); // 12
+ ki.addDoc(fd);
+
+ fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z h ",
+ "[(0-3)s:x]" + // 1
+ "[(3-6)s:y]" + // 2
+ "[(6-9)s:z]" + // 3
+ "[(9-12)s:h]"); // 4
+ ki.addDoc(fd);
+
+ // <a>x<a>y<a>zabc</a>abc</a>abc</a>
+ fd = new FieldDocument();
+ fd.addTV("base",
+ "x y z a b c a b c a b c ",
+ "[(0-3)s:x|<>:a#0-36$<i>12]" + // 1
+ "[(3-6)s:y|<>:a#3-27$<i>9]" + // 2
+ "[(6-9)s:z|<>:a#6-18$<i>6]" + // 3
+ "[(9-12)s:a]" + // 4
+ "[(12-15)s:b]" + // 5
+ "[(15-18)s:c]" + // 6
+ "[(18-21)s:a]" + // 7
+ "[(21-24)s:b]" + // 8
+ "[(24-27)s:c]" + // 9
+ "[(27-30)s:a]" + // 10
+ "[(30-33)s:b]" + // 11
+ "[(33-36)s:c]"); // 12
+ ki.addDoc(fd);
+
+ // Save documents
+ ki.commit();
+
+ SpanQuery sq;
+ KorapResult kr;
+
+ sq = new SpanElementQuery("base", "a");
+ kr = ki.search(sq, (short) 15);
+
+ sq = new SpanWithinQuery(
+ new SpanElementQuery("base", "a"),
+ new SpanTermQuery(new Term("base", "s:h"))
+ );
+
+ kr = ki.search(sq, (short) 15);
+
+ // System.err.println(kr.toJSON());
+
+ assertEquals("totalResults", 6, kr.totalResults());
+
+ assertEquals("StartPos (0)", 0, kr.match(0).startPos);
+ assertEquals("EndPos (0)", 12, kr.match(0).endPos);
+ assertEquals("Doc (0)", 0, kr.match(0).internalDocID);
+ assertEquals("StartPos (1)", 0, kr.match(1).startPos);
+ assertEquals("EndPos (1)", 12, kr.match(1).endPos);
+ assertEquals("Doc (1)", 0, kr.match(1).internalDocID);
+ assertEquals("StartPos (2)", 0, kr.match(2).startPos);
+ assertEquals("EndPos (2)", 12, kr.match(2).endPos);
+ assertEquals("Doc (2)", 0, kr.match(2).internalDocID);
+ assertEquals("StartPos (3)", 1, kr.match(3).startPos);
+ assertEquals("EndPos (3)", 9, kr.match(3).endPos);
+ assertEquals("Doc (3)", 0, kr.match(3).internalDocID);
+ assertEquals("StartPos (4)", 1, kr.match(4).startPos);
+ assertEquals("EndPos (4)", 9, kr.match(4).endPos);
+ assertEquals("Doc (4)", 0, kr.match(4).internalDocID);
+ assertEquals("StartPos (5)", 2, kr.match(5).startPos);
+ assertEquals("EndPos (5)", 6, kr.match(5).endPos);
+ assertEquals("Doc (5)", 0, kr.match(5).internalDocID);
+
+ assertEquals(3, ki.numberOf("documents"));
+ };
+
+
+
+
+
+ @Test
public void indexExample2a () throws IOException {
KorapIndex ki = new KorapIndex();
@@ -798,12 +894,12 @@
FieldDocument fd = new FieldDocument();
fd.addTV("base",
"x y x b c x ",
- "[(0-3)s:x]" +
- "[(3-6)s:y]" +
- "[(6-9)s:x|<>:a#6-15$<i>5|<>:a#6-9$<i>3]" +
- "[(9-12)s:b]" +
- "[(12-15)s:c|<>:a#12-15$<i>5]" +
- "[(15-18)s:x]");
+ "[(0-3)s:x|_0#0-3]" +
+ "[(3-6)s:y|_1#3-6]" +
+ "[(6-9)s:x|_2#6-9|<>:a#6-15$<i>5|<>:a#6-9$<i>3]" +
+ "[(9-12)s:b|_3#9-12]" +
+ "[(12-15)s:c|_4#12-15|<>:a#12-15$<i>5]" +
+ "[(15-18)s:x|_5#15-18]");
ki.addDoc(fd);
// Save documents
@@ -818,6 +914,7 @@
KorapResult kr = ki.search(sq, (short) 10);
+ // System.err.println(kr.toJSON());
assertEquals("totalResults", 2, kr.totalResults());
assertEquals("StartPos (0)", 2, kr.match(0).startPos);
assertEquals("EndPos (0)", 3, kr.match(0).endPos);
@@ -954,27 +1051,24 @@
assertEquals("totalResults", 0, kr.totalResults());
};
- /** SpanElementQueries
- * */
- @Test
- public void indexExample8() throws IOException{
- KorapIndex ki = new KorapIndex();
- FieldDocument fd = new FieldDocument();
- // <a>xx <e>hi j <e>hi j</e></e></a>
- fd.addTV("base",
- "xx hi j hi j",
- "[(0-1)s:x|i:x|_0#0-1|<>:a#1-12$<i>8]" +
- "[(1-2)s:x|i:x|_1#1-2]" +
- "[(3-4)s:h|i:h|_2#3-4|<>:e#3-12$<i>8]" +
- "[(4-5)s:i|i:i|_3#4-5]" +
- "[(6-7)s:j|i:j|_4#6-7]" +
- "[(8-9)s:h|i:h|_5#8-9|<>:e#8-9$<i>8]" +
- "[(9-10)s:i|i:i|_6#9-10]" +
- "[(11-12)s:j|i:j|_7#11-12]");
- ki.addDoc(fd);
-
-
- }
-
+ /** SpanElementQueries
+ * */
+ @Test
+ public void indexExample8() throws IOException{
+ KorapIndex ki = new KorapIndex();
+ FieldDocument fd = new FieldDocument();
+ // <a>xx <e>hi j <e>hi j</e></e></a>
+ fd.addTV("base",
+ "xx hi j hi j",
+ "[(0-1)s:x|i:x|_0#0-1|<>:a#1-12$<i>8]" +
+ "[(1-2)s:x|i:x|_1#1-2]" +
+ "[(3-4)s:h|i:h|_2#3-4|<>:e#3-12$<i>8]" +
+ "[(4-5)s:i|i:i|_3#4-5]" +
+ "[(6-7)s:j|i:j|_4#6-7]" +
+ "[(8-9)s:h|i:h|_5#8-9|<>:e#8-9$<i>8]" +
+ "[(9-10)s:i|i:i|_6#9-10]" +
+ "[(11-12)s:j|i:j|_7#11-12]");
+ ki.addDoc(fd);
+ };
};