Added sorting to Subspans.
Change-Id: If93a1e97eee01fa30e9aeb7586704888733ead22
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanFocusQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanFocusQuery.java
index ace6183..5f18ff1 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanFocusQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanFocusQuery.java
@@ -37,7 +37,7 @@
private boolean isSorted = true;
private boolean matchTemporaryClass = false;
private boolean removeTemporaryClasses = false;
-
+ private int windowSize = 10;
/**
* Construct a new SpanFocusQuery.
@@ -60,7 +60,6 @@
this.classNumbers = classNumbers;
};
-
/**
* Construct a new SpanFocusQuery. The class to focus on defaults
* to
@@ -209,4 +208,14 @@
this.removeTemporaryClasses = rem;
}
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
+
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/SpanSubspanQuery.java b/src/main/java/de/ids_mannheim/korap/query/SpanSubspanQuery.java
index 1a1ab0a..c547edc 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SpanSubspanQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SpanSubspanQuery.java
@@ -10,7 +10,6 @@
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
-import de.ids_mannheim.korap.query.spans.ElementSpans;
import de.ids_mannheim.korap.query.spans.SubSpans;
/**
@@ -41,6 +40,7 @@
public class SpanSubspanQuery extends SimpleSpanQuery {
private int startOffset, length;
+ private int windowSize = 10;
/**
@@ -138,4 +138,13 @@
public void setLength (int length) {
this.length = length;
}
+
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpanComparator.java b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpanComparator.java
new file mode 100644
index 0000000..0ba3c29
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/CandidateSpanComparator.java
@@ -0,0 +1,27 @@
+package de.ids_mannheim.korap.query.spans;
+
+import java.util.Comparator;
+
+public class CandidateSpanComparator implements Comparator<CandidateSpan> {
+
+ @Override
+ public int compare(CandidateSpan o1, CandidateSpan o2) {
+ if (o1.doc == o2.doc) {
+ if (o1.getStart() == o2.getStart()) {
+ if (o1.getEnd() == o2.getEnd()) return 0;
+ if (o1.getEnd() > o2.getEnd())
+ return 1;
+ else
+ return -1;
+ }
+ else if (o1.getStart() < o2.getStart())
+ return -1;
+ else
+ return 1;
+ }
+ else if (o1.doc < o2.doc)
+ return -1;
+ else
+ return 1;
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java
index a23240d..202d3d4 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java
@@ -54,12 +54,12 @@
private boolean isSorted, matchTemporaryClass, removeTemporaryClasses;
// private List<CandidateSpan> candidateSpans;
- private int windowSize = 10;
+ private int windowSize;
private int currentDoc;
private int prevStart;
private int prevDoc;
private PriorityQueue<CandidateSpan> candidates;
- private FocusSpanComparator comparator;
+ private CandidateSpanComparator comparator;
/**
* Construct a FocusSpan for the given {@link SpanQuery}.
@@ -86,10 +86,11 @@
"At least one class number must be specified.");
}
classNumbers = query.getClassNumbers();
+ windowSize = query.getWindowSize();
isSorted = query.isSorted();
matchTemporaryClass = query.matchTemporaryClass();
removeTemporaryClasses = query.removeTemporaryClasses();
- // candidateSpans = new ArrayList<CandidateSpan>();
+
candidates = new PriorityQueue<>(windowSize, comparator);
hasMoreSpans = firstSpans.next();
currentDoc = firstSpans.doc();
@@ -284,28 +285,4 @@
public long cost () {
return firstSpans.cost();
};
-
- class FocusSpanComparator implements Comparator<CandidateSpan> {
-
- @Override
- public int compare(CandidateSpan o1, CandidateSpan o2) {
- if (o1.doc == o2.doc) {
- if (o1.getStart() == o2.getStart()) {
- if (o1.getEnd() == o2.getEnd()) return 0;
- if (o1.getEnd() > o2.getEnd())
- return 1;
- else
- return -1;
- }
- else if (o1.getStart() < o2.getStart())
- return -1;
- else
- return 1;
- }
- else if (o1.doc < o2.doc)
- return -1;
- else
- return 1;
- }
- }
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/SubSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/SubSpans.java
index 3b85685..5570b8f 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/SubSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/SubSpans.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.util.Map;
import java.util.ArrayList;
+import java.util.PriorityQueue;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
@@ -33,6 +34,12 @@
public static final boolean DEBUG = false;
private int startOffset, length;
+ private int windowSize;
+ private int currentDoc;
+ private int prevStart;
+ private int prevDoc;
+ private PriorityQueue<CandidateSpan> candidates;
+ private CandidateSpanComparator comparator;
/**
* Constructs SubSpans for the given {@link SpanSubspanQuery}
@@ -52,6 +59,8 @@
this.startOffset = subspanQuery.getStartOffset();
this.length = subspanQuery.getLength();
this.matchPayload = new ArrayList<byte[]>(6);
+ this.windowSize = subspanQuery.getWindowSize();
+ candidates = new PriorityQueue<>(windowSize, comparator);
if (DEBUG) {
log.trace("Init SubSpan at {} with length {}", this.startOffset, this.length);
@@ -75,82 +84,128 @@
* @throws IOException
*/
private boolean advance () throws IOException {
- while (hasMoreSpans) {
- if (findMatch()) {
+ while (hasMoreSpans || candidates.size() > 0) {
+ CandidateSpan cs = new CandidateSpan(firstSpans);
+ if (startOffset > 0) {
+ if (findMatch(cs)) {
+ setMatch(cs);
+ hasMoreSpans = firstSpans.next();
+ return true;
+ }
hasMoreSpans = firstSpans.next();
+ }
+ else if (candidates.isEmpty()) {
+ currentDoc = firstSpans.doc();
+ collectCandidates();
+ }
+ else {
+ setMatch(candidates.poll());
+ collectCandidates();
return true;
}
- hasMoreSpans = firstSpans.next();
}
return false;
}
+ private void collectCandidates() throws IOException {
+
+ while (hasMoreSpans && candidates.size() < windowSize
+ && firstSpans.doc() == currentDoc) {
+ CandidateSpan cs;
+ if (findMatch(cs = new CandidateSpan(firstSpans))) {
+ if (cs.getDoc() == prevDoc && cs.getStart() < prevStart) {
+ log.warn("Span (" + cs.getStart() + ", " + cs.getEnd()
+ + ") is out of order and skipped.");
+ }
+ else {
+ candidates.add(cs);
+ }
+ }
+ hasMoreSpans = firstSpans.next();
+ }
+ }
/**
* Sets the properties of the current match/subspan.
*
* @throws IOException
*/
- public boolean findMatch () throws IOException {
+ public boolean findMatch(CandidateSpan cs) throws IOException {
// Check at span ending
if (this.startOffset < 0) {
- matchStartPosition = firstSpans.end() + startOffset;
- if (matchStartPosition < firstSpans.start()) {
- matchStartPosition = firstSpans.start();
+ cs.setStart(firstSpans.end() + startOffset);
+ if (cs.getStart() < firstSpans.start()) {
+ cs.setStart(firstSpans.start());
};
}
// Check at span beginning
else {
- matchStartPosition = firstSpans.start() + startOffset;
- if (matchStartPosition >= firstSpans.end()) {
+ cs.setStart(firstSpans.start() + startOffset);
+ if (cs.getStart() >= firstSpans.end()) {
return false;
}
}
// Find end position of span
if (this.length > 0) {
- matchEndPosition = matchStartPosition + this.length;
- if (matchEndPosition > firstSpans.end()) {
- matchEndPosition = firstSpans.end();
+ cs.setEnd(cs.getStart() + this.length);
+ if (cs.getEnd() > firstSpans.end()) {
+ cs.setEnd(firstSpans.end());
}
}
else {
- matchEndPosition = firstSpans.end();
+ cs.setEnd(firstSpans.end());
}
- matchPayload.clear();
+ // matchPayload.clear();
// Remove element payloads
for (byte[] payload : firstSpans.getPayload()) {
if ((payload[0] & ((byte) 64)) != 0) {
continue;
};
- matchPayload.add(payload.clone());
+ cs.getPayloads().add(payload.clone());
};
- matchDocNumber = firstSpans.doc();
+ cs.setDoc(firstSpans.doc());
if (DEBUG) {
log.trace("Start at absolute position {} " +
"and end at absolute position {}",
- matchStartPosition,
- matchEndPosition);
+ cs.getStart(),
+ cs.getEnd());
};
return true;
}
+ private void setMatch(CandidateSpan cs) {
+ matchStartPosition = cs.getStart();
+ prevStart = matchStartPosition;
+ matchEndPosition = cs.getEnd();
+ matchDocNumber = cs.getDoc();
+ prevDoc = matchDocNumber;
+ matchPayload.clear();
+ matchPayload.addAll(cs.getPayloads());
+ }
@Override
public boolean skipTo (int target) throws IOException {
- if (hasMoreSpans && (firstSpans.doc() < target)) {
- if (!firstSpans.skipTo(target)) {
- hasMoreSpans = false;
- return false;
+ if (candidates.size() > 0) {
+ CandidateSpan cs;
+ while ((cs = candidates.poll()) != null) {
+ if (cs.getDoc() == target) {
+ return next();
+ }
}
}
- matchPayload.clear();
+ if (firstSpans.doc() == target) {
+ return next();
+ }
+ if (firstSpans.doc() < target && firstSpans.skipTo(target)) {
+ return next();
+ }
return advance();
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSubSpanIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSubSpanIndex.java
index f09301f..eb719b1 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSubSpanIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSubSpanIndex.java
@@ -9,10 +9,13 @@
import org.junit.Test;
import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.query.DistanceConstraint;
+import de.ids_mannheim.korap.query.SpanClassQuery;
import de.ids_mannheim.korap.query.SpanDistanceQuery;
import de.ids_mannheim.korap.query.SpanElementQuery;
+import de.ids_mannheim.korap.query.SpanFocusQuery;
import de.ids_mannheim.korap.query.SpanSubspanQuery;
/*
@@ -139,6 +142,10 @@
SpanSubspanQuery ssq = new SpanSubspanQuery(new SpanElementQuery("base", "x"), -1, 1, true);
kr = ki.search(ssq, (short) 10);
+ for (Match km : kr.getMatches()) {
+ System.out.println(km.getStartPos() + "," + km.getEndPos()
+ + km.getSnippetBrackets());
+ }
assertEquals(2, kr.getTotalResults());
assertEquals("a [b ]c ", kr.getMatch(0).getSnippetBrackets());
assertEquals("a b [c ]", kr.getMatch(1).getSnippetBrackets());