Implemented dynamic window for sorting in FocusSpans.
Change-Id: Ie59bae8a116e6377f4fe9f7d400cdc55207785c1
diff --git a/Changes b/Changes
index ffad231..fa5c8d6 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,5 @@
+0.55.5 2016-04-26
+ - [performance] Changed to a dynamic window for sorting in FocusSpans (margaretha)
0.55.5 2016-04-25
- [bugfix] store skipped spans in Repetitionspans as candidates
(margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java
index 244c79d..873aaf2 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/FocusSpans.java
@@ -3,10 +3,10 @@
import static de.ids_mannheim.korap.util.KrillByte.byte2int;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
+import java.util.Comparator;
import java.util.List;
import java.util.Map;
+import java.util.PriorityQueue;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
@@ -53,10 +53,13 @@
public static final boolean DEBUG = false;
private boolean isSorted, matchTemporaryClass, removeTemporaryClasses;
- private List<CandidateSpan> candidateSpans;
+ // private List<CandidateSpan> candidateSpans;
private int windowSize = 10;
private int currentDoc;
-
+ private int prevStart;
+ private int prevDoc;
+ private PriorityQueue<CandidateSpan> candidates;
+ private FocusSpanComparator comparator;
/**
* Construct a FocusSpan for the given {@link SpanQuery}.
@@ -86,7 +89,8 @@
isSorted = query.isSorted();
matchTemporaryClass = query.matchTemporaryClass();
removeTemporaryClasses = query.removeTemporaryClasses();
- candidateSpans = new ArrayList<CandidateSpan>();
+ // candidateSpans = new ArrayList<CandidateSpan>();
+ candidates = new PriorityQueue<>(windowSize, comparator);
hasMoreSpans = firstSpans.next();
currentDoc = firstSpans.doc();
@@ -99,7 +103,7 @@
matchPayload.clear();
spanId = 0;
CandidateSpan cs;
- while (hasMoreSpans || candidateSpans.size() > 0) {
+ while (hasMoreSpans || candidates.size() > 0) {
if (isSorted) {
if (firstSpans.isPayloadAvailable()
@@ -111,14 +115,13 @@
}
hasMoreSpans = firstSpans.next();
}
- else if (candidateSpans.isEmpty()) {
+ else if (candidates.isEmpty()) {
currentDoc = firstSpans.doc();
collectCandidates();
- Collections.sort(candidateSpans);
}
else {
- setMatch(candidateSpans.get(0));
- candidateSpans.remove(0);
+ setMatch(candidates.poll());
+ collectCandidates();
return true;
}
}
@@ -129,22 +132,29 @@
private void collectCandidates () throws IOException {
CandidateSpan cs = null;
- while (hasMoreSpans && candidateSpans.size() < windowSize
+ while (hasMoreSpans && candidates.size() < windowSize
&& firstSpans.doc() == currentDoc) {
if (firstSpans.isPayloadAvailable()
&& updateSpanPositions(cs = new CandidateSpan(firstSpans))) {
- candidateSpans.add(cs);
+ if (cs.getDoc() == prevDoc && cs.getStart() < prevStart) {
+ log.warn("Span (" + cs.getStart() + ", "
+ + cs.getEnd() + ") is out of order and skipped.");
+ }
+ else {
+ candidates.add(cs);
+ }
}
hasMoreSpans = firstSpans.next();
}
}
-
private void setMatch (CandidateSpan cs) {
matchStartPosition = cs.getStart();
+ prevStart = matchStartPosition;
matchEndPosition = cs.getEnd();
matchDocNumber = cs.getDoc();
+ prevDoc = matchDocNumber;
matchPayload.addAll(cs.getPayloads());
if (firstSpans instanceof RelationSpans && classNumbers.size() == 1) {
@@ -244,7 +254,18 @@
// Todo: Check for this on document boundaries!
@Override
public boolean skipTo (int target) throws IOException {
- if (this.doc() < target && firstSpans.skipTo(target)) {
+ if (candidates.size() > 0) {
+ CandidateSpan cs;
+ while ((cs = candidates.poll()) != null) {
+ if (cs.getDoc() == target) {
+ return next();
+ }
+ }
+ }
+ if (firstSpans.doc() == target) {
+ return next();
+ }
+ if (firstSpans.doc() < target && firstSpans.skipTo(target)) {
return next();
}
return false;
@@ -262,4 +283,28 @@
public long cost () {
return firstSpans.cost();
};
+
+ class FocusSpanComparator implements Comparator<CandidateSpan> {
+
+ @Override
+ public int compare(CandidateSpan o1, CandidateSpan o2) {
+ if (o1.doc == o2.doc) {
+ if (o1.getStart() == o2.getStart()) {
+ if (o1.getEnd() == o2.getEnd()) return 0;
+ if (o1.getEnd() > o2.getEnd())
+ return 1;
+ else
+ return -1;
+ }
+ else if (o1.getStart() < o2.getStart())
+ return -1;
+ else
+ return 1;
+ }
+ else if (o1.doc < o2.doc)
+ return -1;
+ else
+ return 1;
+ }
+ }
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFocusIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestFocusIndex.java
new file mode 100644
index 0000000..2f668e2
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFocusIndex.java
@@ -0,0 +1,46 @@
+package de.ids_mannheim.korap.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.junit.Test;
+
+import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.query.SpanFocusQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
+import de.ids_mannheim.korap.query.SpanRelationQuery;
+import de.ids_mannheim.korap.response.Match;
+import de.ids_mannheim.korap.response.Result;
+
+public class TestFocusIndex {
+ private KrillIndex ki;
+ private Result kr;
+
+ public TestFocusIndex () throws IOException {
+ ki = new KrillIndex();
+ }
+
+ /**
+ * Check Skipto focus spans
+ * */
+ @Test
+ public void testCase12() throws IOException {
+ ki.addDoc(TestRelationIndex.createFieldDoc0());
+ ki.addDoc(TestRelationIndex.createFieldDoc1());
+ ki.commit();
+ SpanRelationQuery sq = new SpanRelationQuery(new SpanTermQuery(
+ new Term("base", ">:xip/syntax-dep_rel")), true);
+ sq.setSourceClass((byte) 1);
+
+ SpanFocusQuery sfq = new SpanFocusQuery(sq, (byte) 1);
+ sfq.setSorted(false);
+ SpanTermQuery stq = new SpanTermQuery(new Term("base", "s:c"));
+ SpanNextQuery snq = new SpanNextQuery(stq, sfq);
+
+ kr = ki.search(snq, (short) 20);
+ for (Match m : kr.getMatches()) {
+ System.out.println(m.getStartPos() + " " + m.getEndPos());
+ }
+ }
+}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index 23674b6..0465585 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -13,9 +13,12 @@
import org.junit.runners.JUnit4;
import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.query.SpanClassQuery;
import de.ids_mannheim.korap.query.SpanElementQuery;
+import de.ids_mannheim.korap.query.SpanFocusQuery;
import de.ids_mannheim.korap.query.SpanNextQuery;
import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper;
+import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
@RunWith(JUnit4.class)
@@ -262,6 +265,23 @@
assertEquals("StartPos", 0, kr.getMatch(0).startPos);
assertEquals("EndPos", 3, kr.getMatch(0).endPos);
+ sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:c")),
+ new SpanNextQuery(new SpanFocusQuery(new SpanClassQuery(
+ new SpanTermQuery(new Term("base", "s:d")), (byte) 1),
+ (byte) 1), new SpanFocusQuery(new SpanClassQuery(
+ new SpanTermQuery(new Term("base", "s:b")), (byte) 2),
+ (byte) 2)));
+
+ kr = ki.search(sq, (short) 10);
+ assertEquals("doc-number", 2, kr.getMatch(0).getLocalDocID());
+ assertEquals("StartPos", 0, kr.getMatch(0).startPos);
+ assertEquals("EndPos", 3, kr.getMatch(0).endPos);
+
+ // for (Match km : kr.getMatches()) {
+ // System.out.println(km.getStartPos() + "," + km.getEndPos()
+ // + " "
+ // + km.getSnippetBrackets());
+ // }
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestRelationIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestRelationIndex.java
index d8f79a7..7df32ea 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestRelationIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestRelationIndex.java
@@ -14,6 +14,7 @@
import de.ids_mannheim.korap.query.SpanClassQuery;
import de.ids_mannheim.korap.query.SpanElementQuery;
import de.ids_mannheim.korap.query.SpanFocusQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
import de.ids_mannheim.korap.query.SpanRelationMatchQuery;
import de.ids_mannheim.korap.query.SpanRelationQuery;
import de.ids_mannheim.korap.query.SpanSegmentQuery;
@@ -71,7 +72,7 @@
}
- private FieldDocument createFieldDoc0 () {
+ public static FieldDocument createFieldDoc0() {
FieldDocument fd = new FieldDocument();
fd.addString("ID", "doc-0");
fd.addTV(
@@ -96,7 +97,7 @@
}
- private FieldDocument createFieldDoc1 () {
+ public static FieldDocument createFieldDoc1() {
FieldDocument fd = new FieldDocument();
fd.addString("ID", "doc-1");
fd.addTV(
@@ -903,9 +904,6 @@
assertEquals((long) 1, kr.getTotalResults());
assertEquals(2, kr.getMatch(0).getStartPos());
assertEquals(7, kr.getMatch(0).getEndPos());
-
- // for (Match m : kr.getMatches()) {
- // System.out.println(m.getStartPos() + " " + m.getEndPos());
- // }
}
+
}