Fixed left expansion match order.
Change-Id: I7755bce59042b9bf35d31a11a252630e8b965db1
diff --git a/Changes b/Changes
index 96b0ae5..a23a214 100644
--- a/Changes
+++ b/Changes
@@ -10,7 +10,8 @@
- [bugfix] Fixed skipTo in NextSpans, see de.ids_mannheim.korap.index.
TestRepetitionIndex.testRepetitionSnippetBug3() (margaretha)
- [bugfix] Fixed the candidate list in NextSpans, see de.ids_mannheim.
- korap.index.TestNextIndex.testNextExpansionBug() (margaretha)
+ korap.index.TestNextIndex.testNextExpansionBug() (margaretha)
+ - [bugfix] Fixed left expansion match order (margaretha)
0.58.0 2018-09-03
- [feature] Implemented referencing cached collection (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
index 129ef82..eb3641f 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
@@ -3,8 +3,9 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
-import java.util.List;
import java.util.Map;
+import java.util.SortedSet;
+import java.util.TreeSet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
@@ -15,16 +16,13 @@
/**
* Enumeration of spans expanded with minimum <code>m</code> and
- * maximum
- * <code>n</code> token positions to either left or right direction
- * from the
- * original spans. See examples in {@link SpanExpansionQuery}.
+ * maximum <code>n</code> token positions to either left (negative)
+ * or right direction from the original spans. See examples in
+ * {@link SpanExpansionQuery}.
*
* The expansion offsets, namely the start and end position of an
- * expansion
- * part, can be stored in payloads. A class number is assigned to the
- * offsets
- * grouping them altogether.
+ * expansion part, can be stored in payloads. A class number is
+ * assigned to the offsets grouping them altogether.
*
* @author margaretha
*/
@@ -33,7 +31,7 @@
private int min, max;
private byte classNumber;
private int direction;
- private List<CandidateSpan> candidateSpans;
+ private SortedSet<CandidateSpan> candidateSpans;
private long matchCost;
@@ -58,8 +56,11 @@
this.direction = spanExpansionQuery.getDirection();
this.classNumber = spanExpansionQuery.getClassNumber();
- candidateSpans = new ArrayList<CandidateSpan>();
+ candidateSpans = new TreeSet<CandidateSpan>();
hasMoreSpans = true;
+ if (direction < 0){
+ hasMoreSpans = firstSpans.next();
+ }
}
@@ -67,8 +68,9 @@
public boolean next () throws IOException {
matchPayload.clear();
isStartEnumeration = false;
- if (candidateSpans.size() == 0 && hasMoreSpans)
+ if (candidateSpans.size() == 0 && hasMoreSpans && direction >= 0) {
hasMoreSpans = firstSpans.next();
+ }
return advance();
}
@@ -87,12 +89,13 @@
private boolean advance () throws IOException {
while (candidateSpans.size() > 0 || hasMoreSpans) {
if (candidateSpans.size() > 0) {
- setMatch(candidateSpans.get(0));
- candidateSpans.remove(0);
+ CandidateSpan cs = candidateSpans.first();
+ setMatch(cs);
+ candidateSpans.remove(cs);
return true;
}
else {
- setCandidateList();
+ setCandidateList(firstSpans.end());
}
}
return false;
@@ -101,18 +104,21 @@
/**
* Sets the candidateList by adding new candidate match spans for
- * all
- * possible expansion with respect to the expansion length
- * (min,max)
- * variables.
+ * all possible expansion with respect to the expansion length
+ * (min,max) variables.
+ *
+ * @param lastPosition
+ * is used in left expansion. The start position of
+ * the candidates to collect must not beyond this
+ * position.
*
* @throws IOException
*/
- private void setCandidateList () throws IOException {
+ private void setCandidateList (int lastPosition) throws IOException {
CandidateSpan cs;
int counter, start, end;
- if (direction < 0) {
+ if (direction < 0) { // left
counter = max;
while (counter >= min) {
start = Math.max(0, firstSpans.start() - counter);
@@ -123,6 +129,13 @@
candidateSpans.add(cs);
counter--;
}
+
+ if (hasMoreSpans && (hasMoreSpans = firstSpans.next()) ) {
+ start = Math.max(0, firstSpans.start() - max);
+ if (start <= lastPosition) {
+ setCandidateList(lastPosition);
+ }
+ }
}
else {
counter = min;
@@ -141,13 +154,11 @@
/**
* Prepares the payloads for a candidate match (ExpandedSpans). If
- * the class
- * number is set, the extension offsets with the given start and
- * end
- * positions are to be stored in the payloads.
+ * the class number is set, the extension offsets with the given
+ * start and end positions are to be stored in the payloads.
*
- * @param start
- * @param end
+ * @param start start position
+ * @param end end position
* @return the payloads for a candidaete match
* @throws IOException
*/
@@ -168,11 +179,11 @@
/**
* Prepares a byte array of extension offsets with the given start
- * and end
- * positions and the class number, to be stored in payloads.
+ * and end positions and the class number, to be stored in
+ * payloads.
*
- * @param start
- * @param end
+ * @param start start position
+ * @param end end position
* @return a byte array of extension offsets and the class number
*/
private byte[] createExtensionPayloads (int start, int end) {
@@ -188,8 +199,7 @@
/**
* Sets the properties of the given candidate match span as the
- * current
- * match (state of ExpandedSpans).
+ * current match (state of ExpandedSpans).
*
* @param candidateSpan
*/
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index 7a19b66..ddd539c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -123,6 +123,43 @@
}
@Test
+ public void testNextRightExpansion () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(simpleFieldDoc("F u G S h A d F ü d T F u d m", " "));
+ ki.commit();
+
+ SpanTermQuery stq = new SpanTermQuery(new Term("base", "s:d"));
+ SpanExpansionQuery seq = new SpanExpansionQuery(stq, 0, 6, 0, true);
+ SpanNextQuery snq = new SpanNextQuery(seq,stq);
+ assertEquals("spanNext(spanExpansion(base:s:d, []{0, 6}, right), base:s:d)", snq.toString());
+ Result kr = ki.search(snq, (short) 10);
+
+// 6,10 FuGShA[[dFüd]]TFudm
+// 6,14 FuGShA[[dFüdTFud]]m
+// 9,14 ... ShAdFü[[dTFud]]
+ assertEquals(3, kr.getTotalResults());
+ }
+
+ @Test
+ public void testNextLeftExpansion () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(simpleFieldDoc("F u G S h A d F ü d T F u d m", " "));
+ ki.commit();
+
+ SpanTermQuery stq = new SpanTermQuery(new Term("base", "s:d"));
+ SpanExpansionQuery seq = new SpanExpansionQuery(stq, 0, 6, -1, true);
+ Result kr = ki.search(seq, (short) 20);
+
+ assertEquals(21, kr.getTotalResults());
+
+ SpanNextQuery snq = new SpanNextQuery(stq,seq);
+ assertEquals("spanNext(base:s:d, spanExpansion(base:s:d, []{0, 6}, left))", snq.toString());
+ kr = ki.search(snq, (short) 10);
+
+ assertEquals(3, kr.getTotalResults());
+ }
+
+ @Test
public void indexExample1 () throws IOException {
KrillIndex ki = new KrillIndex();
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
index 0b4c4b8..d396815 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
@@ -16,7 +16,6 @@
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.automaton.RegExp;
import org.junit.Test;
-import org.junit.Ignore;
import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillIndex;
@@ -28,6 +27,7 @@
import de.ids_mannheim.korap.query.SpanNextQuery;
import de.ids_mannheim.korap.query.SpanRepetitionQuery;
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
+import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.util.QueryException;
@@ -571,7 +571,6 @@
@Test
- @Ignore
public void indexExpansionLeftWithWrongSorting () throws IOException {
KrillIndex ki = new KrillIndex();
ki.addDoc(simpleFieldDoc("abcc"));
@@ -582,19 +581,37 @@
assertEquals("spanExpansion(base:s:c, []{0, 2}, left)", seq.toString());
Result kr = ki.search(seq, (short) 10);
- /*
- for (Match km : kr.getMatches()) {
- System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
- km.getSnippetBrackets());
- };
- */
-
- assertEquals(kr.getMatch(1).getSnippetBrackets(), "a[[bc]]c");
- assertEquals(kr.getMatch(2).getSnippetBrackets(), "a[[bcc]]");
+ assertEquals("a[[bc]]c", kr.getMatch(1).getSnippetBrackets());
+ assertEquals(1, kr.getMatch(1).getStartPos());
+ assertEquals(3, kr.getMatch(1).getEndPos());
+ assertEquals("a[[bcc]]", kr.getMatch(2).getSnippetBrackets());
+ assertEquals(1, kr.getMatch(2).getStartPos());
+ assertEquals(4, kr.getMatch(2).getEndPos());
assertEquals(6, kr.getTotalResults());
}
+ /** Tests left expansion over start doc boundary. Redundant matches should
+ * be omitted.
+ * @throws IOException
+ */
+ @Test
+ public void testLeftExpansionRedundantMatches () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(simpleFieldDoc("A d F ü d T F u d m", " "));
+ ki.commit();
+
+ SpanTermQuery stq = new SpanTermQuery(new Term("base", "s:d"));
+ SpanExpansionQuery seq = new SpanExpansionQuery(stq, 0, 6, -1, true);
+ Result kr = ki.search(seq, (short) 20);
+
+ Match m = kr.getMatch(5);
+ assertEquals(2, m.getStartPos());
+ assertEquals(9, m.getEndPos());
+ assertEquals(14, kr.getTotalResults());
+
+ }
+
private FieldDocument createFieldDoc6 () {
FieldDocument fd = new FieldDocument();
fd.addString("ID", "doc-6");