Fixed right expansion match order & expansion over start.
Change-Id: Id0cf29522e38cf39483e3d01bb75367fb4a13da3
diff --git a/Changes b/Changes
index a23a214..aa15d42 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.58.1 2018-11-27
+0.58.1 2018-11-28
- [bugfix] Security upgrade of Jackson for CVE-2017-17485 and
CVE-2018-7489 (diewald)
- [bugfix] Span expansion with negation (margaretha)
@@ -11,7 +11,8 @@
TestRepetitionIndex.testRepetitionSnippetBug3() (margaretha)
- [bugfix] Fixed the candidate list in NextSpans, see de.ids_mannheim.
korap.index.TestNextIndex.testNextExpansionBug() (margaretha)
- - [bugfix] Fixed left expansion match order (margaretha)
+ - [bugfix] Fixed left expansion match order (margaretha)
+ - [bugfix] Fixed right expansion match order & expansion over start (margaretha)
0.58.0 2018-09-03
- [feature] Implemented referencing cached collection (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
index ecdafc0..cb2bbd7 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ExpandedSpans.java
@@ -3,8 +3,9 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
import java.util.Map;
-import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.lucene.index.LeafReaderContext;
@@ -12,6 +13,9 @@
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import de.ids_mannheim.korap.query.SpanExpansionQuery;
/**
@@ -21,7 +25,7 @@
* {@link SpanExpansionQuery}.
*
* The expansion offsets, namely the start and end position of an
- * expansion part, can be stored in payloads. A class number is
+ * expansion part, can be stored in payloads. A class number is
* assigned to the offsets grouping them altogether.
*
* @author margaretha
@@ -31,9 +35,15 @@
private int min, max;
private byte classNumber;
private int direction;
- private SortedSet<CandidateSpan> candidateSpans;
+ private List<CandidateSpan> candidateSpans;
private long matchCost;
+ // Logger
+ private final Logger log = LoggerFactory.getLogger(ExpandedSpans.class);
+
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+
/**
* Constructs ExpandedSpans from the given
@@ -56,25 +66,17 @@
this.direction = spanExpansionQuery.getDirection();
this.classNumber = spanExpansionQuery.getClassNumber();
- candidateSpans = new TreeSet<CandidateSpan>();
- hasMoreSpans = true;
- if (direction < 0){
- hasMoreSpans = firstSpans.next();
- }
+ candidateSpans = new ArrayList<CandidateSpan>();
+ hasMoreSpans = firstSpans.next();
}
-
@Override
public boolean next () throws IOException {
matchPayload.clear();
isStartEnumeration = false;
- if (candidateSpans.size() == 0 && hasMoreSpans && direction >= 0) {
- hasMoreSpans = firstSpans.next();
- }
return advance();
}
-
/**
* Advances the ExpandedSpans to the next match by setting the
* first element
@@ -89,20 +91,22 @@
private boolean advance () throws IOException {
while (candidateSpans.size() > 0 || hasMoreSpans) {
if (candidateSpans.size() > 0) {
- CandidateSpan cs = candidateSpans.first();
+ CandidateSpan cs = candidateSpans.get(0);
setMatch(cs);
candidateSpans.remove(cs);
return true;
}
else {
setCandidateList();
-// log.debug(candidateSpans.toString());
+ Collections.sort(candidateSpans);
+ if (DEBUG) {
+ log.debug(candidateSpans.toString());
+ };
}
}
return false;
}
-
/**
* Sets the candidateList by adding new candidate match spans for
* all possible expansion with respect to the expansion length
@@ -112,24 +116,29 @@
*/
private void setCandidateList () throws IOException {
CandidateSpan cs;
- int counter, start, end;
+ int counter, start, end = 0;
if (direction < 0) { // left
counter = max;
while (counter >= min) {
- start = Math.max(0, firstSpans.start() - counter);
- cs = new CandidateSpan(start, firstSpans.end(),
- firstSpans.doc(), firstSpans.cost(),
- createPayloads(start, firstSpans.start()));
+ start = firstSpans.start() - counter;
+ if (start >= 0) {
+ cs = new CandidateSpan(start, firstSpans.end(),
+ firstSpans.doc(), firstSpans.cost(),
+ createPayloads(start, firstSpans.start()));
- candidateSpans.add(cs);
+ candidateSpans.add(cs);
+ }
counter--;
}
-
+
int lastPosition = firstSpans.start();
- if (hasMoreSpans && (hasMoreSpans = firstSpans.next()) ) {
+ if (hasMoreSpans && (hasMoreSpans = firstSpans.next())) {
start = Math.max(0, firstSpans.start() - max);
- log.debug("next candidate start: "+start+", lastPosition "+lastPosition);
+ if (DEBUG) {
+ log.debug("next candidate start: " + start + ", lastPosition "
+ + lastPosition);
+ };
if (start <= lastPosition) {
setCandidateList();
}
@@ -138,7 +147,8 @@
else {
counter = min;
while (counter <= max) {
- // TODO: How do I know if the end is already too far (over the end of the doc)?
+ // TODO: How do I know if the end is already too far
+ // (over the end of the doc)?
end = firstSpans.end() + counter;
cs = new CandidateSpan(firstSpans.start(), end,
firstSpans.doc(), firstSpans.cost(),
@@ -146,17 +156,29 @@
candidateSpans.add(cs);
counter++;
}
+
+ int lastPosition = end;
+ if (hasMoreSpans && (hasMoreSpans = firstSpans.next())) {
+ if (DEBUG) {
+ log.debug("next candidate start: " + firstSpans.start()
+ + ", lastPosition " + lastPosition);
+ };
+ if (firstSpans.start() <= lastPosition) {
+ setCandidateList();
+ }
+ }
}
}
-
/**
* Prepares the payloads for a candidate match (ExpandedSpans). If
* the class number is set, the extension offsets with the given
* start and end positions are to be stored in the payloads.
*
- * @param start start position
- * @param end end position
+ * @param start
+ * start position
+ * @param end
+ * end position
* @return the payloads for a candidaete match
* @throws IOException
*/
@@ -168,20 +190,21 @@
payload.addAll(firstSpans.getPayload());
}
if (classNumber > 0) {
- //System.out.println("Extension offsets "+start+","+end);
+ // System.out.println("Extension offsets "+start+","+end);
payload.add(createExtensionPayloads(start, end));
}
return payload;
}
-
/**
* Prepares a byte array of extension offsets with the given start
* and end positions and the class number, to be stored in
* payloads.
*
- * @param start start position
- * @param end end position
+ * @param start
+ * start position
+ * @param end
+ * end position
* @return a byte array of extension offsets and the class number
*/
private byte[] createExtensionPayloads (int start, int end) {
@@ -194,7 +217,6 @@
return buffer.array();
}
-
/**
* Sets the properties of the given candidate match span as the
* current match (state of ExpandedSpans).
@@ -209,7 +231,6 @@
matchCost = candidateSpan.getCost();
}
-
@Override
public boolean skipTo (int target) throws IOException {
if (hasMoreSpans && (firstSpans.doc() < target)) {
@@ -222,7 +243,6 @@
return advance();
}
-
@Override
public long cost () {
return matchCost;
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 5e21d39..ab5672b 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -118,7 +118,9 @@
// Forward firstspan
hasMoreFirstSpan = firstSpans.next();
if (hasMoreFirstSpan){
- log.debug("FirstSpan "+firstSpans.start() +","+firstSpans.end());
+ if (DEBUG) {
+ log.debug("FirstSpan "+firstSpans.start() +","+firstSpans.end());
+ }
setMatchList();
}
else {
@@ -172,7 +174,9 @@
* @throws IOException
*/
private void searchCandidates () throws IOException {
- log.debug(candidateList.toString());
+ if (DEBUG) {
+ log.debug(candidateList.toString());
+ };
Iterator<CandidateSpan> i = candidateList.iterator();
CandidateSpan cs;
while (i.hasNext()) {
@@ -203,7 +207,9 @@
private void searchMatches () throws IOException {
while (hasMoreSpans && candidateListDocNum == secondSpans.doc()) {
- log.debug("SecondSpan " +secondSpans.start() + "," + secondSpans.end());
+ if (DEBUG) {
+ log.debug("SecondSpan " +secondSpans.start() + "," + secondSpans.end());
+ };
if (secondSpans.start() > firstSpans.end()) {
break;
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
index 9ab5c62..71bd0a7 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSpanExpansionIndex.java
@@ -97,16 +97,10 @@
assertEquals(7, kr.getMatch(2).getStartPos());
assertEquals(8, kr.getMatch(2).getEndPos());
- /*
- for (Match km : kr.getMatches()) {
- System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
- km.getSnippetBrackets()); }
- */
-
// right
seq = new SpanExpansionQuery(stq, 3, 4, 0, true);
kr = ki.search(seq, (short) 10);
-
+
assertEquals(7, kr.getMatch(0).getStartPos());
assertEquals(11, kr.getMatch(0).getEndPos());
assertEquals(7, kr.getMatch(1).getStartPos());
@@ -267,9 +261,9 @@
SpanExpansionQuery seq = new SpanExpansionQuery(stq, 2, 2, -1, true);
kr = ki.search(seq, (short) 10);
- assertEquals((long) 4, kr.getTotalResults());
- assertEquals(0, kr.getMatch(0).getStartPos());
- assertEquals(2, kr.getMatch(0).getEndPos());
+ assertEquals((long) 3, kr.getTotalResults());
+ assertEquals(2, kr.getMatch(0).getStartPos());
+ assertEquals(5, kr.getMatch(0).getEndPos());
// right expansion exceeds end position
seq = new SpanExpansionQuery(stq, 3, 3, 0, true);
@@ -592,8 +586,7 @@
}
@Test
- @Ignore
- public void indexExpansionMultipleStartsWithWrongSorting () throws IOException {
+ public void indexExpansionMultipleStartsWithCorrectSorting () throws IOException {
KrillIndex ki = new KrillIndex();
ki.addDoc(simpleFieldDoc("abccef"));
ki.commit();
@@ -606,42 +599,56 @@
seqR.toString());
Result kr = ki.search(seqR, (short) 20);
- /*
- for (Match km : kr.getMatches()) {
- System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
- km.getSnippetBrackets());
- };
- */
+// for (Match km : kr.getMatches()) {
+// System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
+// km.getSnippetBrackets());
+// };
// TODO: These are duplicate results that may be restricted with a wrapper
assertEquals("a[[bcc]]ef", kr.getMatch(3).getSnippetBrackets());
assertEquals("a[[bcc]]ef", kr.getMatch(4).getSnippetBrackets());
assertEquals(12, kr.getTotalResults());
+ }
- stq = new SpanTermQuery(new Term("base", "s:c"));
- seqL = new SpanExpansionQuery(stq, 0, 2, -1, true);
- seqR = new SpanExpansionQuery(seqL, 0, 2, 0, true);
+ @Test
+ public void testRightExpansionWithWrongSorting ()
+ throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(simpleFieldDoc("abccef"));
+ ki.commit();
+
+ SpanTermQuery stq = new SpanTermQuery(new Term("base", "s:c"));
+ SpanExpansionQuery seqL = new SpanExpansionQuery(stq, 0, 2, -1, true);
+ kr = ki.search(seqL, (short) 20);
+// for (Match km : kr.getMatches()) {
+// System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
+// km.getSnippetBrackets());
+// };
+
+ SpanExpansionQuery seqR = new SpanExpansionQuery(seqL, 0, 2, 0, true);
assertEquals(
"spanExpansion(spanExpansion(base:s:c, []{0, 2}, left), []{0, 2}, right)",
seqR.toString());
kr = ki.search(seqR, (short) 20);
- /*
- for (Match km : kr.getMatches()) {
- System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
- km.getSnippetBrackets());
- };
- */
+
+// for (Match km : kr.getMatches()) {
+// System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
+// km.getSnippetBrackets());
+// };
+
assertEquals("a[[bcc]]ef", kr.getMatch(5).getSnippetBrackets());
assertEquals("a[[bcce]]f", kr.getMatch(6).getSnippetBrackets());
assertEquals(18, kr.getTotalResults());
}
+ @Test
public void testLeftExpansionWrongSorting () throws IOException {
KrillIndex ki = new KrillIndex();
ki.addDoc(simpleFieldDoc("B u d B R a d m d Z z s B d v", " "));
ki.commit();
+ // d positions: 2-3, 6-7, 8-9, 13-14
SpanTermQuery stq = new SpanTermQuery(new Term("base", "s:d"));
SpanExpansionQuery seq = new SpanExpansionQuery(stq, 0, 8, -1, true);
@@ -649,7 +656,7 @@
// for (Match km : kr.getMatches()){
// System.out.println(km.getStartPos() +","+km.getEndPos()+" "
// +km.getSnippetBrackets()); }
- // 2-3, 6-7, 8-9, 13-14
+
assertEquals("BudBR[[admdZzsBd]]v", kr.getMatch(15).getSnippetBrackets());
assertEquals(28, kr.getTotalResults());
}
@@ -668,6 +675,11 @@
SpanExpansionQuery seq = new SpanExpansionQuery(stq, 0, 6, -1, true);
Result kr = ki.search(seq, (short) 20);
+// for (Match km : kr.getMatches()) {
+// System.out.println(km.getStartPos() + "," + km.getEndPos() + " " +
+// km.getSnippetBrackets());
+// };
+
Match m = kr.getMatch(5);
assertEquals(2, m.getStartPos());
assertEquals(9, m.getEndPos());