Fix bug in NextSpans that accepts entries in matchList in a different document
Change-Id: I7f6ee346efc2180e042bcc6d8395ef3aafd17554
diff --git a/Changes b/Changes
index 9b1885f..1137eda 100644
--- a/Changes
+++ b/Changes
@@ -1,9 +1,12 @@
-0.58.1 2018-10-31
+0.58.1 2018-11-02
- [bugfix] Security upgrade of Jackson for CVE-2017-17485 and
CVE-2018-7489 (diewald)
- [bugfix] Span expansion with negation (margaretha)
- [bugfix] OpenJDK8u181-workaround (see Debian Bug report #911925)
(diewald)
+ - [feature] Helper methods for fuzzing (diewald)
+ - [bugfix] Remove entries from matchList that are not in the same
+ document in NextSpans (diewald)
0.58.0 2018-09-03
- [feature] Implemented referencing cached collection (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 96eeceb..9f71154 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -11,6 +11,9 @@
import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.Bits;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import de.ids_mannheim.korap.query.SpanNextQuery;
/**
@@ -26,6 +29,12 @@
*/
public class NextSpans extends SimpleSpans {
+ // Logger
+ private final Logger log = LoggerFactory.getLogger(NextSpans.class);
+
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+
private List<CandidateSpan> matchList;
private List<CandidateSpan> candidateList;
private int candidateListDocNum;
@@ -71,13 +80,24 @@
* @throws IOException
*/
private boolean advance () throws IOException {
-
+
while (hasMoreSpans || !matchList.isEmpty()
|| !candidateList.isEmpty()) {
- if (!matchList.isEmpty()) {
+
+ // Check, if the matchlist is fine
+ // It may be enough to clear it though
+ while (!matchList.isEmpty() && matchList.get(0).getDoc() != firstSpans.doc()) {
+ matchList.remove(0);
+ if (DEBUG) {
+ log.debug("Remove first entry from matchlist because it's not in the same doc");
+ };
+ };
+
+ if (!matchList.isEmpty()) {
matchDocNumber = firstSpans.doc();
matchStartPosition = firstSpans.start();
matchEndPosition = matchList.get(0).getEnd();
+
spanId = matchList.get(0).getSpanId();
if (collectPayloads)
matchPayload.addAll(matchList.get(0).getPayloads());
@@ -112,6 +132,16 @@
else {
candidateList.clear();
if (hasMoreSpans && ensureSameDoc(firstSpans, secondSpans)) {
+ if (DEBUG) {
+ log.debug("First and second span now in same doc: {}-{} and {}-{} in {}={}",
+ firstSpans.start(),
+ firstSpans.end(),
+ secondSpans.start(),
+ secondSpans.end(),
+ firstSpans.doc(),
+ secondSpans.doc()
+ );
+ }
candidateListDocNum = firstSpans.doc();
searchMatches();
}
@@ -133,6 +163,7 @@
CandidateSpan cs;
while (i.hasNext()) {
cs = i.next();
+
if (cs.getStart() == firstSpans.end()) {
addMatch(cs);
}
@@ -161,6 +192,14 @@
break;
}
if (secondSpans.start() == firstSpans.end()) {
+
+ if (DEBUG) {
+ log.debug("Check adjacency at {}-{}|{}-{} in {}={}={}",
+ firstSpans.start(), firstSpans.end(),
+ secondSpans.start(), secondSpans.end(),
+ firstSpans.doc(), secondSpans.doc(), candidateListDocNum);
+ };
+
candidateList.add(new CandidateSpan(secondSpans));
addMatch(new CandidateSpan(secondSpans));
}
@@ -201,10 +240,23 @@
@Override
public boolean skipTo (int target) throws IOException {
if (hasMoreSpans && (firstSpans.doc() < target)) {
+
if (!firstSpans.skipTo(target)) {
hasMoreSpans = false;
return false;
- }
+ };
+
+ if (DEBUG) {
+ log.debug("Skip firstSpans to {}={} succeed with positions {}-{}",
+ target,
+ firstSpans.doc(),
+ firstSpans.start(),
+ firstSpans.end());
+ log.debug("secondSpans is at positions {}-{} at {}",
+ secondSpans.start(),
+ secondSpans.end(),
+ secondSpans.doc());
+ };
}
matchPayload.clear();
return advance();
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java
index 7c553e9..7fde2f9 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java
@@ -298,7 +298,7 @@
};
@Test
- public void testRepetitionSnippetBug () throws IOException, QueryException {
+ public void testRepetitionSnippetBug1 () throws IOException, QueryException {
// Construct index
Pattern p = Pattern.compile("bccc?d");
@@ -331,24 +331,14 @@
// First fuzzed failure (0 vs 1)
ki = new KrillIndex();
- ki.addDoc(simpleFieldDoc("cccd"));
- ki.addDoc(simpleFieldDoc("bccccccaeae"));
- ki.addDoc(simpleFieldDoc("cbcedb"));
+ ki.addDoc(simpleFieldDoc("cccd")); // 0
+ ki.addDoc(simpleFieldDoc("bccccccaeae")); // 1
+ ki.addDoc(simpleFieldDoc("cbcedb")); // 2
ki.commit();
kr = ks.apply(ki);
assertEquals(0,kr.getTotalResults());
- // Second fuzzed failure (1 vs 0)
- ki = new KrillIndex();
- ki.addDoc(simpleFieldDoc("cdddbc"));
- ki.addDoc(simpleFieldDoc("bccc"));
- ki.addDoc(simpleFieldDoc("cbcccd"));
-
- ki.commit();
- kr = ks.apply(ki);
- assertEquals(1,kr.getTotalResults());
-
// Third fuzzed failure (1 vs 2)
ki = new KrillIndex();
ki.addDoc(simpleFieldDoc("bccdcb"));
@@ -360,6 +350,41 @@
assertEquals(1,kr.getTotalResults());
};
+ @Test
+ public void testRepetitionSnippetBug2 () throws IOException, QueryException {
+ // Construct index
+ Pattern p = Pattern.compile("bccc?d");
+
+ QueryBuilder qb = new QueryBuilder("base");
+
+ // b c{2,3} d
+ SpanQuery sq = qb.seq(
+ qb.seg("s:b")
+ ).append(
+ qb.repeat(qb.seg("s:c"),2,3)
+ ).append(
+ qb.seg("s:d")
+ ).toQuery();
+
+ Krill ks = new Krill(sq);
+
+ assertEquals(ks.getSpanQuery().toString(),
+ "spanNext(spanNext(base:s:b, spanRepetition(base:s:c{2,3})), base:s:d)");
+
+ // fuzzingRepetitionBug();
+
+ // Second fuzzed failure (1 vs 0)
+ ki = new KrillIndex();
+ ki.addDoc(simpleFieldDoc("cdddbc"));
+ ki.addDoc(simpleFieldDoc("bccc"));
+ ki.addDoc(simpleFieldDoc("cbcccd"));
+
+ ki.commit();
+ kr = ks.apply(ki);
+ assertEquals(1,kr.getTotalResults());
+ };
+
+
/**
* This method creates a corpus using fuzzing to