Fix bug in NextSpans that accepts entries in matchList in a different document

Change-Id: I7f6ee346efc2180e042bcc6d8395ef3aafd17554
diff --git a/Changes b/Changes
index 9b1885f..1137eda 100644
--- a/Changes
+++ b/Changes
@@ -1,9 +1,12 @@
-0.58.1 2018-10-31
+0.58.1 2018-11-02
     - [bugfix] Security upgrade of Jackson for CVE-2017-17485 and
       CVE-2018-7489 (diewald)
     - [bugfix] Span expansion with negation (margaretha)
     - [bugfix] OpenJDK8u181-workaround (see Debian Bug report #911925)
       (diewald)
+    - [feature] Helper methods for fuzzing (diewald)
+    - [bugfix] Remove entries from matchList that are not in the same
+      document in NextSpans (diewald)
 
 0.58.0 2018-09-03
     - [feature] Implemented referencing cached collection (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 96eeceb..9f71154 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -11,6 +11,9 @@
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.util.Bits;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import de.ids_mannheim.korap.query.SpanNextQuery;
 
 /**
@@ -26,6 +29,12 @@
  */
 public class NextSpans extends SimpleSpans {
 
+    // Logger
+    private final Logger log = LoggerFactory.getLogger(NextSpans.class);
+
+    // This advices the java compiler to ignore all loggings
+    public static final boolean DEBUG = false;
+
     private List<CandidateSpan> matchList;
     private List<CandidateSpan> candidateList;
     private int candidateListDocNum;
@@ -71,13 +80,24 @@
      * @throws IOException
      */
     private boolean advance () throws IOException {
-
+       
         while (hasMoreSpans || !matchList.isEmpty()
                 || !candidateList.isEmpty()) {
-            if (!matchList.isEmpty()) {
+
+            // Check, if the matchlist is fine
+            // It may be enough to clear it though
+            while (!matchList.isEmpty() && matchList.get(0).getDoc() != firstSpans.doc()) {
+                matchList.remove(0);
+                if (DEBUG) {
+                    log.debug("Remove first entry from matchlist because it's not in the same doc");
+                };
+            };
+
+            if (!matchList.isEmpty()) {                
                 matchDocNumber = firstSpans.doc();
                 matchStartPosition = firstSpans.start();
                 matchEndPosition = matchList.get(0).getEnd();
+                
                 spanId = matchList.get(0).getSpanId();
                 if (collectPayloads)
                     matchPayload.addAll(matchList.get(0).getPayloads());
@@ -112,6 +132,16 @@
         else {
             candidateList.clear();
             if (hasMoreSpans && ensureSameDoc(firstSpans, secondSpans)) {
+                if (DEBUG) {
+                    log.debug("First and second span now in same doc: {}-{} and {}-{} in {}={}",
+                              firstSpans.start(),
+                              firstSpans.end(),
+                              secondSpans.start(),
+                              secondSpans.end(),
+                              firstSpans.doc(),
+                              secondSpans.doc()
+                        );
+                }
                 candidateListDocNum = firstSpans.doc();
                 searchMatches();
             }
@@ -133,6 +163,7 @@
         CandidateSpan cs;
         while (i.hasNext()) {
             cs = i.next();
+
             if (cs.getStart() == firstSpans.end()) {
                 addMatch(cs);
             }
@@ -161,6 +192,14 @@
                 break;
             }
             if (secondSpans.start() == firstSpans.end()) {
+                
+                if (DEBUG) {
+                    log.debug("Check adjacency at {}-{}|{}-{} in {}={}={}",
+                              firstSpans.start(), firstSpans.end(),
+                              secondSpans.start(), secondSpans.end(),
+                              firstSpans.doc(), secondSpans.doc(), candidateListDocNum);
+                };
+                
                 candidateList.add(new CandidateSpan(secondSpans));
                 addMatch(new CandidateSpan(secondSpans));
             }
@@ -201,10 +240,23 @@
     @Override
     public boolean skipTo (int target) throws IOException {
         if (hasMoreSpans && (firstSpans.doc() < target)) {
+
             if (!firstSpans.skipTo(target)) {
                 hasMoreSpans = false;
                 return false;
-            }
+            };
+
+            if (DEBUG) {
+                log.debug("Skip firstSpans to {}={} succeed with positions {}-{}",
+                          target,
+                          firstSpans.doc(),
+                          firstSpans.start(),
+                          firstSpans.end());
+                log.debug("secondSpans is at positions {}-{} at {}",
+                          secondSpans.start(),
+                          secondSpans.end(),
+                          secondSpans.doc());
+            };
         }
         matchPayload.clear();
         return advance();
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java
index 7c553e9..7fde2f9 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestRepetitionIndex.java
@@ -298,7 +298,7 @@
     };
 
     @Test
-    public void testRepetitionSnippetBug () throws IOException, QueryException {
+    public void testRepetitionSnippetBug1 () throws IOException, QueryException {
         // Construct index
         Pattern p = Pattern.compile("bccc?d");
         
@@ -331,24 +331,14 @@
 
         // First fuzzed failure (0 vs 1)
         ki = new KrillIndex();
-        ki.addDoc(simpleFieldDoc("cccd"));
-        ki.addDoc(simpleFieldDoc("bccccccaeae"));
-        ki.addDoc(simpleFieldDoc("cbcedb"));
+        ki.addDoc(simpleFieldDoc("cccd"));        // 0
+        ki.addDoc(simpleFieldDoc("bccccccaeae")); // 1
+        ki.addDoc(simpleFieldDoc("cbcedb"));      // 2
 
         ki.commit();
         kr = ks.apply(ki);
         assertEquals(0,kr.getTotalResults());
 
-        // Second fuzzed failure (1 vs 0)
-        ki = new KrillIndex();
-        ki.addDoc(simpleFieldDoc("cdddbc"));
-        ki.addDoc(simpleFieldDoc("bccc"));
-        ki.addDoc(simpleFieldDoc("cbcccd"));
-
-        ki.commit();
-        kr = ks.apply(ki);
-        assertEquals(1,kr.getTotalResults());
-            
         // Third fuzzed failure (1 vs 2)
         ki = new KrillIndex();
         ki.addDoc(simpleFieldDoc("bccdcb"));
@@ -360,6 +350,41 @@
         assertEquals(1,kr.getTotalResults());
     };
 
+    @Test
+    public void testRepetitionSnippetBug2 () throws IOException, QueryException {
+        // Construct index
+        Pattern p = Pattern.compile("bccc?d");
+        
+        QueryBuilder qb = new QueryBuilder("base");
+
+        // b c{2,3} d
+        SpanQuery sq = qb.seq(
+            qb.seg("s:b")
+            ).append(
+                qb.repeat(qb.seg("s:c"),2,3)
+                ).append(
+                    qb.seg("s:d")
+                    ).toQuery();
+        
+        Krill ks = new Krill(sq);
+
+        assertEquals(ks.getSpanQuery().toString(),
+                     "spanNext(spanNext(base:s:b, spanRepetition(base:s:c{2,3})), base:s:d)");
+
+        // fuzzingRepetitionBug();
+
+        // Second fuzzed failure (1 vs 0)
+        ki = new KrillIndex();
+        ki.addDoc(simpleFieldDoc("cdddbc"));
+        ki.addDoc(simpleFieldDoc("bccc"));
+        ki.addDoc(simpleFieldDoc("cbcccd"));
+
+        ki.commit();
+        kr = ks.apply(ki);
+        assertEquals(1,kr.getTotalResults());
+    };
+
+    
 
     /**
      * This method creates a corpus using fuzzing to