Fix pagebreaks when doc lower than match doc number

Change-Id: I6255b44c523c7fea1656c78a1aa18db1febfc4c3
diff --git a/Changes b/Changes
index 74018eb..9d4d610 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.61.2 2023-03-28
+    - [bugfix] Fix pagebreak retrieval (margaretha, diewald)
+
 0.61.1 2023-02-14
     - [bugfix] Fixed ensuring same documents of spans (solved #87, 
     margaretha)
diff --git a/pom.xml b/pom.xml
index 3198f40..04925b9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
 
   <groupId>de.ids_mannheim.korap</groupId>
   <artifactId>Krill</artifactId>
-  <version>0.61.1</version>
+  <version>0.61.2</version>
   <packaging>jar</packaging>
 
   <name>Krill</name>
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 8c1498e..a40a7a6 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -501,7 +501,6 @@
 		};
     };
 
-
 	public void addPagebreak (int start, int pagenumber) {
 		this.addHighlight(new Highlight(start, pagenumber));
 	};
@@ -864,11 +863,13 @@
 
 		int charOffset = 0, pagenumber = 0, start = 0;
 
-		if (DEBUG)
+		if (DEBUG) {
+            log.debug("=================================");
 			log.debug("Retrieve pagebreaks between {}-{}",
 					  this.getStartPos(),
 					  this.getEndPos());
-
+        };
+        
 		try {
 
             // Store character offsets in ByteBuffer
@@ -890,19 +891,26 @@
 			while (pagebreakSpans.next() == true) {
 
 				if (DEBUG) {
-					log.debug("There is a pagebreak at {}/{}",
+					log.debug("There is a pagebreak at {}/{} and we are at {}",
 							  pagebreakSpans.doc(),
-							  pagebreakSpans.start());
+							  pagebreakSpans.start(),
+                              this.localDocID);
 				};
 				
 				// Current pagebreak is not in the correct document
-				if (pagebreakSpans.doc() != this.localDocID) {
-					pagebreakSpans.skipTo(this.localDocID);
-
-					// No pagebreaks in this document
-					if (pagebreakSpans.doc() != this.localDocID)
-						break;
-				};
+                if (pagebreakSpans.doc() != this.localDocID) {
+                    if (pagebreakSpans.doc() < this.localDocID) {
+                        pagebreakSpans.skipTo(this.localDocID);
+                        
+                        // No pagebreaks in this document
+                        if (pagebreakSpans.doc() != this.localDocID)
+                            break;
+                    }
+                    else {
+                        break;
+                    };
+                    continue;
+                };
 
 				if (DEBUG)
 					log.debug("The pagebreak occurs in the document");
@@ -911,16 +919,18 @@
 				// if it is in the correct area
 				if (pagebreakSpans.start() <= this.getStartPos()) {
 
-					if (DEBUG)
-						log.debug("PB start position is before match at {}",
-								  pagebreakSpans.start());
-					
 					// Only the first payload is relevant
 					b = pagebreakSpans.getPayload().iterator().next();
 					start = pagebreakSpans.start();
+
+                    if (DEBUG)
+						log.debug("PB start position is before match at {}:{}",
+								  pagebreakSpans.start(),
+                                  b);
+					
 				}
 
-				// This is the first pagebreak!
+				// This is the first pagebreak inside the match!
 				else {
 
 					// b is already defined!
@@ -937,6 +947,7 @@
 						
 						// This is the first pagebreak!
 						pagebreaks.add(new int[]{charOffset, pagenumber});
+                        
 						if (start >= this.getStartPos()) {
 
 							if (DEBUG)
@@ -945,6 +956,7 @@
 										  pagenumber);
 							this.addPagebreak(charOffset, pagenumber);
 						};
+                        b = null;
 					}
 
 					// b wasn't used yet
@@ -963,17 +975,41 @@
 						// This is the first pagebreak!
 						pagebreaks.add(new int[]{charOffset, pagenumber});
 						this.addPagebreak(charOffset,pagenumber);
+                        b = null;
 					}
 
 					// Pagebreak beyond the current position
 					else {
 						break;
 					};
-
-					// Reset byte
-					b = null;
 				};
 			};
+
+            if (b != null) {
+                bb.rewind();
+                bb.put(b);
+                bb.rewind();
+
+                pagenumber = bb.getInt();
+                charOffset = bb.getInt();
+
+                if (DEBUG)
+                    log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
+						
+                // This is a remembered pagebreak!
+                pagebreaks.add(new int[]{charOffset, pagenumber});
+
+                if (start >= this.getStartPos()) {
+                                            
+                    if (DEBUG)
+                        log.debug("Add pagebreak to rendering: {}-{}",
+                                  charOffset,
+                                  pagenumber);
+                    this.addPagebreak(charOffset, pagenumber);
+                };
+
+                b = null;
+            };
 		}
 		catch (Exception e) {
 			log.warn("Some problems with ByteBuffer: {}", e.getMessage());
@@ -988,7 +1024,6 @@
 		return pagebreaks;
 	};
 
-
     // Expand the context to a span
     public int[] expandContextToSpan (String element) {
 
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
index 9c303da..93d7925 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -1,33 +1,19 @@
 package de.ids_mannheim.korap.index;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
 
 import java.io.IOException;
 
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 
-import de.ids_mannheim.korap.KrillCollection;
-import de.ids_mannheim.korap.Krill;
 import de.ids_mannheim.korap.KrillIndex;
 import de.ids_mannheim.korap.query.QueryBuilder;
-import de.ids_mannheim.korap.query.SpanClassQuery;
-import de.ids_mannheim.korap.query.SpanElementQuery;
-import de.ids_mannheim.korap.query.SpanFocusQuery;
-import de.ids_mannheim.korap.query.SpanNextQuery;
-import de.ids_mannheim.korap.query.SpanWithinQuery;
-import de.ids_mannheim.korap.query.QueryBuilder;
-import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
-import de.ids_mannheim.korap.response.Match;
 import de.ids_mannheim.korap.response.Result;
-import de.ids_mannheim.korap.response.SearchContext;
 
 /*
  * Retrieve pagebreak annotations
@@ -35,25 +21,75 @@
 
 @RunWith(JUnit4.class)
 public class TestPagebreakIndex {
+    
+    private FieldDocument createFieldDoc0 () {
+        // abcde
+           FieldDocument fd = new FieldDocument();
+           fd.addTV("tokens", "abcde",
+                    "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>5]" +
+                    "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
+                    "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
+                    "[(3-4)s:a|i:d|_3$<i>3<i>4]" +
+                    "[(4-5)s:b|i:e|_4$<i>4<i>5]"
+                    );
+           return fd;
+       }
+    
+    private FieldDocument createFieldDoc1 () {
+     // abcabcabac
+        FieldDocument fd = new FieldDocument();
+        fd.addTV("tokens", "abcabcabac",
+                 "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
+                 "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
+                 "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
+                 "[(3-4)s:a|i:a|_3$<i>3<i>4]" +
+                 "[(4-5)s:b|i:b|_4$<i>4<i>5]" +
+                 "[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
+                 "[(6-7)s:a|i:a|_6$<i>6<i>7]" +
+                 "[(7-8)s:b|i:b|_7$<i>7<i>8]" +
+                 "[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
+                 "[(9-10)s:c|i:c|_9$<i>9<i>10]");
+        return fd;
+    }
+    
+    @Test
+    public void testPageBreakDocLowerThanLocalDocId () throws IOException {
+        KrillIndex ki = new KrillIndex();
+        ki.addDoc(createFieldDoc0());
+        ki.addDoc(createFieldDoc1());
+        ki.commit();
+        
+        SpanTermQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
+        Result kr = ki.search(sq, (short) 10);
+        assertEquals(4, kr.getMatches().size());
+
+        assertEquals(2, kr.getMatch(0).getStartPos());
+		assertEquals(3, kr.getMatch(0).getEndPos());
+		assertEquals(-1, kr.getMatch(0).getStartPage());
+		assertEquals(-1, kr.getMatch(0).getEndPage());
+
+        assertEquals(2, kr.getMatch(1).getStartPos());
+		assertEquals(3, kr.getMatch(1).getEndPos());
+		assertEquals(528, kr.getMatch(1).getStartPage());
+		assertEquals(-1, kr.getMatch(1).getEndPage());
+
+        assertEquals(5, kr.getMatch(2).getStartPos());
+		assertEquals(6, kr.getMatch(2).getEndPos());
+		assertEquals(529, kr.getMatch(2).getStartPage());
+		assertEquals(-1, kr.getMatch(2).getEndPage());
+
+        assertEquals(9, kr.getMatch(3).getStartPos());
+		assertEquals(10, kr.getMatch(3).getEndPos());
+		assertEquals(530, kr.getMatch(3).getStartPage());
+		assertEquals(-1, kr.getMatch(3).getEndPage());
+    };
 
     @Test
     public void indexExample1 () throws Exception {
 		KrillIndex ki = new KrillIndex();
 
 		// abcabcabac
-		FieldDocument fd = new FieldDocument();
-		fd.addTV("tokens", "abcabcabac",
-				 "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
-				 "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
-				 "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
-				 "[(3-4)s:a|i:a|_3$<i>3<i>4]" +
-				 "[(4-5)s:b|i:b|_4$<i>4<i>5]" +
-				 "[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
-				 "[(6-7)s:a|i:a|_6$<i>6<i>7]" +
-				 "[(7-8)s:b|i:b|_7$<i>7<i>8]" +
-				 "[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
-				 "[(9-10)s:c|i:c|_9$<i>9<i>10]");
-        ki.addDoc(fd);
+        ki.addDoc(createFieldDoc1());
         ki.commit();
 
 		SpanQuery sq;
@@ -88,8 +124,6 @@
 			"</span>",
 			kr.getMatch(0).getSnippetHTML());
 
-				/*
-
 		QueryBuilder qb = new QueryBuilder("tokens");
 		sq = qb.seq().append(
 			qb.repeat(
@@ -106,6 +140,7 @@
 		
 		assertEquals(528, kr.getMatch(0).getStartPage());
 		assertEquals(529, kr.getMatch(0).getEndPage());
+
 		assertEquals(
 			"snippetHTML",
 			"<span class=\"context-left\"></span>"+
@@ -121,6 +156,5 @@
 			"bac"+
 			"</span>",
 			kr.getMatch(0).getSnippetHTML());
-		*/
 	};
 };