Fix pagebreaks when doc lower than match doc number
Change-Id: I6255b44c523c7fea1656c78a1aa18db1febfc4c3
diff --git a/Changes b/Changes
index 74018eb..9d4d610 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.61.2 2023-03-28
+ - [bugfix] Fix pagebreak retrieval (margaretha, diewald)
+
0.61.1 2023-02-14
- [bugfix] Fixed ensuring same documents of spans (solved #87,
margaretha)
diff --git a/pom.xml b/pom.xml
index 3198f40..04925b9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Krill</artifactId>
- <version>0.61.1</version>
+ <version>0.61.2</version>
<packaging>jar</packaging>
<name>Krill</name>
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 8c1498e..a40a7a6 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -501,7 +501,6 @@
};
};
-
public void addPagebreak (int start, int pagenumber) {
this.addHighlight(new Highlight(start, pagenumber));
};
@@ -864,11 +863,13 @@
int charOffset = 0, pagenumber = 0, start = 0;
- if (DEBUG)
+ if (DEBUG) {
+ log.debug("=================================");
log.debug("Retrieve pagebreaks between {}-{}",
this.getStartPos(),
this.getEndPos());
-
+ };
+
try {
// Store character offsets in ByteBuffer
@@ -890,19 +891,26 @@
while (pagebreakSpans.next() == true) {
if (DEBUG) {
- log.debug("There is a pagebreak at {}/{}",
+ log.debug("There is a pagebreak at {}/{} and we are at {}",
pagebreakSpans.doc(),
- pagebreakSpans.start());
+ pagebreakSpans.start(),
+ this.localDocID);
};
// Current pagebreak is not in the correct document
- if (pagebreakSpans.doc() != this.localDocID) {
- pagebreakSpans.skipTo(this.localDocID);
-
- // No pagebreaks in this document
- if (pagebreakSpans.doc() != this.localDocID)
- break;
- };
+ if (pagebreakSpans.doc() != this.localDocID) {
+ if (pagebreakSpans.doc() < this.localDocID) {
+ pagebreakSpans.skipTo(this.localDocID);
+
+ // No pagebreaks in this document
+ if (pagebreakSpans.doc() != this.localDocID)
+ break;
+ }
+ else {
+ break;
+ };
+ continue;
+ };
if (DEBUG)
log.debug("The pagebreak occurs in the document");
@@ -911,16 +919,18 @@
// if it is in the correct area
if (pagebreakSpans.start() <= this.getStartPos()) {
- if (DEBUG)
- log.debug("PB start position is before match at {}",
- pagebreakSpans.start());
-
// Only the first payload is relevant
b = pagebreakSpans.getPayload().iterator().next();
start = pagebreakSpans.start();
+
+ if (DEBUG)
+ log.debug("PB start position is before match at {}:{}",
+ pagebreakSpans.start(),
+ b);
+
}
- // This is the first pagebreak!
+ // This is the first pagebreak inside the match!
else {
// b is already defined!
@@ -937,6 +947,7 @@
// This is the first pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});
+
if (start >= this.getStartPos()) {
if (DEBUG)
@@ -945,6 +956,7 @@
pagenumber);
this.addPagebreak(charOffset, pagenumber);
};
+ b = null;
}
// b wasn't used yet
@@ -963,17 +975,41 @@
// This is the first pagebreak!
pagebreaks.add(new int[]{charOffset, pagenumber});
this.addPagebreak(charOffset,pagenumber);
+ b = null;
}
// Pagebreak beyond the current position
else {
break;
};
-
- // Reset byte
- b = null;
};
};
+
+ if (b != null) {
+ bb.rewind();
+ bb.put(b);
+ bb.rewind();
+
+ pagenumber = bb.getInt();
+ charOffset = bb.getInt();
+
+ if (DEBUG)
+ log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
+
+ // This is a remembered pagebreak!
+ pagebreaks.add(new int[]{charOffset, pagenumber});
+
+ if (start >= this.getStartPos()) {
+
+ if (DEBUG)
+ log.debug("Add pagebreak to rendering: {}-{}",
+ charOffset,
+ pagenumber);
+ this.addPagebreak(charOffset, pagenumber);
+ };
+
+ b = null;
+ };
}
catch (Exception e) {
log.warn("Some problems with ByteBuffer: {}", e.getMessage());
@@ -988,7 +1024,6 @@
return pagebreaks;
};
-
// Expand the context to a span
public int[] expandContextToSpan (String element) {
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
index 9c303da..93d7925 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -1,33 +1,19 @@
package de.ids_mannheim.korap.index;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
import java.io.IOException;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
-import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
-import de.ids_mannheim.korap.KrillCollection;
-import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.query.QueryBuilder;
-import de.ids_mannheim.korap.query.SpanClassQuery;
-import de.ids_mannheim.korap.query.SpanElementQuery;
-import de.ids_mannheim.korap.query.SpanFocusQuery;
-import de.ids_mannheim.korap.query.SpanNextQuery;
-import de.ids_mannheim.korap.query.SpanWithinQuery;
-import de.ids_mannheim.korap.query.QueryBuilder;
-import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
-import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
-import de.ids_mannheim.korap.response.SearchContext;
/*
* Retrieve pagebreak annotations
@@ -35,25 +21,75 @@
@RunWith(JUnit4.class)
public class TestPagebreakIndex {
+
+ private FieldDocument createFieldDoc0 () {
+ // abcde
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("tokens", "abcde",
+ "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>5]" +
+ "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
+ "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
+ "[(3-4)s:a|i:d|_3$<i>3<i>4]" +
+ "[(4-5)s:b|i:e|_4$<i>4<i>5]"
+ );
+ return fd;
+ }
+
+ private FieldDocument createFieldDoc1 () {
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("tokens", "abcabcabac",
+ "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
+ "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
+ "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
+ "[(3-4)s:a|i:a|_3$<i>3<i>4]" +
+ "[(4-5)s:b|i:b|_4$<i>4<i>5]" +
+ "[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
+ "[(6-7)s:a|i:a|_6$<i>6<i>7]" +
+ "[(7-8)s:b|i:b|_7$<i>7<i>8]" +
+ "[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
+ "[(9-10)s:c|i:c|_9$<i>9<i>10]");
+ return fd;
+ }
+
+ @Test
+ public void testPageBreakDocLowerThanLocalDocId () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createFieldDoc0());
+ ki.addDoc(createFieldDoc1());
+ ki.commit();
+
+ SpanTermQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
+ Result kr = ki.search(sq, (short) 10);
+ assertEquals(4, kr.getMatches().size());
+
+ assertEquals(2, kr.getMatch(0).getStartPos());
+ assertEquals(3, kr.getMatch(0).getEndPos());
+ assertEquals(-1, kr.getMatch(0).getStartPage());
+ assertEquals(-1, kr.getMatch(0).getEndPage());
+
+ assertEquals(2, kr.getMatch(1).getStartPos());
+ assertEquals(3, kr.getMatch(1).getEndPos());
+ assertEquals(528, kr.getMatch(1).getStartPage());
+ assertEquals(-1, kr.getMatch(1).getEndPage());
+
+ assertEquals(5, kr.getMatch(2).getStartPos());
+ assertEquals(6, kr.getMatch(2).getEndPos());
+ assertEquals(529, kr.getMatch(2).getStartPage());
+ assertEquals(-1, kr.getMatch(2).getEndPage());
+
+ assertEquals(9, kr.getMatch(3).getStartPos());
+ assertEquals(10, kr.getMatch(3).getEndPos());
+ assertEquals(530, kr.getMatch(3).getStartPage());
+ assertEquals(-1, kr.getMatch(3).getEndPage());
+ };
@Test
public void indexExample1 () throws Exception {
KrillIndex ki = new KrillIndex();
// abcabcabac
- FieldDocument fd = new FieldDocument();
- fd.addTV("tokens", "abcabcabac",
- "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
- "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
- "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
- "[(3-4)s:a|i:a|_3$<i>3<i>4]" +
- "[(4-5)s:b|i:b|_4$<i>4<i>5]" +
- "[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
- "[(6-7)s:a|i:a|_6$<i>6<i>7]" +
- "[(7-8)s:b|i:b|_7$<i>7<i>8]" +
- "[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
- "[(9-10)s:c|i:c|_9$<i>9<i>10]");
- ki.addDoc(fd);
+ ki.addDoc(createFieldDoc1());
ki.commit();
SpanQuery sq;
@@ -88,8 +124,6 @@
"</span>",
kr.getMatch(0).getSnippetHTML());
- /*
-
QueryBuilder qb = new QueryBuilder("tokens");
sq = qb.seq().append(
qb.repeat(
@@ -106,6 +140,7 @@
assertEquals(528, kr.getMatch(0).getStartPage());
assertEquals(529, kr.getMatch(0).getEndPage());
+
assertEquals(
"snippetHTML",
"<span class=\"context-left\"></span>"+
@@ -121,6 +156,5 @@
"bac"+
"</span>",
kr.getMatch(0).getSnippetHTML());
- */
};
};