Added pagebreak handling
Change-Id: I5478cc5f6f168cf96a2685076a3958242e57a9b0
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
index 5303876..90ae2a2 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
@@ -57,7 +57,7 @@
private static short i, l;
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = false;
+ public static final boolean DEBUG = true;
private final Logger log = LoggerFactory
.getLogger(MultiTermTokenStream.class);
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
index 68ba827..d4c9230 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
@@ -30,7 +30,7 @@
private boolean sorted = false;
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = false;
+ public static final boolean DEBUG = true;
private final Logger log = LoggerFactory
.getLogger(MultiTermTokenStream.class);
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
index 50e39aa..458c536 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
@@ -48,7 +48,7 @@
.compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = false;
+ public static final boolean DEBUG = true;
private final Logger log = LoggerFactory
.getLogger(MultiTermTokenStream.class);
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index fc46b62..c8a5ebd 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -40,7 +40,6 @@
The number based Highlighttype is ugly - UGLY!
substrings may be out of range - e.g. if snippets are not lifted!
-
*/
/**
@@ -92,6 +91,9 @@
int relationNumberCounter = 2048;
int identifierNumberCounter = -2;
+ private int startPage = -1;
+ private int endPage = -1;
+
private String tempSnippet, snippetHTML, snippetBrackets, identifier;
private HighlightCombinator snippetArray;
@@ -402,6 +404,24 @@
};
+ /**
+ * Get start page.
+ */
+ @JsonIgnore
+ public int getStartPage () {
+ return this.startPage;
+ };
+
+
+ /**
+ * Get end page.
+ */
+ @JsonIgnore
+ public int getEndPage () {
+ return this.endPage;
+ };
+
+
/**
* Set document id.
*
@@ -645,12 +665,13 @@
// Retrieve pagebreaks in a certain area
public List<int[]> retrievePagebreaks (String pb) {
- if (this.positionsToOffset != null)
+ if (this.positionsToOffset != null) {
return this.retrievePagebreaks(
this.positionsToOffset.getLeafReader(),
(Bits) null,
"tokens", pb
);
+ };
return null;
};
@@ -668,18 +689,27 @@
try {
// Store character offsets in ByteBuffer
- ByteBuffer bb = ByteBuffer.allocate(4);
+ ByteBuffer bb = ByteBuffer.allocate(16);
// Store last relevant pagebreak in byte array
byte[] b = null;
-
- Spans pagebreakSpans = new SpanTermQuery(new Term(field, pb)).getSpans(
+
+ SpanTermQuery stq = new SpanTermQuery(new Term(field, pb));
+
+ if (DEBUG)
+ log.trace("Check pagebreaks with {}", stq.toString());
+
+ Spans pagebreakSpans = stq.getSpans(
atomic, bitset, new HashMap<Term, TermContext>()
);
// Iterate over all pagebreaks
- while (pagebreakSpans.next()) {
+ while (pagebreakSpans.next() == true) {
+ if (DEBUG) {
+ log.debug("There is a pagebreak at {}", pagebreakSpans.doc());
+ };
+
// Current pagebreak is not in the correct document
if (pagebreakSpans.doc() != this.localDocID) {
pagebreakSpans.skipTo(this.localDocID);
@@ -689,10 +719,16 @@
break;
};
+ if (DEBUG)
+ log.debug("The pagebreak occurs in the document");
+
// There is a pagebreak found - check,
// if it is in the correct area
if (pagebreakSpans.start() <= this.getStartPos()) {
+ if (DEBUG)
+ log.debug("PB start position is before at {}", pagebreakSpans.start());
+
// Only the first payload is relevant
b = pagebreakSpans.getPayload().iterator().next();
}
@@ -704,6 +740,28 @@
if (b != null) {
bb.rewind();
bb.put(b);
+ bb.rewind();
+
+ if (DEBUG)
+ log.debug("Add pagebreak to list");
+
+ // This is the first pagebreak!
+ pagebreaks.add(
+ new int[]{
+ bb.getInt(),
+ bb.getInt()
+ });
+ }
+
+ // b wasn't used yet
+ else if (pagebreakSpans.start() <= this.getEndPos()) {
+
+ // Set new pagebreak
+ // Only the first payload is relevant
+ b = pagebreakSpans.getPayload().iterator().next();
+ bb.rewind();
+ bb.put(b);
+ bb.rewind();
// This is the first pagebreak!
pagebreaks.add(
@@ -711,27 +769,16 @@
bb.getInt(),
bb.getInt()
});
- };
- // Set new pagebreak
- // Only the first payload is relevant
- b = pagebreakSpans.getPayload().iterator().next();
- bb.rewind();
- bb.put(b);
-
- // This is the first pagebreak!
- pagebreaks.add(
- new int[]{
- bb.getInt(),
- bb.getInt()
- });
+ }
+
+ // Pagebreak beyond the current position
+ else {
+ break;
+ };
// Reset byte
b = null;
-
- // Pagebreak beyond the current position
- if (pagebreakSpans.start() > this.getEndPos())
- break;
};
};
}
@@ -739,6 +786,12 @@
log.warn("Some problems with ByteBuffer: {}", e.getMessage());
};
+ if (pagebreaks.size() > 0) {
+ this.startPage = pagebreaks.get(0)[0];
+ if (pagebreaks.size() > 1 && pagebreaks.get(pagebreaks.size()-1) != null)
+ this.endPage = pagebreaks.get(pagebreaks.size()-1)[0];
+ }
+
return pagebreaks;
};
@@ -1488,6 +1541,15 @@
if (this.version != null)
json.put("version", this.getVersion());
+ if (this.startPage != -1) {
+ ArrayNode pages = mapper.createArrayNode();
+ pages.add(this.startPage);
+ if (this.endPage != -1 && this.endPage != this.startPage)
+ pages.add(this.endPage);
+
+ json.put("pages", pages);
+ };
+
return json;
};
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index ea2aea9..8b39d8f 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -52,7 +52,7 @@
"-i",
"src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
"-o", "test-index" });
- assertEquals("Indexed 3 files.", outputStream.toString());
+ assertEquals("Indexed 4 files.", outputStream.toString());
}
@Test
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
new file mode 100644
index 0000000..ca4905b
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -0,0 +1,85 @@
+package de.ids_mannheim.korap.index;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import de.ids_mannheim.korap.KrillCollection;
+import de.ids_mannheim.korap.Krill;
+import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.query.QueryBuilder;
+import de.ids_mannheim.korap.query.SpanClassQuery;
+import de.ids_mannheim.korap.query.SpanElementQuery;
+import de.ids_mannheim.korap.query.SpanFocusQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
+import de.ids_mannheim.korap.query.SpanWithinQuery;
+import de.ids_mannheim.korap.response.Match;
+import de.ids_mannheim.korap.response.Result;
+import de.ids_mannheim.korap.response.SearchContext;
+
+/*
+ * Retrieve pagebreak annotations
+ */
+
+@RunWith(JUnit4.class)
+public class TestPagebreakIndex {
+
+ @Test
+ public void indexExample1 () throws IOException {
+ KrillIndex ki = new KrillIndex();
+
+ // abcabcabac
+ FieldDocument fd = new FieldDocument();
+ fd.addTV("tokens", "abcabcabac",
+ "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
+ "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
+ "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
+ "[(3-4)s:a|i:a|_3$<i>3<i>4]" +
+ "[(4-5)s:b|i:b|_4$<i>4<i>5]" +
+ "[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
+ "[(6-7)s:a|i:a|_6$<i>6<i>7]" +
+ "[(7-8)s:b|i:b|_7$<i>7<i>8]" +
+ "[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
+ "[(9-10)s:c|i:c|_9$<i>9<i>10]");
+ ki.addDoc(fd);
+ ki.commit();
+
+ SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
+
+ Result kr = ki.search(sq, (short) 10);
+
+ assertEquals(528, kr.getMatch(0).getStartPage());
+ assertEquals(-1, kr.getMatch(0).getEndPage());
+ assertEquals(
+ "snippetHTML",
+ "<span class=\"context-left\">"+
+ "<span class=\"pb\" data-after=\"528\"></span>"+
+ "ab"+
+ "</span>"+
+ "<span class=\"match\">"+
+ "<mark>"+
+ "c"+
+ "</mark>"+
+ "</span>"+
+ "<span class=\"context-right\">"+
+ "ab"+
+ "<span class=\"pb\" data-after=\"528\"></span>"+
+ "cab"+
+ "<span class=\"pb\" data-after=\"528\"></span>"+
+ "a"+
+ "<span class=\"more\">"+
+ "</span>"+
+ "</span>",
+ kr.getMatch(0).getSnippetHTML());
+ };
+};
diff --git a/src/test/resources/goe/AGA-03828-pb.json.gz b/src/test/resources/goe/AGA-03828-pb.json.gz
new file mode 100644
index 0000000..4cd1a74
--- /dev/null
+++ b/src/test/resources/goe/AGA-03828-pb.json.gz
Binary files differ