Added pagebreak handling

Change-Id: I5478cc5f6f168cf96a2685076a3958242e57a9b0
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
index 5303876..90ae2a2 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTerm.java
@@ -57,7 +57,7 @@
     private static short i, l;
 
     // This advices the java compiler to ignore all loggings
-    public static final boolean DEBUG = false;
+    public static final boolean DEBUG = true;
     private final Logger log = LoggerFactory
             .getLogger(MultiTermTokenStream.class);
 
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
index 68ba827..d4c9230 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermToken.java
@@ -30,7 +30,7 @@
     private boolean sorted = false;
 
     // This advices the java compiler to ignore all loggings
-    public static final boolean DEBUG = false;
+    public static final boolean DEBUG = true;
     private final Logger log = LoggerFactory
             .getLogger(MultiTermTokenStream.class);
 
diff --git a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
index 50e39aa..458c536 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MultiTermTokenStream.java
@@ -48,7 +48,7 @@
             .compile("\\[(?:\\([0-9]+-[0-9]+\\))?([^\\]]+?)\\]");
 
     // This advices the java compiler to ignore all loggings
-    public static final boolean DEBUG = false;
+    public static final boolean DEBUG = true;
     private final Logger log = LoggerFactory
             .getLogger(MultiTermTokenStream.class);
 
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index fc46b62..c8a5ebd 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -40,7 +40,6 @@
   The number based Highlighttype is ugly - UGLY!
 
   substrings may be out of range - e.g. if snippets are not lifted!
-
 */
 
 /**
@@ -92,6 +91,9 @@
     int relationNumberCounter = 2048;
     int identifierNumberCounter = -2;
 
+	private int startPage = -1;
+	private int endPage = -1;
+	
     private String tempSnippet, snippetHTML, snippetBrackets, identifier;
 
     private HighlightCombinator snippetArray;
@@ -402,6 +404,24 @@
     };
 
 
+	/**
+	 * Get start page.
+	 */
+    @JsonIgnore
+	public int getStartPage () {
+		return this.startPage;
+	};
+
+	
+	/**
+	 * Get end page.
+	 */
+    @JsonIgnore
+	public int getEndPage () {
+		return this.endPage;
+	};
+
+	
     /**
      * Set document id.
      * 
@@ -645,12 +665,13 @@
 	
 	// Retrieve pagebreaks in a certain area
 	public List<int[]> retrievePagebreaks (String pb) {
-		if (this.positionsToOffset != null)
+		if (this.positionsToOffset != null) {
 			return this.retrievePagebreaks(
 				this.positionsToOffset.getLeafReader(),
 				(Bits) null,
 				"tokens", pb
 				);
+		};
 
 		return null;
 	};
@@ -668,18 +689,27 @@
 		try {
 
             // Store character offsets in ByteBuffer
-            ByteBuffer bb = ByteBuffer.allocate(4);
+            ByteBuffer bb = ByteBuffer.allocate(16);
 
 			// Store last relevant pagebreak in byte array
 			byte[] b = null;
-			
-			Spans pagebreakSpans = new SpanTermQuery(new Term(field, pb)).getSpans(
+
+			SpanTermQuery stq = new SpanTermQuery(new Term(field, pb));
+
+			if (DEBUG)
+				log.trace("Check pagebreaks with {}", stq.toString());
+
+			Spans pagebreakSpans = stq.getSpans(
 				atomic, bitset, new HashMap<Term, TermContext>()
 				);
 
 			// Iterate over all pagebreaks
-			while (pagebreakSpans.next()) {
+			while (pagebreakSpans.next() == true) {
 
+				if (DEBUG) {
+					log.debug("There is a pagebreak at {}", pagebreakSpans.doc());
+				};
+				
 				// Current pagebreak is not in the correct document
 				if (pagebreakSpans.doc() != this.localDocID) {
 					pagebreakSpans.skipTo(this.localDocID);
@@ -689,10 +719,16 @@
 						break;
 				};
 
+				if (DEBUG)
+					log.debug("The pagebreak occurs in the document");
+				
 				// There is a pagebreak found - check,
 				// if it is in the correct area
 				if (pagebreakSpans.start() <= this.getStartPos()) {
 
+					if (DEBUG)
+						log.debug("PB start position is before at {}", pagebreakSpans.start());
+					
 					// Only the first payload is relevant
 					b = pagebreakSpans.getPayload().iterator().next();
 				}
@@ -704,6 +740,28 @@
 					if (b != null) {
 						bb.rewind();
 						bb.put(b);
+						bb.rewind();
+
+						if (DEBUG)
+							log.debug("Add pagebreak to list");
+						
+						// This is the first pagebreak!
+						pagebreaks.add(
+							new int[]{
+								bb.getInt(),
+								bb.getInt()
+							});
+					}
+
+					// b wasn't used yet
+					else if (pagebreakSpans.start() <= this.getEndPos()) {
+
+						// Set new pagebreak
+						// Only the first payload is relevant
+						b = pagebreakSpans.getPayload().iterator().next();
+						bb.rewind();
+						bb.put(b);
+						bb.rewind();
 							
 						// This is the first pagebreak!
 						pagebreaks.add(
@@ -711,27 +769,16 @@
 								bb.getInt(),
 								bb.getInt()
 							});
-					};
 
-					// Set new pagebreak
-					// Only the first payload is relevant
-					b = pagebreakSpans.getPayload().iterator().next();
-					bb.rewind();
-					bb.put(b);
-							
-					// This is the first pagebreak!
-					pagebreaks.add(
-						new int[]{
-							bb.getInt(),
-							bb.getInt()
-						});
+					}
+
+					// Pagebreak beyond the current position
+					else {
+						break;
+					};
 
 					// Reset byte
 					b = null;
-
-					// Pagebreak beyond the current position
-					if (pagebreakSpans.start() > this.getEndPos())
-						break;
 				};
 			};
 		}
@@ -739,6 +786,12 @@
 			log.warn("Some problems with ByteBuffer: {}", e.getMessage());
 		};
 
+		if (pagebreaks.size() > 0) {
+			this.startPage = pagebreaks.get(0)[0];
+			if (pagebreaks.size() > 1 && pagebreaks.get(pagebreaks.size()-1) != null)
+				this.endPage = pagebreaks.get(pagebreaks.size()-1)[0];
+		}
+		
 		return pagebreaks;
 	};
 
@@ -1488,6 +1541,15 @@
         if (this.version != null)
             json.put("version", this.getVersion());
 
+		if (this.startPage != -1) {
+			ArrayNode pages = mapper.createArrayNode();
+			pages.add(this.startPage);
+			if (this.endPage != -1 && this.endPage != this.startPage)
+				pages.add(this.endPage);
+
+			json.put("pages", pages);
+		};
+
         return json;
     };
 
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index ea2aea9..8b39d8f 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -52,7 +52,7 @@
                 "-i",

                 "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",

                 "-o", "test-index" });

-        assertEquals("Indexed 3 files.", outputStream.toString());

+        assertEquals("Indexed 4 files.", outputStream.toString());

     }

 

     @Test

diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
new file mode 100644
index 0000000..ca4905b
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -0,0 +1,85 @@
+package de.ids_mannheim.korap.index;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import de.ids_mannheim.korap.KrillCollection;
+import de.ids_mannheim.korap.Krill;
+import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.query.QueryBuilder;
+import de.ids_mannheim.korap.query.SpanClassQuery;
+import de.ids_mannheim.korap.query.SpanElementQuery;
+import de.ids_mannheim.korap.query.SpanFocusQuery;
+import de.ids_mannheim.korap.query.SpanNextQuery;
+import de.ids_mannheim.korap.query.SpanWithinQuery;
+import de.ids_mannheim.korap.response.Match;
+import de.ids_mannheim.korap.response.Result;
+import de.ids_mannheim.korap.response.SearchContext;
+
+/*
+ * Retrieve pagebreak annotations
+ */
+
+@RunWith(JUnit4.class)
+public class TestPagebreakIndex {
+
+    @Test
+    public void indexExample1 () throws IOException {
+		KrillIndex ki = new KrillIndex();
+
+		// abcabcabac
+		FieldDocument fd = new FieldDocument();
+		fd.addTV("tokens", "abcabcabac",
+				 "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10|~:base/s:pb$<i>528<i>0]" +
+				 "[(1-2)s:b|i:b|_1$<i>1<i>2]" +
+				 "[(2-3)s:c|i:c|_2$<i>2<i>3]" +
+				 "[(3-4)s:a|i:a|_3$<i>3<i>4]" +
+				 "[(4-5)s:b|i:b|_4$<i>4<i>5]" +
+				 "[(5-6)s:c|i:c|_5$<i>5<i>6|~:base/s:pb$<i>529<i>5]" +
+				 "[(6-7)s:a|i:a|_6$<i>6<i>7]" +
+				 "[(7-8)s:b|i:b|_7$<i>7<i>8]" +
+				 "[(8-9)s:a|i:a|_8$<i>8<i>9|~:base/s:pb$<i>530<i>8]" +
+				 "[(9-10)s:c|i:c|_9$<i>9<i>10]");
+        ki.addDoc(fd);
+        ki.commit();
+
+		SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
+
+        Result kr = ki.search(sq, (short) 10);
+
+		assertEquals(528, kr.getMatch(0).getStartPage());
+		assertEquals(-1, kr.getMatch(0).getEndPage());
+		assertEquals(
+			"snippetHTML",
+			"<span class=\"context-left\">"+
+			"<span class=\"pb\" data-after=\"528\"></span>"+
+			"ab"+
+			"</span>"+
+			"<span class=\"match\">"+
+			"<mark>"+
+			"c"+
+			"</mark>"+
+			"</span>"+
+			"<span class=\"context-right\">"+
+			"ab"+
+			"<span class=\"pb\" data-after=\"528\"></span>"+
+			"cab"+
+			"<span class=\"pb\" data-after=\"528\"></span>"+
+			"a"+
+			"<span class=\"more\">"+
+			"</span>"+
+			"</span>",
+			kr.getMatch(0).getSnippetHTML());
+	};
+};
diff --git a/src/test/resources/goe/AGA-03828-pb.json.gz b/src/test/resources/goe/AGA-03828-pb.json.gz
new file mode 100644
index 0000000..4cd1a74
--- /dev/null
+++ b/src/test/resources/goe/AGA-03828-pb.json.gz
Binary files differ