Improved pagebreak retrieval

Change-Id: Id2a3ab41525d415f99dd0c4ff304d0136afd731c
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 890b2a8..29ac699 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -50,12 +50,15 @@
  *    closing tags (pretty much clones of the initial span list),
  *    sorted for opening resp. closing, and processed in parallel
  *    to form an open/close stack. The new structure on the stack is
- *    [startchar, endchar, highlightclass, open=1/close=0]
+ *    [startchar, endchar, highlightclass, close=0/open=1/empty=2]
  *    (processHighlightStack)
  *    3.1. If the element is a relation with an identifier, this may
  *         be removed if duplicate (filterMultipleIdentifiers)
  * 4. Based on the stack and the primary data the snippet is created.
  *    (processHighlightSnippet)
+ *    4.1. To avoid unbalanced elements, all open/close/empty tags
+ *         are balanced (i.e. closed and reopened if overlaps occur).
+ *         (Highlightcombinator)
  */
 
 /*
@@ -785,11 +788,11 @@
 						bb.put(b);
 						bb.rewind();
 
-						if (DEBUG)
-							log.debug("Add pagebreak to list");
-
 						pagenumber = bb.getInt();
 						charOffset = bb.getInt();
+
+						if (DEBUG)
+							log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
 						
 						// This is the first pagebreak!
 						pagebreaks.add(new int[]{charOffset, pagenumber});
@@ -1140,7 +1143,8 @@
         // Iterate over all elements of the stack
         for (int[] element : stack) {
 
-            // The position
+            // The position is the start position for opening and
+			// empty elements and the end position for closing elements
             pos = element[3] != 0 ? element[0] : element[1];
 
 			// The new position is behind the old position
@@ -1163,9 +1167,18 @@
 
 			// close tag
             if (element[3] == 0) {
+
+				// Add close
                 snippetArray.addClose(element[2]);
             }
 
+			else if (element[3] == 2) {
+
+				// Add Empty (pagebreak)
+                snippetArray.addEmpty(element[2]);
+			}
+			
+
 			// open tag
             else {
                 snippetArray.addOpen(element[2]);
@@ -1341,8 +1354,19 @@
 
 			// Nothing more to open -- close all
             if (openList.isEmpty()) {
-                stack.addAll(closeList);
-                break;
+
+				if (DEBUG)
+					log.debug("No more open tags -- close all non pagebreaks");
+
+				if (closeList.peekFirst()[1] != PB_MARKER) {
+					stack.add(closeList.removeFirst());
+				}
+				else if (DEBUG) {
+					if (DEBUG)
+						log.debug("Close is pagebreak -- ignore (1)");
+				};
+
+                continue;
             }
 
             // Not sure about this, but it can happen
@@ -1350,9 +1374,37 @@
                 break;
             };
 
+			// Closener is pagebreak
+			if (closeList.peekFirst()[1] == PB_MARKER) {
+
+				if (DEBUG)
+					log.debug("Close is pagebreak -- ignore (2)");
+
+				// Remove closing pagebreak
+				closeList.removeFirst();
+			}
+
+			// Opener is pagebreak
+			else if (openList.peekFirst()[1] == PB_MARKER) {
+				int[] e = openList.removeFirst().clone();
+
+				if (DEBUG)
+					log.debug("Open is pagebreak");
+
+				// Mark as empty
+                e[1] = e[0]; // Remove pagebreak marker
+                e[3] = 2;
+
+				// Add empty pagebreak
+				stack.add(e);
+			}
+
 			// check if the opener is smaller than the closener
-            if (openList.peekFirst()[0] < closeList.peekFirst()[1]) {
-				
+			else if (openList.peekFirst()[0] < closeList.peekFirst()[1]) {
+
+				if (DEBUG)
+					log.debug("Open starts before close ends");
+
                 int[] e = openList.removeFirst().clone();
 
 				// Mark as opener
@@ -1361,7 +1413,11 @@
 				// Add opener to stack
                 stack.add(e);
             }
+
 			else {
+				if (DEBUG)
+					log.debug("Close ends before open");
+
 				// Add closener to stack
                 stack.add(closeList.removeFirst());
             };
@@ -1471,6 +1527,7 @@
 					// In pagebreak highlights
 					// there is already a character
 					start = highlight.start;
+					end = highlight.end;
 				};
 
                 if (DEBUG)
@@ -1482,6 +1539,9 @@
 				// Keep end equal -1
 				if (end != PB_MARKER) {
 					end -= startOffsetChar;
+				}
+				else if (DEBUG) {
+					log.debug("Pagebreak keeps end position");
 				};
 
                 if (start < 0 || (end < 0 && end != PB_MARKER))
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
index f73017f..7286436 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinator.java
@@ -17,7 +17,7 @@
     private final static Logger log = LoggerFactory.getLogger(Match.class);
 
     // This advices the java compiler to ignore all loggings
-    public static final boolean DEBUG = false;
+    public static final boolean DEBUG = true;
 
     private LinkedList<HighlightCombinatorElement> combine;
     private Stack<Integer> balanceStack = new Stack<>();
@@ -63,6 +63,9 @@
     // Add primary data to the stack
     public void addString (String characters) {
         this.combine.add(new HighlightCombinatorElement(characters));
+		if (DEBUG) {
+			log.trace("Add string \"{}\" to stack", characters);
+		};
     };
 
 
@@ -70,6 +73,13 @@
     public void addOpen (int number) {
         this.combine.add(new HighlightCombinatorElement((byte) 1, number));
         this.balanceStack.push(number);
+		if (DEBUG)
+			log.trace("Add opening {} to stack", number);
+    };
+
+	// Add empty highlight to the stack
+    public void addEmpty (int pagenumber) {
+        this.combine.add(new HighlightCombinatorElement((byte) 3, pagenumber));
     };
 
     // Add closing highlight combinator to the stack
@@ -111,7 +121,10 @@
                 log.trace(
                         "Closing element is unbalanced - {} "
                                 + "!= {} with lastComb {}|{}|{}",
-                        eold, number, lastComb.type, lastComb.number,
+                        eold,
+						number,
+						lastComb.type,
+						lastComb.number,
                         lastComb.characters);
 
             // combinator is opening and the number is not equal to the last
@@ -122,6 +135,11 @@
                 this.combine.removeLast();
             }
 
+			// Last element is empty
+			else if (lastComb.type == 3) {
+				System.err.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
+			}
+
             // combinator is either closing (??) or another opener
             else {
 
@@ -144,8 +162,6 @@
         lastComb = this.combine.peekLast();
 
         if (DEBUG) {
-            log.trace("LastComb: " + lastComb.type + '|' + lastComb.number + '|'
-                    + lastComb.characters + " for " + number);
             log.trace("Stack for checking 2: {}|{}|{}|{}", lastComb.type,
                     lastComb.number, lastComb.characters, number);
         };
@@ -157,6 +173,9 @@
                 lastComb = this.combine.peekLast();
             };
         }
+		else if (lastComb.type == 3) {
+			System.err.println("öööööööööööööööööööööööö");
+		}
         else {
             if (DEBUG)
                 log.trace("close element b) {}", number);
diff --git a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
index ffc9638..4f43656 100644
--- a/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
+++ b/src/main/java/de/ids_mannheim/korap/response/match/HighlightCombinatorElement.java
@@ -15,6 +15,7 @@
     // Type 0: Textual data
     // Type 1: Opening
     // Type 2: Closing
+	// Type 3: Empty
     public byte type;
 
     public int number = 0;
@@ -50,6 +51,7 @@
 
     // Return html fragment for this combinator element
     public String toHTML (Match match, FixedBitSet level, byte[] levelCache) {
+
         // Opening
         if (this.type == 1) {
             StringBuilder sb = new StringBuilder();
@@ -102,6 +104,7 @@
             };
             return sb.toString();
         }
+
         // Closing
         else if (this.type == 2) {
             if (this.number < -1 || this.number >= 256)
@@ -113,7 +116,12 @@
             if (this.terminal)
                 level.set((int) levelCache[this.number]);
             return "</mark>";
-        };
+        }
+
+		// Empty element
+		else if (this.type == 3) {
+			return "<span class=\"pb\" data-after=\"" + number + "\"></span>";
+		};
 
         // HTML encode primary data
         return escapeHTML(this.characters);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
index dae3aa9..288e672 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPagebreakIndex.java
@@ -56,8 +56,11 @@
         ki.addDoc(fd);
         ki.commit();
 
-		SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:c"));
-        Result kr = ki.search(sq, (short) 10);
+		SpanQuery sq;
+		Result kr;
+		/*
+		  sq = new SpanTermQuery(new Term("tokens", "s:c"));
+        kr = ki.search(sq, (short) 10);
 		
 		assertEquals(528, kr.getMatch(0).getStartPage());
 		assertEquals(-1, kr.getMatch(0).getEndPage());
@@ -82,7 +85,7 @@
 			"</span>"+
 			"</span>",
 			kr.getMatch(0).getSnippetHTML());
-
+*/
 
 		QueryBuilder qb = new QueryBuilder("tokens");
 		sq = qb.seq().append(
@@ -94,5 +97,26 @@
 			.toQuery();
 
 		assertEquals(sq.toString(), "spanNext(spanRepetition(spanNext(spanNext(tokens:s:a, tokens:s:b), tokens:s:c){2,2}), tokens:s:a)");
+
+
+		kr = ki.search(sq, (short) 10);
+		
+		assertEquals(528, kr.getMatch(0).getStartPage());
+		assertEquals(-1, kr.getMatch(0).getEndPage());
+		assertEquals(
+			"snippetHTML",
+			"<span class=\"context-left\"></span>"+
+			"<span class=\"match\">"+
+			"<mark>"+
+			"<span class=\"pb\" data-after=\"528\"></span>"+
+			"abcab"+
+			// "<span class=\"pb\" data-after=\"529\"></span>"+
+			"ca"+
+			"</mark>"+
+			"</span>"+
+			"<span class=\"context-right\">"+
+			"bac"+
+			"</span>",
+			kr.getMatch(0).getSnippetHTML());
 	};
 };