Fix character offset bug in snippet generation

Change-Id: Ic4a6435de3b4b6561ebef71a6e07901fcabe662a
diff --git a/Changes b/Changes
index 6a6b75d..9063748 100644
--- a/Changes
+++ b/Changes
@@ -1,7 +1,9 @@
-0.55.9 2017-09-12
+0.55.9 2017-11-15
         - [bugfix] Serialize token identifier correctly for
           new corpora with text siglen (diewald)
         - [bugfix] Extend bytebuffer for relation payloads (diewald)
+        - [bugfix] Wrong char offsets in snippet generation, where
+          repositioning can result in exceeding the string (diewald)
 
 0.55.8 2017-09-05
         - [feature] Retrieve and display pagebreaks (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 92a1307..39c18cb 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -433,13 +433,14 @@
 							 String annotation) {
 
 		if (DEBUG)
-			log.trace("Add relation {}: {}-{}->{}-{}",
+			log.trace("Add relation {}: {}-{}>>{}-{}",
 					  annotation, srcStart, srcEnd, targetStart, targetEnd);
 
+		// Add source token
 		if (srcEnd == -1) {
-			// Add source token
 			this.addHighlight(new Highlight(srcStart, srcStart, annotation, targetStart, targetEnd));
 		}
+		// Add source span
 		else {
 			this.addHighlight(new Highlight(srcStart, srcEnd, annotation, targetStart, targetEnd));
 		};
@@ -447,9 +448,12 @@
         int id = identifierNumberCounter--;
         identifierNumber.put(id, targetStart);
 
+		// Add target token
 		if (targetEnd == -1) {
 			this.addHighlight(new Highlight(targetStart, targetStart, id));
 		}
+
+		// Add target span
 		else {
 			this.addHighlight(new Highlight(targetStart, targetEnd, id));
 		};
@@ -1200,8 +1204,10 @@
     private void _processHighlightSnippet (String clean,
             ArrayList<int[]> stack) {
 
-        if (DEBUG)
+        if (DEBUG) {
             log.trace("--- Process Highlight snippet");
+            log.trace("--- Snippet: {}", clean);
+		};
 
         int pos = 0, oldPos = 0;
 
@@ -1214,6 +1220,12 @@
 			// empty elements and the end position for closing elements
             pos = element[3] != 0 ? element[0] : element[1];
 
+			if (DEBUG)
+				log.trace("Add tag at position {} (was {})",
+						  pos,
+						  oldPos);
+
+			
 			// The new position is behind the old position
             if (pos > oldPos) {
 
@@ -1222,7 +1234,12 @@
                 if (pos > clean.length()) {
 
 					// Reposition to the end
-                    pos = clean.length() - 1;
+                    pos = clean.length();
+
+					if (DEBUG)
+						log.trace("Position exceeds string, now {}",
+								  pos);
+
                 };
 
 				// Add partial string
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 9742ab0..518b1af 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -659,7 +659,10 @@
         FieldDocument fd = ki.addDoc(2, getClass().getResourceAsStream("/goe/AGA-03828-new.json.gz"), true);
 		
         ki.commit();
-        Match km = ki.getMatchInfo("match-c1!d5-p0-4", "tokens", null, null,
+
+		Match km;
+
+        km = ki.getMatchInfo("match-c1!d5-p0-4", "tokens", null, null,
                 true, false);
 
 		assertEquals("SnippetBrackets (with Spans)",
@@ -678,199 +681,7 @@
 
 		km = ki.getMatchInfo("match-GOE/AGA/03828-p0-10", "tokens", "malt", null,
 							 true, false);
-
-		// Autobiographische einzelheiten Selbstschilderung (1) immer tätiger, nach innen und außen fortwirkender
-		/*
-		  [
-		  "-:base/paragraphs$<i>14",
-		  "-:base/sentences$<i>215",
-		  "-:corenlp/sentences$<i>212",
-		  "-:opennlp/sentences$<i>203",
-		  "-:tokens$<i>5234",
-		  "<>:dereko/s:front$<b>65<i>0<i>0<i>0<b>1",
-		  "<>:dereko/s:pb$<b>65<i>0<i>0<i>0<b>3<s>2",
-		  "<>:base/s:s$<b>64<i>0<i>30<i>2<b>2",
-		  "<>:dereko/s:head$<b>64<i>0<i>30<i>2<b>3<s>3",
-		  "<>:dereko/s:s$<b>64<i>0<i>30<i>2<b>4",
-		  "<>:corenlp/c:ROOT$<b>64<i>0<i>254<i>32<b>0",
-		  "<>:corenlp/s:s$<b>64<i>0<i>254<i>32<b>0",
-		  "<>:opennlp/s:s$<b>64<i>0<i>254<i>32<b>0",
-		  "<>:corenlp/c:NUR$<b>64<i>0<i>254<i>32<b>1",
-		  "<>:corenlp/c:NP$<b>64<i>0<i>253<i>32<b>2",
-		  "<>:base/s:t$<b>64<i>0<i>35242<i>5233<b>0",
-		  "<>:dereko/s:text$<b>64<i>0<i>35242<i>5233<b>0",
-		  "<>:dereko/s:body$<b>64<i>0<i>35242<i>5233<b>1",
-		  "<>:dereko/s:div$<b>64<i>0<i>35242<i>5233<b>2<s>1",
-		  ">:malt/d:ATTR$<b>32<i>2",
-		  "<:malt/d:ROOT$<b>34<i>0<i>179<i>21<i>2",
-		  "<:malt/d:ROOT$<b>34<i>0<i>179<i>21<i>13",
-		  "@:dereko/s:n:0$<b>17<s>1<i>5233",
-		  "@:dereko/s:type:Autobiographie$<b>17<s>1<i>5233",
-		  "@:dereko/s:complete:y$<b>17<s>1<i>5233",
-		  "@:dereko/s:n:529$<b>17<s>2",
-		  "@:dereko/s:id:aga.03828-529-pb529$<b>17<s>2",
-		  "@:dereko/s:TEIform:pb$<b>17<s>2",
-		  "@:dereko/s:type:main$<b>17<s>3<i>2",
-		  "_0$<i>0<i>17",
-		  "corenlp/p:ADJA",
-		  "i:autobiographische",
-		  "marmot/m:case:nom",
-		  "marmot/m:degree:pos",
-		  "marmot/m:gender:fem",
-		  "marmot/m:number:pl",
-		  "marmot/p:ADJA",
-		  "opennlp/p:ADJA",
-		  "s:Autobiographische",
-		  "tt/l:Autobiographische$<b>129<b>32",
-		  "tt/l:autobiographisch$<b>129<b>222",
-		  "tt/p:ADJA$<b>129<b>222",
-		  "tt/p:NN$<b>129<b>32",
-		  "~:base/s:pb$<i>529<i>0"
-		  ],[
-		  ">:malt/d:ATTR$<b>32<i>2",
-		  "_1$<i>18<i>30",
-		  "corenlp/p:ADJA",
-		  "i:einzelheiten",
-		  "marmot/m:case:nom",
-		  "marmot/m:gender:fem",
-		  "marmot/m:number:pl",
-		  "marmot/p:NN",
-		  "opennlp/p:ADJA",
-		  "s:einzelheiten",
-		  "tt/p:ADJA$<b>129<b>253"
-		  ],[
-		  "<:malt/d:ATTR$<b>32<i>0",
-		  "<>:base/s:s$<b>64<i>31<i>52<i>4<b>2",
-		  "<>:dereko/s:head$<b>64<i>31<i>52<i>4<b>4<s>2",
-		  "<>:dereko/s:s$<b>64<i>31<i>52<i>4<b>5",
-		  "<>:corenlp/c:S$<b>64<i>31<i>253<i>32<b>3",
-		  "<>:dereko/s:div$<b>64<i>31<i>3299<i>504<b>3<s>1",
-		  ">:malt/d:ROOT$<b>33<i>0<i>179<i>0<i>21",
-		  "<:malt/d:ATTR$<b>32<i>1",
-		  "<:malt/d:APP$<b>32<i>3",
-		  "<:malt/d:ATTR$<b>32<i>5",
-		  "@:dereko/s:complete:y$<b>17<s>1<i>504",
-		  "@:dereko/s:n:1$<b>17<s>1<i>504",
-		  "@:dereko/s:type:section$<b>17<s>1<i>504",
-		  "@:dereko/s:type:cross$<b>17<s>2<i>4",
-		  "_2$<i>31<i>48",
-		  "corenlp/p:NN",
-		  "i:selbstschilderung",
-		  "marmot/m:case:nom",
-		  "marmot/m:gender:fem",
-		  "marmot/m:number:sg",
-		  "marmot/p:NN",
-		  "opennlp/p:NN",
-		  "s:Selbstschilderung",
-		  "tt/l:Selbstschilderung$<b>129<b>255",
-		  "tt/p:NN$<b>129<b>255"
-		  ],[
-		  "<>:corenlp/c:NM$<b>64<i>50<i>52<i>4<b>6",
-		  "<>:corenlp/c:AVP$<b>64<i>50<i>58<i>5<b>5",
-		  "<>:corenlp/c:AP$<b>64<i>50<i>66<i>6<b>4",
-		  ">:malt/d:APP$<b>32<i>2",
-		  "_3$<i>50<i>51",
-		  "corenlp/p:CARD",
-		  "i:1",
-		  "marmot/p:CARD",
-		  "opennlp/p:CARD",
-		  "s:1",
-		  "tt/l:1$<b>129<b>255",
-		  "tt/p:CARD$<b>129<b>255"
-		  ],[
-		  "<>:base/s:s$<b>64<i>53<i>254<i>32<b>2",
-		  "<>:dereko/s:s$<b>64<i>53<i>254<i>32<b>5<s>1",
-		  "<>:base/s:p$<b>64<i>53<i>3299<i>504<b>1",
-		  "<>:dereko/s:p$<b>64<i>53<i>3299<i>504<b>4",
-		  ">:malt/d:ADV$<b>32<i>5",
-		  "@:dereko/s:type:manual$<b>17<s>1<i>32",
-		  "_4$<i>53<i>58",
-		  "corenlp/p:ADV",
-		  "i:immer",
-		  "marmot/p:ADV",
-		  "opennlp/p:ADV",
-		  "s:immer",
-		  "tt/l:immer$<b>129<b>255",
-		  "tt/p:ADV$<b>129<b>255"
-		  ],[
-		  ">:malt/d:ATTR$<b>32<i>2",
-		  "<:malt/d:ADV$<b>32<i>4",
-		  "_5$<i>59<i>66",
-		  "corenlp/p:ADJD",
-		  "i:tätiger",
-		  "marmot/m:degree:comp",
-		  "marmot/p:ADJD",
-		  "opennlp/p:ADJD",
-		  "s:tätiger",
-		  "tt/l:tätig$<b>129<b>233",
-		  "tt/p:ADJD$<b>129<b>233"
-		  ],[
-		  "<:malt/d:PN$<b>32<i>7",
-		  "<>:corenlp/c:PP$<b>64<i>68<i>127<i>13<b>4",
-		  ">:malt/d:PP$<b>32<i>13",
-		  "_6$<i>68<i>72",
-		  "corenlp/p:APPR",
-		  "i:nach",
-		  "marmot/p:APPR",
-		  "opennlp/p:APPR",
-		  "s:nach",
-		  "tt/l:nach$<b>129<b>172",
-		  "tt/l:nach$<b>129<b>82",
-		  "tt/p:ADV$<b>129<b>82",
-		  "tt/p:APPR$<b>129<b>172"
-		  ],[
-		  "<:malt/d:KON$<b>32<i>8",
-		  "<>:corenlp/c:CAVP$<b>64<i>73<i>88<i>10<b>5",
-		  ">:malt/d:PN$<b>32<i>6",
-		  "_7$<i>73<i>78",
-		  "corenlp/p:TRUNC",
-		  "i:innen",
-		  "marmot/p:ADV",
-		  "opennlp/p:ADV",
-		  "s:innen",
-		  "tt/l:innen$<b>129<b>173",
-		  "tt/l:innen$<b>129<b>81",
-		  "tt/p:ADJD$<b>129<b>173",
-		  "tt/p:ADV$<b>129<b>81"
-		  ],[
-		  ">:malt/d:KON$<b>32<i>7",
-		  "<:malt/d:CJ$<b>32<i>12",
-		  "_8$<i>79<i>82",
-		  "corenlp/p:KON",
-		  "i:und",
-		  "marmot/p:KON",
-		  "opennlp/p:KON",
-		  "s:und",
-		  "tt/l:und$<b>129<b>255",
-		  "tt/p:KON$<b>129<b>255"
-		  ],[
-		  ">:malt/d:ADV$<b>32<i>11",
-		  "_9$<i>83<i>88",
-		  "corenlp/p:ADV",
-		  "i:aussen",
-		  "marmot/p:ADV",
-		  "opennlp/p:ADV",
-		  "s:außen",
-		  "tt/l:außen$<b>129<b>253",
-		  "tt/p:ADV$<b>129<b>253"
-		  ],[
-		  ">:malt/d:ADV$<b>32<i>11",
-		  "_10$<i>89<i>102",
-		  "corenlp/p:ADJA",
-		  "i:fortwirkender",
-		  "marmot/m:case:nom",
-		  "marmot/m:degree:pos",
-		  "marmot/m:gender:masc",
-		  "marmot/m:number:sg",
-		  "marmot/p:ADJA",
-		  "opennlp/p:ADJA",
-		  "s:fortwirkender",
-		  "tt/l:fortwirkend$<b>129<b>158",
-		  "tt/l:fortwirkend$<b>129<b>96",
-		  "tt/p:ADJA$<b>129<b>96",
-		  "tt/p:ADJD$<b>129<b>158"
-		  ]
-		 */
+		
 		assertEquals("SnippetBrackets (with Spans)",
 					 "[[{malt/d:ATTR>1:Autobiographische} "+
 					 "{#1:{malt/d:ATTR>1:einzelheiten}} "+
@@ -881,7 +692,7 @@
 					 "{#6:{malt/d:PP>12:nach}} "+
 					 "{malt/d:PN>5:innen} "+
 					 "{malt/d:KON>6:und} "+
-					 "{malt/d:ADV>10:außen}]]n ...",
+					 "{malt/d:ADV>10:außen}]] ...",
 					 km.getSnippetBrackets());
 	};