Introduce prefix cutting in with expandToContext to avoid missing matches in matchinfo view

Change-Id: I997439e3f621470d4d96e108cca25ae3692d6de9
diff --git a/Changes b/Changes
index f967ed6..94c2480 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,8 @@
 0.58.6 2019-05-28
     - [bugfix] Updated cache loading (fixed #55) (diewald, margaretha)
+    - [bugfix] Introduce left match cutting so that
+      in matchinfo with expandToContext cutting won't
+      remove the actual match (diewald; reported by CoRoLa)
 
 0.58.5 2019-03-18
     - [bugfix] Fix bug where duplicate keys occured in
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 738748b..0b32c8b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1273,6 +1273,7 @@
 
                 // Search for minimal surrounding sentences
                 if (extendToSentence) {
+                    
                     String element = "base/s:s";
                     int[] spanContext = match.expandContextToSpan(element);
 
@@ -1281,6 +1282,22 @@
 
                     if (spanContext[0] >= 0
                             && spanContext[0] < spanContext[1]) {
+
+                        // Match needs to be cutted!
+                        if ((spanContext[1] - spanContext[0]) > match.getMaxMatchTokens()) {
+                            int contextLength = match.getMaxMatchTokens() - match.getLength();
+                            int halfContext = contextLength / 2;
+
+                            // This is the extended context calculated
+                            int realLeftLength = match.getStartPos() - spanContext[0];
+
+                            // The length is too large - cut!
+                            if (realLeftLength > halfContext) {
+                                match.startCutted = true;
+                                spanContext[0] = match.getStartPos() - halfContext;
+                            }
+                        }
+
                         match.setStartPos(spanContext[0]);
                         match.setEndPos(spanContext[1]);
 						match.potentialStartPosChar = spanContext[2];
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index f520865..bf18bb5 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -113,7 +113,8 @@
     public int potentialStartPosChar = -1, potentialEndPosChar = -1;
 
 	@JsonIgnore
-	public boolean cutted = false;
+	public boolean startCutted = false;
+	public boolean endCutted = false;
 
     private String version;
 
@@ -219,7 +220,6 @@
         };
     };
 
-
     /**
      * Private class of highlights.
 	 * TODO: This should probably be renamed, as it not only contains highlights
@@ -345,7 +345,7 @@
                             this.potentialStartPosChar = bb.getInt(1);
                     };
 
-                    if (bb.getInt(4) > this.potentialEndPosChar && !this.cutted)
+                    if (bb.getInt(4) > this.potentialEndPosChar && !this.endCutted)
                         this.potentialEndPosChar = bb.getInt(5);
 
                     if (DEBUG)
@@ -500,6 +500,11 @@
 		this.addHighlight(new Highlight(start, pagenumber));
 	};
 
+    @JsonIgnore
+    public int getMaxMatchTokens () {
+        return MAX_MATCH_TOKENS;
+    }
+    
     /**
      * Get document id.
      */
@@ -568,7 +573,7 @@
         this.startPos = pos;
 		if (this.endPos != -1 && (this.endPos - pos) > MAX_MATCH_TOKENS) {
 			this.endPos = pos + MAX_MATCH_TOKENS;
-			this.cutted = true;
+			this.endCutted = true;
 		};
     };
 
@@ -615,7 +620,7 @@
     public void setEndPos (int pos) {
 		if (this.startPos != -1 && (pos - this.startPos) > MAX_MATCH_TOKENS) {
 			pos = this.startPos + MAX_MATCH_TOKENS;
-			this.cutted = true;
+			this.endCutted = true;
 		};
         this.endPos = pos;
     };
@@ -821,6 +826,10 @@
         return this.context;
     };
 
+    @JsonIgnore
+    public int getLength () {
+        return this.getEndPos() - this.getStartPos();
+    };  
 
 	
 	// Retrieve pagebreaks in a certain area
@@ -1413,6 +1422,11 @@
 
         // Iterate through all remaining elements
         sb.append("<span class=\"match\">");
+
+		if (this.startCutted) {
+			sb.append("<span class=\"cutted\"></span>");
+		};
+        
         for (short i = start; i <= end; i++) {
 
 			elem = this.snippetArray.get(i);
@@ -1427,7 +1441,7 @@
 				sb.append(elemString);
 			}
         };
-		if (this.cutted) {
+		if (this.endCutted) {
 			sb.append("<span class=\"cutted\"></span>");
 		};
         sb.append("</span>");
@@ -1465,6 +1479,10 @@
 
         sb.append("[");
 
+		if (this.startCutted) {
+			sb.append("<!>");
+		};
+        
         // Last element of sorted array
         elem = this.snippetArray.getLast();
         StringBuilder rightContext = new StringBuilder();
@@ -1480,7 +1498,7 @@
             sb.append(this.snippetArray.get(i).toBrackets(this));
         };
 
-		if (this.cutted) {
+		if (this.endCutted) {
 			sb.append("<!>");
 		};
         sb.append("]");
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index a346f79..69e339c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -15,6 +15,11 @@
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import de.ids_mannheim.korap.response.Result;
+
 import de.ids_mannheim.korap.Krill;
 import de.ids_mannheim.korap.KrillIndex;
 import de.ids_mannheim.korap.query.QueryBuilder;
@@ -1099,6 +1104,31 @@
 		assertEquals(km.getFieldValue("availability"), "CC-BY-SA");
     };
 
+    @Test
+    public void indexCorolaTokensBugReplicated () throws IOException, QueryException {
+        KrillIndex ki = new KrillIndex();
+
+        ki.addDoc(getClass().getResourceAsStream("/others/corola-bug.json"), false);
+        ki.commit();
+
+        SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:b"));
+
+        Result kr = ki.search(sq, (short) 10);
+
+        assertEquals(70, kr.getMatch(0).getStartPos());
+        assertEquals(71, kr.getMatch(0).getEndPos());
+        assertEquals("totalResults", kr.getTotalResults(), 1);
+        assertEquals("... a a a a a a [[b]] a a a a a a ...", kr.getMatch(0).getSnippetBrackets());
+            
+        // see TestNextIndex#corolaNextTest
+        Match km = ki.getMatchInfo("match-Corola-blog/BlogPost/370281_a_371610-p70-71", "tokens", null, null,false, false, true);
+
+        // The match needs to be cutted on both sides!
+        String str = km.getSnippetBrackets();
+        assertTrue(str.contains("[<!>a"));
+        assertTrue(str.contains("a}<!>]"));
+    };
+    
 
     private FieldDocument createSimpleFieldDoc () {
         FieldDocument fd = new FieldDocument();
diff --git a/src/test/resources/others/corola-bug.json b/src/test/resources/others/corola-bug.json
new file mode 100644
index 0000000..37c3c29
--- /dev/null
+++ b/src/test/resources/others/corola-bug.json
@@ -0,0 +1,20 @@
+{
+  "data":{
+    "stream":[["<>:base\/s:s$<b>64<i>0<i>0<i>100<b>2","_0$<i>0<i>1","s:a"],["_1$<i>2<i>3","s:a","drukola\/l:au"],["_2$<i>4<i>5","s:a","drukola\/l:au"],["_3$<i>6<i>7","s:a","drukola\/l:au"],["_4$<i>8<i>9","s:a","drukola\/l:au"],["_5$<i>10<i>11","s:a","drukola\/l:au"],["_6$<i>12<i>13","s:a","drukola\/l:au"],["_7$<i>14<i>15","s:a","drukola\/l:au"],["_8$<i>16<i>17","s:a","drukola\/l:au"],["_9$<i>18<i>19","s:a","drukola\/l:au"],["_10$<i>20<i>21","s:a","drukola\/l:au"],["_11$<i>22<i>23","s:a","drukola\/l:au"],["_12$<i>24<i>25","s:a","drukola\/l:au"],["_13$<i>26<i>27","s:a","drukola\/l:au"],["_14$<i>28<i>29","s:a","drukola\/l:au"],["_15$<i>30<i>31","s:a","drukola\/l:au"],["_16$<i>32<i>33","s:a","drukola\/l:au"],["_17$<i>34<i>35","s:a","drukola\/l:au"],["_18$<i>36<i>37","s:a","drukola\/l:au"],["_19$<i>38<i>39","s:a","drukola\/l:au"],["_20$<i>40<i>41","s:a","drukola\/l:au"],["_21$<i>42<i>43","s:a","drukola\/l:au"],["_22$<i>44<i>45","s:a","drukola\/l:au"],["_23$<i>46<i>47","s:a","drukola\/l:au"],["_24$<i>48<i>49","s:a","drukola\/l:au"],["_25$<i>50<i>51","s:a","drukola\/l:au"],["_26$<i>52<i>53","s:a","drukola\/l:au"],["_27$<i>54<i>55","s:a","drukola\/l:au"],["_28$<i>56<i>57","s:a","drukola\/l:au"],["_29$<i>58<i>59","s:a","drukola\/l:au"],["_30$<i>60<i>61","s:a","drukola\/l:au"],["_31$<i>62<i>63","s:a","drukola\/l:au"],["_32$<i>64<i>65","s:a","drukola\/l:au"],["_33$<i>66<i>67","s:a","drukola\/l:au"],["_34$<i>68<i>69","s:a","drukola\/l:au"],["_35$<i>70<i>71","s:a","drukola\/l:au"],["_36$<i>72<i>73","s:a","drukola\/l:au"],["_37$<i>74<i>75","s:a","drukola\/l:au"],["_38$<i>76<i>77","s:a","drukola\/l:au"],["_39$<i>78<i>79","s:a","drukola\/l:au"],["_40$<i>80<i>81","s:a","drukola\/l:au"],["_41$<i>82<i>83","s:a","drukola\/l:au"],["_42$<i>84<i>85","s:a","drukola\/l:au"],["_43$<i>86<i>87","s:a","drukola\/l:au"],["_44$<i>88<i>89","s:a","drukola\/l:au"],["_45$<i>90<i>91","s:a","drukola\/l:au"],["_46$<i>92<i>93","s:a","drukola\/l:au"],["_47$<i>94<i>95","s:a","drukola\/l:au"],["_48$<i>96<i>97","s:a","drukola\/l:au"],["_49$<i>98<i>99","s:a","drukola\/l:au"],["_50$<i>100<i>101","s:a","drukola\/l:au"],["_51$<i>102<i>103","s:a","drukola\/l:au"],["_52$<i>104<i>105","s:a","drukola\/l:au"],["_53$<i>106<i>107","s:a","drukola\/l:au"],["_54$<i>108<i>109","s:a","drukola\/l:au"],["_55$<i>110<i>111","s:a","drukola\/l:au"],["_56$<i>112<i>113","s:a","drukola\/l:au"],["_57$<i>114<i>115","s:a","drukola\/l:au"],["_58$<i>116<i>117","s:a","drukola\/l:au"],["_59$<i>118<i>119","s:a","drukola\/l:au"],["_60$<i>120<i>121","s:a","drukola\/l:au"],["_61$<i>122<i>123","s:a","drukola\/l:au"],["_62$<i>124<i>125","s:a","drukola\/l:au"],["_63$<i>126<i>127","s:a","drukola\/l:au"],["_64$<i>128<i>129","s:a","drukola\/l:au"],["_65$<i>130<i>131","s:a","drukola\/l:au"],["_66$<i>132<i>133","s:a","drukola\/l:au"],["_67$<i>134<i>135","s:a","drukola\/l:au"],["_68$<i>136<i>137","s:a","drukola\/l:au"],["_69$<i>138<i>139","s:a","drukola\/l:au"],["_70$<i>140<i>141","s:b","drukola\/l:bu"],["_71$<i>142<i>143","s:a","drukola\/l:au"],["_72$<i>144<i>145","s:a","drukola\/l:au"],["_73$<i>146<i>147","s:a","drukola\/l:au"],["_74$<i>148<i>149","s:a","drukola\/l:au"],["_75$<i>150<i>151","s:a","drukola\/l:au"],["_76$<i>152<i>153","s:a","drukola\/l:au"],["_77$<i>154<i>155","s:a","drukola\/l:au"],["_78$<i>156<i>157","s:a","drukola\/l:au"],["_79$<i>158<i>159","s:a","drukola\/l:au"],["_80$<i>160<i>161","s:a","drukola\/l:au"],["_81$<i>162<i>163","s:a","drukola\/l:au"],["_82$<i>164<i>165","s:a","drukola\/l:au"],["_83$<i>166<i>167","s:a","drukola\/l:au"],["_84$<i>168<i>169","s:a","drukola\/l:au"],["_85$<i>170<i>171","s:a","drukola\/l:au"],["_86$<i>172<i>173","s:a","drukola\/l:au"],["_87$<i>174<i>175","s:a","drukola\/l:au"],["_88$<i>176<i>177","s:a","drukola\/l:au"],["_89$<i>178<i>179","s:a","drukola\/l:au"],["_90$<i>180<i>181","s:a","drukola\/l:au"],["_91$<i>182<i>183","s:a","drukola\/l:au"],["_92$<i>184<i>185","s:a","drukola\/l:au"],["_93$<i>186<i>187","s:a","drukola\/l:au"],["_94$<i>188<i>189","s:a","drukola\/l:au"],["_95$<i>190<i>191","s:a","drukola\/l:au"],["_96$<i>192<i>193","s:a","drukola\/l:au"],["_97$<i>194<i>195","s:a","drukola\/l:au"],["_98$<i>196<i>197","s:a","drukola\/l:au"],["_99$<i>198<i>199","s:a","drukola\/l:au"],["_100$<i>200<i>201","s:a","drukola\/l:au"]],
+    "tokenSource":"drukola#morpho",
+    "layerInfos":"dereko/s=spans drukola/l=tokens drukola/m=tokens drukola/p=tokens",
+    "foundries":"dereko dereko/structure dereko/structure/base-sentences-paragraphs drukola drukola/morpho",
+    "name":"tokens",
+    "text":"a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a b a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a "
+  },
+  "title":"Example",
+  "version":"0.03",
+  "language":"ro",
+  "corpusSigle":"Corola-blog",
+  "textSigle":"Corola-blog/BlogPost/370281_a_371610",
+  "textClass":"Other",
+  "author":"http://confluente.ro/marian_malciu_1409419962.html",
+  "availability":"QAO-NC",
+  "docTitle":"BlogPost",
+  "docSigle":"Corola-blog/BlogPost"
+}