Introduce prefix cutting in with expandToContext to avoid missing matches in matchinfo view
Change-Id: I997439e3f621470d4d96e108cca25ae3692d6de9
diff --git a/Changes b/Changes
index f967ed6..94c2480 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,8 @@
0.58.6 2019-05-28
- [bugfix] Updated cache loading (fixed #55) (diewald, margaretha)
+ - [bugfix] Introduce left match cutting so that
+ in matchinfo with expandToContext cutting won't
+ remove the actual match (diewald; reported by CoRoLa)
0.58.5 2019-03-18
- [bugfix] Fix bug where duplicate keys occured in
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 738748b..0b32c8b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1273,6 +1273,7 @@
// Search for minimal surrounding sentences
if (extendToSentence) {
+
String element = "base/s:s";
int[] spanContext = match.expandContextToSpan(element);
@@ -1281,6 +1282,22 @@
if (spanContext[0] >= 0
&& spanContext[0] < spanContext[1]) {
+
+ // Match needs to be cutted!
+ if ((spanContext[1] - spanContext[0]) > match.getMaxMatchTokens()) {
+ int contextLength = match.getMaxMatchTokens() - match.getLength();
+ int halfContext = contextLength / 2;
+
+ // This is the extended context calculated
+ int realLeftLength = match.getStartPos() - spanContext[0];
+
+ // The length is too large - cut!
+ if (realLeftLength > halfContext) {
+ match.startCutted = true;
+ spanContext[0] = match.getStartPos() - halfContext;
+ }
+ }
+
match.setStartPos(spanContext[0]);
match.setEndPos(spanContext[1]);
match.potentialStartPosChar = spanContext[2];
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index f520865..bf18bb5 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -113,7 +113,8 @@
public int potentialStartPosChar = -1, potentialEndPosChar = -1;
@JsonIgnore
- public boolean cutted = false;
+ public boolean startCutted = false;
+ public boolean endCutted = false;
private String version;
@@ -219,7 +220,6 @@
};
};
-
/**
* Private class of highlights.
* TODO: This should probably be renamed, as it not only contains highlights
@@ -345,7 +345,7 @@
this.potentialStartPosChar = bb.getInt(1);
};
- if (bb.getInt(4) > this.potentialEndPosChar && !this.cutted)
+ if (bb.getInt(4) > this.potentialEndPosChar && !this.endCutted)
this.potentialEndPosChar = bb.getInt(5);
if (DEBUG)
@@ -500,6 +500,11 @@
this.addHighlight(new Highlight(start, pagenumber));
};
+ @JsonIgnore
+ public int getMaxMatchTokens () {
+ return MAX_MATCH_TOKENS;
+ }
+
/**
* Get document id.
*/
@@ -568,7 +573,7 @@
this.startPos = pos;
if (this.endPos != -1 && (this.endPos - pos) > MAX_MATCH_TOKENS) {
this.endPos = pos + MAX_MATCH_TOKENS;
- this.cutted = true;
+ this.endCutted = true;
};
};
@@ -615,7 +620,7 @@
public void setEndPos (int pos) {
if (this.startPos != -1 && (pos - this.startPos) > MAX_MATCH_TOKENS) {
pos = this.startPos + MAX_MATCH_TOKENS;
- this.cutted = true;
+ this.endCutted = true;
};
this.endPos = pos;
};
@@ -821,6 +826,10 @@
return this.context;
};
+ @JsonIgnore
+ public int getLength () {
+ return this.getEndPos() - this.getStartPos();
+ };
// Retrieve pagebreaks in a certain area
@@ -1413,6 +1422,11 @@
// Iterate through all remaining elements
sb.append("<span class=\"match\">");
+
+ if (this.startCutted) {
+ sb.append("<span class=\"cutted\"></span>");
+ };
+
for (short i = start; i <= end; i++) {
elem = this.snippetArray.get(i);
@@ -1427,7 +1441,7 @@
sb.append(elemString);
}
};
- if (this.cutted) {
+ if (this.endCutted) {
sb.append("<span class=\"cutted\"></span>");
};
sb.append("</span>");
@@ -1465,6 +1479,10 @@
sb.append("[");
+ if (this.startCutted) {
+ sb.append("<!>");
+ };
+
// Last element of sorted array
elem = this.snippetArray.getLast();
StringBuilder rightContext = new StringBuilder();
@@ -1480,7 +1498,7 @@
sb.append(this.snippetArray.get(i).toBrackets(this));
};
- if (this.cutted) {
+ if (this.endCutted) {
sb.append("<!>");
};
sb.append("]");
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index a346f79..69e339c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -15,6 +15,11 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import de.ids_mannheim.korap.response.Result;
+
import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.query.QueryBuilder;
@@ -1099,6 +1104,31 @@
assertEquals(km.getFieldValue("availability"), "CC-BY-SA");
};
+ @Test
+ public void indexCorolaTokensBugReplicated () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+
+ ki.addDoc(getClass().getResourceAsStream("/others/corola-bug.json"), false);
+ ki.commit();
+
+ SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:b"));
+
+ Result kr = ki.search(sq, (short) 10);
+
+ assertEquals(70, kr.getMatch(0).getStartPos());
+ assertEquals(71, kr.getMatch(0).getEndPos());
+ assertEquals("totalResults", kr.getTotalResults(), 1);
+ assertEquals("... a a a a a a [[b]] a a a a a a ...", kr.getMatch(0).getSnippetBrackets());
+
+ // see TestNextIndex#corolaNextTest
+ Match km = ki.getMatchInfo("match-Corola-blog/BlogPost/370281_a_371610-p70-71", "tokens", null, null,false, false, true);
+
+ // The match needs to be cutted on both sides!
+ String str = km.getSnippetBrackets();
+ assertTrue(str.contains("[<!>a"));
+ assertTrue(str.contains("a}<!>]"));
+ };
+
private FieldDocument createSimpleFieldDoc () {
FieldDocument fd = new FieldDocument();
diff --git a/src/test/resources/others/corola-bug.json b/src/test/resources/others/corola-bug.json
new file mode 100644
index 0000000..37c3c29
--- /dev/null
+++ b/src/test/resources/others/corola-bug.json
@@ -0,0 +1,20 @@
+{
+ "data":{
+ "stream":[["<>:base\/s:s$<b>64<i>0<i>0<i>100<b>2","_0$<i>0<i>1","s:a"],["_1$<i>2<i>3","s:a","drukola\/l:au"],["_2$<i>4<i>5","s:a","drukola\/l:au"],["_3$<i>6<i>7","s:a","drukola\/l:au"],["_4$<i>8<i>9","s:a","drukola\/l:au"],["_5$<i>10<i>11","s:a","drukola\/l:au"],["_6$<i>12<i>13","s:a","drukola\/l:au"],["_7$<i>14<i>15","s:a","drukola\/l:au"],["_8$<i>16<i>17","s:a","drukola\/l:au"],["_9$<i>18<i>19","s:a","drukola\/l:au"],["_10$<i>20<i>21","s:a","drukola\/l:au"],["_11$<i>22<i>23","s:a","drukola\/l:au"],["_12$<i>24<i>25","s:a","drukola\/l:au"],["_13$<i>26<i>27","s:a","drukola\/l:au"],["_14$<i>28<i>29","s:a","drukola\/l:au"],["_15$<i>30<i>31","s:a","drukola\/l:au"],["_16$<i>32<i>33","s:a","drukola\/l:au"],["_17$<i>34<i>35","s:a","drukola\/l:au"],["_18$<i>36<i>37","s:a","drukola\/l:au"],["_19$<i>38<i>39","s:a","drukola\/l:au"],["_20$<i>40<i>41","s:a","drukola\/l:au"],["_21$<i>42<i>43","s:a","drukola\/l:au"],["_22$<i>44<i>45","s:a","drukola\/l:au"],["_23$<i>46<i>47","s:a","drukola\/l:au"],["_24$<i>48<i>49","s:a","drukola\/l:au"],["_25$<i>50<i>51","s:a","drukola\/l:au"],["_26$<i>52<i>53","s:a","drukola\/l:au"],["_27$<i>54<i>55","s:a","drukola\/l:au"],["_28$<i>56<i>57","s:a","drukola\/l:au"],["_29$<i>58<i>59","s:a","drukola\/l:au"],["_30$<i>60<i>61","s:a","drukola\/l:au"],["_31$<i>62<i>63","s:a","drukola\/l:au"],["_32$<i>64<i>65","s:a","drukola\/l:au"],["_33$<i>66<i>67","s:a","drukola\/l:au"],["_34$<i>68<i>69","s:a","drukola\/l:au"],["_35$<i>70<i>71","s:a","drukola\/l:au"],["_36$<i>72<i>73","s:a","drukola\/l:au"],["_37$<i>74<i>75","s:a","drukola\/l:au"],["_38$<i>76<i>77","s:a","drukola\/l:au"],["_39$<i>78<i>79","s:a","drukola\/l:au"],["_40$<i>80<i>81","s:a","drukola\/l:au"],["_41$<i>82<i>83","s:a","drukola\/l:au"],["_42$<i>84<i>85","s:a","drukola\/l:au"],["_43$<i>86<i>87","s:a","drukola\/l:au"],["_44$<i>88<i>89","s:a","drukola\/l:au"],["_45$<i>90<i>91","s:a","drukola\/l:au"],["_46$<i>92<i>93","s:a","drukola\/l:au"],["_47$<i>94<i>95","s:a","drukola\/l:au"],["_48$<i>96<i>97","s:a","drukola\/l:au"],["_49$<i>98<i>99","s:a","drukola\/l:au"],["_50$<i>100<i>101","s:a","drukola\/l:au"],["_51$<i>102<i>103","s:a","drukola\/l:au"],["_52$<i>104<i>105","s:a","drukola\/l:au"],["_53$<i>106<i>107","s:a","drukola\/l:au"],["_54$<i>108<i>109","s:a","drukola\/l:au"],["_55$<i>110<i>111","s:a","drukola\/l:au"],["_56$<i>112<i>113","s:a","drukola\/l:au"],["_57$<i>114<i>115","s:a","drukola\/l:au"],["_58$<i>116<i>117","s:a","drukola\/l:au"],["_59$<i>118<i>119","s:a","drukola\/l:au"],["_60$<i>120<i>121","s:a","drukola\/l:au"],["_61$<i>122<i>123","s:a","drukola\/l:au"],["_62$<i>124<i>125","s:a","drukola\/l:au"],["_63$<i>126<i>127","s:a","drukola\/l:au"],["_64$<i>128<i>129","s:a","drukola\/l:au"],["_65$<i>130<i>131","s:a","drukola\/l:au"],["_66$<i>132<i>133","s:a","drukola\/l:au"],["_67$<i>134<i>135","s:a","drukola\/l:au"],["_68$<i>136<i>137","s:a","drukola\/l:au"],["_69$<i>138<i>139","s:a","drukola\/l:au"],["_70$<i>140<i>141","s:b","drukola\/l:bu"],["_71$<i>142<i>143","s:a","drukola\/l:au"],["_72$<i>144<i>145","s:a","drukola\/l:au"],["_73$<i>146<i>147","s:a","drukola\/l:au"],["_74$<i>148<i>149","s:a","drukola\/l:au"],["_75$<i>150<i>151","s:a","drukola\/l:au"],["_76$<i>152<i>153","s:a","drukola\/l:au"],["_77$<i>154<i>155","s:a","drukola\/l:au"],["_78$<i>156<i>157","s:a","drukola\/l:au"],["_79$<i>158<i>159","s:a","drukola\/l:au"],["_80$<i>160<i>161","s:a","drukola\/l:au"],["_81$<i>162<i>163","s:a","drukola\/l:au"],["_82$<i>164<i>165","s:a","drukola\/l:au"],["_83$<i>166<i>167","s:a","drukola\/l:au"],["_84$<i>168<i>169","s:a","drukola\/l:au"],["_85$<i>170<i>171","s:a","drukola\/l:au"],["_86$<i>172<i>173","s:a","drukola\/l:au"],["_87$<i>174<i>175","s:a","drukola\/l:au"],["_88$<i>176<i>177","s:a","drukola\/l:au"],["_89$<i>178<i>179","s:a","drukola\/l:au"],["_90$<i>180<i>181","s:a","drukola\/l:au"],["_91$<i>182<i>183","s:a","drukola\/l:au"],["_92$<i>184<i>185","s:a","drukola\/l:au"],["_93$<i>186<i>187","s:a","drukola\/l:au"],["_94$<i>188<i>189","s:a","drukola\/l:au"],["_95$<i>190<i>191","s:a","drukola\/l:au"],["_96$<i>192<i>193","s:a","drukola\/l:au"],["_97$<i>194<i>195","s:a","drukola\/l:au"],["_98$<i>196<i>197","s:a","drukola\/l:au"],["_99$<i>198<i>199","s:a","drukola\/l:au"],["_100$<i>200<i>201","s:a","drukola\/l:au"]],
+ "tokenSource":"drukola#morpho",
+ "layerInfos":"dereko/s=spans drukola/l=tokens drukola/m=tokens drukola/p=tokens",
+ "foundries":"dereko dereko/structure dereko/structure/base-sentences-paragraphs drukola drukola/morpho",
+ "name":"tokens",
+ "text":"a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a b a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a "
+ },
+ "title":"Example",
+ "version":"0.03",
+ "language":"ro",
+ "corpusSigle":"Corola-blog",
+ "textSigle":"Corola-blog/BlogPost/370281_a_371610",
+ "textClass":"Other",
+ "author":"http://confluente.ro/marian_malciu_1409419962.html",
+ "availability":"QAO-NC",
+ "docTitle":"BlogPost",
+ "docSigle":"Corola-blog/BlogPost"
+}