blob: 44782954a8bbf197af723b9a18c9343c2fc328b5 [file] [log] [blame]
Eliza Margarethad28469f2014-03-10 12:42:21 +00001package de.ids_mannheim.korap.index;
2
margaretha4f995582015-12-14 14:14:34 +01003import static org.junit.Assert.assertEquals;
Eliza Margarethad28469f2014-03-10 12:42:21 +00004
5import java.io.IOException;
Akronad006992018-11-02 09:03:39 +01006import java.util.*;
7import java.util.regex.*;
Eliza Margarethad28469f2014-03-10 12:42:21 +00008
Akronad006992018-11-02 09:03:39 +01009import static de.ids_mannheim.korap.TestSimple.getJsonString;
10import static de.ids_mannheim.korap.TestSimple.simpleFieldDoc;
11import static de.ids_mannheim.korap.TestSimple.simpleFuzzyFieldDoc;
Eliza Margarethad28469f2014-03-10 12:42:21 +000012import org.apache.lucene.index.Term;
13import org.apache.lucene.search.spans.SpanOrQuery;
14import org.apache.lucene.search.spans.SpanQuery;
15import org.apache.lucene.search.spans.SpanTermQuery;
16import org.junit.Test;
Akronad006992018-11-02 09:03:39 +010017import org.junit.Ignore;
Eliza Margarethad28469f2014-03-10 12:42:21 +000018
Akronad006992018-11-02 09:03:39 +010019import de.ids_mannheim.korap.query.QueryBuilder;
20import de.ids_mannheim.korap.Krill;
Nils Diewalda14ecd62015-02-26 21:00:20 +000021import de.ids_mannheim.korap.KrillIndex;
Eliza Margarethad28469f2014-03-10 12:42:21 +000022import de.ids_mannheim.korap.query.SpanNextQuery;
Eliza Margarethad4693462014-03-17 13:16:18 +000023import de.ids_mannheim.korap.query.SpanRepetitionQuery;
Eliza Margaretha6517c502016-10-10 18:19:24 +020024import de.ids_mannheim.korap.response.Match;
margaretha4f995582015-12-14 14:14:34 +010025import de.ids_mannheim.korap.response.Result;
Akronad006992018-11-02 09:03:39 +010026import de.ids_mannheim.korap.util.QueryException;
Eliza Margarethad28469f2014-03-10 12:42:21 +000027
Eliza Margarethad4693462014-03-17 13:16:18 +000028public class TestRepetitionIndex {
Eliza Margarethad28469f2014-03-10 12:42:21 +000029
Nils Diewaldbb33da22015-03-04 16:24:25 +000030 private KrillIndex ki;
31 private Result kr;
Akronad006992018-11-02 09:03:39 +010032 private FieldDocument fd;
Nils Diewaldbb33da22015-03-04 16:24:25 +000033
34 private FieldDocument createFieldDoc0 () {
35 FieldDocument fd = new FieldDocument();
Eliza Margarethad28469f2014-03-10 12:42:21 +000036 fd.addString("ID", "doc-0");
Eliza Margaretha6f989202016-10-14 21:48:29 +020037 fd.addTV("base", "text",
38 "[(0-1)s:c|_1$<i>0<i>1]" + "[(1-2)s:e|_2$<i>1<i>2]"
39 + "[(2-3)s:c|_3$<i>2<i>3|<>:y$<b>64<i>2<i>4<i>4<b>0]"
40 + "[(3-4)s:c|s:b|_4$<i>3<i>4|<>:x$<b>64<i>3<i>7<i>7<b>0]"
41 + "[(4-5)s:e|s:d|_5$<i>4<i>5|<>:y$<b>64<i>4<i>6<i>6<b>0]"
42 + "[(5-6)s:c|_6$<i>5<i>6|<>:y$<b>64<i>5<i>8<i>8]"
43 + "[(6-7)s:d|_7$<i>6<i>7<b>0]"
44 + "[(7-8)s:e|_8$<i>7<i>8|<>:x$<b>64<i>7<i>9<i>9<b>0]"
45 + "[(8-9)s:e|s:b|_9$<i>8<i>9|<>:x$<b>64<i>8<i>10<i>10<b>0]"
46 + "[(9-10)s:d|_10$<i>9<i>10]");
Eliza Margarethad28469f2014-03-10 12:42:21 +000047 return fd;
48 }
Eliza Margaretha7788a982014-08-29 16:10:52 +000049
Nils Diewaldbb33da22015-03-04 16:24:25 +000050
51 private FieldDocument createFieldDoc1 () {
52 FieldDocument fd = new FieldDocument();
53 fd.addString("ID", "doc-1");
margaretha4f995582015-12-14 14:14:34 +010054 fd.addTV("base", "text", "[(0-1)s:b|_1$<i>0<i>1]"
55 + "[(1-2)s:e|_2$<i>1<i>2]" + "[(2-3)s:c|_3$<i>2<i>3]"
56 + "[(3-4)s:c|s:d]" + "[(4-5)s:d|s:c|_5$<i>4<i>5]"
57 + "[(5-6)s:e|s:c|_6$<i>5<i>6]" + "[(6-7)s:e|_7$<i>6<i>7]"
58 + "[(7-8)s:c|_8$<i>7<i>8]" + "[(8-9)s:d|_9$<i>8<i>9]"
59 + "[(9-10)s:d|_10$<i>9<i>10]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000060 return fd;
61 }
62
63
64 private FieldDocument createFieldDoc2 () {
65 FieldDocument fd = new FieldDocument();
66 fd.addString("ID", "doc-2");
margaretha4f995582015-12-14 14:14:34 +010067 fd.addTV("base", "text",
68 "[(0-1)s:b|s:c|_1$<i>0<i>1|<>:s$<b>64<i>0<i>2<i>1<b>0]"
69 + "[(1-2)s:c|_2$<i>1<i>2]"
70 + "[(2-3)s:b|_3$<i>2<i>3|<>:s$<b>64<i>2<i>3<i>3<b>0]"
71 + "[(3-4)s:c|_4$<i>3<i>4|<>:s$<b>64<i>3<i>4<i>4<b>0]"
72 + "[(4-5)s:c|_5$<i>4<i>5|<>:s$<b>64<i>4<i>5<i>5]"
73 + "[(5-6)s:b|_6$<i>5<i>6<b>0]"
74 + "[(6-7)s:c|_7$<i>6<i>7|<>:s$<b>64<i>6<i>7<i>7<b>0]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000075 return fd;
76 }
77
78
79 private FieldDocument createFieldDoc3 () {
80 FieldDocument fd = new FieldDocument();
81 fd.addString("ID", "doc-3");
margaretha4f995582015-12-14 14:14:34 +010082 fd.addTV("base", "text",
83 "[(0-1)s:a|_1$<i>0<i>1|<>:s$<b>64<i>0<i>2<i>1<b>0]"
84 + "[(1-2)s:d|_2$<i>1<i>2|<>:s$<b>64<i>1<i>2<i>3]"
85 + "[(2-3)s:e|_3$<i>2<i>3<b>0]");
Nils Diewaldbb33da22015-03-04 16:24:25 +000086 return fd;
87 }
88
Eliza Margaretha6f989202016-10-14 21:48:29 +020089
90 @Test
Eliza Margaretha6517c502016-10-10 18:19:24 +020091 public void testTermQuery () throws IOException {
92 ki = new KrillIndex();
93 ki.addDoc(createFieldDoc0());
94 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +020095
Eliza Margaretha6517c502016-10-10 18:19:24 +020096 // Quantifier only
97 // c{1,2}
98 SpanQuery sq = new SpanRepetitionQuery(
99 new SpanTermQuery(new Term("base", "s:c")), 1, 2, true);
100 kr = ki.search(sq, (short) 10);
101 // 0-1, 2-3, 2-4, 3-4, 5-6
102 assertEquals((long) 5, kr.getTotalResults());
103 assertEquals(0, kr.getMatch(0).getStartPos());
104 assertEquals(1, kr.getMatch(0).getEndPos());
105 assertEquals(2, kr.getMatch(1).getStartPos());
106 assertEquals(3, kr.getMatch(1).getEndPos());
107 assertEquals(2, kr.getMatch(2).getStartPos());
108 assertEquals(4, kr.getMatch(2).getEndPos());
109 assertEquals(3, kr.getMatch(3).getStartPos());
110 assertEquals(4, kr.getMatch(3).getEndPos());
111 assertEquals(5, kr.getMatch(4).getStartPos());
112 assertEquals(6, kr.getMatch(4).getEndPos());
113 }
Eliza Margaretha6f989202016-10-14 21:48:29 +0200114
115
Nils Diewaldbb33da22015-03-04 16:24:25 +0000116 @Test
Eliza Margaretha6f989202016-10-14 21:48:29 +0200117 public void testRepetitionInSequences () throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000118 ki = new KrillIndex();
Eliza Margarethad28469f2014-03-10 12:42:21 +0000119 ki.addDoc(createFieldDoc0());
120 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000121
Eliza Margarethad28469f2014-03-10 12:42:21 +0000122 SpanQuery sq, sq2;
Eliza Margarethad28469f2014-03-10 12:42:21 +0000123 // ec{1,2}
Nils Diewaldbb33da22015-03-04 16:24:25 +0000124 sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:e")),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200125 new SpanRepetitionQuery(
126 new SpanTermQuery(new Term("base", "s:c")), 1, 2,
127 true));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000128
Eliza Margarethad28469f2014-03-10 12:42:21 +0000129 kr = ki.search(sq, (short) 10);
130 // 1-3, 1-4, 4-6
Nils Diewaldbb33da22015-03-04 16:24:25 +0000131 assertEquals((long) 3, kr.getTotalResults());
Eliza Margaretha6517c502016-10-10 18:19:24 +0200132 assertEquals(1, kr.getMatch(0).getStartPos());
133 assertEquals(3, kr.getMatch(0).getEndPos());
134 assertEquals(1, kr.getMatch(1).getStartPos());
135 assertEquals(4, kr.getMatch(1).getEndPos());
136 assertEquals(4, kr.getMatch(2).getStartPos());
137 assertEquals(6, kr.getMatch(2).getEndPos());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000138
Eliza Margarethad28469f2014-03-10 12:42:21 +0000139 // ec{1,2}d
Nils Diewaldbb33da22015-03-04 16:24:25 +0000140 sq2 = new SpanNextQuery(sq, new SpanTermQuery(new Term("base", "s:d")));
141 kr = ki.search(sq2, (short) 10);
142 assertEquals((long) 2, kr.getTotalResults());
Eliza Margarethad28469f2014-03-10 12:42:21 +0000143 assertEquals(1, kr.getMatch(0).startPos);
144 assertEquals(5, kr.getMatch(0).endPos);
145 assertEquals(4, kr.getMatch(1).startPos);
146 assertEquals(7, kr.getMatch(1).endPos);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000147
Eliza Margarethad28469f2014-03-10 12:42:21 +0000148 // Multiple documents
149 ki.addDoc(createFieldDoc1());
150 ki.commit();
151 kr = ki.search(sq2, (short) 10);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000152 assertEquals((long) 5, kr.getTotalResults());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000153 }
154
Eliza Margaretha6f989202016-10-14 21:48:29 +0200155
Eliza Margaretha6517c502016-10-10 18:19:24 +0200156 @Test
Eliza Margaretha6f989202016-10-14 21:48:29 +0200157 public void testMinZeroRepetition () throws IOException {
158 ki = new KrillIndex();
Eliza Margaretha6517c502016-10-10 18:19:24 +0200159 ki.addDoc(createFieldDoc0());
160 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200161
162 SpanQuery sq, sq2;
163 sq = new SpanTermQuery(new Term("base", "s:e"));
164 kr = ki.search(sq, (short) 10);
165
166 assertEquals((long) 4, kr.getTotalResults());
Eliza Margaretha6517c502016-10-10 18:19:24 +0200167 assertEquals(1, kr.getMatch(0).getStartPos());
168 assertEquals(2, kr.getMatch(0).getEndPos());
169 assertEquals(4, kr.getMatch(1).getStartPos());
170 assertEquals(5, kr.getMatch(1).getEndPos());
171 assertEquals(7, kr.getMatch(2).getStartPos());
172 assertEquals(8, kr.getMatch(2).getEndPos());
173 assertEquals(8, kr.getMatch(3).getStartPos());
174 assertEquals(9, kr.getMatch(3).getEndPos());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200175 try {
176 sq2 = new SpanNextQuery(sq, new SpanRepetitionQuery(
177 new SpanTermQuery(new Term("base", "s:c")), 0, 1, true));
178 }
179 catch (IllegalArgumentException e) {
180 assertEquals("Minimum repetition must not lower than 1.",
181 e.getMessage());
182 }
Eliza Margaretha6517c502016-10-10 18:19:24 +0200183 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000184
Eliza Margaretha6f989202016-10-14 21:48:29 +0200185
Nils Diewaldbb33da22015-03-04 16:24:25 +0000186 /** Skip to */
187 @Test
188 public void testCase2 () throws IOException {
189 ki = new KrillIndex();
Eliza Margarethad28469f2014-03-10 12:42:21 +0000190 ki.addDoc(createFieldDoc0());
191 ki.addDoc(createFieldDoc3());
192 ki.addDoc(createFieldDoc2());
193 ki.addDoc(createFieldDoc1());
194 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000195
Eliza Margarethad28469f2014-03-10 12:42:21 +0000196 SpanQuery sq;
Eliza Margaretha7788a982014-08-29 16:10:52 +0000197 // c{2,2}
margaretha4cfc89e2016-04-25 18:01:14 +0200198 // sq = new SpanRepetitionQuery(
199 // new SpanTermQuery(new Term("base", "s:c")), 2, 2, true);
200 // kr = ki.search(sq, (short) 10);
201 // // doc1 2-4, 3-5, 4-6
202 // assertEquals((long) 6, kr.getTotalResults());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000203
Eliza Margaretha7788a982014-08-29 16:10:52 +0000204 // ec{2,2}
Nils Diewaldbb33da22015-03-04 16:24:25 +0000205 sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:e")),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200206 new SpanRepetitionQuery(
207 new SpanTermQuery(new Term("base", "s:c")), 2, 2,
208 true));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000209
210 kr = ki.search(sq, (short) 10);
211 assertEquals((long) 2, kr.getTotalResults());
212 assertEquals(3, kr.getMatch(1).getLocalDocID());
213
214 }
215
216
217 /** OR */
218 @Test
219 public void testCase3 () throws IOException {
220 ki = new KrillIndex();
Eliza Margarethad28469f2014-03-10 12:42:21 +0000221 ki.addDoc(createFieldDoc0());
222 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223
224 SpanQuery sq, sq2;
225 // ec{1,2}
226 sq = new SpanNextQuery(new SpanTermQuery(new Term("base", "s:e")),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200227 new SpanOrQuery(new SpanRepetitionQuery(
228 new SpanTermQuery(new Term("base", "s:c")), 1, 1, true),
229 new SpanRepetitionQuery(
230 new SpanTermQuery(new Term("base", "s:b")), 1,
231 1, true)));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000232 kr = ki.search(sq, (short) 10);
233 assertEquals((long) 3, kr.getTotalResults());
Eliza Margarethad28469f2014-03-10 12:42:21 +0000234 assertEquals(1, kr.getMatch(0).startPos);
235 assertEquals(3, kr.getMatch(0).endPos);
236 assertEquals(4, kr.getMatch(1).startPos);
237 assertEquals(6, kr.getMatch(1).endPos);
238 assertEquals(7, kr.getMatch(2).startPos);
239 assertEquals(9, kr.getMatch(2).endPos);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000240
Nils Diewaldbb33da22015-03-04 16:24:25 +0000241 }
242
243
244 @Test
245 public void testCase4 () throws IOException {
246 ki = new KrillIndex();
Eliza Margaretha7788a982014-08-29 16:10:52 +0000247 ki.addDoc(createFieldDoc1());
248 ki.commit();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000249
250 SpanQuery sq;
Eliza Margaretha7788a982014-08-29 16:10:52 +0000251 // c{2,2}
Eliza Margaretha6f989202016-10-14 21:48:29 +0200252 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base", "s:c")),
253 1, 3, true);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000254 kr = ki.search(sq, (short) 10);
255 // 2-3, 2-4, 2-5, 3-4, 3-5, 3-6, 4-5, 4-6, 5-6, 7-8
Nils Diewaldbb33da22015-03-04 16:24:25 +0000256 assertEquals((long) 10, kr.getTotalResults());
257
Eliza Margaretha6f989202016-10-14 21:48:29 +0200258 sq = new SpanRepetitionQuery(new SpanTermQuery(new Term("base", "s:c")),
259 2, 3, true);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000260 kr = ki.search(sq, (short) 10);
261 // 2-4, 2-5, 3-5, 3-6, 4-6
Nils Diewaldbb33da22015-03-04 16:24:25 +0000262 assertEquals((long) 5, kr.getTotalResults());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000263 }
264
Akronf9def5e2016-10-10 21:26:46 +0200265
Nils Diewaldbb33da22015-03-04 16:24:25 +0000266 @Test
267 public void testCase5 () throws IOException {
268 ki = new KrillIndex();
269 ki.addDoc(getClass().getResourceAsStream("/wiki/00001.json.gz"), true);
270 ki.commit();
271
272 SpanQuery sq0, sq1, sq2;
273 sq0 = new SpanTermQuery(new Term("tokens", "tt/p:NN"));
Eliza Margaretha6f989202016-10-14 21:48:29 +0200274 sq1 = new SpanRepetitionQuery(
275 new SpanTermQuery(new Term("tokens", "tt/p:ADJA")), 2, 3, true);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000276 sq2 = new SpanNextQuery(sq1, sq0);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000277 kr = ki.search(sq2, (short) 10);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000278
279 assertEquals((long) 2, kr.getTotalResults());
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000280 assertEquals(73, kr.getMatch(0).getStartPos());
281 assertEquals(77, kr.getMatch(0).getEndPos());
282 assertEquals(74, kr.getMatch(1).getStartPos());
283 assertEquals(77, kr.getMatch(1).getEndPos());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000284
Eliza Margaretha6f989202016-10-14 21:48:29 +0200285
286 sq2 = new SpanNextQuery(
287 new SpanTermQuery(new Term("tokens", "s:offenen")), sq2);
Eliza Margaretha7788a982014-08-29 16:10:52 +0000288 kr = ki.search(sq2, (short) 10);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000289
290 assertEquals((long) 1, kr.getTotalResults());
Eliza Margarethafaa548f2014-09-30 17:22:11 +0000291 assertEquals(73, kr.getMatch(0).getStartPos());
292 assertEquals(77, kr.getMatch(0).getEndPos());
293 /*
Nils Diewald392bcf32015-02-26 20:01:17 +0000294 for (Match km : kr.getMatches()){
Eliza Margaretha7788a982014-08-29 16:10:52 +0000295 System.out.println(km.getSnippetBrackets());
296 System.out.println(km.getStartPos() +","+km.getEndPos());
297 }*/
Akrondfc93572016-08-10 19:01:34 +0200298 };
Akronad006992018-11-02 09:03:39 +0100299
300 @Test
Akron1f88f402018-11-02 14:32:09 +0100301 public void testRepetitionSnippetBug1 () throws IOException, QueryException {
Akronad006992018-11-02 09:03:39 +0100302 // Construct index
303 Pattern p = Pattern.compile("bccc?d");
304
305 // Der [corenlp/p=ADJA]{2,3} Baum
306
307 QueryBuilder qb = new QueryBuilder("base");
308
309 // b c{2,3} d
310 SpanQuery sq = qb.seq(
311 qb.seg("s:b")
312 ).append(
313 qb.repeat(qb.seg("s:c"),2,3)
314 ).append(
315 qb.seg("s:d")
316 ).toQuery();
317
318 Krill ks = new Krill(sq);
319
320 assertEquals(ks.getSpanQuery().toString(),
321 "spanNext(spanNext(base:s:b, spanRepetition(base:s:c{2,3})), base:s:d)");
322
323 // simpleDocTest
324 KrillIndex ki = new KrillIndex();
325 ki.addDoc(simpleFieldDoc("abccde"));
326 ki.commit();
327 Result kr = ks.apply(ki);
328 assertEquals(1,kr.getTotalResults());
329
330 // fuzzingRepetitionBug();
331
332 // First fuzzed failure (0 vs 1)
333 ki = new KrillIndex();
Akron1f88f402018-11-02 14:32:09 +0100334 ki.addDoc(simpleFieldDoc("cccd")); // 0
335 ki.addDoc(simpleFieldDoc("bccccccaeae")); // 1
336 ki.addDoc(simpleFieldDoc("cbcedb")); // 2
Akronad006992018-11-02 09:03:39 +0100337
338 ki.commit();
339 kr = ks.apply(ki);
340 assertEquals(0,kr.getTotalResults());
341
Akronad006992018-11-02 09:03:39 +0100342 // Third fuzzed failure (1 vs 2)
343 ki = new KrillIndex();
344 ki.addDoc(simpleFieldDoc("bccdcb"));
345 ki.addDoc(simpleFieldDoc("ebccce"));
346 ki.addDoc(simpleFieldDoc("adbdcd"));
347
348 ki.commit();
349 kr = ks.apply(ki);
350 assertEquals(1,kr.getTotalResults());
351 };
352
Akron1f88f402018-11-02 14:32:09 +0100353 @Test
354 public void testRepetitionSnippetBug2 () throws IOException, QueryException {
355 // Construct index
356 Pattern p = Pattern.compile("bccc?d");
357
358 QueryBuilder qb = new QueryBuilder("base");
359
360 // b c{2,3} d
361 SpanQuery sq = qb.seq(
362 qb.seg("s:b")
363 ).append(
364 qb.repeat(qb.seg("s:c"),2,3)
365 ).append(
366 qb.seg("s:d")
367 ).toQuery();
368
369 Krill ks = new Krill(sq);
370
371 assertEquals(ks.getSpanQuery().toString(),
372 "spanNext(spanNext(base:s:b, spanRepetition(base:s:c{2,3})), base:s:d)");
373
374 // fuzzingRepetitionBug();
375
376 // Second fuzzed failure (1 vs 0)
377 ki = new KrillIndex();
378 ki.addDoc(simpleFieldDoc("cdddbc"));
379 ki.addDoc(simpleFieldDoc("bccc"));
380 ki.addDoc(simpleFieldDoc("cbcccd"));
381
382 ki.commit();
383 kr = ks.apply(ki);
384 assertEquals(1,kr.getTotalResults());
385 };
386
Akron27867e22018-11-05 09:12:50 +0100387 @Test
Akron27867e22018-11-05 09:12:50 +0100388 public void testRepetitionSnippetBug3 () throws IOException, QueryException {
389 // Construct index
390 Pattern p = Pattern.compile("bccc?d");
391
392 QueryBuilder qb = new QueryBuilder("base");
393
394 // b c{2,3} d
395 SpanQuery sq = qb.seq(
396 qb.seg("s:b")
397 ).append(
398 qb.repeat(qb.seg("s:c"),2,3)
399 ).append(
400 qb.seg("s:d")
401 ).toQuery();
402
403 Krill ks = new Krill(sq);
404
405 assertEquals(ks.getSpanQuery().toString(),
406 "spanNext(spanNext(base:s:b, spanRepetition(base:s:c{2,3})), base:s:d)");
407
408 // fuzzingRepetitionBug();
409
410 // Fourth fuzzed failure (1 vs 0)
411 ki = new KrillIndex();
412 ki.addDoc(simpleFieldDoc("cdcd"));
413 ki.addDoc(simpleFieldDoc("bcebccac"));
414 ki.addDoc(simpleFieldDoc("bccdcecc")); // !
415
416 ki.commit();
417 kr = ks.apply(ki);
418 assertEquals(1,kr.getTotalResults());
419 };
420
Akronad006992018-11-02 09:03:39 +0100421
422 /**
423 * This method creates a corpus using fuzzing to
424 * check for unexpected, failing constellations
425 * regarding repetition queries.
426 * By shrinking the accepted result length, it tries
427 * to minimize the complexity of the constellations.
428 */
429 public void fuzzingRepetitionBug () throws IOException, QueryException {
430
431 List<String> chars = Arrays.asList("a", "b", "c", "c", "d", "e");
432
433 // Construct index
434 Pattern p = Pattern.compile("bccc?d");
435 QueryBuilder qb = new QueryBuilder("base");
436
437 // b c{2,3} d
438 SpanQuery sq = qb.seq(
439 qb.seg("s:b")
440 ).append(
441 qb.repeat(qb.seg("s:c"),2,3)
442 ).append(
443 qb.seg("s:d")
444 ).toQuery();
445
446 Krill ks = new Krill(sq);
447
448 assertEquals(ks.getSpanQuery().toString(),
449 "spanNext(spanNext(base:s:b, spanRepetition(base:s:c{2,3})), base:s:d)");
450
451 String lastFailureConf = "";
452
453 int minLength = 6;
454 int maxLength = 22;
455 int maxDocs = 8;
456
457 // Create fuzzy corpora (1000 trials)
458 for (int x = 0; x < 100000; x++) {
459 KrillIndex ki = new KrillIndex();
460 ArrayList<String> list = new ArrayList<String>();
461 int c = 0;
462
463 // Create a corpus of 8 fuzzy docs
464 for (int i = 0; i < (int)(Math.random() * maxDocs); i++) {
465 FieldDocument testDoc = simpleFuzzyFieldDoc(chars, minLength, maxLength);
466 String testString = testDoc.doc.getField("base").stringValue();
467 Matcher m = p.matcher(testString);
468 list.add(testString);
469 while (m.find())
470 c++;
471 ki.addDoc(testDoc);
472 };
473
474 ki.commit();
475
476 Result kr = ks.apply(ki);
477
478 // Check if the regex-calculated matches are correct, otherwise
479 // spit out the corpus configurations
480 if (c != kr.getTotalResults()) {
481 String failureConf = c + ":" + kr.getTotalResults() + " " + list.toString();
482 if (lastFailureConf.length() == 0 ||
483 failureConf.length() < lastFailureConf.length()) {
484 System.err.println(failureConf);
485 lastFailureConf = failureConf;
486 minLength--;
487 maxDocs--;
488 };
489 };
490 };
491 };
Eliza Margarethad28469f2014-03-10 12:42:21 +0000492}