blob: 8471b2039d18f7da405a50c4549d1af54a378291 [file] [log] [blame]
Eliza Margarethabefc23f2014-01-20 14:34:15 +00001package de.ids_mannheim.korap.index;
2
Nils Diewaldf399a672013-11-18 17:55:22 +00003import java.util.*;
4import java.io.*;
5
6import org.apache.lucene.util.Version;
7import org.apache.lucene.util.BytesRef;
8import org.apache.lucene.util.Bits;
9
10import static org.junit.Assert.*;
Eliza Margarethabefc23f2014-01-20 14:34:15 +000011
Nils Diewaldf399a672013-11-18 17:55:22 +000012import org.junit.Test;
13import org.junit.Ignore;
14import org.junit.runner.RunWith;
15import org.junit.runners.JUnit4;
16
17import de.ids_mannheim.korap.KorapIndex;
Eliza Margarethaa8491712014-07-25 13:27:54 +000018import de.ids_mannheim.korap.KorapMatch;
Nils Diewaldf399a672013-11-18 17:55:22 +000019import de.ids_mannheim.korap.KorapQuery;
20import de.ids_mannheim.korap.KorapResult;
21import de.ids_mannheim.korap.query.SpanNextQuery;
22import de.ids_mannheim.korap.index.FieldDocument;
23import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
Eliza Margarethaa8491712014-07-25 13:27:54 +000024
25import org.apache.lucene.search.spans.SpanOrQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000026import org.apache.lucene.search.spans.SpanQuery;
27import org.apache.lucene.search.spans.SpanTermQuery;
28
29import de.ids_mannheim.korap.query.SpanElementQuery;
Eliza Margarethabefc23f2014-01-20 14:34:15 +000030import de.ids_mannheim.korap.query.SpanSegmentQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000031import de.ids_mannheim.korap.query.SpanWithinQuery;
32
Nils Diewaldf5f29ff2014-02-14 12:24:34 +000033import de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper;
34
Nils Diewaldf399a672013-11-18 17:55:22 +000035import org.apache.lucene.index.Term;
36
Nils Diewaldf399a672013-11-18 17:55:22 +000037@RunWith(JUnit4.class)
38public class TestNextIndex {
39
40 // Todo: primary data as a non-indexed field separated.
41
42 @Test
43 public void indexExample1 () throws IOException {
44 KorapIndex ki = new KorapIndex();
45
46 // abcabcabac
47 FieldDocument fd = new FieldDocument();
48 fd.addTV("base",
49 "abcabcabac",
50 "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
51 "[(1-2)s:b|i:b|_1#1-2]" +
52 "[(2-3)s:c|i:c|_2#2-3]" +
53 "[(3-4)s:a|i:a|_3#3-4]" +
54 "[(4-5)s:b|i:b|_4#4-5]" +
55 "[(5-6)s:c|i:c|_5#5-6]" +
56 "[(6-7)s:a|i:a|_6#6-7]" +
57 "[(7-8)s:b|i:b|_7#7-8]" +
58 "[(8-9)s:a|i:a|_8#8-9]" +
59 "[(9-10)s:c|i:c|_9#9-10]");
60 ki.addDoc(fd);
61
62 ki.commit();
63
64 SpanQuery sq;
65 KorapResult kr;
66
67 sq = new SpanNextQuery(
68 new SpanTermQuery(new Term("base", "s:a")),
69 new SpanTermQuery(new Term("base", "s:b"))
70 );
71
72 kr = ki.search(sq, (short) 10);
73
74 assertEquals("totalResults", 3, kr.totalResults());
75 assertEquals("StartPos (0)", 0, kr.match(0).startPos);
76 assertEquals("EndPos (0)", 2, kr.match(0).endPos);
77 assertEquals("StartPos (1)", 3, kr.match(1).startPos);
78 assertEquals("EndPos (1)", 5, kr.match(1).endPos);
79 assertEquals("StartPos (2)", 6, kr.match(2).startPos);
80 assertEquals("EndPos (2)", 8, kr.match(2).endPos);
Eliza Margarethabefc23f2014-01-20 14:34:15 +000081
Nils Diewaldf399a672013-11-18 17:55:22 +000082 sq = new SpanNextQuery(
83 new SpanTermQuery(new Term("base", "s:b")),
84 new SpanTermQuery(new Term("base", "s:c"))
85 );
86
87 kr = ki.search(sq, (short) 10);
88
89 assertEquals("totalResults", 2, kr.totalResults());
90 assertEquals("StartPos (0)", 1, kr.match(0).startPos);
91 assertEquals("EndPos (0)", 3, kr.match(0).endPos);
92 assertEquals("StartPos (1)", 4, kr.match(1).startPos);
93 assertEquals("EndPos (1)", 6, kr.match(1).endPos);
94
Nils Diewaldbaf68c52013-11-20 13:22:19 +000095 assertEquals(1, ki.numberOf("base", "documents"));
96 assertEquals(10, ki.numberOf("base", "t"));
Nils Diewaldf399a672013-11-18 17:55:22 +000097
98
99 sq = new SpanNextQuery(
100 new SpanTermQuery(new Term("base", "s:a")),
101 new SpanNextQuery(
102 new SpanTermQuery(new Term("base", "s:b")),
103 new SpanTermQuery(new Term("base", "s:c"))
104 )
105 );
106
107 kr = ki.search(sq, (short) 2);
108
109 assertEquals("totalResults", 2, kr.totalResults());
110 assertEquals("StartPos (0)", 0, kr.match(0).startPos);
111 assertEquals("EndPos (0)", 3, kr.match(0).endPos);
112 assertEquals("StartPos (1)", 3, kr.match(1).startPos);
113 assertEquals("EndPos (1)", 6, kr.match(1).endPos);
114
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000115 assertEquals(1, ki.numberOf("base", "documents"));
116 assertEquals(10, ki.numberOf("base", "t"));
Nils Diewaldf399a672013-11-18 17:55:22 +0000117
118 };
119
120 @Test
121 public void indexExample2 () throws IOException {
122 KorapIndex ki = new KorapIndex();
123
124 // abcabcabac
125 FieldDocument fd = new FieldDocument();
126 fd.addTV("base",
127 "abcabcabac",
128 "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
129 "[(1-2)s:b|i:b|_1#1-2]" +
130 "[(2-3)s:c|i:c|_2#2-3]" +
Eliza Margarethad28469f2014-03-10 12:42:21 +0000131 "[(3-4)s:a|i:a|_3#3-4|<>:x#3-4$<i>4|<>:x#3-7$<i>7]" +
Nils Diewaldf399a672013-11-18 17:55:22 +0000132 "[(4-5)s:b|i:b|_4#4-5]" +
133 "[(5-6)s:c|i:c|_5#5-6]" +
134 "[(6-7)s:a|i:a|_6#6-7]" +
135 "[(7-8)s:b|i:b|_7#7-8]" +
136 "[(8-9)s:a|i:a|_8#8-9]" +
137 "[(9-10)s:c|i:c|_9#9-10]");
138 ki.addDoc(fd);
139
140 ki.commit();
141
142 SpanQuery sq;
143 KorapResult kr;
144
145 sq = new SpanNextQuery(
146 new SpanTermQuery(new Term("base", "s:c")),
147 new SpanElementQuery("base", "x")
148 );
149
150 kr = ki.search(sq, (short) 10);
Eliza Margarethad28469f2014-03-10 12:42:21 +0000151 assertEquals("ab[cabca]bac", kr.match(1).getSnippetBrackets());
Nils Diewaldf399a672013-11-18 17:55:22 +0000152
153 };
154
155 @Test
156 public void indexExample3 () throws IOException {
157 KorapIndex ki = new KorapIndex();
158
159 // abcabcabac
160 FieldDocument fd = new FieldDocument();
161 fd.addTV("base",
162 "abcabcabac",
163 "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
164 "[(1-2)s:b|i:b|_1#1-2]" +
165 "[(2-3)s:c|i:c|_2#2-3]" +
166 "[(3-4)s:a|i:a|_3#3-4|<>:x#3-7$<i>7]" +
167 "[(4-5)s:b|i:b|_4#4-5]" +
168 "[(5-6)s:c|i:c|_5#5-6]" +
169 "[(6-7)s:a|i:a|_6#6-7]" +
170 "[(7-8)s:b|i:b|_7#7-8]" +
171 "[(8-9)s:a|i:a|_8#8-9]" +
172 "[(9-10)s:c|i:c|_9#9-10]");
173 ki.addDoc(fd);
174
175 ki.commit();
176
177 SpanQuery sq;
178 KorapResult kr;
179
180 sq = new SpanNextQuery(
181 new SpanElementQuery("base", "x"),
182 new SpanTermQuery(new Term("base", "s:b"))
183 );
184
185 kr = ki.search(sq, (short) 10);
186 assertEquals("abc[abcab]ac", kr.match(0).getSnippetBrackets());
187 };
188
189 @Test
190 public void indexExample4 () throws IOException {
191 KorapIndex ki = new KorapIndex();
192
193 // abcabcabac
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000194 // abc<x>abc<x>a</x>b</x>ac
Nils Diewaldf399a672013-11-18 17:55:22 +0000195 FieldDocument fd = new FieldDocument();
196 fd.addString("ID", "doc-1");
197 fd.addTV("base",
198 "abcabcabac",
199 "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
200 "[(1-2)s:b|i:b|_1#1-2]" +
201 "[(2-3)s:c|i:c|_2#2-3]" +
202 "[(3-4)s:a|i:a|_3#3-4|<>:x#3-7$<i>7]" +
203 "[(4-5)s:b|i:b|_4#4-5]" +
204 "[(5-6)s:c|i:c|_5#5-6]" +
205 "[(6-7)s:a|i:a|_6#6-7]<>:x#6-8$<i>8]" +
206 "[(7-8)s:b|i:b|_7#7-8]" +
207 "[(8-9)s:a|i:a|_8#8-9]" +
208 "[(9-10)s:c|i:c|_9#9-10]");
209 ki.addDoc(fd);
210
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000211 // xbz<x>xbzx</x>bxz
Nils Diewaldf399a672013-11-18 17:55:22 +0000212 fd = new FieldDocument();
213 fd.addString("ID", "doc-2");
214 fd.addTV("base",
215 "xbzxbzxbxz",
216 "[(0-1)s:x|i:x|_0#0-1|-:t$<i>10]" +
217 "[(1-2)s:b|i:b|_1#1-2]" +
218 "[(2-3)s:z|i:z|_2#2-3]" +
219 "[(3-4)s:x|i:x|_3#3-4|<>:x#3-7$<i>7]" +
220 "[(4-5)s:b|i:b|_4#4-5]" +
221 "[(5-6)s:z|i:z|_5#5-6]" +
222 "[(6-7)s:x|i:x|_6#6-7]" +
223 "[(7-8)s:b|i:b|_7#7-8]" +
224 "[(8-9)s:x|i:x|_8#8-9]" +
225 "[(9-10)s:z|i:z|_9#9-10]");
226 ki.addDoc(fd);
227
228
229 ki.commit();
230
231 SpanQuery sq;
232 KorapResult kr;
233
234 sq = new SpanNextQuery(
235 new SpanElementQuery("base", "x"),
236 new SpanTermQuery(new Term("base", "s:b"))
237 );
238
239 kr = ki.search(sq, (short) 10);
240 assertEquals(2, kr.totalResults());
241 assertEquals("abc[abcab]ac", kr.match(0).getSnippetBrackets());
242 assertEquals("xbz[xbzxb]xz", kr.match(1).getSnippetBrackets());
243
244 sq = new SpanNextQuery(
245 new SpanTermQuery(new Term("base", "s:c")),
246 new SpanElementQuery("base", "x")
247 );
248
249 kr = ki.search(sq, (short) 10);
250 assertEquals(1, kr.totalResults());
251 assertEquals("ab[cabca]bac", kr.match(0).getSnippetBrackets());
252
253 sq = new SpanNextQuery(
254 new SpanTermQuery(new Term("base", "s:z")),
255 new SpanElementQuery("base", "x")
256 );
257
258 kr = ki.search(sq, (short) 10);
259 assertEquals(1, kr.totalResults());
260 assertEquals("xb[zxbzx]bxz", kr.match(0).getSnippetBrackets());
261 };
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000262
263 /**
264 * Multiple atomic indices
265 * Skip to a greater doc#
266 * */
267 @Test
268 public void indexExample5 () throws IOException {
269 KorapIndex ki = new KorapIndex();
270 ki.addDoc(createFieldDoc1());
271 ki.addDoc(createFieldDoc2());
272 ki.commit();
273 ki.addDoc(createFieldDoc3());
274 ki.commit();
275
276 SpanQuery sq = new SpanNextQuery(
277 new SpanTermQuery(new Term("base","s:d")),
278 new SpanTermQuery(new Term("base","s:b"))
279 );
280 KorapResult kr = ki.search(sq, (short) 10);
281
282 assertEquals("totalResults", 2, kr.totalResults());
283 // Match #0
284 assertEquals("doc-number", 0, kr.match(0).getLocalDocID());
285 assertEquals("StartPos", 4, kr.match(0).startPos);
286 assertEquals("EndPos", 6, kr.match(0).endPos);
287 // Match #1
288 assertEquals("doc-number", 0, kr.match(1).getLocalDocID());
289 assertEquals("StartPos", 1, kr.match(1).startPos);
290 assertEquals("EndPos", 3, kr.match(1).endPos);
291
292 sq = new SpanNextQuery(
293 new SpanTermQuery(new Term("base","s:b")),
294 new SpanTermQuery(new Term("base","s:d"))
295 );
296 kr = ki.search(sq, (short) 10);
297
298 assertEquals("totalResults", 1, kr.totalResults());
299 assertEquals("doc-number", 0, kr.match(0).getLocalDocID());
300 assertEquals("StartPos", 2, kr.match(0).startPos);
301 assertEquals("EndPos", 4, kr.match(0).endPos);
302 }
303
304 /** Skip to NextSpan */
305 @Test
306 public void indexExample6() throws IOException{
307 KorapIndex ki = new KorapIndex();
308 ki.addDoc(createFieldDoc1());
309 ki.addDoc(createFieldDoc2());
310 ki.addDoc(createFieldDoc3());
311 ki.commit();
312
313 SpanQuery sq = new SpanNextQuery(
314 new SpanTermQuery(new Term("base","s:c")),
315 new SpanNextQuery(
316 new SpanTermQuery(new Term("base","s:d")),
317 new SpanTermQuery(new Term("base","s:b"))
318 )
319 );
320
321 KorapResult kr = ki.search(sq, (short) 10);
322 assertEquals("totalResults", 1, kr.totalResults());
323 assertEquals("doc-number", 2, kr.match(0).getLocalDocID());
324 assertEquals("StartPos", 0, kr.match(0).startPos);
325 assertEquals("EndPos", 3, kr.match(0).endPos);
326
327 }
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000328
329 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000330 public void indexExample7Distances () throws Exception{
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000331 KorapIndex ki = new KorapIndex();
332 ki.addDoc(createFieldDoc1());
333 ki.addDoc(createFieldDoc2());
334 ki.addDoc(createFieldDoc3());
335 ki.addDoc(createFieldDoc4());
336 ki.commit();
337
338 SpanSequenceQueryWrapper sq = new SpanSequenceQueryWrapper("base");
339 sq.append("i:b").append("i:d").withConstraint(1,3);
340
341 KorapResult kr = ki.search(sq.toQuery(), (short) 10);
342
343 assertEquals("totalResults", 3, kr.totalResults());
344 assertEquals("doc-number", "match-doc-0-p2-5", kr.match(0).getID());
345 assertEquals("doc-number", "match-doc-2-p2-4", kr.match(1).getID());
346 assertEquals("doc-number", "match-doc-3-p2-5", kr.match(2).getID());
347 };
348
349 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000350 public void indexExample8Distances () throws Exception{
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000351 KorapIndex ki = new KorapIndex();
352 ki.addDoc(createFieldDoc1());
353 ki.addDoc(createFieldDoc2());
354 ki.addDoc(createFieldDoc3());
355 ki.addDoc(createFieldDoc4());
356 ki.commit();
357
358 SpanSequenceQueryWrapper sq = new SpanSequenceQueryWrapper("base");
359 sq.append("i:a").append("i:b").withConstraint(0, 3, "e");
360
361 KorapResult kr = ki.search(sq.toQuery(), (short) 10);
362
363 assertEquals("totalResults", 3, kr.totalResults());
364 assertEquals("doc-number", "match-doc-0-p3-6", kr.match(0).getID());
365 assertEquals("doc-number", "match-doc-1-p1-3", kr.match(1).getID());
366 assertEquals("doc-number", "match-doc-3-p3-6", kr.match(2).getID());
367 };
368
Eliza Margarethaa8491712014-07-25 13:27:54 +0000369 @Test
370 public void indexExample9() throws IOException{
371 KorapIndex ki = new KorapIndex();
372 ki.addDoc(createFieldDoc1());
373 ki.commit();
374
375 SpanQuery sq = new SpanNextQuery(
376 new SpanOrQuery(
377 new SpanTermQuery(new Term("base","s:a")),
378 new SpanTermQuery(new Term("base","s:b"))),
379 new SpanTermQuery(new Term("base","s:c"))
380 );
381
382 KorapResult kr = ki.search(sq, (short) 10);
383
384 assertEquals(0, kr.match(0).getStartPos());
385 assertEquals(2, kr.match(0).getEndPos());
386 assertEquals(3, kr.match(1).getStartPos());
387 assertEquals(5, kr.match(1).getEndPos());
388
389// for (KorapMatch m : kr.getMatches()){
390// System.out.println(m.getStartPos() +" "+ m.getEndPos());
391// }
392 }
393
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000394
395 private FieldDocument createFieldDoc1(){
396 FieldDocument fd = new FieldDocument();
397 fd.addString("ID", "doc-0");
398 fd.addTV("base",
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000399 "bcbadb",
400 "[(0-1)s:b|i:b|_0#0-1]" +
401 "[(1-2)s:c|i:c|s:b|_1#1-2]" +
402 "[(2-3)s:b|i:b|_2#2-3]" +
403 "[(3-4)s:a|i:a|_3#3-4|<>:e#3-6$<i>6]" +
404 "[(4-5)s:d|i:d|s:c|_4#4-5]" +
405 "[(5-6)s:b|i:b|_5#5-6]");
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000406 return fd;
407 }
408
409 private FieldDocument createFieldDoc2(){
410 FieldDocument fd = new FieldDocument();
411 fd.addString("ID", "doc-1");
412 fd.addTV("base",
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000413 "caba",
414 "[(0-1)s:c|i:c|_0#0-1]" +
415 "[(1-2)s:a|i:a|s:c|_1#1-2|<>:e#1-3$<i>3]" +
416 "[(2-3)s:b|i:b|s:a|_2#2-3]" +
417 "[(3-4)s:a|i:a|_3#3-4]");
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000418 return fd;
419 }
420
421 private FieldDocument createFieldDoc3(){
422 FieldDocument fd = new FieldDocument();
423 fd.addString("ID", "doc-2");
424 fd.addTV("base",
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000425 "cdbd",
426 "[(0-1)s:c|i:c|_0#0-1]" +
427 "[(1-2)s:d|i:d|_1#1-2]"+
428 "[(2-3)s:b|i:b|s:a|_2#2-3]"+
429 "[(3-4)s:d|i:d|_3#3-4]");
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000430
431 return fd;
432 }
Nils Diewaldf5f29ff2014-02-14 12:24:34 +0000433
434 private FieldDocument createFieldDoc4(){
435 FieldDocument fd = new FieldDocument();
436 fd.addString("ID", "doc-3");
437 fd.addTV("base",
438 "bcbadb",
439 "[(0-1)s:b|i:b|_0#0-1]" +
440 "[(1-2)s:c|i:c|s:b|<>:s#1-3$<i>3|_1#1-2]" +
441 "[(2-3)s:b|i:b|_2#2-3]" +
442 "[(3-4)s:a|i:a|_3#3-4|<>:e#3-6$<i>6]" +
443 "[(4-5)s:d|i:d|s:c|_4#4-5]" +
444 "[(5-6)s:b|i:b|_5#5-6]");
445 return fd;
446 }
Eliza Margarethabefc23f2014-01-20 14:34:15 +0000447
448
Nils Diewaldf399a672013-11-18 17:55:22 +0000449};