blob: 3f14ab3cc1ef6f26842c9446728df0ed85153ec5 [file] [log] [blame]
Eliza Margaretha45b5be12014-02-04 11:22:46 +00001package de.ids_mannheim.korap.index;
2
Nils Diewaldf399a672013-11-18 17:55:22 +00003import java.util.*;
4import java.io.*;
5
6import org.apache.lucene.util.Version;
7import org.apache.lucene.util.BytesRef;
8import org.apache.lucene.util.Bits;
9
10import static org.junit.Assert.*;
11import org.junit.Test;
12import org.junit.Ignore;
13import org.junit.runner.RunWith;
14import org.junit.runners.JUnit4;
15
16import de.ids_mannheim.korap.KorapIndex;
17import de.ids_mannheim.korap.KorapQuery;
18import de.ids_mannheim.korap.KorapResult;
19import de.ids_mannheim.korap.query.SpanElementQuery;
20import de.ids_mannheim.korap.index.FieldDocument;
21import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
22import org.apache.lucene.search.spans.SpanQuery;
Nils Diewald20607ab2014-03-20 23:28:36 +000023import org.apache.lucene.search.spans.SpanTermQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000024import org.apache.lucene.index.Term;
25
26
27@RunWith(JUnit4.class)
28public class TestElementIndex {
29
30 // Todo: primary data as a non-indexed field separated.
31
32 @Test
33 public void indexExample1 () throws IOException {
34 KorapIndex ki = new KorapIndex();
35
36 // <a>x<a>y<a>zhij</a>hij</a>hij</a>hij</a>
37 FieldDocument fd = new FieldDocument();
38 fd.addTV("base",
39 "x y z h i j h i j h i j ",
40 "[(0-3)s:x|<>:a#0-3$<i>12]" +
41 "[(3-6)s:y|<>:a#3-6$<i>9]" +
42 "[(6-9)s:z|<>:a#6-9$<i>6]" +
43 "[(9-12)s:h]" +
44 "[(12-15)s:i]" +
45 "[(15-18)s:j]" +
46 "[(18-21)s:h]" +
47 "[(21-24)s:i]" +
48 "[(24-27)s:j]" +
49 "[(27-30)s:h]" +
50 "[(30-33)s:i]" +
51 "[(33-36)s:j]");
52 ki.addDoc(fd);
53
54 // <a>x<a>y<a>zcde</a>cde</a>cde</a>cde</a>
55 fd = new FieldDocument();
56 fd.addTV("base",
57 "x y z c d e c d e c d e ",
58 "[(0-3)s:x|<>:a#0-3$<i>12]" +
59 "[(3-6)s:y|<>:a#3-6$<i>9]" +
60 "[(6-9)s:z|<>:a#6-9$<i>6]" +
61 "[(9-12)s:c]" +
62 "[(12-15)s:d]" +
63 "[(15-18)s:e]" +
64 "[(18-21)s:c]" +
65 "[(21-24)s:d]" +
66 "[(24-27)s:e]" +
67 "[(27-30)s:c]" +
68 "[(30-33)s:d]" +
69 "[(33-36)s:e]");
70 ki.addDoc(fd);
71
72 // Save documents
73 ki.commit();
74
75 assertEquals(2, ki.numberOf("documents"));
76
77 SpanQuery sq = new SpanElementQuery("base", "a");
78
79 KorapResult kr = ki.search(sq, (short) 10);
80
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000081 assertEquals("totalResults", kr.getTotalResults(), 6);
Nils Diewaldf399a672013-11-18 17:55:22 +000082
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000083 assertEquals("StartPos (0)", 0, kr.getMatch(0).startPos);
84 assertEquals("EndPos (0)", 12, kr.getMatch(0).endPos);
85 assertEquals("StartPos (1)", 1, kr.getMatch(1).startPos);
86 assertEquals("EndPos (1)", 9, kr.getMatch(1).endPos);
87 assertEquals("StartPos (2)", 2, kr.getMatch(2).startPos);
88 assertEquals("EndPos (2)", 6, kr.getMatch(2).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +000089
Nils Diewalde1ecd5e2014-11-27 02:17:24 +000090 assertEquals("StartPos (0)", 0, kr.getMatch(3).startPos);
91 assertEquals("EndPos (0)", 12, kr.getMatch(3).endPos);
92 assertEquals("StartPos (1)", 1, kr.getMatch(4).startPos);
93 assertEquals("EndPos (1)", 9, kr.getMatch(4).endPos);
94 assertEquals("StartPos (2)", 2, kr.getMatch(5).startPos);
95 assertEquals("EndPos (2)", 6, kr.getMatch(5).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +000096
97 // System.err.println(kr.toJSON());
98 };
99
100 @Test
101 public void indexExample2 () throws IOException {
102 KorapIndex ki = new KorapIndex();
103
104 // <a><a><a>h</a>hhij</a>hij</a>hij</a>
105 FieldDocument fd = new FieldDocument();
106 fd.addTV("base",
107 "h h i j h i j h i j ",
108 "[(0-3)s:h|<>:a#0-27$<i>6|<>:a#0-18$<i>3|<>:a#0-36$<i>9]" +
109 "[(3-6)s:h]" +
110 "[(12-15)s:i]" +
111 "[(15-18)s:j]" +
112 "[(18-21)s:h]" +
113 "[(21-24)s:i]" +
114 "[(24-27)s:j]" +
115 "[(27-30)s:h]" +
116 "[(30-33)s:i]" +
117 "[(33-36)s:j]");
118 ki.addDoc(fd);
119
120 // Save documents
121 ki.commit();
122
123 assertEquals(1, ki.numberOf("documents"));
124
125 SpanQuery sq = new SpanElementQuery("base", "a");
126
127 KorapResult kr = ki.search(sq, (short) 10);
128
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000129 assertEquals("totalResults", kr.getTotalResults(), 3);
130 assertEquals("StartPos (0)", 0, kr.getMatch(0).startPos);
131 assertEquals("EndPos (0)", 3, kr.getMatch(0).endPos);
132 assertEquals("StartPos (1)", 0, kr.getMatch(1).startPos);
133 assertEquals("EndPos (1)", 6, kr.getMatch(1).endPos);
134 assertEquals("StartPos (2)", 0, kr.getMatch(2).startPos);
135 assertEquals("EndPos (2)", 9, kr.getMatch(2).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000136 };
137
138 @Test
139 public void indexExample3 () throws IOException {
140 KorapIndex ki = new KorapIndex();
141
142 // <a><a><a>u</a></a></a>
143 FieldDocument fd = new FieldDocument();
144 fd.addTV("base",
145 "xyz",
146 "[(0-3)s:xyz|<>:a#0-3$<i>0|<>:a#0-3$<i>0|<>:a#0-3$<i>0|<>:b#0-3$<i>0]");
147 ki.addDoc(fd);
148
149 // <a><b>x<a>y<a>zcde</a>cde</a>cde</b></a>
150 fd = new FieldDocument();
151 fd.addTV("base",
152 "x y z c d e c d e c d e ",
153 "[(0-3)s:x|<>:a#0-36$<i>12|<>:b#0-36$<i>12]" +
154 "[(3-6)s:y|<>:a#3-27$<i>9]" +
155 "[(6-9)s:z|<>:a#6-18$<i>6]" +
156 "[(9-12)s:c]" +
157 "[(12-15)s:d]" +
158 "[(15-18)s:e]" +
159 "[(18-21)s:c]" +
160 "[(21-24)s:d]" +
161 "[(24-27)s:e]" +
162 "[(27-30)s:c]" +
163 "[(30-33)s:d]" +
164 "[(33-36)s:e]");
165 ki.addDoc(fd);
166
167 // xyz
168 fd = new FieldDocument();
169 fd.addTV("base",
170 "x y z ",
171 "[(0-3)s:x]" +
172 "[(3-6)s:y]" +
173 "[(6-9)s:z]");
174 ki.addDoc(fd);
175
176 // <a>x<a><b>y<a>zcde</a>cde</b></a>cde</a>
177 fd = new FieldDocument();
178 fd.addTV("base",
179 "x y z k l m k l m k l m ",
180 "[(0-3)s:x|<>:a#0-3$<i>12]" +
181 "[(3-6)s:y|<>:a#3-6$<i>9|<>:b#3-6$<i>9]" +
182 "[(6-9)s:z|<>:a#6-9$<i>6]" +
183 "[(9-12)s:k]" +
184 "[(12-15)s:l]" +
185 "[(15-18)s:m]" +
186 "[(18-21)s:k]" +
187 "[(21-24)s:l]" +
188 "[(24-27)s:m]" +
189 "[(27-30)s:k]" +
190 "[(30-33)s:l]" +
191 "[(33-36)s:m]");
192 ki.addDoc(fd);
193
194 // <a><a><a>h</a>hhij</a>hij</a>hij</a>
195 fd = new FieldDocument();
196 fd.addTV("base",
197 "h h i j h i j h i j ",
198 "[(0-3)s:h|<>:a#0-27$<i>6|<>:a#0-18$<i>3|<>:a#0-36$<i>9]" +
199 "[(3-6)s:h]" +
200 "[(12-15)s:i]" +
201 "[(15-18)s:j]" +
202 "[(18-21)s:h]" +
203 "[(21-24)s:i]" +
204 "[(24-27)s:j]" +
205 "[(27-30)s:h]" +
206 "[(30-33)s:i]" +
207 "[(33-36)s:j]");
208 ki.addDoc(fd);
209
210 // xyz
211 fd = new FieldDocument();
212 fd.addTV("base",
213 "a b c ",
214 "[(0-3)s:a]" +
215 "[(3-6)s:b]" +
216 "[(6-9)s:c]");
217 ki.addDoc(fd);
218
219
220 // Save documents
221 ki.commit();
222
223 assertEquals(6, ki.numberOf("documents"));
224
225 SpanQuery sq = new SpanElementQuery("base", "a");
226
227 KorapResult kr = ki.search(sq, (short) 15);
228
229 // System.err.println(kr.toJSON());
230
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000231 assertEquals("totalResults", kr.getTotalResults(), 12);
Nils Diewaldf399a672013-11-18 17:55:22 +0000232
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000233 assertEquals("StartPos (0)", 0, kr.getMatch(0).startPos);
234 assertEquals("EndPos (0)", 0, kr.getMatch(0).endPos);
235 assertEquals("StartPos (1)", 0, kr.getMatch(1).startPos);
236 assertEquals("EndPos (1)", 0, kr.getMatch(1).endPos);
237 assertEquals("StartPos (2)", 0, kr.getMatch(2).startPos);
238 assertEquals("EndPos (2)", 0, kr.getMatch(2).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000239
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000240 assertEquals("StartPos (3)", 0, kr.getMatch(3).startPos);
241 assertEquals("EndPos (3)", 12, kr.getMatch(3).endPos);
242 assertEquals("StartPos (4)", 1, kr.getMatch(4).startPos);
243 assertEquals("EndPos (4)", 9, kr.getMatch(4).endPos);
244 assertEquals("StartPos (5)", 2, kr.getMatch(5).startPos);
245 assertEquals("EndPos (5)", 6, kr.getMatch(5).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000246
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000247 assertEquals("StartPos (6)", 0, kr.getMatch(6).startPos);
248 assertEquals("EndPos (6)", 12, kr.getMatch(6).endPos);
249 assertEquals("StartPos (7)", 1, kr.getMatch(7).startPos);
250 assertEquals("EndPos (7)", 9, kr.getMatch(7).endPos);
251 assertEquals("StartPos (8)", 2, kr.getMatch(8).startPos);
252 assertEquals("EndPos (8)", 6, kr.getMatch(8).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000253
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000254 assertEquals("StartPos (9)", 0, kr.getMatch(9).startPos);
255 assertEquals("EndPos (9)", 3, kr.getMatch(9).endPos);
256 assertEquals("StartPos (10)", 0, kr.getMatch(10).startPos);
257 assertEquals("EndPos (10)", 6, kr.getMatch(10).endPos);
258 assertEquals("StartPos (11)", 0, kr.getMatch(11).startPos);
259 assertEquals("EndPos (11)", 9, kr.getMatch(11).endPos);
Nils Diewaldf399a672013-11-18 17:55:22 +0000260 };
261
262
263 @Test
264 public void indexExample4 () throws IOException {
265 KorapIndex ki = new KorapIndex();
266
267 FieldDocument fd = new FieldDocument();
268 fd.addTV("base",
269 "111111ccc222222fff333333iiijjj",
270 "[(0-3)s:a|_0#0-3]" +
271 "[(3-6)s:b|_1#3-6]" +
272 "[(6-9)s:c|_2#6-9]" +
273 "[(9-12)s:d|_3#9-12|<>:a#9-15$<i>4]" +
274 "[(12-15)s:e|_4#12-15]" +
275 "[(15-18)s:f|_5#15-18]" +
276 "[(18-21)s:g|_6#18-21|<>:a#18-24$<i>8]" +
277 "[(21-24)s:h|_7#21-24]" +
278 "[(24-27)s:i|_8#24-27]" +
279 "[(27-30)s:j|_9#27-30]");
280 ki.addDoc(fd);
281
282 // Save documents
283 ki.commit();
284
285 assertEquals(1, ki.numberOf("documents"));
286
287 SpanQuery sq = new SpanElementQuery("base", "a");
288
289 KorapResult kr = ki.search(sq, 0, (short) 15, false, (short) 3, false, (short) 3);
290
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000291 assertEquals("... ccc[222222]fff ...", kr.getMatch(0).getSnippetBrackets());
292 assertEquals("... fff[333333]iii ...", kr.getMatch(1).getSnippetBrackets());
Nils Diewaldf399a672013-11-18 17:55:22 +0000293 };
294
295
296 @Test
297 public void indexExample5 () throws IOException {
298 KorapIndex ki = new KorapIndex();
299
300 FieldDocument fd = new FieldDocument();
301 fd.addTV("base",
302 "111111ccc222222fff333333iiijjj",
303 "[(0-3)s:a|_0#0-3|<>:a#0-6$<i>1]" +
304 "[(3-6)s:b|_1#3-6]" +
305 "[(6-9)s:c|_2#6-9]" +
306 "[(9-12)s:d|_3#9-12|<>:a#9-15$<i>4]" +
307 "[(12-15)s:e|_4#12-15]" +
308 "[(15-18)s:f|_5#15-18]" +
309 "[(18-21)s:g|_6#18-21|<>:a#18-24$<i>8]" +
310 "[(21-24)s:h|_7#21-24]" +
311 "[(24-27)s:i|_8#24-27]" +
312 "[(27-30)s:j|_9#27-30]");
313 ki.addDoc(fd);
314
315 // Save documents
316 ki.commit();
317
318 assertEquals(1, ki.numberOf("documents"));
319
320 SpanQuery sq = new SpanElementQuery("base", "a");
321
322 KorapResult kr = ki.search(sq, 0, (short) 15, false, (short) 3, false, (short) 3);
323
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000324 assertEquals("[111111]ccc ...", kr.getMatch(0).getSnippetBrackets());
325 assertEquals("... ccc[222222]fff ...", kr.getMatch(1).getSnippetBrackets());
326 assertEquals("... fff[333333]iii ...", kr.getMatch(2).getSnippetBrackets());
Nils Diewaldf399a672013-11-18 17:55:22 +0000327 };
Nils Diewald20607ab2014-03-20 23:28:36 +0000328
329
330 @Test
331 public void indexExample6 () throws IOException {
332
333 KorapIndex ki = new KorapIndex();
334
335 // <a>x<a>y<a>zhij</a>hij</a>hij</a>
336 FieldDocument fd = new FieldDocument();
337 fd.addTV("base",
338 "x y z h i j h i j h i j ",
339 "[(0-3)s:x|_0#0-3|<>:a#0-36$<i>12]" + // 1
340 "[(3-6)s:y|_1#3-6|<>:a#3-27$<i>9]" + // 2
341 "[(6-9)s:z|_2#6-9|<>:a#6-18$<i>6]" + // 3
342 "[(9-12)s:h|_3#9-12]" + // 4
343 "[(12-15)s:i|_4#12-15]" + // 5
344 "[(15-18)s:j|_5#15-18]" + // 6
345 "[(18-21)s:h|_6#18-21]" + // 7
346 "[(21-24)s:i|_7#21-24]" + // 8
347 "[(24-27)s:j|_8#24-27]" + // 9
348 "[(27-30)s:h|_9#27-30]" + // 10
349 "[(30-33)s:i|_10#30-33]" + // 11
350 "[(33-36)s:j|_11#33-36]"); // 12
351 ki.addDoc(fd);
352
353 fd = new FieldDocument();
354 fd.addTV("base",
355 "x y z h ",
356 "[(0-3)s:x|_0#0-3]" + // 1
357 "[(3-6)s:y|_1#3-6]" + // 2
358 "[(6-9)s:z|_2#6-9]" + // 3
359 "[(9-12)s:h|_3#9-12]"); // 4
360 ki.addDoc(fd);
361
362 // Here is a larger offset than expected
363 fd = new FieldDocument();
364 fd.addTV("base",
365 "x y z h ",
366 "[(0-3)s:x|_0#0-3|<>:a#0-36$<i>12]" + // 1
367 "[(3-6)s:y|_1#3-6]" + // 2
368 "[(6-9)s:z|_2#6-9]" + // 3
369 "[(9-12)s:h|_3#9-12]"); // 4
370 ki.addDoc(fd);
371
372 // <a>x<a>y<a>zabc</a>abc</a>abc</a>
373 fd = new FieldDocument();
374 fd.addTV("base",
375 "x y z a b c a b c a b c ",
376 "[(0-3)s:x|_0#0-3|<>:a#0-36$<i>12]" + // 1
377 "[(3-6)s:y|_1#3-6|<>:a#3-27$<i>9]" + // 2
378 "[(6-9)s:z|_2#6-9|<>:a#6-18$<i>6]" + // 3
379 "[(9-12)s:a|_3#9-12]" + // 4
380 "[(12-15)s:b|_4#12-15]" + // 5
381 "[(15-18)s:c|_5#15-18]" + // 6
382 "[(18-21)s:a|_6#18-21]" + // 7
383 "[(21-24)s:b|_7#21-24]" + // 8
384 "[(24-27)s:c|_8#24-27]" + // 9
385 "[(27-30)s:a|_9#27-30]" + // 10
386 "[(30-33)s:b|_10#30-33]" + // 11
387 "[(33-36)s:c|_11#33-36]"); // 12
388 ki.addDoc(fd);
389
390 fd = new FieldDocument();
391 fd.addTV("base",
392 "x y z h ",
393 "[(0-3)s:x|_0#0-3]" + // 1
394 "[(3-6)s:y|_1#3-6]" + // 2
395 "[(6-9)s:z|_2#6-9]" + // 3
396 "[(9-12)s:h|_3#9-12]"); // 4
397 ki.addDoc(fd);
398
399 // Save documents
400 ki.commit();
401
402 SpanQuery sq;
403 KorapResult kr;
404
405 sq = new SpanElementQuery("base", "a");
406 kr = ki.search(sq, (short) 15);
407
408 // System.err.println(kr.toJSON());
409
410 assertEquals(5, ki.numberOf("documents"));
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000411 assertEquals("totalResults", kr.getTotalResults(), 7);
Nils Diewald20607ab2014-03-20 23:28:36 +0000412 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000413};