blob: bc113ef65085f56a4c2f2c54c6d4a1c3c16086bf [file] [log] [blame]
Akron3ba74f22015-07-24 18:46:17 +02001package de.ids_mannheim.korap.collection;
Akron40550172015-08-04 03:06:12 +02002
margaretha2a30bd42021-02-01 16:56:36 +01003import static de.ids_mannheim.korap.TestSimple.getJsonString;
4import static org.junit.Assert.assertEquals;
margaretha2a30bd42021-02-01 16:56:36 +01005import static org.junit.Assert.assertTrue;
Akron3ba74f22015-07-24 18:46:17 +02006
margaretha2a30bd42021-02-01 16:56:36 +01007import java.io.IOException;
Akronb59f40e2018-08-23 17:15:43 +02008
Akronfd05f502015-07-30 18:34:26 +02009import org.apache.lucene.index.Term;
Akronfd05f502015-07-30 18:34:26 +020010import org.apache.lucene.search.spans.SpanQuery;
11import org.apache.lucene.search.spans.SpanTermQuery;
margaretha2a30bd42021-02-01 16:56:36 +010012import org.junit.Test;
Akron3ba74f22015-07-24 18:46:17 +020013import org.junit.runner.RunWith;
14import org.junit.runners.JUnit4;
15
margaretha2a30bd42021-02-01 16:56:36 +010016import de.ids_mannheim.korap.Krill;
17import de.ids_mannheim.korap.KrillCollection;
18import de.ids_mannheim.korap.KrillIndex;
19import de.ids_mannheim.korap.index.FieldDocument;
20import de.ids_mannheim.korap.query.QueryBuilder;
21import de.ids_mannheim.korap.response.Result;
22import de.ids_mannheim.korap.response.SearchContext;
margaretha2a30bd42021-02-01 16:56:36 +010023import de.ids_mannheim.korap.util.StatusCodes;
Akronb59f40e2018-08-23 17:15:43 +020024
25
Akron3ba74f22015-07-24 18:46:17 +020026@RunWith(JUnit4.class)
27public class TestKrillCollectionIndex {
28 private KrillIndex ki;
29
Akronb59f40e2018-08-23 17:15:43 +020030 final String path = "/queries/collections/";
Akron40550172015-08-04 03:06:12 +020031
Akron3ba74f22015-07-24 18:46:17 +020032 @Test
margarethaee683ff2017-07-03 12:27:28 +020033 public void testKrillCollectionWithWrongJson () throws IOException {
34 ki = new KrillIndex();
35 ki.addDoc(createDoc1());
36 ki.addDoc(createDoc2());
37 ki.addDoc(createDoc3());
38 ki.commit();
39
40 KrillCollection kc = new KrillCollection("{lalala}");
Akron5e3436f2017-07-04 15:28:03 +020041 assertEquals("Unable to parse JSON", kc.getError(0).getMessage());
margarethaee683ff2017-07-03 12:27:28 +020042 kc.setIndex(ki);
Akron5e3436f2017-07-04 15:28:03 +020043
margarethaee683ff2017-07-03 12:27:28 +020044 long docs = 0, tokens = 0, sentences = 0, paragraphs = 0;
45 try {
46 docs = kc.numberOf("documents");
47 tokens = kc.numberOf("tokens");
48 sentences = kc.numberOf("sentences");
49 paragraphs = kc.numberOf("paragraphs");
50 }
51 catch (IOException e) {
52 e.printStackTrace();
53 }
54 assertEquals(0, docs);
55 assertEquals(0, tokens);
56 assertEquals(0, sentences);
57 assertEquals(0, paragraphs);
margaretha30a03ae2017-07-11 19:04:09 +020058
59 assertEquals(1, kc.getErrors().size());
60 assertEquals(StatusCodes.UNABLE_TO_PARSE_JSON, kc.getErrors().get(0).getCode());
margarethaee683ff2017-07-03 12:27:28 +020061 }
62
63
64 @Test
Akron3ba74f22015-07-24 18:46:17 +020065 public void testIndexWithCollectionBuilder () throws IOException {
66 ki = new KrillIndex();
67 ki.addDoc(createDoc1());
68 ki.addDoc(createDoc2());
69 ki.addDoc(createDoc3());
70 ki.commit();
Akron176c9b12015-07-29 19:53:40 +020071 CollectionBuilder cb = new CollectionBuilder();
72 KrillCollection kcn = new KrillCollection(ki);
Akron3ba74f22015-07-24 18:46:17 +020073
74 // Simple string tests
75 kcn.fromBuilder(cb.term("author", "Frank"));
76 assertEquals(1, kcn.docCount());
77
78 kcn.fromBuilder(cb.term("author", "Peter"));
79 assertEquals(1, kcn.docCount());
80
81 kcn.fromBuilder(cb.term("author", "Sebastian"));
82 assertEquals(1, kcn.docCount());
83
84 kcn.fromBuilder(cb.term("author", "Michael"));
85 assertEquals(0, kcn.docCount());
86
margarethaee683ff2017-07-03 12:27:28 +020087 kcn.fromBuilder(cb.term("nothing", "nothing"));
Akronc346ce42017-07-02 19:14:07 +020088 assertEquals(0, kcn.docCount());
89
Akron3ba74f22015-07-24 18:46:17 +020090 kcn.fromBuilder(cb.term("textClass", "reisen"));
91 assertEquals(3, kcn.docCount());
92
93 kcn.fromBuilder(cb.term("textClass", "kultur"));
94 assertEquals(2, kcn.docCount());
95
96 kcn.fromBuilder(cb.term("textClass", "finanzen"));
97 assertEquals(1, kcn.docCount());
98
99 // Simple orGroup tests
Akron40550172015-08-04 03:06:12 +0200100 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank"))
101 .with(cb.term("author", "Michael")));
Akron3ba74f22015-07-24 18:46:17 +0200102 assertEquals(1, kcn.docCount());
103
Akron40550172015-08-04 03:06:12 +0200104 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank"))
105 .with(cb.term("author", "Sebastian")));
Akron3ba74f22015-07-24 18:46:17 +0200106 assertEquals(2, kcn.docCount());
107
Akron176c9b12015-07-29 19:53:40 +0200108 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank"))
Akron40550172015-08-04 03:06:12 +0200109 .with(cb.term("author", "Sebastian"))
110 .with(cb.term("author", "Peter")));
Akron3ba74f22015-07-24 18:46:17 +0200111 assertEquals(3, kcn.docCount());
112
Akron176c9b12015-07-29 19:53:40 +0200113 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Huhu"))
Akron40550172015-08-04 03:06:12 +0200114 .with(cb.term("author", "Haha"))
115 .with(cb.term("author", "Hehe")));
Akron3ba74f22015-07-24 18:46:17 +0200116 assertEquals(0, kcn.docCount());
117
118 // Multi field orGroup tests
Akron40550172015-08-04 03:06:12 +0200119 kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1"))
120 .with(cb.term("author", "Peter")));
Akron3ba74f22015-07-24 18:46:17 +0200121 assertEquals(2, kcn.docCount());
122
Akron40550172015-08-04 03:06:12 +0200123 kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1"))
124 .with(cb.term("author", "Frank")));
Akron3ba74f22015-07-24 18:46:17 +0200125 assertEquals(1, kcn.docCount());
126
Akron40550172015-08-04 03:06:12 +0200127 kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1"))
128 .with(cb.term("author", "Michael")));
Akron3ba74f22015-07-24 18:46:17 +0200129 assertEquals(1, kcn.docCount());
130
131 // Simple andGroup tests
Akron40550172015-08-04 03:06:12 +0200132 kcn.fromBuilder(cb.andGroup().with(cb.term("author", "Frank"))
133 .with(cb.term("author", "Michael")));
Akron3ba74f22015-07-24 18:46:17 +0200134 assertEquals(0, kcn.docCount());
135
Akron40550172015-08-04 03:06:12 +0200136 kcn.fromBuilder(cb.andGroup().with(cb.term("ID", "doc-1"))
137 .with(cb.term("author", "Frank")));
Akron3ba74f22015-07-24 18:46:17 +0200138 assertEquals(1, kcn.docCount());
139
140 // andGroup in keyword field test
Akron40550172015-08-04 03:06:12 +0200141 kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen"))
142 .with(cb.term("textClass", "finanzen")));
Akron3ba74f22015-07-24 18:46:17 +0200143 assertEquals(1, kcn.docCount());
144
Akron40550172015-08-04 03:06:12 +0200145 kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen"))
146 .with(cb.term("textClass", "kultur")));
Akron3ba74f22015-07-24 18:46:17 +0200147 assertEquals(2, kcn.docCount());
148
Akron40550172015-08-04 03:06:12 +0200149 kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "finanzen"))
150 .with(cb.term("textClass", "kultur")));
Akron3ba74f22015-07-24 18:46:17 +0200151 assertEquals(0, kcn.docCount());
Akron80cba8d2015-07-27 17:27:46 +0200152
Akron26207572018-04-04 20:21:42 +0200153 kcn.fromBuilder(cb.term("text", "mann"));
Akron80cba8d2015-07-27 17:27:46 +0200154 assertEquals(3, kcn.docCount());
155
Akron26207572018-04-04 20:21:42 +0200156 kcn.fromBuilder(cb.term("text", "frau"));
Akron80cba8d2015-07-27 17:27:46 +0200157 assertEquals(1, kcn.docCount());
Akron3ba74f22015-07-24 18:46:17 +0200158 };
159
Akron40550172015-08-04 03:06:12 +0200160
Akron5a44f2b2018-01-09 23:26:41 +0100161 @Test
162 public void testIndexWithRegex () throws IOException {
163 ki = new KrillIndex();
164 ki.addDoc(createDoc1());
165 ki.addDoc(createDoc2());
166 ki.addDoc(createDoc3());
167 ki.commit();
168 CollectionBuilder cb = new CollectionBuilder();
169 KrillCollection kcn = new KrillCollection(ki);
170
171 // Frank, Sebastian
172 kcn.fromBuilder(cb.re("author", ".*an.*"));
173 assertEquals(2, kcn.docCount());
174
175 // Kultur & Reisen,
176 // Reisen & Finanzen,
177 // Nachricht & Kultur & Reisen
178 kcn.fromBuilder(cb.re("textClass", ".*(ult|eis).*"));
179 assertEquals(3, kcn.docCount());
180
181 // Test in group
182 kcn.fromBuilder(
183 cb.andGroup().with(cb.term("textClass", "reisen")).with(cb.term("textClass", "kultur"))
184 );
185 assertEquals(2, kcn.docCount());
186
187 kcn.fromBuilder(
188 cb.andGroup().with(
189 cb.re("textClass", ".*eis.*")
190 ).with(
191 cb.re("textClass", ".*ult.*")
192 )
193 );
194 assertEquals(2, kcn.docCount());
195
196 kcn.fromBuilder(
197 cb.andGroup().with(
198 cb.re("textClass", ".*eis.*")
199 ).with(
200 cb.orGroup().with(
201 cb.re("textClass", ".*ult.*")
202 ).with(
203 cb.re("textClass", ".*nan.*")
204 )
205 )
206 );
207 assertEquals(3, kcn.docCount());
208 };
209
210
Akron3ba74f22015-07-24 18:46:17 +0200211 @Test
Akron6b0be132019-09-16 19:01:59 +0200212 public void testIndexWithNegation1 () throws IOException {
Akron3ba74f22015-07-24 18:46:17 +0200213 ki = new KrillIndex();
214 ki.addDoc(createDoc1());
215 ki.addDoc(createDoc2());
216 ki.addDoc(createDoc3());
217 ki.commit();
Akron176c9b12015-07-29 19:53:40 +0200218 CollectionBuilder cb = new CollectionBuilder();
219 KrillCollection kcn = new KrillCollection(ki);
Akron3ba74f22015-07-24 18:46:17 +0200220
221 // Simple negation tests
222 kcn.fromBuilder(cb.term("author", "Frank").not());
223 assertEquals(2, kcn.docCount());
224
225 kcn.fromBuilder(cb.term("textClass", "reisen").not());
226 assertEquals(0, kcn.docCount());
227
228 kcn.fromBuilder(cb.term("textClass", "kultur").not());
229 assertEquals(1, kcn.docCount());
230
231 // orGroup with simple Negation
Akron40550172015-08-04 03:06:12 +0200232 kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
233 .with(cb.term("author", "Peter")));
Akron3ba74f22015-07-24 18:46:17 +0200234 assertEquals(2, kcn.docCount());
235
Akron40550172015-08-04 03:06:12 +0200236 kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
237 .with(cb.term("author", "Sebastian")));
Akron3ba74f22015-07-24 18:46:17 +0200238 assertEquals(1, kcn.docCount());
Akron6b0be132019-09-16 19:01:59 +0200239
240 kcn.fromBuilder(
241 cb.andGroup().with(
242 cb.term("author", "Frank").not()
243 )
244 .with(
245 cb.term("author", "Sebastian").not()
246 )
247 );
248 assertEquals("AndGroup(-author:Frank -author:Sebastian)", kcn.toString());
249 assertEquals(1, kcn.docCount());
250
251
252 kcn.fromBuilder(
253 cb.andGroup().with(
254 cb.term("author", "Peter")
255 )
256 .with(
257 cb.andGroup().with(
258 cb.term("author", "Frank").not()
259 )
260 .with(
261 cb.term("author", "Sebastian").not()
262 )
263 )
264 );
265 assertEquals("AndGroup(author:Peter AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
266 assertEquals(1, kcn.docCount());
267
268 kcn.fromBuilder(
269 cb.andGroup().with(
270 cb.re("textClass", "reis.*")
271 )
272 .with(
273 cb.andGroup().with(
274 cb.term("author", "Frank").not()
275 )
276 .with(
277 cb.term("author", "Sebastian").not()
278 )
279 )
280 );
281 assertEquals("AndGroup(QueryWrapperFilter(textClass:/reis.*/) AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
282 assertEquals(1, kcn.docCount());
Akron3ba74f22015-07-24 18:46:17 +0200283 };
284
Akron40550172015-08-04 03:06:12 +0200285
Akron3ba74f22015-07-24 18:46:17 +0200286 @Test
Akron6b0be132019-09-16 19:01:59 +0200287 public void testIndexWithNegation2 () throws IOException {
288 ki = new KrillIndex();
289 ki.addDoc(createDoc1());
290 ki.commit();
291 ki.addDoc(createDoc2());
292 ki.commit();
293 ki.addDoc(createDoc3());
294 ki.commit();
295 CollectionBuilder cb = new CollectionBuilder();
296 KrillCollection kcn = new KrillCollection(ki);
297
298 // Simple negation tests
299 kcn.fromBuilder(cb.term("author", "Frank").not());
300 assertEquals(2, kcn.docCount());
301
302 kcn.fromBuilder(cb.term("textClass", "reisen").not());
303 assertEquals(0, kcn.docCount());
304
305 kcn.fromBuilder(cb.term("textClass", "kultur").not());
306 assertEquals(1, kcn.docCount());
307
308 // orGroup with simple Negation
309 kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
310 .with(cb.term("author", "Peter")));
311 assertEquals(2, kcn.docCount());
312
313 kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
314 .with(cb.term("author", "Sebastian")));
315 assertEquals(1, kcn.docCount());
316
317 kcn.fromBuilder(
318 cb.andGroup().with(
319 cb.term("author", "Frank").not()
320 )
321 .with(
322 cb.term("author", "Sebastian").not()
323 )
324 );
325 assertEquals("AndGroup(-author:Frank -author:Sebastian)", kcn.toString());
326 assertEquals(1, kcn.docCount());
327
328 kcn.fromBuilder(
329 cb.andGroup().with(
330 cb.term("author", "Peter")
331 )
332 .with(
333 cb.andGroup().with(
334 cb.term("author", "Frank").not()
335 )
336 .with(
337 cb.term("author", "Sebastian").not()
338 )
339 )
340 );
341 assertEquals("AndGroup(author:Peter AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
342 assertEquals(1, kcn.docCount());
343
344 kcn.fromBuilder(
345 cb.andGroup().with(
346 cb.re("textClass", "reis..")
347 )
348 .with(
349 cb.andGroup().with(
350 cb.term("author", "Frank").not()
351 )
352 .with(
353 cb.term("author", "Sebastian").not()
354 )
355 )
356 );
357 assertEquals("AndGroup(QueryWrapperFilter(textClass:/reis../) AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
358 assertEquals(1, kcn.docCount());
359 };
360
361 @Test
362 public void testIndexWithNegation3 () throws IOException {
363
364 // This is identical to above but the operands are switched
365 ki = new KrillIndex();
366 ki.addDoc(createDoc1());
367 ki.commit();
368 ki.addDoc(createDoc2());
369 ki.commit();
370 ki.addDoc(createDoc3());
371 ki.commit();
372 CollectionBuilder cb = new CollectionBuilder();
373 KrillCollection kcn = new KrillCollection(ki);
374
375 // orGroup with simple Negation
376 kcn.fromBuilder(
377 cb.orGroup().with(cb.term("author", "Peter"))
378 .with(cb.term("textClass", "kultur").not()));
379 assertEquals(2, kcn.docCount());
380
381 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Sebastian"))
382 .with(cb.term("textClass", "kultur").not()));
383 assertEquals(1, kcn.docCount());
384
385 kcn.fromBuilder(
386 cb.andGroup().with(
387 cb.term("author", "Sebastian").not()
388 )
389 .with(
390 cb.term("author", "Frank").not()
391 )
392 );
393 assertEquals("AndGroup(-author:Sebastian -author:Frank)", kcn.toString());
394 assertEquals(1, kcn.docCount());
395
396 kcn.fromBuilder(
397 cb.andGroup().with(
398 cb.andGroup().with(
399 cb.term("author", "Sebastian").not()
400 )
401 .with(
402 cb.term("author", "Frank").not()
403 )
404 )
405 .with(
406 cb.term("author", "Peter")
407 )
408 );
409 assertEquals("AndGroup(AndGroup(-author:Sebastian -author:Frank) author:Peter)", kcn.toString());
410 assertEquals(1, kcn.docCount());
411
412 kcn.fromBuilder(
413 cb.andGroup().with(
414 cb.andGroup().with(
415 cb.term("author", "Sebastian").not()
416 )
417 .with(
418 cb.term("author", "Frank").not()
419 )
420 )
421 .with(
422 cb.re("textClass", "reis..")
423 )
424 );
425 assertEquals("AndGroup(AndGroup(-author:Sebastian -author:Frank) QueryWrapperFilter(textClass:/reis../))", kcn.toString());
426 assertEquals(1, kcn.docCount());
427 };
428
429
430 @Test
Akron80cba8d2015-07-27 17:27:46 +0200431 public void testIndexWithMultipleCommitsAndDeletes () throws IOException {
Akron3ba74f22015-07-24 18:46:17 +0200432 ki = new KrillIndex();
433 ki.addDoc(createDoc1());
434 ki.addDoc(createDoc2());
435 ki.commit();
Akron176c9b12015-07-29 19:53:40 +0200436 CollectionBuilder cb = new CollectionBuilder();
437 KrillCollection kcn = new KrillCollection(ki);
Akron3ba74f22015-07-24 18:46:17 +0200438
439 kcn.fromBuilder(cb.term("author", "Frank"));
440 assertEquals(1, kcn.docCount());
441 kcn.fromBuilder(cb.term("author", "Peter"));
442 assertEquals(1, kcn.docCount());
443 kcn.fromBuilder(cb.term("author", "Sebastian"));
444 assertEquals(0, kcn.docCount());
445 kcn.fromBuilder(cb.term("author", "Michael").not());
446 assertEquals(2, kcn.docCount());
447
448 // Add Sebastians doc
449 ki.addDoc(createDoc3());
450 ki.commit();
451
452 kcn.fromBuilder(cb.term("author", "Frank"));
453 assertEquals(1, kcn.docCount());
454 kcn.fromBuilder(cb.term("author", "Peter"));
455 assertEquals(1, kcn.docCount());
456 kcn.fromBuilder(cb.term("author", "Sebastian"));
457 assertEquals(1, kcn.docCount());
458 kcn.fromBuilder(cb.term("author", "Michael").not());
459 assertEquals(3, kcn.docCount());
460
461 // Remove one document
462 ki.delDocs("author", "Peter");
463 ki.commit();
464
465 kcn.fromBuilder(cb.term("author", "Frank"));
466 assertEquals(1, kcn.docCount());
467 kcn.fromBuilder(cb.term("author", "Peter"));
468 assertEquals(0, kcn.docCount());
469 kcn.fromBuilder(cb.term("author", "Sebastian"));
470 assertEquals(1, kcn.docCount());
471 kcn.fromBuilder(cb.term("author", "Michael").not());
472 assertEquals(2, kcn.docCount());
Akron80cba8d2015-07-27 17:27:46 +0200473
474 // Readd Peter's doc
475 ki.addDoc(createDoc2());
476 ki.commit();
477
478 kcn.fromBuilder(cb.term("author", "Frank"));
479 assertEquals(1, kcn.docCount());
480 kcn.fromBuilder(cb.term("author", "Peter"));
481 assertEquals(1, kcn.docCount());
482 kcn.fromBuilder(cb.term("author", "Sebastian"));
483 assertEquals(1, kcn.docCount());
484 kcn.fromBuilder(cb.term("author", "Michael").not());
485 assertEquals(3, kcn.docCount());
Akron3ba74f22015-07-24 18:46:17 +0200486 };
487
Akron40550172015-08-04 03:06:12 +0200488
Akron80cba8d2015-07-27 17:27:46 +0200489 @Test
490 public void testIndexWithDateRanges () throws IOException {
491 ki = new KrillIndex();
492 ki.addDoc(createDoc1());
493 ki.addDoc(createDoc2());
494 ki.addDoc(createDoc3());
495 ki.commit();
Akron176c9b12015-07-29 19:53:40 +0200496 CollectionBuilder cb = new CollectionBuilder();
497 KrillCollection kcn = new KrillCollection(ki);
Akron80cba8d2015-07-27 17:27:46 +0200498
499 kcn.fromBuilder(cb.date("pubDate", "2005"));
500 assertEquals(3, kcn.docCount());
501 kcn.fromBuilder(cb.date("pubDate", "2005-12"));
502 assertEquals(3, kcn.docCount());
503
504 kcn.fromBuilder(cb.date("pubDate", "2005-12-10"));
505 assertEquals(1, kcn.docCount());
506 kcn.fromBuilder(cb.date("pubDate", "2005-12-16"));
507 assertEquals(1, kcn.docCount());
508 kcn.fromBuilder(cb.date("pubDate", "2005-12-07"));
509 assertEquals(1, kcn.docCount());
510
511 kcn.fromBuilder(cb.since("pubDate", "2005-12-07"));
512 assertEquals(3, kcn.docCount());
513 kcn.fromBuilder(cb.since("pubDate", "2005-12-10"));
514 assertEquals(2, kcn.docCount());
515 kcn.fromBuilder(cb.since("pubDate", "2005-12-16"));
516 assertEquals(1, kcn.docCount());
517
518 kcn.fromBuilder(cb.till("pubDate", "2005-12-16"));
519 assertEquals(3, kcn.docCount());
520 kcn.fromBuilder(cb.till("pubDate", "2005-12-10"));
521 assertEquals(2, kcn.docCount());
522 kcn.fromBuilder(cb.till("pubDate", "2005-12-07"));
523 assertEquals(1, kcn.docCount());
524
525 kcn.fromBuilder(cb.date("pubDate", "2005-12-10").not());
526 assertEquals(2, kcn.docCount());
527 kcn.fromBuilder(cb.date("pubDate", "2005-12-16").not());
528 assertEquals(2, kcn.docCount());
529 kcn.fromBuilder(cb.date("pubDate", "2005-12-07").not());
530 assertEquals(2, kcn.docCount());
531 kcn.fromBuilder(cb.date("pubDate", "2005-12-09").not());
532 assertEquals(3, kcn.docCount());
533
534
535 kcn.fromBuilder(cb.till("pubDate", "2005-12-16").not());
536 assertEquals(0, kcn.docCount());
537 kcn.fromBuilder(cb.till("pubDate", "2005-12-15").not());
538 assertEquals(1, kcn.docCount());
539 kcn.fromBuilder(cb.till("pubDate", "2005-12-10").not());
540 assertEquals(1, kcn.docCount());
541 kcn.fromBuilder(cb.till("pubDate", "2005-12-09").not());
542 assertEquals(2, kcn.docCount());
543 kcn.fromBuilder(cb.till("pubDate", "2005-12-07").not());
544 assertEquals(2, kcn.docCount());
545 kcn.fromBuilder(cb.till("pubDate", "2005-12-06").not());
546 assertEquals(3, kcn.docCount());
547 };
548
549
550 @Test
551 public void testIndexWithRegexes () throws IOException {
552 ki = new KrillIndex();
553
Akron26207572018-04-04 20:21:42 +0200554 FieldDocument fd = ki.addDoc(createDoc1());
Akron80cba8d2015-07-27 17:27:46 +0200555 ki.addDoc(createDoc2());
556 ki.addDoc(createDoc3());
557 ki.commit();
558
Akron176c9b12015-07-29 19:53:40 +0200559 CollectionBuilder cb = new CollectionBuilder();
560 KrillCollection kcn = new KrillCollection(ki);
Akron80cba8d2015-07-27 17:27:46 +0200561
562 kcn.fromBuilder(cb.re("author", "Fran.*"));
563 assertEquals(1, kcn.docCount());
564 kcn.fromBuilder(cb.re("author", "Blin.*"));
565 assertEquals(0, kcn.docCount());
566 kcn.fromBuilder(cb.re("author", "Frank|Peter"));
567 assertEquals(2, kcn.docCount());
568
Akron2e5897b2018-03-29 12:07:11 +0200569 // "Frau" requires text request!
570 kcn.fromBuilder(cb.text("text", "Frau"));
571 assertEquals(1, kcn.docCount());
572
Akron26207572018-04-04 20:21:42 +0200573 kcn.fromBuilder(cb.term("text", "frau"));
Akron80cba8d2015-07-27 17:27:46 +0200574 assertEquals(1, kcn.docCount());
575
Akron26207572018-04-04 20:21:42 +0200576 kcn.fromBuilder(cb.re("text", "fra."));
Akron80cba8d2015-07-27 17:27:46 +0200577 assertEquals(1, kcn.docCount());
578
Akron26207572018-04-04 20:21:42 +0200579 kcn.fromBuilder(cb.re("text", "fra.|ma.n"));
Akron80cba8d2015-07-27 17:27:46 +0200580 assertEquals(3, kcn.docCount());
Akron26207572018-04-04 20:21:42 +0200581
582 String sv = fd.doc.getField("text").stringValue();
583 assertEquals("Der alte Mann ging über die Straße", sv);
584
585 kcn.fromBuilder(cb.term("text", sv));
586 assertEquals(1, kcn.docCount());
Akron26207572018-04-04 20:21:42 +0200587 };
Akron80cba8d2015-07-27 17:27:46 +0200588
Akron604bf362024-12-13 12:27:37 +0100589 @Test
590 public void testIndexWithIntegers () throws IOException {
591 ki = new KrillIndex();
592
593 FieldDocument fd = ki.addDoc(createDoc1());
594 ki.addDoc(createDoc2());
595 ki.addDoc(createDoc5001());
596 ki.commit();
597
598 CollectionBuilder cb = new CollectionBuilder();
599 KrillCollection kcn = new KrillCollection(ki);
600
601 assertEquals("toks:[2000.0 TO 4000.0]", cb.between("toks", 2000, 4000).toString());
602
603 kcn.fromBuilder(cb.between("toks", 2000, 4000));
604 assertEquals(1, kcn.docCount());
605
606 kcn.fromBuilder(cb.geq("toks", 2000));
607 assertEquals(1, kcn.docCount());
608
609 kcn.fromBuilder(cb.leq("toks", 4000));
610 assertEquals(1, kcn.docCount());
611
612 kcn.fromBuilder(cb.leq("toks", 2000));
613 assertEquals(0, kcn.docCount());
614
615 kcn.fromBuilder(cb.geq("toks", 4000));
616 assertEquals(0, kcn.docCount());
617
618 kcn.fromBuilder(cb.lt("toks", 3000));
619 assertEquals(0, kcn.docCount());
620
621 kcn.fromBuilder(cb.lt("toks", 3001));
622 assertEquals(1, kcn.docCount());
623
624 kcn.fromBuilder(cb.gt("toks", 3000));
625 assertEquals(0, kcn.docCount());
626
627 kcn.fromBuilder(cb.gt("toks", 2999));
628 assertEquals(1, kcn.docCount());
629
630 kcn.fromBuilder(cb.eq("toks", 3000));
631 assertEquals(1, kcn.docCount());
632
633 kcn.fromBuilder(cb.eq("toks", 3001));
634 assertEquals(0, kcn.docCount());
635 };
636
Akron408ae352018-03-28 16:47:41 +0200637 @Test
638 public void testIndexWithTextStringQueries () throws IOException {
639 ki = new KrillIndex();
640 ki.addDoc(createDoc1());
641 ki.commit();
642
643 CollectionBuilder cb = new CollectionBuilder();
644 KrillCollection kcn = new KrillCollection(ki);
645
Akron26207572018-04-04 20:21:42 +0200646 kcn.fromBuilder(cb.term("text", "mann"));
Akron2e5897b2018-03-29 12:07:11 +0200647 assertEquals(1, kcn.docCount());
648
Akron26207572018-04-04 20:21:42 +0200649 kcn.fromBuilder(cb.term("text", "Der alte Mann ging über die Straße"));
Akron2e5897b2018-03-29 12:07:11 +0200650 assertEquals(1, kcn.docCount());
651
Akron26207572018-04-04 20:21:42 +0200652 kcn.fromBuilder(cb.text("text", "Der alte Mann"));
Marc Kupietzbef0a272025-09-24 13:59:27 +0200653 assertEquals("QueryWrapperFilter(text:\"der alte mann\")", kcn.toString());
Akron26207572018-04-04 20:21:42 +0200654 assertEquals(1, kcn.docCount());
Akron408ae352018-03-28 16:47:41 +0200655 };
656
Akronb59f40e2018-08-23 17:15:43 +0200657 @Test
658 public void testUnknownVC () throws IOException {
659 ki = new KrillIndex();
660 ki.addDoc(createDoc1());
661 ki.commit();
662
663 // This test was adopted from TestVCCaching,
664 // But does not fail anymore for deserialization
margaretha5a8abea2021-11-08 16:57:51 +0100665 String json = _getJSONString("vc-ref/unknown-vc-ref.jsonld");
Akronb59f40e2018-08-23 17:15:43 +0200666
667 KrillCollection kc = new KrillCollection(json);
668 assertEquals("referTo(https://korap.ids-mannheim.de/@ndiewald/MyCorpus)", kc.getBuilder().toString());
669
margarethad9a46af2022-01-04 09:10:04 +0100670 assertEquals("vcFilter(https://korap.ids-mannheim.de/@ndiewald/MyCorpus)",kc.toString());
Akronb59f40e2018-08-23 17:15:43 +0200671
672 QueryBuilder kq = new QueryBuilder("field");
673
674 Krill krill = new Krill(kq.seg("a").with("b"));
675 krill.setCollection(kc);
676
677 Result result = krill.apply(ki);
678
679 assertEquals(StatusCodes.MISSING_COLLECTION, result.getError(0).getCode());
margaretha24a8da62025-08-19 09:35:18 +0200680 assertTrue(result.getError(0).getMessage().startsWith("VC is not found"));
Akronb59f40e2018-08-23 17:15:43 +0200681 };
margarethad9a46af2022-01-04 09:10:04 +0100682
Akronb59f40e2018-08-23 17:15:43 +0200683 @Test
margaretha10ecded2024-08-02 10:08:58 +0200684 public void testEmptyDocIdSetIterator () throws IOException {
685 KrillIndex ki = new KrillIndex();
686 // Indexing test files
687 for (String i : new String[] { "00001", "00002", "00003", "00004",
688 "00005", "00006", "02439" }) {
689 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
690 true);
691 };
692 ki.commit();
693
694 String filename = "/queries/collections/vc-ref/query-with-vc-ref-klznkz66.jsonld";
695 String json = getJsonString(getClass().getResource(filename).getFile());
696 KrillCollection kc = new KrillCollection(json);
697 kc.setIndex(ki);
698 assertEquals(0, kc.numberOf("documents"));
699 }
700
701 @Test
Akronfd05f502015-07-30 18:34:26 +0200702 public void filterExampleFromLegacy () throws Exception {
703
704 // Construct index
705 KrillIndex ki = new KrillIndex();
706 // Indexing test files
707 for (String i : new String[] { "00001", "00002", "00003", "00004",
708 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200709 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Akronfd05f502015-07-30 18:34:26 +0200710 true);
711 };
712 ki.commit();
713
714 // Create Virtual collections:
715 KrillCollection kc = new KrillCollection(ki);
716
717 assertEquals("Documents", 7, kc.numberOf("documents"));
718
719 // The virtual collection consists of all documents that have
720 // the textClass "reisen" and "freizeit"
721
722 /* kc.filter(kf.and("textClass", "reisen").and("textClass",
723 "freizeit-unterhaltung"));
724 */
725
Akron40550172015-08-04 03:06:12 +0200726 kc.fromBuilder(kc.build().andGroup()
727 .with(kc.build().term("textClass", "reisen"))
728 .with(kc.build().term("textClass", "freizeit-unterhaltung")));
Akronfd05f502015-07-30 18:34:26 +0200729
730 assertEquals("Documents", 5, kc.numberOf("documents"));
731 assertEquals("Tokens", 1678, kc.numberOf("tokens"));
732 assertEquals("Sentences", 194, kc.numberOf("sentences"));
733 assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
734
735
736 // Subset this to all documents that have also the text
737 // kc.filter(kf.and("textClass", "kultur"));
738 /*
739 kc.fromBuilder(
740 kc.build().andGroup().with(
741 kc.getBuilder()
742 ).with(
743 kc.build().term("textClass", "kultur")
744 )
745 );
746 */
747
748 kc.filter(kc.build().term("textClass", "kultur"));
749
750 assertEquals("Documents", 1, kc.numberOf("documents"));
751 assertEquals("Tokens", 405, kc.numberOf("tokens"));
752 assertEquals("Sentences", 75, kc.numberOf("sentences"));
753 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
754
755
756 // kc.filter(kf.and("corpusID", "WPD"));
757 kc.filter(kc.build().term("corpusID", "WPD"));
758
759 assertEquals("Documents", 1, kc.numberOf("documents"));
760 assertEquals("Tokens", 405, kc.numberOf("tokens"));
761 assertEquals("Sentences", 75, kc.numberOf("sentences"));
762 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
763
764 // Create a query
Eliza Margaretha6f989202016-10-14 21:48:29 +0200765 Krill ks = new Krill(
766 new QueryBuilder("tokens").seg("opennlp/p:NN").with("tt/p:NN"));
767 ks.setCollection(kc).getMeta().setStartIndex(0).setCount((short) 20)
768 .setContext(
769 new SearchContext(true, (short) 5, true, (short) 5));
Akronfd05f502015-07-30 18:34:26 +0200770
Akron60971692016-06-08 12:56:21 +0200771 Result kr = ks.apply(ki);
772
773 /*
Akron40550172015-08-04 03:06:12 +0200774 Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
775 (short) 5);
Akron60971692016-06-08 12:56:21 +0200776 */
Marc Kupietzbef0a272025-09-24 13:59:27 +0200777 assertEquals(70, kr.getTotalResults());
Akronfd05f502015-07-30 18:34:26 +0200778
779
780 kc.extend(kc.build().term("textClass", "uninteresting"));
781 assertEquals("Documents", 1, kc.numberOf("documents"));
782
783 kc.extend(kc.build().term("textClass", "wissenschaft"));
784
785 assertEquals("Documents", 3, kc.numberOf("documents"));
786 assertEquals("Tokens", 1669, kc.numberOf("tokens"));
787 assertEquals("Sentences", 188, kc.numberOf("sentences"));
788 assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
Akronfd05f502015-07-30 18:34:26 +0200789 };
790
margarethaee683ff2017-07-03 12:27:28 +0200791
792 @Test
Akronc346ce42017-07-02 19:14:07 +0200793 public void filterExampleWithNullresult () throws Exception {
794
795 // Construct index
796 KrillIndex ki = new KrillIndex();
797 // Indexing test files
798 for (String i : new String[] { "00001", "00002" }) {
799 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
margarethaee683ff2017-07-03 12:27:28 +0200800 true);
Akronc346ce42017-07-02 19:14:07 +0200801 };
802 ki.commit();
803
804 // Create Virtual collections:
805 KrillCollection kc = new KrillCollection(ki);
806
807 assertEquals("Documents", 2, kc.numberOf("documents"));
808
809 kc.fromBuilder(kc.build().term("textClass", "nichts"));
810
811 assertEquals("Documents", 0, kc.numberOf("documents"));
812 assertEquals("Tokens", 0, kc.numberOf("tokens"));
813 assertEquals("Sentences", 0, kc.numberOf("sentences"));
814 assertEquals("Paragraphs", 0, kc.numberOf("paragraphs"));
margarethaee683ff2017-07-03 12:27:28 +0200815 };
816
Akronfd05f502015-07-30 18:34:26 +0200817
818 @Test
819 public void filterExampleAtomicLegacy () throws Exception {
820
821 // That's exactly the same test class, but with multiple atomic indices
822
823 // Construct index
824 KrillIndex ki = new KrillIndex();
825 // Indexing test files
826 for (String i : new String[] { "00001", "00002", "00003", "00004",
827 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200828 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Akronfd05f502015-07-30 18:34:26 +0200829 true);
830 ki.commit();
831 };
832
833 CollectionBuilder kf = new CollectionBuilder();
834
835 // Create Virtual collections:
836 KrillCollection kc = new KrillCollection(ki);
837
838 assertEquals("Documents", 7, kc.numberOf("documents"));
839
840 // If this is set - everything is fine automatically ...
841 kc.filter(kc.build().term("corpusID", "WPD"));
842
843 assertEquals("Documents", 7, kc.numberOf("documents"));
844
845 // The virtual collection consists of all documents that have the textClass "reisen" and "freizeit"
846
847 /*
848 kc.filter(kf.and("textClass", "reisen").and("textClass",
849 "freizeit-unterhaltung"));
850 */
Akron40550172015-08-04 03:06:12 +0200851 kc.filter(kc.build().andGroup()
852 .with(kc.build().term("textClass", "reisen"))
853 .with(kc.build().term("textClass", "freizeit-unterhaltung")));
Akronfd05f502015-07-30 18:34:26 +0200854
855 assertEquals("Documents", 5, kc.numberOf("documents"));
856 assertEquals("Tokens", 1678, kc.numberOf("tokens"));
857 assertEquals("Sentences", 194, kc.numberOf("sentences"));
858 assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
859
860 // Subset this to all documents that have also the text
861 // kc.filter(kf.and("textClass", "kultur"));
862
863 kc.filter(kc.build().term("textClass", "kultur"));
864
865 assertEquals("Documents", 1, kc.numberOf("documents"));
866 assertEquals("Tokens", 405, kc.numberOf("tokens"));
867 assertEquals("Sentences", 75, kc.numberOf("sentences"));
868 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
869
870 // This is already filtered though ...
871 // kc.filter(kf.and("corpusID", "WPD"));
872 kc.filter(kc.build().term("corpusID", "WPD"));
873
874 assertEquals("Documents", 1, kc.numberOf("documents"));
875 assertEquals("Tokens", 405, kc.numberOf("tokens"));
876 assertEquals("Sentences", 75, kc.numberOf("sentences"));
877 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
878
879 // Create a query
Eliza Margaretha6f989202016-10-14 21:48:29 +0200880 Krill ks = new Krill(
881 new QueryBuilder("tokens").seg("opennlp/p:NN").with("tt/p:NN"));
882 ks.setCollection(kc).getMeta().setStartIndex(0).setCount((short) 20)
883 .setContext(
884 new SearchContext(true, (short) 5, true, (short) 5));
Akronfd05f502015-07-30 18:34:26 +0200885
Akron60971692016-06-08 12:56:21 +0200886 Result kr = ks.apply(ki);
887 /*
Akron40550172015-08-04 03:06:12 +0200888 Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
889 (short) 5);
Akron60971692016-06-08 12:56:21 +0200890 */
Marc Kupietzbef0a272025-09-24 13:59:27 +0200891 assertEquals(70, kr.getTotalResults());
Akronfd05f502015-07-30 18:34:26 +0200892
893 // kc.extend(kf.and("textClass", "uninteresting"));
894 kc.extend(kc.build().term("textClass", "uninteresting"));
895
Akronfd05f502015-07-30 18:34:26 +0200896 assertEquals("Documents", 1, kc.numberOf("documents"));
897
Akronaa74ec62015-07-31 17:22:55 +0200898 kc.extend(kc.build().term("textClass", "wissenschaft"));
Akronfd05f502015-07-30 18:34:26 +0200899
900 assertEquals("Documents", 3, kc.numberOf("documents"));
901 assertEquals("Tokens", 1669, kc.numberOf("tokens"));
902 assertEquals("Sentences", 188, kc.numberOf("sentences"));
903 assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
Akronaa74ec62015-07-31 17:22:55 +0200904
905 // System.err.println(kc.toString());
906 // Test collectionbuilder simplifier!
907 /*
908 OrGroup(
909 AndGroup(
910 corpusID:WPD
911 textClass:reisen
912 textClass:freizeit-unterhaltung
913 textClass:kultur
914 corpusID:WPD
915 )
916 textClass:uninteresting
917 textClass:wissenschaft
918 )
Akronfd05f502015-07-30 18:34:26 +0200919 */
Akronaa74ec62015-07-31 17:22:55 +0200920
921 assertTrue(ki.delDocs("textClass", "wissenschaft"));
922 ki.commit();
923
924 assertEquals("Documents", 1, kc.numberOf("documents"));
925 assertEquals("Tokens", 405, kc.numberOf("tokens"));
926 assertEquals("Sentences", 75, kc.numberOf("sentences"));
927 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
928 };
929
Akron40550172015-08-04 03:06:12 +0200930
Akronaa74ec62015-07-31 17:22:55 +0200931 @Test
932 public void filterExample2Legacy () throws Exception {
933
934 // Construct index
935 KrillIndex ki = new KrillIndex();
936 // Indexing test files
937 for (String i : new String[] { "00001", "00002", "00003", "00004",
938 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200939 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Akronaa74ec62015-07-31 17:22:55 +0200940 true);
941 };
942 ki.commit();
943
Eliza Margaretha6f989202016-10-14 21:48:29 +0200944 ki.addDoc(
945 getClass().getResourceAsStream("/wiki/00012-fakemeta.json.gz"),
946 true);
Akronaa74ec62015-07-31 17:22:55 +0200947
948 ki.commit();
949
950 /*
951 CollectionBuilderLegacy kf = new CollectionBuilderLegacy();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200952
Akronaa74ec62015-07-31 17:22:55 +0200953 // Create Virtual collections:
954 KrillCollectionLegacy kc = new KrillCollectionLegacy(ki);
955 kc.filter(kf.and("textClass", "reisen").and("textClass",
956 "freizeit-unterhaltung"));
957 */
958
959 KrillCollection kc = new KrillCollection(ki);
960 CollectionBuilder cb = kc.build();
Akron40550172015-08-04 03:06:12 +0200961 kc.filter(cb.andGroup().with(cb.term("textClass", "reisen"))
962 .with(cb.term("textClass", "freizeit-unterhaltung")));
Akronaa74ec62015-07-31 17:22:55 +0200963
964 assertEquals("Documents", 5, kc.numberOf("documents"));
965 assertEquals("Tokens", 1678, kc.numberOf("tokens"));
966 assertEquals("Sentences", 194, kc.numberOf("sentences"));
967 assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
968
969
970 // Create a query
Eliza Margaretha6f989202016-10-14 21:48:29 +0200971 Krill ks = new Krill(
972 new QueryBuilder("tokens").seg("opennlp/p:NN").with("tt/p:NN"));
973 ks.setCollection(kc).getMeta().setStartIndex(0).setCount((short) 20)
974 .setContext(
975 new SearchContext(true, (short) 5, true, (short) 5));
Akronaa74ec62015-07-31 17:22:55 +0200976
Akron60971692016-06-08 12:56:21 +0200977 Result kr = ks.apply(ki);
Akronaa74ec62015-07-31 17:22:55 +0200978
Marc Kupietzbef0a272025-09-24 13:59:27 +0200979 assertEquals(369, kr.getTotalResults());
Akronaa74ec62015-07-31 17:22:55 +0200980
981 // kc.filter(kf.and("corpusID", "QQQ"));
982 kc.filter(cb.term("corpusID", "QQQ"));
983
984 assertEquals("Documents", 0, kc.numberOf("documents"));
985 assertEquals("Tokens", 0, kc.numberOf("tokens"));
986 assertEquals("Sentences", 0, kc.numberOf("sentences"));
987 assertEquals("Paragraphs", 0, kc.numberOf("paragraphs"));
988
Akron60971692016-06-08 12:56:21 +0200989 ks.setCollection(kc);
990
991 // Create a query
992 kr = ks.apply(ki);
993 /*
Akron40550172015-08-04 03:06:12 +0200994 kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
995 (short) 5);
Akron60971692016-06-08 12:56:21 +0200996 */
Marc Kupietzbef0a272025-09-24 13:59:27 +0200997 assertEquals(0, kr.getTotalResults());
Akronaa74ec62015-07-31 17:22:55 +0200998 };
999
1000
1001 @Test
1002 public void uidCollectionLegacy () throws IOException {
1003
1004 // Construct index
1005 KrillIndex ki = new KrillIndex();
1006 // Indexing test files
1007 int uid = 1;
1008 for (String i : new String[] { "00001", "00002", "00003", "00004",
1009 "00005", "00006", "02439" }) {
1010 FieldDocument fd = ki.addDoc(uid++,
1011 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
1012 true);
1013 };
1014 ki.commit();
1015
1016 assertEquals("Documents", 7, ki.numberOf("documents"));
1017 assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
1018 assertEquals("Sentences", 281, ki.numberOf("sentences"));
1019 assertEquals("Tokens", 2661, ki.numberOf("tokens"));
1020
1021 SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der"));
1022 Result kr = ki.search(sq, (short) 10);
1023 assertEquals(86, kr.getTotalResults());
1024
1025 // Create Virtual collections:
1026 KrillCollection kc = new KrillCollection();
1027 kc.filterUIDs(new String[] { "2", "3", "4" });
1028 kc.setIndex(ki);
1029 assertEquals("Documents", 3, kc.numberOf("documents"));
1030
1031 assertEquals("Paragraphs", 46, kc.numberOf("paragraphs"));
1032 assertEquals("Sentences", 103, kc.numberOf("sentences"));
1033 assertEquals("Tokens", 1229, kc.numberOf("tokens"));
1034
Akron08f4ceb2016-08-03 23:53:32 +02001035
Akron60971692016-06-08 12:56:21 +02001036 Krill ks = new Krill(sq);
Eliza Margaretha6f989202016-10-14 21:48:29 +02001037 ks.setCollection(kc).getMeta().setStartIndex(0).setCount((short) 20)
1038 .setContext(
1039 new SearchContext(true, (short) 5, true, (short) 5));
Akron60971692016-06-08 12:56:21 +02001040 kr = ks.apply(ki);
1041
1042 // kr = ki.search(kc, sq, 0, (short) 20, true, (short) 5, true, (short) 5);
Akronaa74ec62015-07-31 17:22:55 +02001043
1044 assertEquals((long) 39, kr.getTotalResults());
1045 };
1046
Akron40550172015-08-04 03:06:12 +02001047
Akronaa74ec62015-07-31 17:22:55 +02001048 @Test
1049 public void uidCollectionWithDeletions () throws IOException {
1050
1051 // Construct index
1052 KrillIndex ki = new KrillIndex();
1053 // Indexing test files
1054 int uid = 1;
1055 for (String i : new String[] { "00001", "00002", "00003", "00004",
1056 "00005", "00006", "02439" }) {
1057 FieldDocument fd = ki.addDoc(uid++,
1058 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
1059 true);
1060 };
1061 ki.commit();
1062
1063
1064 assertEquals("Documents", 7, ki.numberOf("documents"));
1065 assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
1066 assertEquals("Sentences", 281, ki.numberOf("sentences"));
1067 assertEquals("Tokens", 2661, ki.numberOf("tokens"));
1068
1069 assertTrue(ki.delDoc(3));
1070 ki.commit();
1071
1072 assertEquals("Documents", 6, ki.numberOf("documents"));
1073
1074 assertEquals("Paragraphs", 146, ki.numberOf("paragraphs"));
1075 assertEquals("Sentences", 212, ki.numberOf("sentences"));
1076 assertEquals("Tokens", 2019, ki.numberOf("tokens"));
1077
1078 assertTrue(ki.delDoc(2));
1079 assertTrue(ki.delDoc(3));
1080 assertTrue(ki.delDoc(4));
1081 assertTrue(ki.delDoc(5));
1082 assertTrue(ki.delDoc(6));
1083 assertTrue(ki.delDoc(7));
1084 ki.commit();
1085
1086 assertEquals("Documents", 1, ki.numberOf("documents"));
1087 assertEquals("Paragraphs", 75, ki.numberOf("paragraphs"));
Akronfd05f502015-07-30 18:34:26 +02001088 };
1089
Akron451b7ae2018-08-15 13:21:27 +02001090 @Test
1091 public void testKrillCollectionWithNonexistingNegation () throws IOException {
1092 ki = new KrillIndex();
1093 ki.addDoc(createDoc1()); // nachricht kultur reisen
1094 ki.addDoc(createDoc3()); // reisen finanzen
1095 ki.commit();
1096
1097 KrillCollection kc = new KrillCollection(ki);
1098 CollectionBuilder cb = kc.build();
1099
1100 kc.fromBuilder(cb.term("textClass","reisen"));
Marc Kupietzbef0a272025-09-24 13:59:27 +02001101 assertEquals("textClass:reisen", kc.toString());
Akron451b7ae2018-08-15 13:21:27 +02001102 assertEquals("Documents", 2, kc.numberOf("documents"));
1103
1104 kc.fromBuilder(cb.andGroup().with(
1105 cb.term("textClass","reisen")
1106 ).with(
1107 cb.term("textClass","nachricht").not()
1108 ));
Marc Kupietzbef0a272025-09-24 13:59:27 +02001109 assertEquals("AndGroup(textClass:reisen -textClass:nachricht)", kc.toString());
Akron451b7ae2018-08-15 13:21:27 +02001110 assertEquals("Documents", 1, kc.numberOf("documents"));
1111
1112
1113 kc.fromBuilder(cb.andGroup().with(
1114 cb.term("textClass","reisen")
1115 ).with(
1116 cb.term("textClass","reisen").not()
1117 ));
Marc Kupietzbef0a272025-09-24 13:59:27 +02001118 assertEquals("AndGroup(textClass:reisen -textClass:reisen)", kc.toString());
Akron451b7ae2018-08-15 13:21:27 +02001119 assertEquals("Documents", 0, kc.numberOf("documents"));
1120
1121 kc.fromBuilder(cb.andGroup().with(
1122 cb.term("textClass","kultur")
1123 ).with(
1124 cb.term("textClass","finanzen").not()
1125 ));
Marc Kupietzbef0a272025-09-24 13:59:27 +02001126 assertEquals("AndGroup(textClass:kultur -textClass:finanzen)", kc.toString());
Akron451b7ae2018-08-15 13:21:27 +02001127 assertEquals("Documents", 1, kc.numberOf("documents"));
1128
1129 kc.fromBuilder(cb.andGroup().with(
1130 cb.term("textClass","reisen")
1131 ).with(
1132 cb.term("textClass","Blabla").not()
1133 ));
Marc Kupietzbef0a272025-09-24 13:59:27 +02001134 assertEquals("AndGroup(textClass:reisen -textClass:Blabla)", kc.toString());
Akron451b7ae2018-08-15 13:21:27 +02001135 assertEquals("Documents", 2, kc.numberOf("documents"));
1136 }
1137
Akronfd05f502015-07-30 18:34:26 +02001138
Akron1f531262018-08-24 14:27:00 +02001139 @Test
1140 public void testKrillCollectionWithValueVectorNe () throws IOException {
1141 ki = new KrillIndex();
1142 ki.addDoc(createDoc1()); // nachricht kultur reisen
1143 ki.addDoc(createDoc2()); // kultur reisen
1144 ki.addDoc(createDoc3()); // reisen finanzen
1145 ki.commit();
1146
1147 KrillCollection kc = new KrillCollection();
1148 kc.setIndex(ki);
1149
1150 CollectionBuilder cb = kc.build();
1151 kc.fromBuilder(cb.orGroup().with(cb.term("textClass", "nachricht")).with(cb.term("textClass","finanzen")));
1152 assertEquals("OrGroup(textClass:nachricht textClass:finanzen)", kc.toString());
1153 assertEquals("Documents", 2, kc.numberOf("documents"));
1154
1155 kc.fromBuilder(cb.term("textClass", "nachricht").not());
1156 assertEquals("-textClass:nachricht", kc.toString());
1157 assertEquals("Documents", 2, kc.numberOf("documents"));
1158
1159 kc.fromBuilder(cb.orGroup().with(cb.term("textClass", "nachricht").not()).with(cb.term("textClass","finanzen").not()));
1160 assertEquals("OrGroup(-textClass:nachricht -textClass:finanzen)", kc.toString());
1161 assertEquals("Documents", 3, kc.numberOf("documents"));
1162
1163 kc.fromBuilder(cb.orGroup().with(cb.term("textClass", "nachricht")).with(cb.term("textClass","finanzen")).not());
1164 assertEquals("-OrGroup(textClass:nachricht textClass:finanzen)", kc.toString());
1165 assertEquals("Documents", 1, kc.numberOf("documents"));
1166
1167 Krill ks = new Krill(new QueryBuilder("tokens").seg("i:a"));
1168 ks.setCollection(kc);
1169
1170 // Create a query
1171 Result kr = ks.apply(ki);
1172 assertEquals(1, kr.getTotalResults());
1173 assertEquals("[[a]] c d", kr.getMatch(0).getSnippetBrackets());
1174
1175 String json = _getJSONString("collection_with_vector_ne.jsonld");
1176 ks = new Krill(json);
1177
1178 kc = ks.getCollection();
1179 kc.setIndex(ki);
1180
1181 assertEquals("-OrGroup(textClass:nachricht textClass:finanzen)", kc.toString());
1182 assertEquals("Documents", 1, kc.numberOf("documents"));
1183
1184 kr = ks.apply(ki);
1185 assertEquals("[[a]] c d", kr.getMatch(0).getSnippetBrackets());
1186 assertEquals(1, kr.getTotalResults());
1187 };
Akron2423bba2018-09-03 15:11:10 +02001188
1189 @Test
1190 public void testKrillCollectionWithLargeVector () throws IOException {
1191 ki = new KrillIndex();
1192 ki.addDoc(createDoc1());
1193 ki.addDoc(createDoc2());
1194 ki.addDoc(createDoc3());
1195 ki.commit();
1196 ki.addDoc(createDoc5000());
1197 ki.commit();
1198
1199 String json = _getJSONString("collection_large_vector.jsonld");
1200 KrillCollection kc = new KrillCollection(json);
1201
1202 Krill ks = new Krill(new QueryBuilder("tokens").seg("i:a"));
1203 ks.setCollection(kc);
1204 kc.setIndex(ki);
1205
1206 assertEquals("Documents", 4, kc.numberOf("documents"));
1207
1208 Result kr = ks.apply(ki);
1209 assertEquals("[[a]] b c", kr.getMatch(0).getSnippetBrackets());
1210 assertEquals("[[a]] c d", kr.getMatch(1).getSnippetBrackets());
1211 assertEquals("[[a]] d e", kr.getMatch(2).getSnippetBrackets());
1212 assertEquals("[[a]] d e", kr.getMatch(3).getSnippetBrackets());
1213 };
1214
Akronfd966c52018-09-03 15:29:37 +02001215 @Test
1216 public void testKrillCollectionWithLargeVectorAndLargeIndex () throws IOException {
1217 ki = new KrillIndex();
1218 for (int i = 0; i < 6000; i++) {
1219 FieldDocument fd = new FieldDocument();
1220 fd.addString("UID", Integer.toString(i));
1221 ki.addDoc(fd);
1222 if (i == 4500)
1223 ki.commit();
1224 };
1225
1226 ki.commit();
1227
1228 String json = _getJSONString("collection_large_vector.jsonld");
1229 KrillCollection kc = new KrillCollection(json);
1230 kc.setIndex(ki);
1231
1232 assertEquals("Documents", 5000, kc.numberOf("documents"));
1233 };
1234
1235
Akron2423bba2018-09-03 15:11:10 +02001236
Akron1f531262018-08-24 14:27:00 +02001237
margarethaa0d88f62018-09-03 18:03:52 +02001238 public static FieldDocument createDoc1 () {
Akron3ba74f22015-07-24 18:46:17 +02001239 FieldDocument fd = new FieldDocument();
Akronb59f40e2018-08-23 17:15:43 +02001240 fd.addString("UID", "1");
Akron3ba74f22015-07-24 18:46:17 +02001241 fd.addString("ID", "doc-1");
1242 fd.addString("author", "Frank");
Akrona6dabb72019-01-09 13:09:41 +01001243 fd.addKeywords("textClass", "Nachricht Kultur Reisen");
Akronc7a2abc2019-01-17 14:21:34 +01001244 fd.addDate("pubDate", 20051210);
Akron22d319e2018-04-01 17:13:49 +02001245 fd.addText("text", "Der alte Mann ging über die Straße");
Akronb59f40e2018-08-23 17:15:43 +02001246 fd.addTV("tokens", "a b c", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
1247 + "[(2-3)s:b|i:b|_1$<i>2<i>3]" + "[(4-5)s:c|i:c|_2$<i>4<i>5]");
Akron3ba74f22015-07-24 18:46:17 +02001248 return fd;
1249 };
1250
Akron40550172015-08-04 03:06:12 +02001251
margarethaa0d88f62018-09-03 18:03:52 +02001252 public static FieldDocument createDoc2 () {
Akron3ba74f22015-07-24 18:46:17 +02001253 FieldDocument fd = new FieldDocument();
Akronb59f40e2018-08-23 17:15:43 +02001254 fd.addString("UID", "2");
1255 fd.addString("ID", "doc-2");
Akron3ba74f22015-07-24 18:46:17 +02001256 fd.addString("author", "Peter");
Akrona6dabb72019-01-09 13:09:41 +01001257 fd.addKeywords("textClass", "Kultur Reisen");
Akronc7a2abc2019-01-17 14:21:34 +01001258 fd.addDate("pubDate", 20051207);
Akron3ba74f22015-07-24 18:46:17 +02001259 fd.addText("text", "Der junge Mann hatte keine andere Wahl");
Akronb59f40e2018-08-23 17:15:43 +02001260 fd.addTV("tokens", "a c d", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
1261 + "[(2-3)s:c|i:c|_1$<i>2<i>3]" + "[(4-5)s:d|i:d|_2$<i>4<i>5]");
Akron3ba74f22015-07-24 18:46:17 +02001262 return fd;
1263 };
1264
Akron40550172015-08-04 03:06:12 +02001265
margarethaa0d88f62018-09-03 18:03:52 +02001266 public static FieldDocument createDoc3 () {
Akron3ba74f22015-07-24 18:46:17 +02001267 FieldDocument fd = new FieldDocument();
Akronb59f40e2018-08-23 17:15:43 +02001268 fd.addString("UID", "3");
1269 fd.addString("ID", "doc-3");
Akron3ba74f22015-07-24 18:46:17 +02001270 fd.addString("author", "Sebastian");
Akrona6dabb72019-01-09 13:09:41 +01001271 fd.addKeywords("textClass", "Reisen Finanzen");
Akronc7a2abc2019-01-17 14:21:34 +01001272 fd.addDate("pubDate", 20051216);
Akron3ba74f22015-07-24 18:46:17 +02001273 fd.addText("text", "Die Frau und der Mann küssten sich");
Akronb59f40e2018-08-23 17:15:43 +02001274 fd.addTV("tokens", "a d e", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
1275 + "[(2-3)s:d|i:d|_1$<i>2<i>3]" + "[(4-5)s:e|i:e|_2$<i>4<i>5]");
Akron3ba74f22015-07-24 18:46:17 +02001276 return fd;
1277 };
Akronb59f40e2018-08-23 17:15:43 +02001278
margarethad9a46af2022-01-04 09:10:04 +01001279 public static FieldDocument createDoc5000 () {
Akron2423bba2018-09-03 15:11:10 +02001280 FieldDocument fd = new FieldDocument();
1281 fd.addString("UID", "5000");
1282 fd.addString("ID", "doc-5000");
1283 fd.addString("author", "Sebastian");
Akrona6dabb72019-01-09 13:09:41 +01001284 fd.addKeywords("textClass", "Kultur Finanzen");
Akronc7a2abc2019-01-17 14:21:34 +01001285 fd.addDate("pubDate", 20180202);
Akron2423bba2018-09-03 15:11:10 +02001286 fd.addText("text", "Die Frau und der Mann küssten sich");
1287 fd.addTV("tokens", "a d e", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
1288 + "[(2-3)s:d|i:d|_1$<i>2<i>3]" + "[(4-5)s:e|i:e|_2$<i>4<i>5]");
1289 return fd;
1290 };
1291
Akron604bf362024-12-13 12:27:37 +01001292 public static FieldDocument createDoc5001 () {
1293 FieldDocument fd = new FieldDocument();
1294 fd.addString("UID", "5001");
1295 fd.addString("ID", "doc-5001");
1296 fd.addInt("toks", 3000);
1297 fd.addDate("pubDate", 20180202);
1298 fd.addText("text", "Der alte Mann ging über die Straße");
1299 fd.addTV("tokens", "a b c", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
1300 + "[(2-3)s:b|i:b|_1$<i>2<i>3]" + "[(4-5)s:c|i:c|_2$<i>4<i>5]");
1301 return fd;
1302 };
1303
Akronb59f40e2018-08-23 17:15:43 +02001304 private String _getJSONString (String file) {
1305 return getJsonString(getClass().getResource(path + file).getFile());
1306 };
Akron3ba74f22015-07-24 18:46:17 +02001307};