blob: 24494522964a18cf7c8a66daf6ac2303a29ede05 [file] [log] [blame]
Akron3ba74f22015-07-24 18:46:17 +02001package de.ids_mannheim.korap.collection;
Akron40550172015-08-04 03:06:12 +02002
Akron3ba74f22015-07-24 18:46:17 +02003import java.io.IOException;
4
5import de.ids_mannheim.korap.KrillIndex;
Akron176c9b12015-07-29 19:53:40 +02006import de.ids_mannheim.korap.KrillCollection;
7import de.ids_mannheim.korap.collection.CollectionBuilder;
Akron3ba74f22015-07-24 18:46:17 +02008import de.ids_mannheim.korap.index.FieldDocument;
Akron1d63f272015-07-28 12:19:49 +02009import de.ids_mannheim.korap.index.TextAnalyzer;
Akronfd05f502015-07-30 18:34:26 +020010import de.ids_mannheim.korap.response.Result;
11import de.ids_mannheim.korap.KrillQuery;
12import de.ids_mannheim.korap.query.QueryBuilder;
13
Akron1d63f272015-07-28 12:19:49 +020014import org.apache.lucene.analysis.Analyzer;
15import org.apache.lucene.analysis.TokenStream;
16import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Akronfd05f502015-07-30 18:34:26 +020017import org.apache.lucene.index.Term;
18import org.apache.lucene.search.spans.SpanOrQuery;
19import org.apache.lucene.search.spans.SpanQuery;
20import org.apache.lucene.search.spans.SpanTermQuery;
21import org.apache.lucene.search.spans.SpanQuery;
22
Akron3ba74f22015-07-24 18:46:17 +020023import static org.junit.Assert.*;
24import org.junit.Test;
25import org.junit.Ignore;
26import org.junit.runner.RunWith;
27import org.junit.runners.JUnit4;
28
29@RunWith(JUnit4.class)
30public class TestKrillCollectionIndex {
31 private KrillIndex ki;
32
Akron40550172015-08-04 03:06:12 +020033
Akron3ba74f22015-07-24 18:46:17 +020034 @Test
35 public void testIndexWithCollectionBuilder () throws IOException {
36 ki = new KrillIndex();
37 ki.addDoc(createDoc1());
38 ki.addDoc(createDoc2());
39 ki.addDoc(createDoc3());
40 ki.commit();
Akron176c9b12015-07-29 19:53:40 +020041 CollectionBuilder cb = new CollectionBuilder();
42 KrillCollection kcn = new KrillCollection(ki);
Akron3ba74f22015-07-24 18:46:17 +020043
44 // Simple string tests
45 kcn.fromBuilder(cb.term("author", "Frank"));
46 assertEquals(1, kcn.docCount());
47
48 kcn.fromBuilder(cb.term("author", "Peter"));
49 assertEquals(1, kcn.docCount());
50
51 kcn.fromBuilder(cb.term("author", "Sebastian"));
52 assertEquals(1, kcn.docCount());
53
54 kcn.fromBuilder(cb.term("author", "Michael"));
55 assertEquals(0, kcn.docCount());
56
57 kcn.fromBuilder(cb.term("textClass", "reisen"));
58 assertEquals(3, kcn.docCount());
59
60 kcn.fromBuilder(cb.term("textClass", "kultur"));
61 assertEquals(2, kcn.docCount());
62
63 kcn.fromBuilder(cb.term("textClass", "finanzen"));
64 assertEquals(1, kcn.docCount());
65
66 // Simple orGroup tests
Akron40550172015-08-04 03:06:12 +020067 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank"))
68 .with(cb.term("author", "Michael")));
Akron3ba74f22015-07-24 18:46:17 +020069 assertEquals(1, kcn.docCount());
70
Akron40550172015-08-04 03:06:12 +020071 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank"))
72 .with(cb.term("author", "Sebastian")));
Akron3ba74f22015-07-24 18:46:17 +020073 assertEquals(2, kcn.docCount());
74
Akron176c9b12015-07-29 19:53:40 +020075 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Frank"))
Akron40550172015-08-04 03:06:12 +020076 .with(cb.term("author", "Sebastian"))
77 .with(cb.term("author", "Peter")));
Akron3ba74f22015-07-24 18:46:17 +020078 assertEquals(3, kcn.docCount());
79
Akron176c9b12015-07-29 19:53:40 +020080 kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Huhu"))
Akron40550172015-08-04 03:06:12 +020081 .with(cb.term("author", "Haha"))
82 .with(cb.term("author", "Hehe")));
Akron3ba74f22015-07-24 18:46:17 +020083 assertEquals(0, kcn.docCount());
84
85 // Multi field orGroup tests
Akron40550172015-08-04 03:06:12 +020086 kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1"))
87 .with(cb.term("author", "Peter")));
Akron3ba74f22015-07-24 18:46:17 +020088 assertEquals(2, kcn.docCount());
89
Akron40550172015-08-04 03:06:12 +020090 kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1"))
91 .with(cb.term("author", "Frank")));
Akron3ba74f22015-07-24 18:46:17 +020092 assertEquals(1, kcn.docCount());
93
Akron40550172015-08-04 03:06:12 +020094 kcn.fromBuilder(cb.orGroup().with(cb.term("ID", "doc-1"))
95 .with(cb.term("author", "Michael")));
Akron3ba74f22015-07-24 18:46:17 +020096 assertEquals(1, kcn.docCount());
97
98 // Simple andGroup tests
Akron40550172015-08-04 03:06:12 +020099 kcn.fromBuilder(cb.andGroup().with(cb.term("author", "Frank"))
100 .with(cb.term("author", "Michael")));
Akron3ba74f22015-07-24 18:46:17 +0200101 assertEquals(0, kcn.docCount());
102
Akron40550172015-08-04 03:06:12 +0200103 kcn.fromBuilder(cb.andGroup().with(cb.term("ID", "doc-1"))
104 .with(cb.term("author", "Frank")));
Akron3ba74f22015-07-24 18:46:17 +0200105 assertEquals(1, kcn.docCount());
106
107 // andGroup in keyword field test
Akron40550172015-08-04 03:06:12 +0200108 kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen"))
109 .with(cb.term("textClass", "finanzen")));
Akron3ba74f22015-07-24 18:46:17 +0200110 assertEquals(1, kcn.docCount());
111
Akron40550172015-08-04 03:06:12 +0200112 kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "reisen"))
113 .with(cb.term("textClass", "kultur")));
Akron3ba74f22015-07-24 18:46:17 +0200114 assertEquals(2, kcn.docCount());
115
Akron40550172015-08-04 03:06:12 +0200116 kcn.fromBuilder(cb.andGroup().with(cb.term("textClass", "finanzen"))
117 .with(cb.term("textClass", "kultur")));
Akron3ba74f22015-07-24 18:46:17 +0200118 assertEquals(0, kcn.docCount());
Akron80cba8d2015-07-27 17:27:46 +0200119
Akron1d63f272015-07-28 12:19:49 +0200120 kcn.fromBuilder(cb.term("text", "mann"));
Akron80cba8d2015-07-27 17:27:46 +0200121 assertEquals(3, kcn.docCount());
122
Akron1d63f272015-07-28 12:19:49 +0200123 kcn.fromBuilder(cb.term("text", "frau"));
Akron80cba8d2015-07-27 17:27:46 +0200124 assertEquals(1, kcn.docCount());
Akron3ba74f22015-07-24 18:46:17 +0200125 };
126
Akron40550172015-08-04 03:06:12 +0200127
Akron3ba74f22015-07-24 18:46:17 +0200128 @Test
129 public void testIndexWithNegation () throws IOException {
130 ki = new KrillIndex();
131 ki.addDoc(createDoc1());
132 ki.addDoc(createDoc2());
133 ki.addDoc(createDoc3());
134 ki.commit();
Akron176c9b12015-07-29 19:53:40 +0200135 CollectionBuilder cb = new CollectionBuilder();
136 KrillCollection kcn = new KrillCollection(ki);
Akron3ba74f22015-07-24 18:46:17 +0200137
138 // Simple negation tests
139 kcn.fromBuilder(cb.term("author", "Frank").not());
140 assertEquals(2, kcn.docCount());
141
142 kcn.fromBuilder(cb.term("textClass", "reisen").not());
143 assertEquals(0, kcn.docCount());
144
145 kcn.fromBuilder(cb.term("textClass", "kultur").not());
146 assertEquals(1, kcn.docCount());
147
148 // orGroup with simple Negation
Akron40550172015-08-04 03:06:12 +0200149 kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
150 .with(cb.term("author", "Peter")));
Akron3ba74f22015-07-24 18:46:17 +0200151 assertEquals(2, kcn.docCount());
152
Akron40550172015-08-04 03:06:12 +0200153 kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
154 .with(cb.term("author", "Sebastian")));
Akron3ba74f22015-07-24 18:46:17 +0200155 assertEquals(1, kcn.docCount());
Akron3ba74f22015-07-24 18:46:17 +0200156 };
157
Akron40550172015-08-04 03:06:12 +0200158
Akron3ba74f22015-07-24 18:46:17 +0200159 @Test
Akron80cba8d2015-07-27 17:27:46 +0200160 public void testIndexWithMultipleCommitsAndDeletes () throws IOException {
Akron3ba74f22015-07-24 18:46:17 +0200161 ki = new KrillIndex();
162 ki.addDoc(createDoc1());
163 ki.addDoc(createDoc2());
164 ki.commit();
Akron176c9b12015-07-29 19:53:40 +0200165 CollectionBuilder cb = new CollectionBuilder();
166 KrillCollection kcn = new KrillCollection(ki);
Akron3ba74f22015-07-24 18:46:17 +0200167
168 kcn.fromBuilder(cb.term("author", "Frank"));
169 assertEquals(1, kcn.docCount());
170 kcn.fromBuilder(cb.term("author", "Peter"));
171 assertEquals(1, kcn.docCount());
172 kcn.fromBuilder(cb.term("author", "Sebastian"));
173 assertEquals(0, kcn.docCount());
174 kcn.fromBuilder(cb.term("author", "Michael").not());
175 assertEquals(2, kcn.docCount());
176
177 // Add Sebastians doc
178 ki.addDoc(createDoc3());
179 ki.commit();
180
181 kcn.fromBuilder(cb.term("author", "Frank"));
182 assertEquals(1, kcn.docCount());
183 kcn.fromBuilder(cb.term("author", "Peter"));
184 assertEquals(1, kcn.docCount());
185 kcn.fromBuilder(cb.term("author", "Sebastian"));
186 assertEquals(1, kcn.docCount());
187 kcn.fromBuilder(cb.term("author", "Michael").not());
188 assertEquals(3, kcn.docCount());
189
190 // Remove one document
191 ki.delDocs("author", "Peter");
192 ki.commit();
193
194 kcn.fromBuilder(cb.term("author", "Frank"));
195 assertEquals(1, kcn.docCount());
196 kcn.fromBuilder(cb.term("author", "Peter"));
197 assertEquals(0, kcn.docCount());
198 kcn.fromBuilder(cb.term("author", "Sebastian"));
199 assertEquals(1, kcn.docCount());
200 kcn.fromBuilder(cb.term("author", "Michael").not());
201 assertEquals(2, kcn.docCount());
Akron80cba8d2015-07-27 17:27:46 +0200202
203 // Readd Peter's doc
204 ki.addDoc(createDoc2());
205 ki.commit();
206
207 kcn.fromBuilder(cb.term("author", "Frank"));
208 assertEquals(1, kcn.docCount());
209 kcn.fromBuilder(cb.term("author", "Peter"));
210 assertEquals(1, kcn.docCount());
211 kcn.fromBuilder(cb.term("author", "Sebastian"));
212 assertEquals(1, kcn.docCount());
213 kcn.fromBuilder(cb.term("author", "Michael").not());
214 assertEquals(3, kcn.docCount());
Akron3ba74f22015-07-24 18:46:17 +0200215 };
216
Akron40550172015-08-04 03:06:12 +0200217
Akron80cba8d2015-07-27 17:27:46 +0200218 @Test
Akron1d63f272015-07-28 12:19:49 +0200219 public void testIndexStream () throws IOException {
220 ki = new KrillIndex();
221 FieldDocument fd = ki.addDoc(createDoc1());
222 ki.commit();
223
224 Analyzer ana = new TextAnalyzer();
225 TokenStream ts = fd.doc.getField("text").tokenStream(ana, null);
226
Akron40550172015-08-04 03:06:12 +0200227 CharTermAttribute charTermAttribute = ts
228 .addAttribute(CharTermAttribute.class);
Akron1d63f272015-07-28 12:19:49 +0200229 ts.reset();
230
231 ts.incrementToken();
232 assertEquals("der", charTermAttribute.toString());
233 ts.incrementToken();
234 assertEquals("alte", charTermAttribute.toString());
235 ts.incrementToken();
236 assertEquals("mann", charTermAttribute.toString());
237 ts.incrementToken();
238 assertEquals("ging", charTermAttribute.toString());
239 ts.incrementToken();
240 assertEquals("über", charTermAttribute.toString());
241 ts.incrementToken();
242 assertEquals("die", charTermAttribute.toString());
243 ts.incrementToken();
244 assertEquals("straße", charTermAttribute.toString());
245 };
246
Akron40550172015-08-04 03:06:12 +0200247
Akron1d63f272015-07-28 12:19:49 +0200248 @Test
Akron80cba8d2015-07-27 17:27:46 +0200249 public void testIndexWithDateRanges () throws IOException {
250 ki = new KrillIndex();
251 ki.addDoc(createDoc1());
252 ki.addDoc(createDoc2());
253 ki.addDoc(createDoc3());
254 ki.commit();
Akron176c9b12015-07-29 19:53:40 +0200255 CollectionBuilder cb = new CollectionBuilder();
256 KrillCollection kcn = new KrillCollection(ki);
Akron80cba8d2015-07-27 17:27:46 +0200257
258 kcn.fromBuilder(cb.date("pubDate", "2005"));
259 assertEquals(3, kcn.docCount());
260 kcn.fromBuilder(cb.date("pubDate", "2005-12"));
261 assertEquals(3, kcn.docCount());
262
263 kcn.fromBuilder(cb.date("pubDate", "2005-12-10"));
264 assertEquals(1, kcn.docCount());
265 kcn.fromBuilder(cb.date("pubDate", "2005-12-16"));
266 assertEquals(1, kcn.docCount());
267 kcn.fromBuilder(cb.date("pubDate", "2005-12-07"));
268 assertEquals(1, kcn.docCount());
269
270 kcn.fromBuilder(cb.since("pubDate", "2005-12-07"));
271 assertEquals(3, kcn.docCount());
272 kcn.fromBuilder(cb.since("pubDate", "2005-12-10"));
273 assertEquals(2, kcn.docCount());
274 kcn.fromBuilder(cb.since("pubDate", "2005-12-16"));
275 assertEquals(1, kcn.docCount());
276
277 kcn.fromBuilder(cb.till("pubDate", "2005-12-16"));
278 assertEquals(3, kcn.docCount());
279 kcn.fromBuilder(cb.till("pubDate", "2005-12-10"));
280 assertEquals(2, kcn.docCount());
281 kcn.fromBuilder(cb.till("pubDate", "2005-12-07"));
282 assertEquals(1, kcn.docCount());
283
284 kcn.fromBuilder(cb.date("pubDate", "2005-12-10").not());
285 assertEquals(2, kcn.docCount());
286 kcn.fromBuilder(cb.date("pubDate", "2005-12-16").not());
287 assertEquals(2, kcn.docCount());
288 kcn.fromBuilder(cb.date("pubDate", "2005-12-07").not());
289 assertEquals(2, kcn.docCount());
290 kcn.fromBuilder(cb.date("pubDate", "2005-12-09").not());
291 assertEquals(3, kcn.docCount());
292
293
294 kcn.fromBuilder(cb.till("pubDate", "2005-12-16").not());
295 assertEquals(0, kcn.docCount());
296 kcn.fromBuilder(cb.till("pubDate", "2005-12-15").not());
297 assertEquals(1, kcn.docCount());
298 kcn.fromBuilder(cb.till("pubDate", "2005-12-10").not());
299 assertEquals(1, kcn.docCount());
300 kcn.fromBuilder(cb.till("pubDate", "2005-12-09").not());
301 assertEquals(2, kcn.docCount());
302 kcn.fromBuilder(cb.till("pubDate", "2005-12-07").not());
303 assertEquals(2, kcn.docCount());
304 kcn.fromBuilder(cb.till("pubDate", "2005-12-06").not());
305 assertEquals(3, kcn.docCount());
306 };
307
308
309 @Test
310 public void testIndexWithRegexes () throws IOException {
311 ki = new KrillIndex();
312
313 ki.addDoc(createDoc1());
314 ki.addDoc(createDoc2());
315 ki.addDoc(createDoc3());
316 ki.commit();
317
Akron176c9b12015-07-29 19:53:40 +0200318 CollectionBuilder cb = new CollectionBuilder();
319 KrillCollection kcn = new KrillCollection(ki);
Akron80cba8d2015-07-27 17:27:46 +0200320
321 kcn.fromBuilder(cb.re("author", "Fran.*"));
322 assertEquals(1, kcn.docCount());
323 kcn.fromBuilder(cb.re("author", "Blin.*"));
324 assertEquals(0, kcn.docCount());
325 kcn.fromBuilder(cb.re("author", "Frank|Peter"));
326 assertEquals(2, kcn.docCount());
327
Akron1d63f272015-07-28 12:19:49 +0200328 // "Frau" doesn't work!
329 kcn.fromBuilder(cb.term("text", "frau"));
Akron80cba8d2015-07-27 17:27:46 +0200330 assertEquals(1, kcn.docCount());
331
Akron1d63f272015-07-28 12:19:49 +0200332 kcn.fromBuilder(cb.re("text", "frau"));
Akron80cba8d2015-07-27 17:27:46 +0200333 assertEquals(1, kcn.docCount());
334
Akron1d63f272015-07-28 12:19:49 +0200335 kcn.fromBuilder(cb.re("text", "frau|mann"));
Akron80cba8d2015-07-27 17:27:46 +0200336 assertEquals(3, kcn.docCount());
337 };
338
Akron3ba74f22015-07-24 18:46:17 +0200339
Akronfd05f502015-07-30 18:34:26 +0200340 @Test
341 public void filterExampleFromLegacy () throws Exception {
342
343 // Construct index
344 KrillIndex ki = new KrillIndex();
345 // Indexing test files
346 for (String i : new String[] { "00001", "00002", "00003", "00004",
347 "00005", "00006", "02439" }) {
348 ki.addDoc(
349 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
350 true);
351 };
352 ki.commit();
353
354 // Create Virtual collections:
355 KrillCollection kc = new KrillCollection(ki);
356
357 assertEquals("Documents", 7, kc.numberOf("documents"));
358
359 // The virtual collection consists of all documents that have
360 // the textClass "reisen" and "freizeit"
361
362 /* kc.filter(kf.and("textClass", "reisen").and("textClass",
363 "freizeit-unterhaltung"));
364 */
365
Akron40550172015-08-04 03:06:12 +0200366 kc.fromBuilder(kc.build().andGroup()
367 .with(kc.build().term("textClass", "reisen"))
368 .with(kc.build().term("textClass", "freizeit-unterhaltung")));
Akronfd05f502015-07-30 18:34:26 +0200369
370 assertEquals("Documents", 5, kc.numberOf("documents"));
371 assertEquals("Tokens", 1678, kc.numberOf("tokens"));
372 assertEquals("Sentences", 194, kc.numberOf("sentences"));
373 assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
374
375
376 // Subset this to all documents that have also the text
377 // kc.filter(kf.and("textClass", "kultur"));
378 /*
379 kc.fromBuilder(
380 kc.build().andGroup().with(
381 kc.getBuilder()
382 ).with(
383 kc.build().term("textClass", "kultur")
384 )
385 );
386 */
387
388 kc.filter(kc.build().term("textClass", "kultur"));
389
390 assertEquals("Documents", 1, kc.numberOf("documents"));
391 assertEquals("Tokens", 405, kc.numberOf("tokens"));
392 assertEquals("Sentences", 75, kc.numberOf("sentences"));
393 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
394
395
396 // kc.filter(kf.and("corpusID", "WPD"));
397 kc.filter(kc.build().term("corpusID", "WPD"));
398
399 assertEquals("Documents", 1, kc.numberOf("documents"));
400 assertEquals("Tokens", 405, kc.numberOf("tokens"));
401 assertEquals("Sentences", 75, kc.numberOf("sentences"));
402 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
403
404 // Create a query
405 QueryBuilder kq = new QueryBuilder("tokens");
406 SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
407
Akron40550172015-08-04 03:06:12 +0200408 Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
409 (short) 5);
Akronfd05f502015-07-30 18:34:26 +0200410 assertEquals(kr.getTotalResults(), 70);
411
412
413 kc.extend(kc.build().term("textClass", "uninteresting"));
414 assertEquals("Documents", 1, kc.numberOf("documents"));
415
416 kc.extend(kc.build().term("textClass", "wissenschaft"));
417
418 assertEquals("Documents", 3, kc.numberOf("documents"));
419 assertEquals("Tokens", 1669, kc.numberOf("tokens"));
420 assertEquals("Sentences", 188, kc.numberOf("sentences"));
421 assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
422 // System.err.println(kr.toJSON());
423 };
424
425
426 @Test
427 public void filterExampleAtomicLegacy () throws Exception {
428
429 // That's exactly the same test class, but with multiple atomic indices
430
431 // Construct index
432 KrillIndex ki = new KrillIndex();
433 // Indexing test files
434 for (String i : new String[] { "00001", "00002", "00003", "00004",
435 "00005", "00006", "02439" }) {
436 ki.addDoc(
437 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
438 true);
439 ki.commit();
440 };
441
442 CollectionBuilder kf = new CollectionBuilder();
443
444 // Create Virtual collections:
445 KrillCollection kc = new KrillCollection(ki);
446
447 assertEquals("Documents", 7, kc.numberOf("documents"));
448
449 // If this is set - everything is fine automatically ...
450 kc.filter(kc.build().term("corpusID", "WPD"));
451
452 assertEquals("Documents", 7, kc.numberOf("documents"));
453
454 // The virtual collection consists of all documents that have the textClass "reisen" and "freizeit"
455
456 /*
457 kc.filter(kf.and("textClass", "reisen").and("textClass",
458 "freizeit-unterhaltung"));
459 */
Akron40550172015-08-04 03:06:12 +0200460 kc.filter(kc.build().andGroup()
461 .with(kc.build().term("textClass", "reisen"))
462 .with(kc.build().term("textClass", "freizeit-unterhaltung")));
Akronfd05f502015-07-30 18:34:26 +0200463
464 assertEquals("Documents", 5, kc.numberOf("documents"));
465 assertEquals("Tokens", 1678, kc.numberOf("tokens"));
466 assertEquals("Sentences", 194, kc.numberOf("sentences"));
467 assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
468
469 // Subset this to all documents that have also the text
470 // kc.filter(kf.and("textClass", "kultur"));
471
472 kc.filter(kc.build().term("textClass", "kultur"));
473
474 assertEquals("Documents", 1, kc.numberOf("documents"));
475 assertEquals("Tokens", 405, kc.numberOf("tokens"));
476 assertEquals("Sentences", 75, kc.numberOf("sentences"));
477 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
478
479 // This is already filtered though ...
480 // kc.filter(kf.and("corpusID", "WPD"));
481 kc.filter(kc.build().term("corpusID", "WPD"));
482
483 assertEquals("Documents", 1, kc.numberOf("documents"));
484 assertEquals("Tokens", 405, kc.numberOf("tokens"));
485 assertEquals("Sentences", 75, kc.numberOf("sentences"));
486 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
487
488 // Create a query
489 QueryBuilder kq = new QueryBuilder("tokens");
490 SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
491
Akron40550172015-08-04 03:06:12 +0200492 Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
493 (short) 5);
Akronfd05f502015-07-30 18:34:26 +0200494 assertEquals(kr.getTotalResults(), 70);
495
496 // kc.extend(kf.and("textClass", "uninteresting"));
497 kc.extend(kc.build().term("textClass", "uninteresting"));
498
Akronfd05f502015-07-30 18:34:26 +0200499 assertEquals("Documents", 1, kc.numberOf("documents"));
500
Akronaa74ec62015-07-31 17:22:55 +0200501 kc.extend(kc.build().term("textClass", "wissenschaft"));
Akronfd05f502015-07-30 18:34:26 +0200502
503 assertEquals("Documents", 3, kc.numberOf("documents"));
504 assertEquals("Tokens", 1669, kc.numberOf("tokens"));
505 assertEquals("Sentences", 188, kc.numberOf("sentences"));
506 assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
Akronaa74ec62015-07-31 17:22:55 +0200507
508 // System.err.println(kc.toString());
509 // Test collectionbuilder simplifier!
510 /*
511 OrGroup(
512 AndGroup(
513 corpusID:WPD
514 textClass:reisen
515 textClass:freizeit-unterhaltung
516 textClass:kultur
517 corpusID:WPD
518 )
519 textClass:uninteresting
520 textClass:wissenschaft
521 )
Akronfd05f502015-07-30 18:34:26 +0200522 */
Akronaa74ec62015-07-31 17:22:55 +0200523
524 assertTrue(ki.delDocs("textClass", "wissenschaft"));
525 ki.commit();
526
527 assertEquals("Documents", 1, kc.numberOf("documents"));
528 assertEquals("Tokens", 405, kc.numberOf("tokens"));
529 assertEquals("Sentences", 75, kc.numberOf("sentences"));
530 assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
531 };
532
Akron40550172015-08-04 03:06:12 +0200533
Akronaa74ec62015-07-31 17:22:55 +0200534 @Test
535 public void filterExample2Legacy () throws Exception {
536
537 // Construct index
538 KrillIndex ki = new KrillIndex();
539 // Indexing test files
540 for (String i : new String[] { "00001", "00002", "00003", "00004",
541 "00005", "00006", "02439" }) {
542 ki.addDoc(
543 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
544 true);
545 };
546 ki.commit();
547
548 ki.addDoc(getClass()
549 .getResourceAsStream("/wiki/00012-fakemeta.json.gz"), true);
550
551 ki.commit();
552
553 /*
554 CollectionBuilderLegacy kf = new CollectionBuilderLegacy();
555
556 // Create Virtual collections:
557 KrillCollectionLegacy kc = new KrillCollectionLegacy(ki);
558 kc.filter(kf.and("textClass", "reisen").and("textClass",
559 "freizeit-unterhaltung"));
560 */
561
562 KrillCollection kc = new KrillCollection(ki);
563 CollectionBuilder cb = kc.build();
Akron40550172015-08-04 03:06:12 +0200564 kc.filter(cb.andGroup().with(cb.term("textClass", "reisen"))
565 .with(cb.term("textClass", "freizeit-unterhaltung")));
Akronaa74ec62015-07-31 17:22:55 +0200566
567 assertEquals("Documents", 5, kc.numberOf("documents"));
568 assertEquals("Tokens", 1678, kc.numberOf("tokens"));
569 assertEquals("Sentences", 194, kc.numberOf("sentences"));
570 assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
571
572
573 // Create a query
574 QueryBuilder kq = new QueryBuilder("tokens");
575 SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
576
577
Akron40550172015-08-04 03:06:12 +0200578 Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
579 (short) 5);
Akronaa74ec62015-07-31 17:22:55 +0200580 assertEquals(kr.getTotalResults(), 369);
581
582 // kc.filter(kf.and("corpusID", "QQQ"));
583 kc.filter(cb.term("corpusID", "QQQ"));
584
585 assertEquals("Documents", 0, kc.numberOf("documents"));
586 assertEquals("Tokens", 0, kc.numberOf("tokens"));
587 assertEquals("Sentences", 0, kc.numberOf("sentences"));
588 assertEquals("Paragraphs", 0, kc.numberOf("paragraphs"));
589
Akron40550172015-08-04 03:06:12 +0200590 kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
591 (short) 5);
Akronaa74ec62015-07-31 17:22:55 +0200592 assertEquals(kr.getTotalResults(), 0);
593 };
594
595
596 @Test
597 public void uidCollectionLegacy () throws IOException {
598
599 // Construct index
600 KrillIndex ki = new KrillIndex();
601 // Indexing test files
602 int uid = 1;
603 for (String i : new String[] { "00001", "00002", "00003", "00004",
604 "00005", "00006", "02439" }) {
605 FieldDocument fd = ki.addDoc(uid++,
606 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
607 true);
608 };
609 ki.commit();
610
611 assertEquals("Documents", 7, ki.numberOf("documents"));
612 assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
613 assertEquals("Sentences", 281, ki.numberOf("sentences"));
614 assertEquals("Tokens", 2661, ki.numberOf("tokens"));
615
616 SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der"));
617 Result kr = ki.search(sq, (short) 10);
618 assertEquals(86, kr.getTotalResults());
619
620 // Create Virtual collections:
621 KrillCollection kc = new KrillCollection();
622 kc.filterUIDs(new String[] { "2", "3", "4" });
623 kc.setIndex(ki);
624 assertEquals("Documents", 3, kc.numberOf("documents"));
625
626 assertEquals("Paragraphs", 46, kc.numberOf("paragraphs"));
627 assertEquals("Sentences", 103, kc.numberOf("sentences"));
628 assertEquals("Tokens", 1229, kc.numberOf("tokens"));
629
630 kr = ki.search(kc, sq, 0, (short) 20, true, (short) 5, true, (short) 5);
631
632 assertEquals((long) 39, kr.getTotalResults());
633 };
634
Akron40550172015-08-04 03:06:12 +0200635
Akronaa74ec62015-07-31 17:22:55 +0200636 @Test
637 public void uidCollectionWithDeletions () throws IOException {
638
639 // Construct index
640 KrillIndex ki = new KrillIndex();
641 // Indexing test files
642 int uid = 1;
643 for (String i : new String[] { "00001", "00002", "00003", "00004",
644 "00005", "00006", "02439" }) {
645 FieldDocument fd = ki.addDoc(uid++,
646 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
647 true);
648 };
649 ki.commit();
650
651
652 assertEquals("Documents", 7, ki.numberOf("documents"));
653 assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
654 assertEquals("Sentences", 281, ki.numberOf("sentences"));
655 assertEquals("Tokens", 2661, ki.numberOf("tokens"));
656
657 assertTrue(ki.delDoc(3));
658 ki.commit();
659
660 assertEquals("Documents", 6, ki.numberOf("documents"));
661
662 assertEquals("Paragraphs", 146, ki.numberOf("paragraphs"));
663 assertEquals("Sentences", 212, ki.numberOf("sentences"));
664 assertEquals("Tokens", 2019, ki.numberOf("tokens"));
665
666 assertTrue(ki.delDoc(2));
667 assertTrue(ki.delDoc(3));
668 assertTrue(ki.delDoc(4));
669 assertTrue(ki.delDoc(5));
670 assertTrue(ki.delDoc(6));
671 assertTrue(ki.delDoc(7));
672 ki.commit();
673
674 assertEquals("Documents", 1, ki.numberOf("documents"));
675 assertEquals("Paragraphs", 75, ki.numberOf("paragraphs"));
Akronfd05f502015-07-30 18:34:26 +0200676 };
677
678
Akron3ba74f22015-07-24 18:46:17 +0200679 private FieldDocument createDoc1 () {
680 FieldDocument fd = new FieldDocument();
681 fd.addString("ID", "doc-1");
682 fd.addString("author", "Frank");
683 fd.addKeyword("textClass", "Nachricht Kultur Reisen");
684 fd.addInt("pubDate", 20051210);
685 fd.addText("text", "Der alte Mann ging über die Straße");
686 return fd;
687 };
688
Akron40550172015-08-04 03:06:12 +0200689
Akron3ba74f22015-07-24 18:46:17 +0200690 private FieldDocument createDoc2 () {
691 FieldDocument fd = new FieldDocument();
692 fd.addString("ID", "doc-2");
693 fd.addString("author", "Peter");
694 fd.addKeyword("textClass", "Kultur Reisen");
695 fd.addInt("pubDate", 20051207);
696 fd.addText("text", "Der junge Mann hatte keine andere Wahl");
697 return fd;
698 };
699
Akron40550172015-08-04 03:06:12 +0200700
Akron3ba74f22015-07-24 18:46:17 +0200701 private FieldDocument createDoc3 () {
702 FieldDocument fd = new FieldDocument();
703 fd.addString("ID", "doc-3");
704 fd.addString("author", "Sebastian");
705 fd.addKeyword("textClass", "Reisen Finanzen");
706 fd.addInt("pubDate", 20051216);
707 fd.addText("text", "Die Frau und der Mann küssten sich");
708 return fd;
709 };
710};