blob: b5f3dcfd15c3b270bce282afa2b5c30509a5714f [file] [log] [blame]
Eliza Margaretha6a780692014-01-15 09:45:42 +00001package de.ids_mannheim.korap.search;
2
Nils Diewaldc925b492013-12-03 23:56:10 +00003import java.util.*;
4import java.io.*;
5
Nils Diewald56dc2582014-11-04 21:33:46 +00006import static de.ids_mannheim.korap.TestSimple.*;
7
Nils Diewaldc925b492013-12-03 23:56:10 +00008import de.ids_mannheim.korap.KorapSearch;
Nils Diewald2276e1c2014-04-10 15:01:59 +00009import de.ids_mannheim.korap.KorapCollection;
Nils Diewaldc925b492013-12-03 23:56:10 +000010import de.ids_mannheim.korap.KorapQuery;
11import de.ids_mannheim.korap.KorapIndex;
Nils Diewald2276e1c2014-04-10 15:01:59 +000012import de.ids_mannheim.korap.index.FieldDocument;
Nils Diewald1e5d5942014-05-20 13:29:53 +000013import de.ids_mannheim.korap.index.SearchContext;
Nils Diewaldea969502015-02-16 21:10:54 +000014import de.ids_mannheim.korap.collection.CollectionBuilder;
Nils Diewaldc925b492013-12-03 23:56:10 +000015import de.ids_mannheim.korap.KorapResult;
16import java.nio.file.Files;
17import java.nio.file.FileSystem;
18import java.nio.file.Path;
19import java.nio.charset.StandardCharsets;
20import java.nio.ByteBuffer;
21
Nils Diewald277e9ce2014-11-06 03:42:11 +000022import com.fasterxml.jackson.databind.ObjectMapper;
23import com.fasterxml.jackson.databind.JsonNode;
24
Nils Diewaldc925b492013-12-03 23:56:10 +000025import static org.junit.Assert.*;
26import org.junit.Test;
27import org.junit.Ignore;
28import org.junit.runner.RunWith;
29import org.junit.runners.JUnit4;
30
31@RunWith(JUnit4.class)
32public class TestKorapSearch {
33 @Test
34 public void searchCount () {
Nils Diewaldafab8f32015-01-26 19:11:32 +000035 KorapSearch ks = new KorapSearch(
36 new KorapQuery("field1").seg("a").with("b")
Nils Diewaldc925b492013-12-03 23:56:10 +000037 );
Nils Diewaldafab8f32015-01-26 19:11:32 +000038 // Count:
39 ks.setCount(30);
40 assertEquals(ks.getCount(), 30);
41 ks.setCount(20);
42 assertEquals(ks.getCount(), 20);
43 ks.setCount(-50);
44 assertEquals(ks.getCount(), 20);
45 ks.setCount(500);
46 assertEquals(ks.getCount(), ks.getCountMax());
Nils Diewaldc925b492013-12-03 23:56:10 +000047 };
48
49 @Test
50 public void searchStartIndex () {
Nils Diewaldafab8f32015-01-26 19:11:32 +000051 KorapSearch ks = new KorapSearch(
52 new KorapQuery("field1").seg("a").with("b")
Nils Diewaldc925b492013-12-03 23:56:10 +000053 );
Nils Diewaldafab8f32015-01-26 19:11:32 +000054 // startIndex
55 ks.setStartIndex(5);
56 assertEquals(ks.getStartIndex(), 5);
57 ks.setStartIndex(1);
58 assertEquals(ks.getStartIndex(), 1);
59 ks.setStartIndex(0);
60 assertEquals(ks.getStartIndex(), 0);
61 ks.setStartIndex(70);
62 assertEquals(ks.getStartIndex(), 70);
63 ks.setStartIndex(-5);
64 assertEquals(ks.getStartIndex(), 0);
Nils Diewaldc925b492013-12-03 23:56:10 +000065 };
66
67 @Test
68 public void searchQuery () {
Nils Diewaldafab8f32015-01-26 19:11:32 +000069 KorapSearch ks = new KorapSearch(
70 new KorapQuery("field1").seg("a").with("b")
Nils Diewaldc925b492013-12-03 23:56:10 +000071 );
Nils Diewaldafab8f32015-01-26 19:11:32 +000072 // query
73 assertEquals(ks.getQuery().toString(), "spanSegment(field1:a, field1:b)");
Nils Diewaldc925b492013-12-03 23:56:10 +000074 };
75
Nils Diewaldafab8f32015-01-26 19:11:32 +000076
Nils Diewaldc925b492013-12-03 23:56:10 +000077 @Test
78 public void searchIndex () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +000079 // Construct index
80 KorapIndex ki = new KorapIndex();
81 // Indexing test files
82 for (String i : new String[] {"00001",
83 "00002",
84 "00003",
85 "00004",
86 "00005",
87 "00006",
88 "02439"}) {
89 ki.addDocFile(
90 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
91 true
Nils Diewaldc925b492013-12-03 23:56:10 +000092 );
Nils Diewaldafab8f32015-01-26 19:11:32 +000093 };
94 ki.commit();
Nils Diewaldc925b492013-12-03 23:56:10 +000095
Nils Diewaldafab8f32015-01-26 19:11:32 +000096 KorapSearch ks = new KorapSearch(
97 new KorapQuery("tokens").seg("s:Buchstaben")
98 );
99 ks.getCollection().filter(
Nils Diewaldea969502015-02-16 21:10:54 +0000100 new CollectionBuilder().and("textClass", "reisen")
Nils Diewaldc925b492013-12-03 23:56:10 +0000101 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000102 ks.setCount(3);
103 ks.setStartIndex(5);
104 ks.context.left.setLength(1);
105 ks.context.right.setLength(1);
106 KorapResult kr = ks.run(ki);
107 assertEquals(kr.getTotalResults(), 6);
108 assertEquals(
109 kr.getMatch(0).getSnippetBrackets(),
110 "... dem [Buchstaben] A ..."
111 );
Nils Diewaldc925b492013-12-03 23:56:10 +0000112 };
Nils Diewaldc6b78752013-12-05 19:05:12 +0000113
Nils Diewaldafab8f32015-01-26 19:11:32 +0000114
Nils Diewaldc6b78752013-12-05 19:05:12 +0000115 @Test
116 public void searchJSON () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000117 // Construct index
118 KorapIndex ki = new KorapIndex();
119 // Indexing test files
120 for (String i : new String[] {"00001",
121 "00002",
122 "00003",
123 "00004",
124 "00005",
125 "00006",
126 "02439"}) {
127 ki.addDocFile(
128 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
129 true
Nils Diewaldc6b78752013-12-05 19:05:12 +0000130 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000131 };
132 ki.commit();
Nils Diewaldc6b78752013-12-05 19:05:12 +0000133
Nils Diewaldafab8f32015-01-26 19:11:32 +0000134 String json = getString(
135 getClass().getResource("/queries/metaquery3.jsonld").getFile()
136 );
Nils Diewaldc6b78752013-12-05 19:05:12 +0000137
Nils Diewaldafab8f32015-01-26 19:11:32 +0000138 KorapSearch ks = new KorapSearch(json);
139 KorapResult kr = ks.run(ki);
140 assertEquals(kr.getTotalResults(), 66);
141 assertEquals(5, kr.getItemsPerPage());
142 assertEquals(5, kr.getStartIndex());
143 assertEquals(
144 "... a: A ist [der klangreichste] der V ...",
145 kr.getMatch(0).getSnippetBrackets()
146 );
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000147 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000148
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000149 @Test
150 public void searchJSON2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000151 // Construct index
152 KorapIndex ki = new KorapIndex();
153 // Indexing test files
154 for (String i : new String[] {"00001",
155 "00002",
156 "00003",
157 "00004",
158 "00005",
159 "00006",
160 "02439",
161 "00012-fakemeta",
162 "00030-fakemeta",
163 /*
164 "02035-substring",
165 "05663-unbalanced",
166 "07452-deep"
167 */
168 }) {
169 ki.addDocFile(
170 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
171 true
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000172 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000173 };
174 ki.commit();
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000175
Nils Diewaldafab8f32015-01-26 19:11:32 +0000176 String json = getString(
177 getClass().getResource("/queries/metaquery4.jsonld").getFile()
178 );
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000179
Nils Diewaldafab8f32015-01-26 19:11:32 +0000180 KorapSearch ks = new KorapSearch(json);
181 KorapResult kr = ks.run(ki);
Nils Diewaldc86aa482014-02-12 16:58:05 +0000182
Nils Diewaldafab8f32015-01-26 19:11:32 +0000183 assertEquals(kr.getTotalResults(), 1);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000184
Nils Diewaldafab8f32015-01-26 19:11:32 +0000185 ks = new KorapSearch(json);
186 // Ignore the collection part of the query!
187 ks.setCollection(new KorapCollection());
188 kr = ks.run(ki);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000189
Nils Diewaldafab8f32015-01-26 19:11:32 +0000190 assertEquals(kr.getTotalResults(), 5);
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000191
Nils Diewaldafab8f32015-01-26 19:11:32 +0000192 json = getString(
193 getClass().getResource("/queries/metaquery5.jsonld").getFile()
194 );
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000195
Nils Diewaldafab8f32015-01-26 19:11:32 +0000196 ks = new KorapSearch(json);
197 kr = ks.run(ki);
198 assertEquals(kr.getTotalResults(), 1);
199
200 json = getString(
201 getClass().getResource("/queries/metaquery6.jsonld").getFile()
202 );
203 ks = new KorapSearch(json);
204 kr = ks.run(ki);
205 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldc6b78752013-12-05 19:05:12 +0000206 };
207
208
209 @Test
210 public void searchJSONFailure () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000211 // Construct index
212 KorapIndex ki = new KorapIndex();
213 // Indexing test files
214 for (String i : new String[] {"00001",
215 "00002",
216 "00003",
217 "00004",
218 "00005",
219 "00006",
220 "02439"
221 }) {
222 ki.addDocFile(
223 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
224 true
Nils Diewaldc6b78752013-12-05 19:05:12 +0000225 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000226 };
227 ki.commit();
228 KorapResult kr = new KorapSearch("{ query").run(ki);
229 assertEquals(kr.getTotalResults(), 0);
230 assertEquals(kr.getError(0).getMessage(), "Unable to parse JSON");
Nils Diewaldc6b78752013-12-05 19:05:12 +0000231 };
232
233
Nils Diewald9f310832013-12-06 22:38:55 +0000234 @Test
235 public void searchJSONindexboundary () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000236 // Construct index
237 KorapIndex ki = new KorapIndex();
238 // Indexing test files
239 for (String i : new String[] {"00001",
240 "00002",
241 "00003",
242 "00004",
243 "00005",
244 "00006",
245 "02439"}) {
246 ki.addDocFile(
247 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
248 true
Nils Diewald9f310832013-12-06 22:38:55 +0000249 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000250 };
251 ki.commit();
Nils Diewald9f310832013-12-06 22:38:55 +0000252
Nils Diewaldafab8f32015-01-26 19:11:32 +0000253 String json = getString(
254 getClass().getResource("/queries/bsp-fail1.jsonld").getFile()
255 );
Nils Diewald9f310832013-12-06 22:38:55 +0000256
Nils Diewaldafab8f32015-01-26 19:11:32 +0000257 KorapResult kr = new KorapSearch(json).run(ki);
258 assertEquals(0, kr.getStartIndex());
259 assertEquals(kr.getTotalResults(), 0);
260 assertEquals(25, kr.getItemsPerPage());
Nils Diewald9f310832013-12-06 22:38:55 +0000261 };
262
Nils Diewaldafab8f32015-01-26 19:11:32 +0000263
Nils Diewald9f310832013-12-06 22:38:55 +0000264 @Test
265 public void searchJSONindexboundary2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000266 // Construct index
267 KorapIndex ki = new KorapIndex();
268 // Indexing test files
269 for (String i : new String[] {"00001",
270 "00002",
271 "00003",
272 "00004",
273 "00005",
274 "00006",
275 "02439"}) {
276 ki.addDocFile(
277 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
278 true
Nils Diewald9f310832013-12-06 22:38:55 +0000279 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000280 };
281 ki.commit();
Nils Diewald9f310832013-12-06 22:38:55 +0000282
Nils Diewaldafab8f32015-01-26 19:11:32 +0000283 String json = getString(
284 getClass().getResource("/queries/bsp-fail2.jsonld").getFile()
285 );
Nils Diewald9f310832013-12-06 22:38:55 +0000286
Nils Diewaldafab8f32015-01-26 19:11:32 +0000287 KorapResult kr = new KorapSearch(json).run(ki);
288 assertEquals(50, kr.getItemsPerPage());
289 assertEquals(49950, kr.getStartIndex());
290 assertEquals(kr.getTotalResults(), 0);
Nils Diewald9f310832013-12-06 22:38:55 +0000291 };
292
Nils Diewaldc6b78752013-12-05 19:05:12 +0000293
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000294 @Test
295 public void searchJSONcontext () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000296 // Construct index
297 KorapIndex ki = new KorapIndex();
298 // Indexing test files
299 for (String i : new String[] {"00001",
300 "00002",
301 "00003",
302 "00004",
303 "00005",
304 "00006",
305 "02439"}) {
306 ki.addDocFile(
307 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
308 true
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000309 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000310 };
311 ki.commit();
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000312
Nils Diewaldafab8f32015-01-26 19:11:32 +0000313 String json = getString(
314 getClass().getResource("/queries/bsp-context.jsonld").getFile()
315 );
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000316
Nils Diewaldafab8f32015-01-26 19:11:32 +0000317 KorapSearch ks = new KorapSearch(json);
318 KorapResult kr = ks.run(ki);
319 assertEquals(kr.getTotalResults(), 10);
320 assertEquals("A bzw. a ist der erste Buchstabe des" +
321 " lateinischen [Alphabets] und ein Vokal." +
322 " Der Buchstabe A hat in deutschen Texten" +
323 " eine durchschnittliche Häufigkeit ...",
324 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000325
Nils Diewaldafab8f32015-01-26 19:11:32 +0000326 ks.setCount(5);
327 ks.setStartPage(2);
328 kr = ks.run(ki);
329 assertEquals(kr.getTotalResults(), 10);
330 assertEquals(5, kr.getStartIndex());
331 assertEquals(5, kr.getItemsPerPage());
Nils Diewald891c53c2013-12-23 16:37:46 +0000332
Nils Diewaldafab8f32015-01-26 19:11:32 +0000333 json = getString(
334 getClass().getResource("/queries/bsp-context-2.jsonld").getFile()
335 );
Nils Diewald891c53c2013-12-23 16:37:46 +0000336
Nils Diewaldafab8f32015-01-26 19:11:32 +0000337 kr = new KorapSearch(json).run(ki);
338 assertEquals(kr.getTotalResults(), -1);
339 assertEquals("... lls seit den Griechen beibehalten worden." +
340 " 3. Bedeutungen in der Biologie steht A für"+
341 " das Nukleosid Adenosin steht A die Base"+
342 " Adenin steht A für die Aminosäure Alanin"+
343 " in der Informatik steht a für den dezimalen"+
344 " [Wert] 97 sowohl im ASCII- als auch im"+
345 " Unicode-Zeichensatz steht A für den dezimalen"+
346 " Wert 65 sowohl im ASCII- als auch im"+
347 " Unicode-Zeichensatz als Kfz-Kennzeichen"+
348 " steht A in Deutschland für Augsburg."+
349 " in Österreich auf ...",
350 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000351 };
352
Nils Diewald364eb642013-12-22 15:03:01 +0000353 @Test
354 public void searchJSONstartPage () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000355 // Construct index
356 KorapIndex ki = new KorapIndex();
357 // Indexing test files
358 for (String i : new String[] {"00001",
359 "00002",
360 "00003",
361 "00004",
362 "00005",
363 "00006",
364 "02439"}) {
365 ki.addDocFile(
366 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
367 true
Nils Diewald364eb642013-12-22 15:03:01 +0000368 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000369 };
370 ki.commit();
Nils Diewald364eb642013-12-22 15:03:01 +0000371
Nils Diewaldafab8f32015-01-26 19:11:32 +0000372 String json = getString(
373 getClass().getResource("/queries/bsp-paging.jsonld").getFile()
374 );
Nils Diewald364eb642013-12-22 15:03:01 +0000375
Nils Diewaldafab8f32015-01-26 19:11:32 +0000376 KorapSearch ks = new KorapSearch(json);
377 KorapResult kr = ks.run(ki);
378 assertEquals(kr.getTotalResults(), 10);
379 assertEquals(5, kr.getStartIndex());
380 assertEquals(5, kr.getItemsPerPage());
Nils Diewald364eb642013-12-22 15:03:01 +0000381
Nils Diewaldafab8f32015-01-26 19:11:32 +0000382 json = getString(
383 getClass().getResource("/queries/bsp-cutoff.jsonld").getFile()
384 );
385 ks = ks = new KorapSearch(json);
386 kr = ks.run(ki);
387 assertEquals(kr.getTotalResults(), -1);
388 assertEquals(2, kr.getStartIndex());
389 assertEquals(2, kr.getItemsPerPage());
Nils Diewald364eb642013-12-22 15:03:01 +0000390
Nils Diewaldafab8f32015-01-26 19:11:32 +0000391 json = getString(
392 getClass().getResource("/queries/metaquery9.jsonld").getFile()
393 );
394 KorapCollection kc = new KorapCollection(json);
395 kc.setIndex(ki);
396 assertEquals(7, kc.numberOf("documents"));
Nils Diewald364eb642013-12-22 15:03:01 +0000397 };
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000398
Nils Diewaldafab8f32015-01-26 19:11:32 +0000399
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000400 @Test
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000401 public void searchJSONitemsPerResource () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000402 // Construct index
403 KorapIndex ki = new KorapIndex();
404 // Indexing test files
405 for (String i : new String[] {"00001",
406 "00002",
407 "00003",
408 "00004",
409 "00005",
410 "00006",
411 "02439"}) {
412 ki.addDocFile(
413 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
414 true
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000415 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000416 };
417 ki.commit();
418 String json = getString(
419 getClass().
420 getResource("/queries/bsp-itemsPerResource.jsonld").
421 getFile()
422 );
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000423
Nils Diewaldafab8f32015-01-26 19:11:32 +0000424 KorapSearch ks = new KorapSearch(json);
425 KorapResult kr = ks.run(ki);
426 assertEquals(kr.getTotalResults(), 10);
427 assertEquals(0, kr.getStartIndex());
428 assertEquals(20, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000429
Nils Diewaldafab8f32015-01-26 19:11:32 +0000430 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
431 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
432 assertEquals("WPD_AAA.00001", kr.getMatch(6).getDocID());
433 assertEquals("WPD_AAA.00002", kr.getMatch(7).getDocID());
434 assertEquals("WPD_AAA.00002", kr.getMatch(8).getDocID());
435 assertEquals("WPD_AAA.00004", kr.getMatch(9).getDocID());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000436
Nils Diewaldafab8f32015-01-26 19:11:32 +0000437 ks = new KorapSearch(json);
438 ks.setItemsPerResource(1);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000439
Nils Diewaldafab8f32015-01-26 19:11:32 +0000440 kr = ks.run(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000441
Nils Diewaldafab8f32015-01-26 19:11:32 +0000442 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
443 assertEquals("WPD_AAA.00002", kr.getMatch(1).getDocID());
444 assertEquals("WPD_AAA.00004", kr.getMatch(2).getDocID());
445
446 assertEquals(kr.getTotalResults(), 3);
447 assertEquals(0, kr.getStartIndex());
448 assertEquals(20, kr.getItemsPerPage());
449
450 ks = new KorapSearch(json);
451 ks.setItemsPerResource(2);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000452
Nils Diewaldafab8f32015-01-26 19:11:32 +0000453 kr = ks.run(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000454
Nils Diewaldafab8f32015-01-26 19:11:32 +0000455 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
456 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
457 assertEquals("WPD_AAA.00002", kr.getMatch(2).getDocID());
458 assertEquals("WPD_AAA.00002", kr.getMatch(3).getDocID());
459 assertEquals("WPD_AAA.00004", kr.getMatch(4).getDocID());
460
461 assertEquals(kr.getTotalResults(), 5);
462 assertEquals(0, kr.getStartIndex());
463 assertEquals(20, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000464
Nils Diewaldafab8f32015-01-26 19:11:32 +0000465 ks = new KorapSearch(json);
466 ks.setItemsPerResource(1);
467 ks.setStartIndex(1);
468 ks.setCount(1);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000469
Nils Diewaldafab8f32015-01-26 19:11:32 +0000470 kr = ks.run(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000471
Nils Diewaldafab8f32015-01-26 19:11:32 +0000472 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000473
Nils Diewaldafab8f32015-01-26 19:11:32 +0000474 assertEquals(kr.getTotalResults(), 3);
475 assertEquals(1, kr.getStartIndex());
476 assertEquals(1, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000477
Nils Diewaldafab8f32015-01-26 19:11:32 +0000478 assertEquals((short) 1, kr.getItemsPerResource());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000479 };
480
Nils Diewaldafab8f32015-01-26 19:11:32 +0000481
Nils Diewaldd723d812014-09-23 18:50:52 +0000482 @Test
483 public void searchJSONitemsPerResourceServer () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000484 /*
485 * This test is a server-only implementation of
486 * TestResource#testCollection
487 */
488 // Construct index
489 KorapIndex ki = new KorapIndex();
490 // Indexing test files
491 int uid = 1;
492 for (String i : new String[] {"00001",
493 "00002",
494 "00003",
495 "00004",
496 "00005",
497 "00006",
498 "02439"}) {
499 ki.addDocFile(
500 uid++,
501 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
502 true
Nils Diewaldd723d812014-09-23 18:50:52 +0000503 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000504 };
505 ki.commit();
Nils Diewaldd723d812014-09-23 18:50:52 +0000506
Nils Diewaldafab8f32015-01-26 19:11:32 +0000507 String json = getString(
508 getClass().
509 getResource("/queries/bsp-uid-example.jsonld").
510 getFile()
511 );
Nils Diewaldd723d812014-09-23 18:50:52 +0000512
Nils Diewaldafab8f32015-01-26 19:11:32 +0000513 KorapSearch ks = new KorapSearch(json);
514 ks.setItemsPerResource(1);
515 KorapCollection kc = new KorapCollection();
516 kc.filterUIDs(new String[]{"1", "4"});
517 kc.setIndex(ki);
518 ks.setCollection(kc);
Nils Diewaldd723d812014-09-23 18:50:52 +0000519
Nils Diewaldafab8f32015-01-26 19:11:32 +0000520 KorapResult kr = ks.run(ki);
Nils Diewaldd723d812014-09-23 18:50:52 +0000521
Nils Diewaldafab8f32015-01-26 19:11:32 +0000522 assertEquals(kr.getTotalResults(), 2);
523 assertEquals(0, kr.getStartIndex());
524 assertEquals(25, kr.getItemsPerPage());
Nils Diewaldd723d812014-09-23 18:50:52 +0000525 };
Nils Diewaldba197f22014-11-01 17:21:46 +0000526
Nils Diewaldafab8f32015-01-26 19:11:32 +0000527
Nils Diewaldba197f22014-11-01 17:21:46 +0000528 @Test
529 public void searchJSONnewJSON () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000530 // Construct index
531 KorapIndex ki = new KorapIndex();
532 // Indexing test files
533 FieldDocument fd = ki.addDocFile(
534 1,
535 getClass().
536 getResource("/goe/AGA-03828.json.gz").
537 getFile(),
538 true
Nils Diewaldba197f22014-11-01 17:21:46 +0000539 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000540 ki.commit();
Nils Diewaldba197f22014-11-01 17:21:46 +0000541
Nils Diewaldafab8f32015-01-26 19:11:32 +0000542 assertEquals(fd.getUID(), 1);
543 assertEquals(fd.getTextSigle(), "GOE_AGA.03828");
544 assertEquals(fd.getDocSigle(), "GOE_AGA");
545 assertEquals(fd.getCorpusSigle(), "GOE");
546 assertEquals(fd.getTitle() , "Autobiographische Einzelheiten");
547 assertNull(fd.getSubTitle());
548 assertEquals(fd.getTextType(), "Autobiographie");
549 assertNull(fd.getTextTypeArt());
550 assertNull(fd.getTextTypeRef());
551 assertNull(fd.getTextColumn());
552 assertNull(fd.getTextDomain());
553 assertEquals(fd.getPages(), "529-547");
554 assertEquals(fd.getLicense(), "QAO-NC");
555 assertEquals(fd.getCreationDate().toString(), "18200000");
556 assertEquals(fd.getPubDate().toString(), "19820000");
557 assertEquals(fd.getAuthor(), "Goethe, Johann Wolfgang von");
558 assertNull(fd.getTextClass());
559 assertEquals(fd.getLanguage(), "de");
560 assertEquals(fd.getPubPlace(), "München");
561 assertEquals(fd.getReference(),
562 "Goethe, Johann Wolfgang von:"+
563 " Autobiographische Einzelheiten,"+
564 " (Geschrieben bis 1832), In: Goethe,"+
565 " Johann Wolfgang von: Goethes Werke,"+
566 " Bd. 10, Autobiographische Schriften"+
567 " II, Hrsg.: Trunz, Erich. München: "+
568 "Verlag C. H. Beck, 1982, S. 529-547");
569 assertEquals(fd.getPublisher(), "Verlag C. H. Beck");
570 assertNull(fd.getEditor());
571 assertNull(fd.getFileEditionStatement());
572 assertNull(fd.getBiblEditionStatement());
573 assertNull(fd.getKeywords());
574
575 assertEquals(fd.getTokenSource(), "opennlp#tokens");
576 assertEquals(fd.getFoundries(),
577 "base base/paragraphs base/sentences corenlp "+
578 "corenlp/constituency corenlp/morpho "+
579 "corenlp/namedentities corenlp/sentences "+
580 "glemm glemm/morpho mate mate/morpho"+
581 " opennlp opennlp/morpho opennlp/sentences"+
582 " treetagger treetagger/morpho "+
583 "treetagger/sentences");
584 assertEquals(fd.getLayerInfos(),
585 "base/s=spans corenlp/c=spans corenlp/ne=tokens"+
586 " corenlp/p=tokens corenlp/s=spans glemm/l=tokens"+
587 " mate/l=tokens mate/m=tokens mate/p=tokens"+
588 " opennlp/p=tokens opennlp/s=spans tt/l=tokens"+
589 " tt/p=tokens tt/s=spans");
590
591 assertEquals(fd.getCorpusTitle(), "Goethes Werke");
592 assertNull(fd.getCorpusSubTitle());
593 assertEquals(fd.getCorpusAuthor(), "Goethe, Johann Wolfgang von");
594 assertEquals(fd.getCorpusEditor(), "Trunz, Erich");
595 assertEquals(fd.getDocTitle(),
596 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)"
597 );
598 assertNull(fd.getDocSubTitle());
599 assertNull(fd.getDocEditor());
600 assertNull(fd.getDocAuthor());
601
602 KorapSearch ks = new KorapSearch(
603 new KorapQuery("tokens").
604 seg("mate/m:case:nom").
605 with("mate/m:number:pl")
606 );
607 KorapResult kr = ks.run(ki);
608
609 assertEquals(kr.getTotalResults(), 148);
610 assertEquals(0, kr.getStartIndex());
611 assertEquals(25, kr.getItemsPerPage());
Nils Diewaldba197f22014-11-01 17:21:46 +0000612 };
Nils Diewald06368ba2014-11-03 20:53:27 +0000613
Nils Diewaldafab8f32015-01-26 19:11:32 +0000614
Nils Diewald06368ba2014-11-03 20:53:27 +0000615 @Test
616 public void searchJSONnewJSON2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000617 // Construct index
618 KorapIndex ki = new KorapIndex();
619 // Indexing test files
620 FieldDocument fd = ki.addDocFile(
621 1,
622 getClass().
623 getResource("/bzk/D59-00089.json.gz").
624 getFile(),
625 true
Nils Diewald06368ba2014-11-03 20:53:27 +0000626 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000627 ki.commit();
Nils Diewald06368ba2014-11-03 20:53:27 +0000628
Nils Diewaldafab8f32015-01-26 19:11:32 +0000629 assertEquals(fd.getUID(), 1);
630 assertEquals(fd.getTextSigle(), "BZK_D59.00089");
631 assertEquals(fd.getDocSigle(), "BZK_D59");
632 assertEquals(fd.getCorpusSigle(), "BZK");
633 assertEquals(fd.getTitle() , "Saragat-Partei zerfällt");
634 assertEquals(fd.getPubDate().toString(), "19590219");
635
636 assertNull(fd.getSubTitle());
637 assertNull(fd.getAuthor());
638 assertNull(fd.getEditor());
639 assertEquals(fd.getPubPlace(), "Berlin");
640 assertNull(fd.getPublisher());
641 assertEquals(fd.getTextType(), "Zeitung: Tageszeitung");
642 assertNull(fd.getTextTypeArt());
643 assertEquals(fd.getTextTypeRef(), "Tageszeitung");
644 assertEquals(fd.getTextDomain(), "Politik");
645 assertEquals(fd.getCreationDate().toString(), "19590219");
646 assertEquals(fd.getLicense(), "ACA-NC-LC");
647 assertEquals(fd.getTextColumn(), "POLITIK");
648 assertNull(fd.getPages());
649 assertEquals(fd.getTextClass(), "politik ausland");
650 assertNull(fd.getFileEditionStatement());
651 assertNull(fd.getBiblEditionStatement());
652
653 assertEquals(fd.getLanguage(), "de");
654 assertEquals(
655 fd.getReference(),
656 "Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14,"+
657 " Berliner Ausgabe, S. 7. - Sachgebiet: Politik, "+
658 "Originalressort: POLITIK; Saragat-Partei zerfällt");
659 assertNull(fd.getPublisher());
660 assertNull(fd.getKeywords());
661
662 assertEquals(fd.getTokenSource(), "opennlp#tokens");
663
664 assertEquals(
665 fd.getFoundries(),
666 "base base/paragraphs base/sentences corenlp "+
667 "corenlp/constituency corenlp/morpho corenlp/namedentities"+
668 " corenlp/sentences glemm glemm/morpho mate mate/morpho"+
669 " opennlp opennlp/morpho opennlp/sentences treetagger"+
670 " treetagger/morpho treetagger/sentences");
671
672 assertEquals(
673 fd.getLayerInfos(),
674 "base/s=spans corenlp/c=spans corenlp/ne=tokens"+
675 " corenlp/p=tokens corenlp/s=spans glemm/l=tokens"+
676 " mate/l=tokens mate/m=tokens mate/p=tokens"+
677 " opennlp/p=tokens opennlp/s=spans tt/l=tokens"+
678 " tt/p=tokens tt/s=spans");
679
680 assertEquals(fd.getCorpusTitle(), "Bonner Zeitungskorpus");
681 assertNull(fd.getCorpusSubTitle());
682 assertNull(fd.getCorpusAuthor());
683 assertNull(fd.getCorpusEditor());
684
685 assertEquals(fd.getDocTitle(), "Neues Deutschland");
686 assertEquals(
687 fd.getDocSubTitle(),
688 "Organ des Zentralkomitees der Sozialistischen "+
689 "Einheitspartei Deutschlands");
690 assertNull(fd.getDocEditor());
691 assertNull(fd.getDocAuthor());
692
693 KorapSearch ks = new KorapSearch(
694 new KorapQuery("tokens").
695 seg("mate/m:case:nom").
696 with("mate/m:number:sg")
697 );
698 KorapResult kr = ks.run(ki);
699
700 assertEquals(kr.getTotalResults(), 6);
701 assertEquals(0, kr.getStartIndex());
702 assertEquals(25, kr.getItemsPerPage());
Nils Diewald06368ba2014-11-03 20:53:27 +0000703 };
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000704
Nils Diewaldafab8f32015-01-26 19:11:32 +0000705
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000706 @Test
Nils Diewald56dc2582014-11-04 21:33:46 +0000707 public void searchJSONcosmasBoundaryBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000708 // Construct index
709 KorapIndex ki = new KorapIndex();
710 // Indexing test files
711 FieldDocument fd = ki.addDocFile(
712 1,
713 getClass().
714 getResource("/bzk/D59-00089.json.gz").
715 getFile(),
716 true
717 );
718 ki.commit();
Nils Diewald56dc2582014-11-04 21:33:46 +0000719
Nils Diewaldafab8f32015-01-26 19:11:32 +0000720 String json = getString(
721 getClass().
722 getResource("/queries/bugs/cosmas_boundary.jsonld").
723 getFile()
Nils Diewald56dc2582014-11-04 21:33:46 +0000724 );
725
Nils Diewaldafab8f32015-01-26 19:11:32 +0000726 KorapQuery kq = new KorapQuery("tokens");
727 KorapSearch ks = new KorapSearch(
Nils Diewald85f9c422015-02-06 21:09:16 +0000728 kq.focus(
Nils Diewaldafab8f32015-01-26 19:11:32 +0000729 1,
730 kq.contains(kq.tag("base/s:s"), kq._(1, kq.seg("s:Leben")))
731 )
732 );
Nils Diewald56dc2582014-11-04 21:33:46 +0000733
Nils Diewaldafab8f32015-01-26 19:11:32 +0000734 KorapResult kr = ks.run(ki);
735 assertEquals(
Nils Diewald0fa2da22014-11-05 03:31:32 +0000736 kr.getQuery(),
Nils Diewald85f9c422015-02-06 21:09:16 +0000737 "focus(1: spanContain(<tokens:base/s:s />, {1: tokens:s:Leben}))"
Nils Diewald0fa2da22014-11-05 03:31:32 +0000738 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000739 assertEquals(
Nils Diewald0fa2da22014-11-05 03:31:32 +0000740 kr.getMatch(0).getSnippetBrackets(),
741 "... Initiative\" eine neue politische Gruppierung ins " +
Nils Diewaldafab8f32015-01-26 19:11:32 +0000742 "[{1:Leben}] gerufen hatten. Pressemeldungen zufolge haben sich ..."
Nils Diewald0fa2da22014-11-05 03:31:32 +0000743 );
Nils Diewald56dc2582014-11-04 21:33:46 +0000744
Nils Diewaldafab8f32015-01-26 19:11:32 +0000745 // Try with high class - don't highlight
746 ks = new KorapSearch(
Nils Diewald85f9c422015-02-06 21:09:16 +0000747 kq.focus(
Nils Diewaldafab8f32015-01-26 19:11:32 +0000748 129,
749 kq.contains(kq.tag("base/s:s"), kq._(129, kq.seg("s:Leben")))
750 )
751 );
Nils Diewald56dc2582014-11-04 21:33:46 +0000752
Nils Diewaldafab8f32015-01-26 19:11:32 +0000753 kr = ks.run(ki);
754 assertEquals(
Nils Diewald0fa2da22014-11-05 03:31:32 +0000755 kr.getQuery(),
Nils Diewald85f9c422015-02-06 21:09:16 +0000756 "focus(129: spanContain(<tokens:base/s:s />, {129: tokens:s:Leben}))"
Nils Diewald0fa2da22014-11-05 03:31:32 +0000757 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000758 assertEquals(
Nils Diewald0fa2da22014-11-05 03:31:32 +0000759 kr.getMatch(0).getSnippetBrackets(),
760 "... Initiative\" eine neue politische Gruppierung ins " +
Nils Diewaldafab8f32015-01-26 19:11:32 +0000761 "[Leben] gerufen hatten. Pressemeldungen zufolge haben sich ..."
Nils Diewald0fa2da22014-11-05 03:31:32 +0000762 );
763
Nils Diewaldafab8f32015-01-26 19:11:32 +0000764 ks = new KorapSearch(json);
765 kr = ks.run(ki);
766 assertEquals(
Nils Diewald0fa2da22014-11-05 03:31:32 +0000767 kr.getQuery(),
Nils Diewald85f9c422015-02-06 21:09:16 +0000768 "focus(129: spanElementDistance({129: tokens:s:Namen}, " +
Nils Diewaldafab8f32015-01-26 19:11:32 +0000769 "{129: tokens:s:Leben}, [(base/s:s[0:1], notOrdered, notExcluded)]))"
Nils Diewald0fa2da22014-11-05 03:31:32 +0000770 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000771 assertEquals(
772 kr.getMatch(0).getSnippetBrackets(),
773 "... ihren Austritt erklärt und unter dem [Namen \"Einheitsbewegung " +
774 "der sozialistischen Initiative\" eine neue politische Gruppierung " +
775 "ins Leben] gerufen hatten. Pressemeldungen zufolge haben sich ..."
Nils Diewald0fa2da22014-11-05 03:31:32 +0000776 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000777 assertEquals(kr.getTotalResults(), 1);
778 assertEquals(0, kr.getStartIndex());
Nils Diewald56dc2582014-11-04 21:33:46 +0000779 };
780
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000781 @Test
782 public void searchJSONmultipleClassesBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000783 // Construct index
784 KorapIndex ki = new KorapIndex();
785 // Indexing test files
786 ki.addDocFile(
787 1,
788 getClass().
789 getResource("/bzk/D59-00089.json.gz").
790 getFile(),
791 true
792 );
793 ki.addDocFile(
794 2,
795 getClass().
796 getResource("/bzk/D59-00089.json.gz").
797 getFile(),
798 true
799 );
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000800
Nils Diewaldafab8f32015-01-26 19:11:32 +0000801 ki.commit();
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000802
Nils Diewaldafab8f32015-01-26 19:11:32 +0000803 String json = getString(
804 getClass().
805 getResource("/queries/bugs/multiple_classes.jsonld").
806 getFile()
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000807 );
808
Nils Diewaldafab8f32015-01-26 19:11:32 +0000809 KorapSearch ks = new KorapSearch(json);
810 KorapResult kr = ks.run(ki);
811 assertEquals(
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000812 kr.getQuery(),
Nils Diewaldafab8f32015-01-26 19:11:32 +0000813 "{4: spanNext({1: spanNext({2: tokens:s:ins}, "+
814 "{3: tokens:s:Leben})}, tokens:s:gerufen)}"
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000815 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000816 assertEquals(
817 kr.getMatch(0).getSnippetBrackets(),
818 "... sozialistischen Initiative\" eine neue politische"+
819 " Gruppierung [{4:{1:{2:ins} {3:Leben}} gerufen}] hatten. " +
820 "Pressemeldungen zufolge haben sich in ..."
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000821 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000822 assertEquals(kr.getTotalResults(), 2);
823 assertEquals(0, kr.getStartIndex());
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000824 };
825
Nils Diewald277e9ce2014-11-06 03:42:11 +0000826 @Test
827 public void searchJSONmultipleClassesBugTokenList () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000828 // Construct index
829 KorapIndex ki = new KorapIndex();
830 // Indexing test files
831 ki.addDocFile(
832 1,
833 getClass().
834 getResource("/goe/AGA-03828.json.gz").
835 getFile(),
836 true
837 );
838 ki.addDocFile(
839 2,
840 getClass().
841 getResource("/bzk/D59-00089.json.gz").
842 getFile(),
843 true
844 );
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000845
Nils Diewaldafab8f32015-01-26 19:11:32 +0000846 ki.commit();
Nils Diewald277e9ce2014-11-06 03:42:11 +0000847
Nils Diewaldafab8f32015-01-26 19:11:32 +0000848 String json = getString(
849 getClass().
850 getResource("/queries/bugs/multiple_classes.jsonld").
851 getFile()
Nils Diewald277e9ce2014-11-06 03:42:11 +0000852 );
853
Nils Diewaldafab8f32015-01-26 19:11:32 +0000854 KorapSearch ks = new KorapSearch(json);
855 KorapResult kr = ks.run(ki);
Nils Diewald277e9ce2014-11-06 03:42:11 +0000856
Nils Diewaldafab8f32015-01-26 19:11:32 +0000857 ObjectMapper mapper = new ObjectMapper();
858 JsonNode res = mapper.readTree(kr.toTokenListJsonString());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000859
Nils Diewaldafab8f32015-01-26 19:11:32 +0000860 assertEquals(1, res.at("/totalResults").asInt());
861 assertEquals(
862 "{4: spanNext({1: spanNext({2: tokens:s:ins}, " +
863 "{3: tokens:s:Leben})}, tokens:s:gerufen)}",
864 res.at("/query").asText());
865 assertEquals(0, res.at("/startIndex").asInt());
866 assertEquals(25, res.at("/itemsPerPage").asInt());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000867
Nils Diewaldafab8f32015-01-26 19:11:32 +0000868 assertEquals("BZK_D59.00089", res.at("/matches/0/textSigle").asText());
869 assertEquals(328, res.at("/matches/0/tokens/0/0").asInt());
870 assertEquals(331, res.at("/matches/0/tokens/0/1").asInt());
871 assertEquals(332, res.at("/matches/0/tokens/1/0").asInt());
872 assertEquals(337, res.at("/matches/0/tokens/1/1").asInt());
873 assertEquals(338, res.at("/matches/0/tokens/2/0").asInt());
874 assertEquals(345, res.at("/matches/0/tokens/2/1").asInt());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000875 };
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000876
Nils Diewaldafab8f32015-01-26 19:11:32 +0000877
Nils Diewaldb84e7272014-11-07 01:27:38 +0000878 @Test
879 public void searchJSONmultitermRewriteBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000880 // Construct index
881 KorapIndex ki = new KorapIndex();
Nils Diewaldb84e7272014-11-07 01:27:38 +0000882
Nils Diewaldafab8f32015-01-26 19:11:32 +0000883 assertEquals(ki.numberOf("documents"), 0);
884
885 // Indexing test files
886 FieldDocument fd = ki.addDocFile(
887 1,
888 getClass().
889 getResource("/bzk/D59-00089.json.gz").
890 getFile(),
891 true
892 );
893 ki.commit();
894
895 assertEquals(ki.numberOf("documents"), 1);
896 assertEquals("BZK", fd.getCorpusSigle());
897
898 // [tt/p="A.*"]{0,3}[tt/p="N.*"]
899 String json = getString(
900 getClass().
901 getResource("/queries/bugs/multiterm_rewrite.jsonld").
902 getFile()
Nils Diewaldb84e7272014-11-07 01:27:38 +0000903 );
904
Nils Diewaldafab8f32015-01-26 19:11:32 +0000905 KorapSearch ks = new KorapSearch(json);
906 KorapCollection kc = ks.getCollection();
Nils Diewaldc471b182014-11-19 22:51:15 +0000907
Nils Diewaldafab8f32015-01-26 19:11:32 +0000908 // No index was set
909 assertEquals(-1, kc.numberOf("documents"));
910 kc.setIndex(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +0000911
Nils Diewaldafab8f32015-01-26 19:11:32 +0000912 // Index was set but vc restricted to WPD
913 assertEquals(0, kc.numberOf("documents"));
Nils Diewaldc471b182014-11-19 22:51:15 +0000914
Nils Diewaldafab8f32015-01-26 19:11:32 +0000915 kc.extend(
Nils Diewaldea969502015-02-16 21:10:54 +0000916 new CollectionBuilder().or("corpusSigle", "BZK")
Nils Diewaldc471b182014-11-19 22:51:15 +0000917 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000918 ks.setCollection(kc);
919 assertEquals(1, kc.numberOf("documents"));
Nils Diewald1220e3e2014-11-08 03:18:58 +0000920
Nils Diewaldafab8f32015-01-26 19:11:32 +0000921 KorapResult kr = ks.run(ki);
922
923 assertEquals(
Nils Diewaldb84e7272014-11-07 01:27:38 +0000924 kr.getQuery(),
Nils Diewaldafab8f32015-01-26 19:11:32 +0000925 "spanOr([SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/), " +
926 "spanNext(spanRepetition(SpanMultiTermQueryWrapper"+
927 "(tokens:/tt/p:A.*/){1,3}), " +
928 "SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/))])"
Nils Diewaldb84e7272014-11-07 01:27:38 +0000929 );
930
Nils Diewaldafab8f32015-01-26 19:11:32 +0000931 assertEquals(kr.getTotalResults(), 58);
932 assertEquals(0, kr.getStartIndex());
Nils Diewald5871e4d2014-11-07 03:48:25 +0000933
Nils Diewaldafab8f32015-01-26 19:11:32 +0000934 assertEquals(
935 kr.getMatch(0).getSnippetBrackets(),
936 "[Saragat-Partei] zerfällt Rom (ADN) die von dem"
Nils Diewald5871e4d2014-11-07 03:48:25 +0000937 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000938 assertEquals(
939 kr.getMatch(1).getSnippetBrackets(),
940 "[Saragat-Partei] zerfällt Rom (ADN) die von dem"
Nils Diewald5871e4d2014-11-07 03:48:25 +0000941 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000942 assertEquals(
943 kr.getMatch(2).getSnippetBrackets(),
944 "Saragat-Partei zerfällt [Rom] (ADN) "+
945 "die von dem Rechtssozialisten Saragat"
Nils Diewald5871e4d2014-11-07 03:48:25 +0000946 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000947 assertEquals(
948 kr.getMatch(3).getSnippetBrackets(),
949 "Saragat-Partei zerfällt Rom ([ADN]) "+
950 "die von dem Rechtssozialisten Saragat geführte"
Nils Diewald5871e4d2014-11-07 03:48:25 +0000951 );
952
Nils Diewaldafab8f32015-01-26 19:11:32 +0000953 assertEquals(
954 kr.getMatch(23).getSnippetBrackets(),
955 "dem Namen \"Einheitsbewegung der sozialistischen "+
956 "Initiative\" [eine neue politische Gruppierung] "+
957 "ins Leben gerufen hatten. Pressemeldungen zufolge"
Nils Diewald5871e4d2014-11-07 03:48:25 +0000958 );
Nils Diewaldb84e7272014-11-07 01:27:38 +0000959 };
960
961
Nils Diewald56dc2582014-11-04 21:33:46 +0000962 @Test
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000963 public void searchJSONCollection () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000964 // Construct index
965 KorapIndex ki = new KorapIndex();
966 // Indexing test files
967 for (String i : new String[] {"00001",
968 "00002",
969 "00003",
970 "00004",
971 "00005",
972 "00006",
973 "02439"}) {
974 ki.addDocFile(
975 getClass().
976 getResource("/wiki/" + i + ".json.gz").
977 getFile(),
978 true
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000979 );
Nils Diewaldafab8f32015-01-26 19:11:32 +0000980 };
981 ki.commit();
982 String json = getString(
983 getClass().
984 getResource("/queries/metaquery8-nocollection.jsonld").
985 getFile()
Nils Diewald56dc2582014-11-04 21:33:46 +0000986 );
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000987
Nils Diewaldafab8f32015-01-26 19:11:32 +0000988 KorapSearch ks = new KorapSearch(json);
989 KorapResult kr = ks.run(ki);
990 assertEquals(kr.getTotalResults(), 276);
991 assertEquals(0, kr.getStartIndex());
992 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000993
Nils Diewaldafab8f32015-01-26 19:11:32 +0000994 json = getString(
995 getClass().
996 getResource("/queries/metaquery8.jsonld").
997 getFile()
998 );
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000999
Nils Diewaldafab8f32015-01-26 19:11:32 +00001000 ks = new KorapSearch(json);
1001 kr = ks.run(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001002
Nils Diewaldafab8f32015-01-26 19:11:32 +00001003 assertEquals(kr.getTotalResults(), 147);
1004 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
1005 assertEquals(0, kr.getStartIndex());
1006 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001007
Nils Diewaldafab8f32015-01-26 19:11:32 +00001008 json = getString(
1009 getClass().
1010 getResource("/queries/metaquery8-filtered.jsonld").
1011 getFile()
1012 );
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001013
Nils Diewaldafab8f32015-01-26 19:11:32 +00001014 ks = new KorapSearch(json);
1015 kr = ks.run(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001016
Nils Diewaldafab8f32015-01-26 19:11:32 +00001017 assertEquals(kr.getTotalResults(), 28);
1018 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1019 assertEquals(0, kr.getStartIndex());
1020 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001021
Nils Diewaldafab8f32015-01-26 19:11:32 +00001022 json = getString(
1023 getClass().
1024 getResource("/queries/metaquery8-filtered-further.jsonld").
1025 getFile()
1026 );
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001027
Nils Diewaldafab8f32015-01-26 19:11:32 +00001028 ks = new KorapSearch(json);
1029 kr = ks.run(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001030
Nils Diewaldafab8f32015-01-26 19:11:32 +00001031 assertEquals(kr.getTotalResults(), 0);
1032 assertEquals(0, kr.getStartIndex());
1033 assertEquals(10, kr.getItemsPerPage());
1034
1035 json = getString(
1036 getClass().
1037 getResource("/queries/metaquery8-filtered-nested.jsonld").
1038 getFile()
1039 );
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001040
Nils Diewaldafab8f32015-01-26 19:11:32 +00001041 ks = new KorapSearch(json);
1042 kr = ks.run(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001043
Nils Diewaldafab8f32015-01-26 19:11:32 +00001044 assertEquals("filter with QueryWrapperFilter("+
1045 "+(ID:WPD_AAA.00003 (+tokens:s:die"+
1046 " +tokens:s:Schriftzeichen)))",
1047 ks.getCollection().getFilter(1).toString());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001048
Nils Diewaldafab8f32015-01-26 19:11:32 +00001049 assertEquals(kr.getTotalResults(), 119);
1050 assertEquals(0, kr.getStartIndex());
1051 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001052 };
1053
Nils Diewald1e5d5942014-05-20 13:29:53 +00001054
1055 @Test
1056 public void searchJSONSentenceContext () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001057 // Construct index
1058 KorapIndex ki = new KorapIndex();
1059 // Indexing test files
1060 for (String i : new String[] {"00001",
1061 "00002",
1062 "00003",
1063 "00004",
1064 "00005",
1065 "00006",
1066 "02439"}) {
1067 ki.addDocFile(
1068 getClass().
1069 getResource("/wiki/" + i + ".json.gz").
1070 getFile(),
1071 true
Nils Diewald1e5d5942014-05-20 13:29:53 +00001072 );
Nils Diewaldafab8f32015-01-26 19:11:32 +00001073 };
1074 ki.commit();
Nils Diewald1e5d5942014-05-20 13:29:53 +00001075
Nils Diewaldafab8f32015-01-26 19:11:32 +00001076 String json = getString(
1077 getClass().
1078 getResource("/queries/bsp-context-2.jsonld").
1079 getFile()
1080 );
Nils Diewald1e5d5942014-05-20 13:29:53 +00001081
Nils Diewaldafab8f32015-01-26 19:11:32 +00001082 KorapSearch ks = new KorapSearch(json);
1083 ks.setCutOff(false);
1084 SearchContext sc = ks.getContext();
1085 sc.left.setLength((short) 10);
1086 sc.right.setLength((short) 10);
1087
1088 KorapResult kr = ks.run(ki);
1089 assertEquals(
1090 kr.getMatch(1).getSnippetBrackets(),
1091 "... dezimalen [Wert] 65 sowohl ..."
1092 );
1093 assertEquals(kr.getTotalResults(), 3);
1094 assertEquals(0, kr.getStartIndex());
1095 assertEquals(25, kr.getItemsPerPage());
1096 assertFalse(kr.getContext().toJsonNode().toString().equals("\"s\""));
Nils Diewald1e5d5942014-05-20 13:29:53 +00001097
Nils Diewaldafab8f32015-01-26 19:11:32 +00001098 json = getString(
1099 getClass().
1100 getResource("/queries/bsp-context-sentence.jsonld").
1101 getFile()
1102 );
Nils Diewald1e5d5942014-05-20 13:29:53 +00001103
Nils Diewaldafab8f32015-01-26 19:11:32 +00001104 kr = new KorapSearch(json).run(ki);
1105 assertEquals(
1106 kr.getMatch(0).getSnippetBrackets(),
1107 "steht a für den dezimalen [Wert] 97 sowohl im ASCII-"+
1108 " als auch im Unicode-Zeichensatz"
1109 );
1110 assertEquals(
1111 kr.getMatch(1).getSnippetBrackets(),
1112 "steht A für den dezimalen [Wert] 65 sowohl im ASCII-"+
1113 " als auch im Unicode-Zeichensatz"
1114 );
1115 assertEquals(
1116 kr.getMatch(2).getSnippetBrackets(),
1117 "In einem Zahlensystem mit einer Basis größer "+
1118 "als 10 steht A oder a häufig für den dezimalen"+
1119 " [Wert] 10, siehe auch Hexadezimalsystem."
1120 );
Nils Diewald1e5d5942014-05-20 13:29:53 +00001121
Nils Diewaldafab8f32015-01-26 19:11:32 +00001122 assertEquals(kr.getContext().toJsonNode().toString(), "\"s\"");
Nils Diewald1e5d5942014-05-20 13:29:53 +00001123 };
1124
1125
Nils Diewald2276e1c2014-04-10 15:01:59 +00001126 @Test
Nils Diewald54187632014-06-11 14:39:29 +00001127 public void searchJSONbug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001128 // Construct index
1129 KorapIndex ki = new KorapIndex();
1130 // Indexing test files
1131 for (String i : new String[] {"00001",
1132 "00002",
1133 "00003",
1134 "00004",
1135 "00005",
1136 "00006",
1137 "02439"}) {
1138 ki.addDocFile(
1139 getClass().
1140 getResource("/wiki/" + i + ".json.gz").
1141 getFile(),
1142 true
Nils Diewald54187632014-06-11 14:39:29 +00001143 );
Nils Diewaldafab8f32015-01-26 19:11:32 +00001144 };
1145 ki.commit();
Nils Diewald54187632014-06-11 14:39:29 +00001146
Nils Diewaldafab8f32015-01-26 19:11:32 +00001147 String json = getString(
1148 getClass().
1149 getResource("/queries/bsp-bug.jsonld").
1150 getFile()
1151 );
Nils Diewald54187632014-06-11 14:39:29 +00001152
Nils Diewaldafab8f32015-01-26 19:11:32 +00001153 KorapResult kr = new KorapSearch(json).run(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +00001154
Nils Diewaldafab8f32015-01-26 19:11:32 +00001155 assertEquals(
1156 kr.getError(0).getMessage(),
Nils Diewald93d6d1b2015-02-02 21:47:43 +00001157 "Operation needs operand list"
Nils Diewaldafab8f32015-01-26 19:11:32 +00001158 );
Nils Diewald54187632014-06-11 14:39:29 +00001159 };
1160
Nils Diewaldafab8f32015-01-26 19:11:32 +00001161
Nils Diewaldef7124e2014-11-12 20:08:13 +00001162 /**
1163 * This is a breaking test for #179
1164 */
1165 @Test
1166 public void searchJSONexpansionBug () throws IOException {
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001167 // Construct index
1168 KorapIndex ki = new KorapIndex();
1169 // Indexing test files
1170 ki.addDocFile(
Nils Diewaldafab8f32015-01-26 19:11:32 +00001171 getClass().
1172 getResource("/wiki/00002.json.gz").
1173 getFile(),
1174 true
1175 );
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001176 ki.commit();
1177
1178 // Expansion bug
1179 // der alte Digraph Aa durch Ã…
1180 String json = getString(
Nils Diewaldafab8f32015-01-26 19:11:32 +00001181 getClass().
1182 getResource("/queries/bugs/expansion_bug_2.jsonld").
1183 getFile()
1184 );
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001185
1186 KorapResult kr = new KorapSearch(json).run(ki);
1187 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001188 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
1189 "in Eigennamen und Ortsnamen ...",
1190 kr.getMatch(0).getSnippetBrackets());
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001191 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +00001192 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001193
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001194 // der alte Digraph Aa durch []
1195 // Works with one document
1196 json = getString(
Nils Diewaldafab8f32015-01-26 19:11:32 +00001197 getClass().
1198 getResource("/queries/bugs/expansion_bug.jsonld").
1199 getFile()
1200 );
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001201
1202 kr = new KorapSearch(json).run(ki);
1203
1204 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001205 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
1206 "in Eigennamen und Ortsnamen ...",
1207 kr.getMatch(0).getSnippetBrackets());
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001208 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +00001209 assertEquals(kr.getTotalResults(), 1);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001210
1211 // Now try with one file ahead
1212 ki = new KorapIndex();
1213 for (String i : new String[] {"00001",
Nils Diewaldafab8f32015-01-26 19:11:32 +00001214 "00002"}) {
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001215 ki.addDocFile(
Nils Diewaldafab8f32015-01-26 19:11:32 +00001216 getClass().
1217 getResource("/wiki/" + i + ".json.gz").
1218 getFile(),
1219 true
1220 );
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001221 };
1222 ki.commit();
1223
1224 // Expansion bug
1225 // der alte Digraph Aa durch Ã…
1226 json = getString(
Nils Diewaldafab8f32015-01-26 19:11:32 +00001227 getClass().
1228 getResource("/queries/bugs/expansion_bug_2.jsonld").
1229 getFile()
1230 );
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001231
1232 kr = new KorapSearch(json).run(ki);
1233
1234 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001235 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
1236 "in Eigennamen und Ortsnamen ...",
1237 kr.getMatch(0).getSnippetBrackets());
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001238 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +00001239 assertEquals(kr.getTotalResults(), 1);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001240
1241 // der alte Digraph Aa durch []
1242 json = getString(
Nils Diewaldafab8f32015-01-26 19:11:32 +00001243 getClass().
1244 getResource("/queries/bugs/expansion_bug.jsonld").
1245 getFile()
1246 );
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001247
Nils Diewaldc471b182014-11-19 22:51:15 +00001248 kr = new KorapSearch(json).run(ki);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001249 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001250 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
1251 "in Eigennamen und Ortsnamen ...",
1252 kr.getMatch(0).getSnippetBrackets());
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001253 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +00001254 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldef7124e2014-11-12 20:08:13 +00001255 };
1256
1257
Nils Diewald979b2fe2014-09-29 16:21:41 +00001258 /*
1259 This test will crash soon - it's just here for nostalgic reasons!
Nils Diewaldafab8f32015-01-26 19:11:32 +00001260 */
Nils Diewald54187632014-06-11 14:39:29 +00001261 @Test
Nils Diewald2276e1c2014-04-10 15:01:59 +00001262 public void getFoundryDistribution () throws Exception {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001263 // Construct index
1264 KorapIndex ki = new KorapIndex();
1265 // Indexing test files
1266 for (String i : new String[] {"00001",
1267 "00002",
1268 "00003",
1269 "00004",
1270 "00005",
1271 "00006",
1272 "02439"}) {
1273 ki.addDocFile(
1274 getClass().
1275 getResource("/wiki/" + i + ".json.gz").
1276 getFile(),
1277 true
Nils Diewald2276e1c2014-04-10 15:01:59 +00001278 );
Nils Diewaldafab8f32015-01-26 19:11:32 +00001279 };
1280 ki.commit();
Nils Diewald2276e1c2014-04-10 15:01:59 +00001281
Nils Diewaldafab8f32015-01-26 19:11:32 +00001282 KorapCollection kc = new KorapCollection(ki);
Nils Diewald2276e1c2014-04-10 15:01:59 +00001283
Nils Diewaldafab8f32015-01-26 19:11:32 +00001284 assertEquals(7, kc.numberOf("documents"));
Nils Diewald2276e1c2014-04-10 15:01:59 +00001285
1286 HashMap map = kc.getTermRelation("foundries");
Nils Diewaldafab8f32015-01-26 19:11:32 +00001287 assertEquals((long) 7, map.get("-docs"));
1288 assertEquals((long) 7, map.get("treetagger"));
1289 assertEquals((long) 6, map.get("opennlp/morpho"));
1290 assertEquals((long) 6, map.get("#__opennlp/morpho:###:treetagger"));
1291 assertEquals((long) 7, map.get("#__opennlp:###:treetagger"));
Nils Diewald2276e1c2014-04-10 15:01:59 +00001292 };
1293
Nils Diewaldafab8f32015-01-26 19:11:32 +00001294
Nils Diewald2276e1c2014-04-10 15:01:59 +00001295 @Test
1296 public void getTextClassDistribution () throws Exception {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001297 KorapIndex ki = new KorapIndex();
1298 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001299"{" +
1300" \"fields\" : [" +
1301" { \"primaryData\" : \"abc\" },{" +
1302" \"name\" : \"tokens\"," +
1303" \"data\" : [" +
1304" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1305" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1306" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1307" \"textClass\" : \"music entertainment\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001308"}"
1309 );
Nils Diewald2276e1c2014-04-10 15:01:59 +00001310
Nils Diewaldafab8f32015-01-26 19:11:32 +00001311 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001312"{" +
1313" \"fields\" : [" +
1314" { \"primaryData\" : \"abc\" },{" +
1315" \"name\" : \"tokens\"," +
1316" \"data\" : [" +
1317" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1318" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1319" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1320" \"textClass\" : \"music singing\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001321"}"
1322 );
Nils Diewald2276e1c2014-04-10 15:01:59 +00001323
Nils Diewaldafab8f32015-01-26 19:11:32 +00001324 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001325"{" +
1326" \"fields\" : [" +
1327" { \"primaryData\" : \"abc\" },{" +
1328" \"name\" : \"tokens\"," +
1329" \"data\" : [" +
1330" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1331" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1332" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1333" \"textClass\" : \"music entertainment jumping\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001334"}"
1335 );
1336 ki.commit();
Nils Diewald2276e1c2014-04-10 15:01:59 +00001337
Nils Diewaldafab8f32015-01-26 19:11:32 +00001338 KorapCollection kc = new KorapCollection(ki);
1339 assertEquals(3, kc.numberOf("documents"));
Nils Diewald2276e1c2014-04-10 15:01:59 +00001340
1341 HashMap map = kc.getTermRelation("textClass");
Nils Diewaldafab8f32015-01-26 19:11:32 +00001342 assertEquals((long) 1, map.get("singing"));
1343 assertEquals((long) 1, map.get("jumping"));
1344 assertEquals((long) 3, map.get("music"));
1345 assertEquals((long) 2, map.get("entertainment"));
1346 assertEquals((long) 3, map.get("-docs"));
1347 assertEquals((long) 2, map.get("#__entertainment:###:music"));
1348 assertEquals((long) 1, map.get("#__entertainment:###:jumping"));
1349 assertEquals((long) 0, map.get("#__entertainment:###:singing"));
1350 assertEquals((long) 0, map.get("#__jumping:###:singing"));
1351 assertEquals((long) 1, map.get("#__jumping:###:music"));
1352 assertEquals((long) 1, map.get("#__music:###:singing"));
1353 assertEquals(11, map.size());
1354
1355 // System.err.println(kc.getTermRelationJSON("textClass"));
Nils Diewald2276e1c2014-04-10 15:01:59 +00001356 };
1357
Nils Diewaldafab8f32015-01-26 19:11:32 +00001358
Nils Diewald2276e1c2014-04-10 15:01:59 +00001359 @Test
1360 public void getTextClassDistribution2 () throws Exception {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001361 KorapIndex ki = new KorapIndex();
1362 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001363"{" +
1364" \"fields\" : [" +
1365" { \"primaryData\" : \"abc\" },{" +
1366" \"name\" : \"tokens\"," +
1367" \"data\" : [" +
1368" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1369" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1370" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1371" \"textClass\" : \"\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001372"}"
1373 );
1374 ki.commit();
1375 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001376"{" +
1377" \"fields\" : [" +
1378" { \"primaryData\" : \"abc\" },{" +
1379" \"name\" : \"tokens\"," +
1380" \"data\" : [" +
1381" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1382" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1383" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1384" \"textClass\" : \"music entertainment\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001385"}"
1386 );
Nils Diewald2276e1c2014-04-10 15:01:59 +00001387
Nils Diewaldafab8f32015-01-26 19:11:32 +00001388 ki.commit();
1389 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001390"{" +
1391" \"fields\" : [" +
1392" { \"primaryData\" : \"abc\" },{" +
1393" \"name\" : \"tokens\"," +
1394" \"data\" : [" +
1395" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1396" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1397" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1398" \"textClass\" : \"music singing\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001399"}"
1400 );
Nils Diewald2276e1c2014-04-10 15:01:59 +00001401
Nils Diewaldafab8f32015-01-26 19:11:32 +00001402 ki.addDoc(
Nils Diewald2276e1c2014-04-10 15:01:59 +00001403"{" +
1404" \"fields\" : [" +
1405" { \"primaryData\" : \"abc\" },{" +
1406" \"name\" : \"tokens\"," +
1407" \"data\" : [" +
1408" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1409" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1410" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1411" \"textClass\" : \"music entertainment jumping\"" +
Nils Diewaldafab8f32015-01-26 19:11:32 +00001412"}"
1413 );
1414 ki.commit();
Nils Diewald2276e1c2014-04-10 15:01:59 +00001415
Nils Diewaldafab8f32015-01-26 19:11:32 +00001416 KorapCollection kc = new KorapCollection(ki);
1417 assertEquals(4, kc.numberOf("documents"));
Nils Diewald2276e1c2014-04-10 15:01:59 +00001418
1419 HashMap map = kc.getTermRelation("textClass");
Nils Diewaldafab8f32015-01-26 19:11:32 +00001420 assertEquals((long) 1, map.get("singing"));
1421 assertEquals((long) 1, map.get("jumping"));
1422 assertEquals((long) 3, map.get("music"));
1423 assertEquals((long) 2, map.get("entertainment"));
1424 assertEquals((long) 4, map.get("-docs"));
1425 assertEquals((long) 2, map.get("#__entertainment:###:music"));
1426 assertEquals((long) 1, map.get("#__entertainment:###:jumping"));
1427 assertEquals((long) 0, map.get("#__entertainment:###:singing"));
1428 assertEquals((long) 0, map.get("#__jumping:###:singing"));
1429 assertEquals((long) 1, map.get("#__jumping:###:music"));
1430 assertEquals((long) 1, map.get("#__music:###:singing"));
1431 assertEquals(11, map.size());
Nils Diewald2276e1c2014-04-10 15:01:59 +00001432 };
Nils Diewaldc925b492013-12-03 23:56:10 +00001433};