blob: d7f30a4d3ac2a1e8a82e90cfd4f7775794b76016 [file] [log] [blame]
Eliza Margaretha6a780692014-01-15 09:45:42 +00001package de.ids_mannheim.korap.search;
2
Nils Diewaldc925b492013-12-03 23:56:10 +00003import java.util.*;
4import java.io.*;
5
Nils Diewald56dc2582014-11-04 21:33:46 +00006import static de.ids_mannheim.korap.TestSimple.*;
7
Nils Diewaldc925b492013-12-03 23:56:10 +00008import de.ids_mannheim.korap.KorapSearch;
Nils Diewald2276e1c2014-04-10 15:01:59 +00009import de.ids_mannheim.korap.KorapCollection;
Nils Diewaldc925b492013-12-03 23:56:10 +000010import de.ids_mannheim.korap.KorapQuery;
11import de.ids_mannheim.korap.KorapIndex;
Nils Diewald2276e1c2014-04-10 15:01:59 +000012import de.ids_mannheim.korap.index.FieldDocument;
Nils Diewald1e5d5942014-05-20 13:29:53 +000013import de.ids_mannheim.korap.index.SearchContext;
Nils Diewaldc925b492013-12-03 23:56:10 +000014import de.ids_mannheim.korap.KorapFilter;
15import de.ids_mannheim.korap.KorapResult;
16import java.nio.file.Files;
17import java.nio.file.FileSystem;
18import java.nio.file.Path;
19import java.nio.charset.StandardCharsets;
20import java.nio.ByteBuffer;
21
Nils Diewald277e9ce2014-11-06 03:42:11 +000022import com.fasterxml.jackson.databind.ObjectMapper;
23import com.fasterxml.jackson.databind.JsonNode;
24
Nils Diewaldc925b492013-12-03 23:56:10 +000025import static org.junit.Assert.*;
26import org.junit.Test;
27import org.junit.Ignore;
28import org.junit.runner.RunWith;
29import org.junit.runners.JUnit4;
30
31@RunWith(JUnit4.class)
32public class TestKorapSearch {
33 @Test
34 public void searchCount () {
35 KorapSearch ks = new KorapSearch(
36 new KorapQuery("field1").seg("a").with("b")
37 );
38 // Count:
39 ks.setCount(30);
40 assertEquals(ks.getCount(), 30);
41 ks.setCount(20);
42 assertEquals(ks.getCount(), 20);
43 ks.setCount(-50);
44 assertEquals(ks.getCount(), 20);
45 ks.setCount(500);
46 assertEquals(ks.getCount(), ks.getCountMax());
47 };
48
49 @Test
50 public void searchStartIndex () {
51 KorapSearch ks = new KorapSearch(
52 new KorapQuery("field1").seg("a").with("b")
53 );
54 // startIndex
55 ks.setStartIndex(5);
56 assertEquals(ks.getStartIndex(), 5);
57 ks.setStartIndex(1);
58 assertEquals(ks.getStartIndex(), 1);
59 ks.setStartIndex(0);
60 assertEquals(ks.getStartIndex(), 0);
61 ks.setStartIndex(70);
62 assertEquals(ks.getStartIndex(), 70);
63 ks.setStartIndex(-5);
64 assertEquals(ks.getStartIndex(), 0);
65 };
66
67 @Test
68 public void searchQuery () {
69 KorapSearch ks = new KorapSearch(
70 new KorapQuery("field1").seg("a").with("b")
71 );
72 // query
Nils Diewald97b66382014-02-11 00:32:23 +000073 assertEquals(ks.getQuery().toString(), "spanSegment(field1:a, field1:b)");
Nils Diewaldc925b492013-12-03 23:56:10 +000074 };
75
76 @Test
77 public void searchIndex () throws IOException {
78
79 // Construct index
80 KorapIndex ki = new KorapIndex();
81 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +000082 for (String i : new String[] {"00001",
83 "00002",
84 "00003",
85 "00004",
86 "00005",
87 "00006",
88 "02439"}) {
Nils Diewaldc925b492013-12-03 23:56:10 +000089 ki.addDocFile(
90 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
91 );
92 };
93 ki.commit();
94
95 KorapSearch ks = new KorapSearch(
96 new KorapQuery("tokens").seg("s:Buchstaben")
97 );
98 ks.getCollection().filter(
99 new KorapFilter().and("textClass", "reisen")
100 );
101 ks.setCount(3);
102 ks.setStartIndex(5);
Nils Diewald1e5d5942014-05-20 13:29:53 +0000103 ks.context.left.setLength(1);
104 ks.context.right.setLength(1);
Nils Diewaldc925b492013-12-03 23:56:10 +0000105 KorapResult kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000106 assertEquals(kr.getTotalResults(), 6);
Nils Diewaldc925b492013-12-03 23:56:10 +0000107 assertEquals(kr.getMatch(0).getSnippetBrackets(), "... dem [Buchstaben] A ...");
108 };
Nils Diewaldc6b78752013-12-05 19:05:12 +0000109
110 @Test
111 public void searchJSON () throws IOException {
112
113 // Construct index
114 KorapIndex ki = new KorapIndex();
115 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000116 for (String i : new String[] {"00001",
117 "00002",
118 "00003",
119 "00004",
120 "00005",
121 "00006",
122 "02439"}) {
Nils Diewaldc6b78752013-12-05 19:05:12 +0000123 ki.addDocFile(
124 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
125 );
126 };
127 ki.commit();
128
Nils Diewaldc86aa482014-02-12 16:58:05 +0000129 String json = getString(getClass().getResource("/queries/metaquery3.jsonld").getFile());
Nils Diewaldc6b78752013-12-05 19:05:12 +0000130
Nils Diewaldc86aa482014-02-12 16:58:05 +0000131 KorapSearch ks = new KorapSearch(json);
Eliza Margaretha6a780692014-01-15 09:45:42 +0000132
Nils Diewaldc86aa482014-02-12 16:58:05 +0000133 KorapResult kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000134 assertEquals(kr.getTotalResults(), 66);
Nils Diewaldc6b78752013-12-05 19:05:12 +0000135 assertEquals(5, kr.getItemsPerPage());
136 assertEquals(5, kr.getStartIndex());
137 assertEquals("... a: A ist [der klangreichste] der V ...", kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000138 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000139
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000140 @Test
141 public void searchJSON2 () throws IOException {
Nils Diewald01b4ce32013-12-05 22:39:25 +0000142
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000143 // Construct index
144 KorapIndex ki = new KorapIndex();
145 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000146 for (String i : new String[] {"00001",
147 "00002",
148 "00003",
149 "00004",
150 "00005",
151 "00006",
152 "02439",
153 "00012-fakemeta",
154 "00030-fakemeta",
155 /*
156 "02035-substring",
157 "05663-unbalanced",
158 "07452-deep"
159 */
160 }) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000161 ki.addDocFile(
162 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
163 );
164 };
165 ki.commit();
166
Nils Diewaldc86aa482014-02-12 16:58:05 +0000167 String json = getString(getClass().getResource("/queries/metaquery4.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000168
169 KorapSearch ks = new KorapSearch(json);
170 KorapResult kr = ks.run(ki);
Nils Diewaldc86aa482014-02-12 16:58:05 +0000171
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000172 assertEquals(kr.getTotalResults(), 1);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000173
174 ks = new KorapSearch(json);
175 // Ignore the collection part of the query!
176 ks.setCollection(new KorapCollection());
177 kr = ks.run(ki);
178
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000179 assertEquals(kr.getTotalResults(), 5);
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000180
Nils Diewaldc86aa482014-02-12 16:58:05 +0000181 json = getString(getClass().getResource("/queries/metaquery5.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000182 ks = new KorapSearch(json);
183 kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000184 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000185
Nils Diewaldc86aa482014-02-12 16:58:05 +0000186 json = getString(getClass().getResource("/queries/metaquery6.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000187 ks = new KorapSearch(json);
188 kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000189 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldc6b78752013-12-05 19:05:12 +0000190 };
191
192
193 @Test
194 public void searchJSONFailure () throws IOException {
195
196 // Construct index
197 KorapIndex ki = new KorapIndex();
198 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000199 for (String i : new String[] {"00001",
200 "00002",
201 "00003",
202 "00004",
203 "00005",
204 "00006",
205 "02439"
206 }) {
Nils Diewaldc6b78752013-12-05 19:05:12 +0000207 ki.addDocFile(
208 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
209 );
210 };
211 ki.commit();
212
213 KorapResult kr = new KorapSearch("{ query").run(ki);
214
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000215 assertEquals(kr.getTotalResults(), 0);
Nils Diewaldc471b182014-11-19 22:51:15 +0000216 assertEquals(kr.getError(0).getMessage(), "Unable to parse JSON");
Nils Diewaldc6b78752013-12-05 19:05:12 +0000217 };
218
219
220
Nils Diewald9f310832013-12-06 22:38:55 +0000221 @Test
222 public void searchJSONindexboundary () throws IOException {
223
224 // Construct index
225 KorapIndex ki = new KorapIndex();
226 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000227 for (String i : new String[] {"00001",
228 "00002",
229 "00003",
230 "00004",
231 "00005",
232 "00006",
233 "02439"}) {
Nils Diewald9f310832013-12-06 22:38:55 +0000234 ki.addDocFile(
235 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
236 );
237 };
238 ki.commit();
239
Nils Diewaldc86aa482014-02-12 16:58:05 +0000240 String json = getString(getClass().getResource("/queries/bsp-fail1.jsonld").getFile());
Nils Diewald9f310832013-12-06 22:38:55 +0000241
242 KorapResult kr = new KorapSearch(json).run(ki);
243 assertEquals(0, kr.getStartIndex());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000244 assertEquals(kr.getTotalResults(), 0);
Nils Diewald9f310832013-12-06 22:38:55 +0000245 assertEquals(25, kr.getItemsPerPage());
246 };
247
248 @Test
249 public void searchJSONindexboundary2 () throws IOException {
250
251 // Construct index
252 KorapIndex ki = new KorapIndex();
253 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000254 for (String i : new String[] {"00001",
255 "00002",
256 "00003",
257 "00004",
258 "00005",
259 "00006",
260 "02439"}) {
Nils Diewald9f310832013-12-06 22:38:55 +0000261 ki.addDocFile(
262 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
263 );
264 };
265 ki.commit();
266
Nils Diewaldc86aa482014-02-12 16:58:05 +0000267 String json = getString(getClass().getResource("/queries/bsp-fail2.jsonld").getFile());
Nils Diewald9f310832013-12-06 22:38:55 +0000268
269 KorapResult kr = new KorapSearch(json).run(ki);
270 assertEquals(50, kr.getItemsPerPage());
271 assertEquals(49950, kr.getStartIndex());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000272 assertEquals(kr.getTotalResults(), 0);
Nils Diewald9f310832013-12-06 22:38:55 +0000273 };
274
Nils Diewaldc6b78752013-12-05 19:05:12 +0000275
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000276 @Test
277 public void searchJSONcontext () throws IOException {
278
279 // Construct index
280 KorapIndex ki = new KorapIndex();
281 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000282 for (String i : new String[] {"00001",
283 "00002",
284 "00003",
285 "00004",
286 "00005",
287 "00006",
288 "02439"}) {
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000289 ki.addDocFile(
290 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
291 );
292 };
293 ki.commit();
294
Nils Diewaldc86aa482014-02-12 16:58:05 +0000295 String json = getString(getClass().getResource("/queries/bsp-context.jsonld").getFile());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000296
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000297 KorapSearch ks = new KorapSearch(json);
298 KorapResult kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000299 assertEquals(kr.getTotalResults(), 10);
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000300 assertEquals("A bzw. a ist der erste Buchstabe des lateinischen [Alphabets] und ein Vokal. Der Buchstabe A hat in deutschen Texten eine durchschnittliche Häufigkeit ...", kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000301
302 ks.setCount(5);
303 ks.setStartPage(2);
304 kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000305 assertEquals(kr.getTotalResults(), 10);
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000306 assertEquals(5, kr.getStartIndex());
307 assertEquals(5, kr.getItemsPerPage());
Nils Diewald891c53c2013-12-23 16:37:46 +0000308
309
Nils Diewaldc86aa482014-02-12 16:58:05 +0000310 json = getString(getClass().getResource("/queries/bsp-context-2.jsonld").getFile());
Nils Diewald891c53c2013-12-23 16:37:46 +0000311
312 kr = new KorapSearch(json).run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000313 assertEquals(kr.getTotalResults(), -1);
Nils Diewald891c53c2013-12-23 16:37:46 +0000314 assertEquals("... lls seit den Griechen beibehalten worden. 3. Bedeutungen in der Biologie steht A für das Nukleosid Adenosin steht A die Base Adenin steht A für die Aminosäure Alanin in der Informatik steht a für den dezimalen [Wert] 97 sowohl im ASCII- als auch im Unicode-Zeichensatz steht A für den dezimalen Wert 65 sowohl im ASCII- als auch im Unicode-Zeichensatz als Kfz-Kennzeichen steht A in Deutschland für Augsburg. in Österreich auf ...", kr.getMatch(0).getSnippetBrackets());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000315 };
316
Nils Diewald364eb642013-12-22 15:03:01 +0000317 @Test
318 public void searchJSONstartPage () throws IOException {
319
320 // Construct index
321 KorapIndex ki = new KorapIndex();
322 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000323 for (String i : new String[] {"00001",
324 "00002",
325 "00003",
326 "00004",
327 "00005",
328 "00006",
329 "02439"}) {
Nils Diewald364eb642013-12-22 15:03:01 +0000330 ki.addDocFile(
331 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
332 );
333 };
334 ki.commit();
335
Nils Diewaldc86aa482014-02-12 16:58:05 +0000336 String json = getString(getClass().getResource("/queries/bsp-paging.jsonld").getFile());
Nils Diewald364eb642013-12-22 15:03:01 +0000337
338 KorapSearch ks = new KorapSearch(json);
339 KorapResult kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000340 assertEquals(kr.getTotalResults(), 10);
Nils Diewald364eb642013-12-22 15:03:01 +0000341 assertEquals(5, kr.getStartIndex());
342 assertEquals(5, kr.getItemsPerPage());
343
Nils Diewaldc86aa482014-02-12 16:58:05 +0000344 json = getString(getClass().getResource("/queries/bsp-cutoff.jsonld").getFile());
Nils Diewald364eb642013-12-22 15:03:01 +0000345 ks = ks = new KorapSearch(json);
346
347 kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000348 assertEquals(kr.getTotalResults(), -1);
Nils Diewald364eb642013-12-22 15:03:01 +0000349 assertEquals(2, kr.getStartIndex());
350 assertEquals(2, kr.getItemsPerPage());
351
Nils Diewald50389b02014-04-11 16:27:52 +0000352
353 json = getString(getClass().getResource("/queries/metaquery9.jsonld").getFile());
354 KorapCollection kc = new KorapCollection(json);
355 kc.setIndex(ki);
356 assertEquals(7, kc.numberOf("documents"));
Nils Diewald364eb642013-12-22 15:03:01 +0000357 };
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000358
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000359 @Test
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000360 public void searchJSONitemsPerResource () throws IOException {
361
362 // Construct index
363 KorapIndex ki = new KorapIndex();
364 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000365 for (String i : new String[] {"00001",
366 "00002",
367 "00003",
368 "00004",
369 "00005",
370 "00006",
371 "02439"}) {
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000372 ki.addDocFile(
373 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
374 );
375 };
376 ki.commit();
377
378 String json = getString(getClass().getResource("/queries/bsp-itemsPerResource.jsonld").getFile());
379
380 KorapSearch ks = new KorapSearch(json);
381 KorapResult kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000382 assertEquals(kr.getTotalResults(), 10);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000383 assertEquals(0, kr.getStartIndex());
384 assertEquals(20, kr.getItemsPerPage());
385
386 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
387 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
388 assertEquals("WPD_AAA.00001", kr.getMatch(6).getDocID());
389 assertEquals("WPD_AAA.00002", kr.getMatch(7).getDocID());
390 assertEquals("WPD_AAA.00002", kr.getMatch(8).getDocID());
391 assertEquals("WPD_AAA.00004", kr.getMatch(9).getDocID());
392
393 ks = new KorapSearch(json);
394 ks.setItemsPerResource(1);
395
396 kr = ks.run(ki);
397
398 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
399 assertEquals("WPD_AAA.00002", kr.getMatch(1).getDocID());
400 assertEquals("WPD_AAA.00004", kr.getMatch(2).getDocID());
401
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000402 assertEquals(kr.getTotalResults(), 3);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000403 assertEquals(0, kr.getStartIndex());
404 assertEquals(20, kr.getItemsPerPage());
405
406
407 ks = new KorapSearch(json);
408 ks.setItemsPerResource(2);
409
410 kr = ks.run(ki);
411
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000412 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
413 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
414 assertEquals("WPD_AAA.00002", kr.getMatch(2).getDocID());
415 assertEquals("WPD_AAA.00002", kr.getMatch(3).getDocID());
416 assertEquals("WPD_AAA.00004", kr.getMatch(4).getDocID());
417
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000418 assertEquals(kr.getTotalResults(), 5);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000419 assertEquals(0, kr.getStartIndex());
420 assertEquals(20, kr.getItemsPerPage());
421
422
423 ks = new KorapSearch(json);
424 ks.setItemsPerResource(1);
425 ks.setStartIndex(1);
426 ks.setCount(1);
427
428 kr = ks.run(ki);
429
430 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
431
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000432 assertEquals(kr.getTotalResults(), 3);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000433 assertEquals(1, kr.getStartIndex());
434 assertEquals(1, kr.getItemsPerPage());
435
436 assertEquals((short) 1, kr.getItemsPerResource());
437 };
438
Nils Diewaldd723d812014-09-23 18:50:52 +0000439 @Test
440 public void searchJSONitemsPerResourceServer () throws IOException {
441
442 /*
443 * This test is a server-only implementation of
444 * TestResource#testCollection
445 */
446
447
448 // Construct index
449 KorapIndex ki = new KorapIndex();
450 // Indexing test files
451 int uid = 1;
452 for (String i : new String[] {"00001",
453 "00002",
454 "00003",
455 "00004",
456 "00005",
457 "00006",
458 "02439"}) {
459 ki.addDocFile(
460 uid++,
461 getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
462 true
463 );
464 };
465 ki.commit();
466
467 String json = getString(getClass().getResource("/queries/bsp-uid-example.jsonld").getFile());
468
469 KorapSearch ks = new KorapSearch(json);
470 ks.setItemsPerResource(1);
471 KorapCollection kc = new KorapCollection();
472 kc.filterUIDs(new String[]{"1", "4"});
473 kc.setIndex(ki);
474 ks.setCollection(kc);
475
476 KorapResult kr = ks.run(ki);
477
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000478 assertEquals(kr.getTotalResults(), 2);
Nils Diewaldd723d812014-09-23 18:50:52 +0000479 assertEquals(0, kr.getStartIndex());
480 assertEquals(25, kr.getItemsPerPage());
481 };
Nils Diewaldba197f22014-11-01 17:21:46 +0000482
483 @Test
484 public void searchJSONnewJSON () throws IOException {
485 // Construct index
486 KorapIndex ki = new KorapIndex();
487 // Indexing test files
488 FieldDocument fd = ki.addDocFile(
489 1,getClass().getResource("/goe/AGA-03828.json.gz").getFile(), true
490 );
491 ki.commit();
492
493 assertEquals(fd.getUID(), 1);
494 assertEquals(fd.getTextSigle(), "GOE_AGA.03828");
495 assertEquals(fd.getDocSigle(), "GOE_AGA");
496 assertEquals(fd.getCorpusSigle(), "GOE");
497 assertEquals(fd.getTitle() , "Autobiographische Einzelheiten");
498 assertNull(fd.getSubTitle());
499 assertEquals(fd.getTextType(), "Autobiographie");
500 assertNull(fd.getTextTypeArt());
501 assertNull(fd.getTextTypeRef());
502 assertNull(fd.getTextColumn());
503 assertNull(fd.getTextDomain());
504 assertEquals(fd.getPages(), "529-547");
505 assertEquals(fd.getLicense(), "QAO-NC");
506 assertEquals(fd.getCreationDate().toString(), "18200000");
507 assertEquals(fd.getPubDate().toString(), "19820000");
508 assertEquals(fd.getAuthor(), "Goethe, Johann Wolfgang von");
509 assertNull(fd.getTextClass());
510 assertEquals(fd.getLanguage(), "de");
511 assertEquals(fd.getPubPlace(), "München");
Nils Diewaldba197f22014-11-01 17:21:46 +0000512 assertEquals(fd.getReference(), "Goethe, Johann Wolfgang von: Autobiographische Einzelheiten, (Geschrieben bis 1832), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 10, Autobiographische Schriften II, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 529-547");
513 assertEquals(fd.getPublisher(), "Verlag C. H. Beck");
Nils Diewaldba197f22014-11-01 17:21:46 +0000514 assertNull(fd.getEditor());
515 assertNull(fd.getFileEditionStatement());
516 assertNull(fd.getBiblEditionStatement());
Nils Diewaldba197f22014-11-01 17:21:46 +0000517 assertNull(fd.getKeywords());
518
519 assertEquals(fd.getTokenSource(), "opennlp#tokens");
520 assertEquals(fd.getFoundries(), "base base/paragraphs base/sentences corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences");
Nils Diewaldd66e2da2014-11-03 21:34:49 +0000521 assertEquals(fd.getLayerInfos(), "base/s=spans corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans");
Nils Diewaldba197f22014-11-01 17:21:46 +0000522
Nils Diewald06368ba2014-11-03 20:53:27 +0000523
524 assertEquals(fd.getCorpusTitle(), "Goethes Werke");
525 assertNull(fd.getCorpusSubTitle());
526 assertEquals(fd.getCorpusAuthor(), "Goethe, Johann Wolfgang von");
527 assertEquals(fd.getCorpusEditor(), "Trunz, Erich");
528
529 assertEquals(fd.getDocTitle(), "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
530 assertNull(fd.getDocSubTitle());
531 assertNull(fd.getDocEditor());
532 assertNull(fd.getDocAuthor());
533
Nils Diewaldba197f22014-11-01 17:21:46 +0000534 KorapSearch ks = new KorapSearch(
535 new KorapQuery("tokens").seg("mate/m:case:nom").with("mate/m:number:pl")
536 );
537 KorapResult kr = ks.run(ki);
538
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000539 assertEquals(kr.getTotalResults(), 148);
Nils Diewaldba197f22014-11-01 17:21:46 +0000540 assertEquals(0, kr.getStartIndex());
541 assertEquals(25, kr.getItemsPerPage());
542 };
Nils Diewald06368ba2014-11-03 20:53:27 +0000543
544 @Test
545 public void searchJSONnewJSON2 () throws IOException {
546 // Construct index
547 KorapIndex ki = new KorapIndex();
548 // Indexing test files
549 FieldDocument fd = ki.addDocFile(
550 1,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
551 );
552 ki.commit();
553
554 assertEquals(fd.getUID(), 1);
555 assertEquals(fd.getTextSigle(), "BZK_D59.00089");
556 assertEquals(fd.getDocSigle(), "BZK_D59");
557 assertEquals(fd.getCorpusSigle(), "BZK");
558 assertEquals(fd.getTitle() , "Saragat-Partei zerfällt");
559 assertEquals(fd.getPubDate().toString(), "19590219");
560
561 assertNull(fd.getSubTitle());
562 assertNull(fd.getAuthor());
563 assertNull(fd.getEditor());
564 assertEquals(fd.getPubPlace(), "Berlin");
565 assertNull(fd.getPublisher());
566 assertEquals(fd.getTextType(), "Zeitung: Tageszeitung");
567 assertNull(fd.getTextTypeArt());
568 assertEquals(fd.getTextTypeRef(), "Tageszeitung");
569 assertEquals(fd.getTextDomain(), "Politik");
570 assertEquals(fd.getCreationDate().toString(), "19590219");
571 assertEquals(fd.getLicense(), "ACA-NC-LC");
572 assertEquals(fd.getTextColumn(), "POLITIK");
573 assertNull(fd.getPages());
574 assertEquals(fd.getTextClass(), "politik ausland");
575 assertNull(fd.getFileEditionStatement());
576 assertNull(fd.getBiblEditionStatement());
577
578 assertEquals(fd.getLanguage(), "de");
579 assertEquals(fd.getReference(), "Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14, Berliner Ausgabe, S. 7. - Sachgebiet: Politik, Originalressort: POLITIK; Saragat-Partei zerfällt");
580 assertNull(fd.getPublisher());
581 assertNull(fd.getKeywords());
582
583 assertEquals(fd.getTokenSource(), "opennlp#tokens");
584
585 assertEquals(fd.getFoundries(), "base base/paragraphs base/sentences corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences");
586
587 assertEquals(fd.getLayerInfos(), "base/s=spans corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans");
588
589 assertEquals(fd.getCorpusTitle(), "Bonner Zeitungskorpus");
590 assertNull(fd.getCorpusSubTitle());
591 assertNull(fd.getCorpusAuthor());
592 assertNull(fd.getCorpusEditor());
593
594 assertEquals(fd.getDocTitle(), "Neues Deutschland");
595 assertEquals(fd.getDocSubTitle(), "Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands");
596 assertNull(fd.getDocEditor());
597 assertNull(fd.getDocAuthor());
598
599 KorapSearch ks = new KorapSearch(
600 new KorapQuery("tokens").seg("mate/m:case:nom").with("mate/m:number:sg")
601 );
602 KorapResult kr = ks.run(ki);
603
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000604 assertEquals(kr.getTotalResults(), 6);
Nils Diewald06368ba2014-11-03 20:53:27 +0000605 assertEquals(0, kr.getStartIndex());
606 assertEquals(25, kr.getItemsPerPage());
607 };
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000608
609 @Test
Nils Diewald56dc2582014-11-04 21:33:46 +0000610 public void searchJSONcosmasBoundaryBug () throws IOException {
611 // Construct index
612 KorapIndex ki = new KorapIndex();
613 // Indexing test files
614 FieldDocument fd = ki.addDocFile(
615 1,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
616 );
617 ki.commit();
618
619 String json = getString(
620 getClass().getResource("/queries/bugs/cosmas_boundary.jsonld").getFile()
621 );
622
623 KorapQuery kq = new KorapQuery("tokens");
624
Nils Diewald56dc2582014-11-04 21:33:46 +0000625 KorapSearch ks = new KorapSearch(
626 kq.shrink(1,kq.contains(kq.tag("base/s:s"), kq._(1, kq.seg("s:Leben"))))
627 );
628
629 KorapResult kr = ks.run(ki);
Nils Diewald0fa2da22014-11-05 03:31:32 +0000630 assertEquals(
631 kr.getQuery(),
632 "shrink(1: spanContain(<tokens:base/s:s />, {1: tokens:s:Leben}))"
633 );
634 assertEquals(
635 kr.getMatch(0).getSnippetBrackets(),
636 "... Initiative\" eine neue politische Gruppierung ins " +
Nils Diewalda206b2e2014-11-05 17:24:47 +0000637 "[{1:Leben}] gerufen hatten. Pressemeldungen zufolge haben sich ..."
Nils Diewald0fa2da22014-11-05 03:31:32 +0000638 );
Nils Diewald56dc2582014-11-04 21:33:46 +0000639
Nils Diewalda206b2e2014-11-05 17:24:47 +0000640 // Try with high class - don't highlight
Nils Diewald0fa2da22014-11-05 03:31:32 +0000641 ks = new KorapSearch(
642 kq.shrink(129, kq.contains(kq.tag("base/s:s"), kq._(129, kq.seg("s:Leben"))))
643 );
Nils Diewald56dc2582014-11-04 21:33:46 +0000644
645 kr = ks.run(ki);
Nils Diewald0fa2da22014-11-05 03:31:32 +0000646 assertEquals(
647 kr.getQuery(),
648 "shrink(129: spanContain(<tokens:base/s:s />, {129: tokens:s:Leben}))"
649 );
650 assertEquals(
651 kr.getMatch(0).getSnippetBrackets(),
652 "... Initiative\" eine neue politische Gruppierung ins " +
653 "[Leben] gerufen hatten. Pressemeldungen zufolge haben sich ..."
654 );
655
656 ks = new KorapSearch(json);
657 kr = ks.run(ki);
658 assertEquals(
659 kr.getQuery(),
660 "shrink(129: spanElementDistance({129: tokens:s:Namen}, " +
661 "{129: tokens:s:Leben}, [(base/s:s[0:1], notOrdered, notExcluded)]))"
662 );
663 assertEquals(
664 kr.getMatch(0).getSnippetBrackets(),
665 "... ihren Austritt erklärt und unter dem [Namen \"Einheitsbewegung " +
666 "der sozialistischen Initiative\" eine neue politische Gruppierung " +
667 "ins Leben] gerufen hatten. Pressemeldungen zufolge haben sich ..."
668 );
669
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000670 assertEquals(kr.getTotalResults(), 1);
Nils Diewald56dc2582014-11-04 21:33:46 +0000671 assertEquals(0, kr.getStartIndex());
Nils Diewald56dc2582014-11-04 21:33:46 +0000672 };
673
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000674 @Test
675 public void searchJSONmultipleClassesBug () throws IOException {
676 // Construct index
677 KorapIndex ki = new KorapIndex();
678 // Indexing test files
679 ki.addDocFile(
680 1,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
681 );
682 ki.addDocFile(
683 2,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
684 );
685
686 ki.commit();
687
688 String json = getString(
689 getClass().getResource("/queries/bugs/multiple_classes.jsonld").getFile()
690 );
691
692 KorapSearch ks = new KorapSearch(json);
693 KorapResult kr = ks.run(ki);
694 assertEquals(
695 kr.getQuery(),
696 "{4: spanNext({1: spanNext({2: tokens:s:ins}, {3: tokens:s:Leben})}, tokens:s:gerufen)}"
697 );
698 assertEquals(
699 kr.getMatch(0).getSnippetBrackets(),
700 "... sozialistischen Initiative\" eine neue politische Gruppierung " +
701 "[{4:{1:{2:ins} {3:Leben}} gerufen}] hatten. " +
702 "Pressemeldungen zufolge haben sich in ..."
703 );
704
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000705 assertEquals(kr.getTotalResults(), 2);
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000706 assertEquals(0, kr.getStartIndex());
707 };
708
Nils Diewald277e9ce2014-11-06 03:42:11 +0000709 @Test
710 public void searchJSONmultipleClassesBugTokenList () throws IOException {
711 // Construct index
712 KorapIndex ki = new KorapIndex();
713 // Indexing test files
714 ki.addDocFile(
715 1,getClass().getResource("/goe/AGA-03828.json.gz").getFile(), true
716 );
717 ki.addDocFile(
718 2,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
719 );
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000720
Nils Diewald277e9ce2014-11-06 03:42:11 +0000721 ki.commit();
722
723 String json = getString(
724 getClass().getResource("/queries/bugs/multiple_classes.jsonld").getFile()
725 );
726
727 KorapSearch ks = new KorapSearch(json);
728 KorapResult kr = ks.run(ki);
729
730 ObjectMapper mapper = new ObjectMapper();
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000731 JsonNode res = mapper.readTree(kr.toTokenListJsonString());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000732
733 assertEquals(1, res.at("/totalResults").asInt());
734 assertEquals("{4: spanNext({1: spanNext({2: tokens:s:ins}, " +
735 "{3: tokens:s:Leben})}, tokens:s:gerufen)}", res.at("/query").asText());
736 assertEquals(0, res.at("/startIndex").asInt());
737 assertEquals(25, res.at("/itemsPerPage").asInt());
738
739 assertEquals("BZK_D59.00089", res.at("/matches/0/textSigle").asText());
740 assertEquals(328, res.at("/matches/0/tokens/0/0").asInt());
741 assertEquals(331, res.at("/matches/0/tokens/0/1").asInt());
742 assertEquals(332, res.at("/matches/0/tokens/1/0").asInt());
743 assertEquals(337, res.at("/matches/0/tokens/1/1").asInt());
744 assertEquals(338, res.at("/matches/0/tokens/2/0").asInt());
745 assertEquals(345, res.at("/matches/0/tokens/2/1").asInt());
746 };
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000747
Nils Diewaldb84e7272014-11-07 01:27:38 +0000748 @Test
749 public void searchJSONmultitermRewriteBug () throws IOException {
750 // Construct index
751 KorapIndex ki = new KorapIndex();
752 // Indexing test files
753 ki.addDocFile(
754 1,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
755 );
Nils Diewaldb84e7272014-11-07 01:27:38 +0000756 ki.commit();
757
Nils Diewald5871e4d2014-11-07 03:48:25 +0000758 // [tt/p="A.*"]{0,3}[tt/p="N.*"]
Nils Diewaldb84e7272014-11-07 01:27:38 +0000759 String json = getString(
760 getClass().getResource("/queries/bugs/multiterm_rewrite.jsonld").getFile()
761 );
762
763 KorapSearch ks = new KorapSearch(json);
Nils Diewaldc471b182014-11-19 22:51:15 +0000764 KorapCollection kc = ks.getCollection();
765
766 // No index was set
767 assertEquals(-1, kc.numberOf("documents"));
768 kc.setIndex(ki);
769
770 // Index was set but vc restricted to WPD
771 assertEquals(0, kc.numberOf("documents"));
772
773 kc.extend(
774 new KorapFilter().or("corpusSigle", "BZK")
775 );
776 /*
777 System.err.println(ks.getCollection().toString());
778 */
779 assertEquals("Known issue: ", 1, kc.numberOf("documents"));
Nils Diewald1220e3e2014-11-08 03:18:58 +0000780
Nils Diewaldb84e7272014-11-07 01:27:38 +0000781 KorapResult kr = ks.run(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +0000782
Nils Diewaldb84e7272014-11-07 01:27:38 +0000783 assertEquals(
784 kr.getQuery(),
Nils Diewald5871e4d2014-11-07 03:48:25 +0000785 "spanOr([SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/), " +
786 "spanNext(spanRepetition(SpanMultiTermQueryWrapper(tokens:/tt/p:A.*/){1,3}), " +
787 "SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/))])"
Nils Diewaldb84e7272014-11-07 01:27:38 +0000788 );
789
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000790 assertEquals(kr.getTotalResults(), 58);
Nils Diewaldb84e7272014-11-07 01:27:38 +0000791 assertEquals(0, kr.getStartIndex());
Nils Diewald5871e4d2014-11-07 03:48:25 +0000792
793 assertEquals(
794 kr.getMatch(0).getSnippetBrackets(),
795 "[Saragat-Partei] zerfällt Rom (ADN) die von dem"
796 );
797 assertEquals(
798 kr.getMatch(1).getSnippetBrackets(),
799 "[Saragat-Partei] zerfällt Rom (ADN) die von dem"
800 );
801 assertEquals(
802 kr.getMatch(2).getSnippetBrackets(),
803 "Saragat-Partei zerfällt [Rom] (ADN) die von dem Rechtssozialisten Saragat"
804 );
805 assertEquals(
806 kr.getMatch(3).getSnippetBrackets(),
807 "Saragat-Partei zerfällt Rom ([ADN]) die von dem Rechtssozialisten Saragat geführte"
808 );
809
810 assertEquals(
811 kr.getMatch(23).getSnippetBrackets(),
812 "dem Namen \"Einheitsbewegung der sozialistischen Initiative\" [eine neue politische Gruppierung] ins Leben gerufen hatten. Pressemeldungen zufolge"
813 );
Nils Diewaldb84e7272014-11-07 01:27:38 +0000814 };
815
816
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000817
Nils Diewald56dc2582014-11-04 21:33:46 +0000818 @Test
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000819 public void searchJSONCollection () throws IOException {
820
821 // Construct index
822 KorapIndex ki = new KorapIndex();
823 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000824 for (String i : new String[] {"00001",
825 "00002",
826 "00003",
827 "00004",
828 "00005",
829 "00006",
830 "02439"}) {
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000831 ki.addDocFile(
832 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
833 );
834 };
835 ki.commit();
836
Nils Diewald56dc2582014-11-04 21:33:46 +0000837 String json = getString(
838 getClass().getResource("/queries/metaquery8-nocollection.jsonld").getFile()
839 );
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000840
841 KorapSearch ks = new KorapSearch(json);
842 KorapResult kr = ks.run(ki);
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000843 assertEquals(kr.getTotalResults(), 276);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000844 assertEquals(0, kr.getStartIndex());
845 assertEquals(10, kr.getItemsPerPage());
846
847 json = getString(getClass().getResource("/queries/metaquery8.jsonld").getFile());
848
849 ks = new KorapSearch(json);
850 kr = ks.run(ki);
851
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000852 assertEquals(kr.getTotalResults(), 147);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000853 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
854 assertEquals(0, kr.getStartIndex());
855 assertEquals(10, kr.getItemsPerPage());
856
857 json = getString(getClass().getResource("/queries/metaquery8-filtered.jsonld").getFile());
858
859 ks = new KorapSearch(json);
860 kr = ks.run(ki);
861
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000862 assertEquals(kr.getTotalResults(), 28);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000863 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
864 assertEquals(0, kr.getStartIndex());
865 assertEquals(10, kr.getItemsPerPage());
866
867 json = getString(getClass().getResource("/queries/metaquery8-filtered-further.jsonld").getFile());
868
869 ks = new KorapSearch(json);
870 kr = ks.run(ki);
871
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000872 assertEquals(kr.getTotalResults(), 0);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000873 assertEquals(0, kr.getStartIndex());
874 assertEquals(10, kr.getItemsPerPage());
875
876 json = getString(getClass().getResource("/queries/metaquery8-filtered-nested.jsonld").getFile());
877
878 ks = new KorapSearch(json);
879 kr = ks.run(ki);
880
881 assertEquals("filter with QueryWrapperFilter(+(ID:WPD_AAA.00003 (+tokens:s:die +tokens:s:Schriftzeichen)))", ks.getCollection().getFilter(1).toString());
882
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000883 assertEquals(kr.getTotalResults(), 119);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000884 assertEquals(0, kr.getStartIndex());
885 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000886 };
887
Nils Diewald1e5d5942014-05-20 13:29:53 +0000888
889 @Test
890 public void searchJSONSentenceContext () throws IOException {
891
892 // Construct index
893 KorapIndex ki = new KorapIndex();
894 // Indexing test files
895 for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
896 ki.addDocFile(
897 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
898 );
899 };
900 ki.commit();
901
902 String json = getString(getClass().getResource("/queries/bsp-context-2.jsonld").getFile());
903
904 KorapSearch ks = new KorapSearch(json);
905 ks.setCutOff(false);
906 SearchContext sc = ks.getContext();
907 sc.left.setLength((short) 10);
908 sc.right.setLength((short) 10);
909
910 KorapResult kr = ks.run(ki);
911 assertEquals(kr.getMatch(1).getSnippetBrackets(), "... dezimalen [Wert] 65 sowohl ...");
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000912 assertEquals(kr.getTotalResults(), 3);
Nils Diewald1e5d5942014-05-20 13:29:53 +0000913 assertEquals(0, kr.getStartIndex());
914 assertEquals(25, kr.getItemsPerPage());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000915 assertFalse(kr.getContext().toJsonNode().toString().equals("\"s\""));
Nils Diewald1e5d5942014-05-20 13:29:53 +0000916
917 json = getString(getClass().getResource("/queries/bsp-context-sentence.jsonld").getFile());
918
919 kr = new KorapSearch(json).run(ki);
920 assertEquals(kr.getMatch(0).getSnippetBrackets(),
921 "steht a für den dezimalen [Wert] 97 sowohl im ASCII- als auch im Unicode-Zeichensatz");
922 assertEquals(kr.getMatch(1).getSnippetBrackets(),
923 "steht A für den dezimalen [Wert] 65 sowohl im ASCII- als auch im Unicode-Zeichensatz");
924 assertEquals(kr.getMatch(2).getSnippetBrackets(),
925 "In einem Zahlensystem mit einer Basis größer als 10 steht A oder a häufig für den dezimalen [Wert] 10, siehe auch Hexadezimalsystem.");
926
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000927 assertEquals(kr.getContext().toJsonNode().toString(), "\"s\"");
Nils Diewald1e5d5942014-05-20 13:29:53 +0000928 };
929
930
Nils Diewald2276e1c2014-04-10 15:01:59 +0000931 @Test
Nils Diewald54187632014-06-11 14:39:29 +0000932 public void searchJSONbug () throws IOException {
933
934 // Construct index
935 KorapIndex ki = new KorapIndex();
936 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +0000937 for (String i : new String[] {"00001",
938 "00002",
939 "00003",
940 "00004",
941 "00005",
942 "00006",
943 "02439"}) {
Nils Diewald54187632014-06-11 14:39:29 +0000944 ki.addDocFile(
945 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
946 );
947 };
948 ki.commit();
949
950 String json = getString(getClass().getResource("/queries/bsp-bug.jsonld").getFile());
951
952 KorapResult kr = new KorapSearch(json).run(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +0000953
954 assertEquals(kr.getError(0).getMessage(),
955 "Number of operands is not acceptable");
Nils Diewald54187632014-06-11 14:39:29 +0000956 };
957
Nils Diewaldef7124e2014-11-12 20:08:13 +0000958 /**
959 * This is a breaking test for #179
960 */
961 @Test
962 public void searchJSONexpansionBug () throws IOException {
Eliza Margaretha8e200cd2014-11-13 16:00:38 +0000963 // Construct index
964 KorapIndex ki = new KorapIndex();
965 // Indexing test files
966 ki.addDocFile(
967 getClass().getResource("/wiki/00002.json.gz").getFile(), true
968 );
969 ki.commit();
970
971 // Expansion bug
972 // der alte Digraph Aa durch Ã…
973 String json = getString(
974 getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()
975 );
976
977 KorapResult kr = new KorapSearch(json).run(ki);
978 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
979 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
980 "in Eigennamen und Ortsnamen ...",
981 kr.getMatch(0).getSnippetBrackets());
982 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000983 assertEquals(kr.getTotalResults(), 1);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +0000984
985 // der alte Digraph Aa durch []
986 // Works with one document
987 json = getString(
988 getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile()
989 );
990
991 kr = new KorapSearch(json).run(ki);
992
993 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
994 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
995 "in Eigennamen und Ortsnamen ...",
996 kr.getMatch(0).getSnippetBrackets());
997 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +0000998 assertEquals(kr.getTotalResults(), 1);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +0000999
1000 // Now try with one file ahead
1001 ki = new KorapIndex();
1002 for (String i : new String[] {"00001",
1003 "00002"}) {
1004 ki.addDocFile(
1005 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
1006 );
1007 };
1008 ki.commit();
1009
1010 // Expansion bug
1011 // der alte Digraph Aa durch Ã…
1012 json = getString(
1013 getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile()
1014 );
1015
1016 kr = new KorapSearch(json).run(ki);
1017
1018 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
1019 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
1020 "in Eigennamen und Ortsnamen ...",
1021 kr.getMatch(0).getSnippetBrackets());
1022 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +00001023 assertEquals(kr.getTotalResults(), 1);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001024
1025 // der alte Digraph Aa durch []
1026 json = getString(
1027 getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile()
1028 );
1029
Nils Diewaldc471b182014-11-19 22:51:15 +00001030 kr = new KorapSearch(json).run(ki);
Eliza Margaretha8e200cd2014-11-13 16:00:38 +00001031 assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " +
1032 "[der alte Digraph Aa durch Ã…] ersetzt worden, " +
1033 "in Eigennamen und Ortsnamen ...",
1034 kr.getMatch(0).getSnippetBrackets());
1035 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewalde1ecd5e2014-11-27 02:17:24 +00001036 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldef7124e2014-11-12 20:08:13 +00001037 };
1038
1039
Nils Diewald979b2fe2014-09-29 16:21:41 +00001040 /*
1041 This test will crash soon - it's just here for nostalgic reasons!
1042 */
Nils Diewald54187632014-06-11 14:39:29 +00001043 @Test
Nils Diewald2276e1c2014-04-10 15:01:59 +00001044 public void getFoundryDistribution () throws Exception {
1045
1046 // Construct index
1047 KorapIndex ki = new KorapIndex();
1048 // Indexing test files
Nils Diewald979b2fe2014-09-29 16:21:41 +00001049 for (String i : new String[] {"00001",
1050 "00002",
1051 "00003",
1052 "00004",
1053 "00005",
1054 "00006",
1055 "02439"}) {
Nils Diewald2276e1c2014-04-10 15:01:59 +00001056 ki.addDocFile(
1057 getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
1058 );
1059 };
1060 ki.commit();
1061
1062 KorapCollection kc = new KorapCollection(ki);
1063
1064 assertEquals(7, kc.numberOf("documents"));
1065
1066 HashMap map = kc.getTermRelation("foundries");
1067 assertEquals((long) 7, map.get("-docs"));
1068 assertEquals((long) 7, map.get("treetagger"));
Nils Diewald979b2fe2014-09-29 16:21:41 +00001069 assertEquals((long) 6, map.get("opennlp/morpho"));
1070 assertEquals((long) 6, map.get("#__opennlp/morpho:###:treetagger"));
1071 assertEquals((long) 7, map.get("#__opennlp:###:treetagger"));
Nils Diewald2276e1c2014-04-10 15:01:59 +00001072 };
1073
1074 @Test
1075 public void getTextClassDistribution () throws Exception {
1076
1077 KorapIndex ki = new KorapIndex();
1078 ki.addDoc(
1079"{" +
1080" \"fields\" : [" +
1081" { \"primaryData\" : \"abc\" },{" +
1082" \"name\" : \"tokens\"," +
1083" \"data\" : [" +
1084" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1085" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1086" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1087" \"textClass\" : \"music entertainment\"" +
1088"}");
1089
1090 ki.addDoc(
1091"{" +
1092" \"fields\" : [" +
1093" { \"primaryData\" : \"abc\" },{" +
1094" \"name\" : \"tokens\"," +
1095" \"data\" : [" +
1096" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1097" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1098" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1099" \"textClass\" : \"music singing\"" +
1100"}");
1101
1102 ki.addDoc(
1103"{" +
1104" \"fields\" : [" +
1105" { \"primaryData\" : \"abc\" },{" +
1106" \"name\" : \"tokens\"," +
1107" \"data\" : [" +
1108" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1109" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1110" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1111" \"textClass\" : \"music entertainment jumping\"" +
1112"}");
1113 ki.commit();
1114
1115
1116 KorapCollection kc = new KorapCollection(ki);
1117 assertEquals(3, kc.numberOf("documents"));
1118
1119 HashMap map = kc.getTermRelation("textClass");
1120 assertEquals((long) 1, map.get("singing"));
1121 assertEquals((long) 1, map.get("jumping"));
1122 assertEquals((long) 3, map.get("music"));
1123 assertEquals((long) 2, map.get("entertainment"));
1124 assertEquals((long) 3, map.get("-docs"));
1125 assertEquals((long) 2, map.get("#__entertainment:###:music"));
1126 assertEquals((long) 1, map.get("#__entertainment:###:jumping"));
1127 assertEquals((long) 0, map.get("#__entertainment:###:singing"));
1128 assertEquals((long) 0, map.get("#__jumping:###:singing"));
1129 assertEquals((long) 1, map.get("#__jumping:###:music"));
1130 assertEquals((long) 1, map.get("#__music:###:singing"));
1131 assertEquals(11, map.size());
1132
1133 // System.err.println(kc.getTermRelationJSON("textClass"));
1134 };
1135
1136 @Test
1137 public void getTextClassDistribution2 () throws Exception {
1138
1139 KorapIndex ki = new KorapIndex();
1140 ki.addDoc(
1141"{" +
1142" \"fields\" : [" +
1143" { \"primaryData\" : \"abc\" },{" +
1144" \"name\" : \"tokens\"," +
1145" \"data\" : [" +
1146" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1147" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1148" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1149" \"textClass\" : \"\"" +
1150"}");
1151
1152 ki.commit();
1153 ki.addDoc(
1154"{" +
1155" \"fields\" : [" +
1156" { \"primaryData\" : \"abc\" },{" +
1157" \"name\" : \"tokens\"," +
1158" \"data\" : [" +
1159" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1160" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1161" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1162" \"textClass\" : \"music entertainment\"" +
1163"}");
1164
1165 ki.commit();
1166 ki.addDoc(
1167"{" +
1168" \"fields\" : [" +
1169" { \"primaryData\" : \"abc\" },{" +
1170" \"name\" : \"tokens\"," +
1171" \"data\" : [" +
1172" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1173" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1174" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1175" \"textClass\" : \"music singing\"" +
1176"}");
1177
1178 ki.addDoc(
1179"{" +
1180" \"fields\" : [" +
1181" { \"primaryData\" : \"abc\" },{" +
1182" \"name\" : \"tokens\"," +
1183" \"data\" : [" +
1184" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
1185" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
1186" [ \"s:c\", \"i:c\", \"_2#2-3\" ]]}]," +
1187" \"textClass\" : \"music entertainment jumping\"" +
1188"}");
1189 ki.commit();
1190
1191
1192 KorapCollection kc = new KorapCollection(ki);
1193 assertEquals(4, kc.numberOf("documents"));
1194
1195 HashMap map = kc.getTermRelation("textClass");
1196 assertEquals((long) 1, map.get("singing"));
1197 assertEquals((long) 1, map.get("jumping"));
1198 assertEquals((long) 3, map.get("music"));
1199 assertEquals((long) 2, map.get("entertainment"));
1200 assertEquals((long) 4, map.get("-docs"));
1201 assertEquals((long) 2, map.get("#__entertainment:###:music"));
1202 assertEquals((long) 1, map.get("#__entertainment:###:jumping"));
1203 assertEquals((long) 0, map.get("#__entertainment:###:singing"));
1204 assertEquals((long) 0, map.get("#__jumping:###:singing"));
1205 assertEquals((long) 1, map.get("#__jumping:###:music"));
1206 assertEquals((long) 1, map.get("#__music:###:singing"));
1207 assertEquals(11, map.size());
1208 };
Nils Diewaldc925b492013-12-03 23:56:10 +00001209};