blob: fda521ef9654e95974ee4d21c644038d8611a253 [file] [log] [blame]
Eliza Margaretha6a780692014-01-15 09:45:42 +00001package de.ids_mannheim.korap.search;
2
Eliza Margaretha805e27f2016-10-14 21:39:42 +02003import static de.ids_mannheim.korap.TestSimple.getJsonString;
margarethaf70addb2015-04-27 13:17:18 +02004import static org.junit.Assert.assertEquals;
Akron001dab32015-07-02 12:30:15 +02005import static org.junit.Assert.assertTrue;
margarethaf70addb2015-04-27 13:17:18 +02006import static org.junit.Assert.assertFalse;
7import static org.junit.Assert.assertNull;
Nils Diewaldc925b492013-12-03 23:56:10 +00008
margarethaf70addb2015-04-27 13:17:18 +02009import java.io.IOException;
10import java.util.HashMap;
Nils Diewald56dc2582014-11-04 21:33:46 +000011
Nils Diewaldc925b492013-12-03 23:56:10 +000012import org.junit.Test;
Akron176c9b12015-07-29 19:53:40 +020013import org.junit.Ignore;
Nils Diewaldc925b492013-12-03 23:56:10 +000014import org.junit.runner.RunWith;
15import org.junit.runners.JUnit4;
16
margarethaf70addb2015-04-27 13:17:18 +020017import com.fasterxml.jackson.databind.JsonNode;
18import com.fasterxml.jackson.databind.ObjectMapper;
19
20import de.ids_mannheim.korap.Krill;
21import de.ids_mannheim.korap.KrillCollection;
22import de.ids_mannheim.korap.KrillIndex;
23import de.ids_mannheim.korap.KrillMeta;
24import de.ids_mannheim.korap.collection.CollectionBuilder;
25import de.ids_mannheim.korap.index.FieldDocument;
26import de.ids_mannheim.korap.query.QueryBuilder;
margarethaf70addb2015-04-27 13:17:18 +020027import de.ids_mannheim.korap.response.Result;
Akron69b958c2017-02-15 22:49:45 +010028import de.ids_mannheim.korap.response.Match;
margarethaf70addb2015-04-27 13:17:18 +020029import de.ids_mannheim.korap.response.SearchContext;
30
Nils Diewaldc925b492013-12-03 23:56:10 +000031@RunWith(JUnit4.class)
Nils Diewaldbbd39a52015-02-23 19:56:57 +000032public class TestKrill {
Nils Diewaldc925b492013-12-03 23:56:10 +000033 @Test
34 public void searchCount () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000035 Krill k = new Krill(new QueryBuilder("field1").seg("a").with("b"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000036
37 KrillMeta meta = k.getMeta();
38
Nils Diewaldafab8f32015-01-26 19:11:32 +000039 // Count:
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000040 meta.setCount(30);
41 assertEquals(meta.getCount(), 30);
42 meta.setCount(20);
43 assertEquals(meta.getCount(), 20);
44 meta.setCount(-50);
45 assertEquals(meta.getCount(), 20);
46 meta.setCount(500);
47 assertEquals(meta.getCount(), meta.getCountMax());
Nils Diewaldc925b492013-12-03 23:56:10 +000048 };
49
Nils Diewaldbb33da22015-03-04 16:24:25 +000050
Nils Diewaldc925b492013-12-03 23:56:10 +000051 @Test
52 public void searchStartIndex () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000053 Krill k = new Krill(new QueryBuilder("field1").seg("a").with("b"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000054
55 KrillMeta meta = k.getMeta();
56
Nils Diewaldafab8f32015-01-26 19:11:32 +000057 // startIndex
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000058 meta.setStartIndex(5);
59 assertEquals(meta.getStartIndex(), 5);
60 meta.setStartIndex(1);
61 assertEquals(meta.getStartIndex(), 1);
62 meta.setStartIndex(0);
63 assertEquals(meta.getStartIndex(), 0);
64 meta.setStartIndex(70);
65 assertEquals(meta.getStartIndex(), 70);
66 meta.setStartIndex(-5);
67 assertEquals(meta.getStartIndex(), 0);
Nils Diewaldc925b492013-12-03 23:56:10 +000068 };
69
Nils Diewaldbb33da22015-03-04 16:24:25 +000070
Nils Diewaldc925b492013-12-03 23:56:10 +000071 @Test
72 public void searchQuery () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000073 Krill ks = new Krill(new QueryBuilder("field1").seg("a").with("b"));
Nils Diewaldafab8f32015-01-26 19:11:32 +000074 // query
Nils Diewaldbb33da22015-03-04 16:24:25 +000075 assertEquals(ks.getSpanQuery().toString(),
76 "spanSegment(field1:a, field1:b)");
Nils Diewaldc925b492013-12-03 23:56:10 +000077 };
78
Nils Diewaldafab8f32015-01-26 19:11:32 +000079
Nils Diewaldc925b492013-12-03 23:56:10 +000080 @Test
81 public void searchIndex () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +000082 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +000083 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +000084 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +000085 for (String i : new String[] { "00001", "00002", "00003", "00004",
86 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +020087 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +000088 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +000089 };
90 ki.commit();
Nils Diewaldc925b492013-12-03 23:56:10 +000091
Nils Diewaldbb33da22015-03-04 16:24:25 +000092 Krill ks = new Krill(new QueryBuilder("tokens").seg("s:Buchstaben"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000093
Akron176c9b12015-07-29 19:53:40 +020094 CollectionBuilder cb = new CollectionBuilder();
95
96 ks.getCollection().fromBuilder(cb.term("textClass", "reisen"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000097
98 KrillMeta meta = ks.getMeta();
99 meta.setCount(3);
100 meta.setStartIndex(5);
101 meta.getContext().left.setLength(1);
102 meta.getContext().right.setLength(1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000103
Nils Diewald884dbcf2015-02-27 17:02:28 +0000104 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000105 assertEquals(kr.getTotalResults(), 6);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000106 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200107 "... dem [[Buchstaben]] A ...");
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000108
109 JsonNode res = ks.toJsonNode();
110 assertEquals(3, res.at("/meta/count").asInt());
111 assertEquals(5, res.at("/meta/startIndex").asInt());
112 assertEquals("token", res.at("/meta/context/left/0").asText());
113 assertEquals(1, res.at("/meta/context/left/1").asInt());
114 assertEquals("token", res.at("/meta/context/right/0").asText());
115 assertEquals(1, res.at("/meta/context/right/1").asInt());
Nils Diewaldc925b492013-12-03 23:56:10 +0000116 };
Nils Diewaldc6b78752013-12-05 19:05:12 +0000117
Nils Diewaldafab8f32015-01-26 19:11:32 +0000118
Nils Diewaldc6b78752013-12-05 19:05:12 +0000119 @Test
120 public void searchJSON () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000121 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000122 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000123 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000124 for (String i : new String[] { "00001", "00002", "00003", "00004",
125 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200126 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000127 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000128 };
129 ki.commit();
Nils Diewaldc6b78752013-12-05 19:05:12 +0000130
Eliza Margaretha6f989202016-10-14 21:48:29 +0200131 String json = getJsonString(
132 getClass().getResource("/queries/metaquery3.jsonld").getFile());
Nils Diewaldc6b78752013-12-05 19:05:12 +0000133
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000134 Krill ks = new Krill(json);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000135 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000136 assertEquals(kr.getTotalResults(), 66);
137 assertEquals(5, kr.getItemsPerPage());
138 assertEquals(5, kr.getStartIndex());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200139 assertEquals("... a: A ist [[der klangreichste]] der V ...",
140 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000141 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000142
Nils Diewaldbb33da22015-03-04 16:24:25 +0000143
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000144 @Test
145 public void searchJSON2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000146 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000147 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000148 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000149 for (String i : new String[] { "00001", "00002", "00003", "00004",
150 "00005", "00006", "02439", "00012-fakemeta", "00030-fakemeta",
Eliza Margaretha6f989202016-10-14 21:48:29 +0200151 /*
152 "02035-substring",
153 "05663-unbalanced",
154 "07452-deep"
155 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000156 }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200157 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000158 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000159 };
160 ki.commit();
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000161
Eliza Margaretha6f989202016-10-14 21:48:29 +0200162 String json = getJsonString(
163 getClass().getResource("/queries/metaquery4.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000164
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000165 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000166 Result kr = ks.apply(ki);
Nils Diewaldc86aa482014-02-12 16:58:05 +0000167
Nils Diewaldafab8f32015-01-26 19:11:32 +0000168 assertEquals(kr.getTotalResults(), 1);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000169
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000170 ks = new Krill(json);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000171 // Ignore the collection part of the query!
Nils Diewald2d5f8102015-02-26 21:07:54 +0000172 ks.setCollection(new KrillCollection());
Nils Diewald3aa9e692015-02-20 22:20:11 +0000173 kr = ks.apply(ki);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000174
Nils Diewaldafab8f32015-01-26 19:11:32 +0000175 assertEquals(kr.getTotalResults(), 5);
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000176
Eliza Margaretha6f989202016-10-14 21:48:29 +0200177 json = getJsonString(
178 getClass().getResource("/queries/metaquery5.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000179
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000180 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000181 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000182 assertEquals(kr.getTotalResults(), 1);
183
Eliza Margaretha6f989202016-10-14 21:48:29 +0200184 json = getJsonString(
185 getClass().getResource("/queries/metaquery6.jsonld").getFile());
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000186 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000187 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000188 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldc6b78752013-12-05 19:05:12 +0000189 };
190
Akronbb5d1732015-06-22 01:22:40 +0200191
Akronc63697c2015-06-17 22:32:02 +0200192 // Todo: There SHOULD be a failure here, but Koral currently creates empty collections
193 @Test
194 public void queryJSONapiTest1 () {
Akronbb5d1732015-06-22 01:22:40 +0200195 Krill test = new Krill(
196 "{\"@context\":\"http://korap.ids-mannheim.de/ns/koral/0.3/context.jsonld\",\"errors\":[],\"warnings\":[],\"messages\":[],\"collection\":{},\"query\":{\"@type\":\"koral:token\",\"wrap\":{\"@type\":\"koral:term\",\"layer\":\"orth\",\"key\":\"Baum\",\"match\":\"match:eq\"}},\"meta\":{}}");
Akronc63697c2015-06-17 22:32:02 +0200197 assertFalse(test.hasErrors());
198 };
199
Nils Diewaldc6b78752013-12-05 19:05:12 +0000200
201 @Test
202 public void searchJSONFailure () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000203 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000204 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000205 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000206 for (String i : new String[] { "00001", "00002", "00003", "00004",
207 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200208 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000209 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000210 };
211 ki.commit();
Nils Diewald884dbcf2015-02-27 17:02:28 +0000212 Result kr = new Krill("{ query").apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000213 assertEquals(kr.getTotalResults(), 0);
214 assertEquals(kr.getError(0).getMessage(), "Unable to parse JSON");
Nils Diewaldc6b78752013-12-05 19:05:12 +0000215 };
216
217
Nils Diewald9f310832013-12-06 22:38:55 +0000218 @Test
219 public void searchJSONindexboundary () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000220 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000221 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000222 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223 for (String i : new String[] { "00001", "00002", "00003", "00004",
224 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200225 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000226 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000227 };
228 ki.commit();
Nils Diewald9f310832013-12-06 22:38:55 +0000229
Eliza Margaretha6f989202016-10-14 21:48:29 +0200230 String json = getJsonString(
231 getClass().getResource("/queries/bsp-fail1.jsonld").getFile());
Nils Diewald9f310832013-12-06 22:38:55 +0000232
Nils Diewald884dbcf2015-02-27 17:02:28 +0000233 Result kr = new Krill(json).apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000234 assertEquals(0, kr.getStartIndex());
235 assertEquals(kr.getTotalResults(), 0);
236 assertEquals(25, kr.getItemsPerPage());
Nils Diewald9f310832013-12-06 22:38:55 +0000237 };
238
Nils Diewaldafab8f32015-01-26 19:11:32 +0000239
Nils Diewald9f310832013-12-06 22:38:55 +0000240 @Test
241 public void searchJSONindexboundary2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000242 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000243 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000244 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000245 for (String i : new String[] { "00001", "00002", "00003", "00004",
246 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200247 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000248 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000249 };
250 ki.commit();
Nils Diewald9f310832013-12-06 22:38:55 +0000251
Eliza Margaretha6f989202016-10-14 21:48:29 +0200252 String json = getJsonString(
253 getClass().getResource("/queries/bsp-fail2.jsonld").getFile());
Nils Diewald9f310832013-12-06 22:38:55 +0000254
Nils Diewald884dbcf2015-02-27 17:02:28 +0000255 Result kr = new Krill(json).apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000256 assertEquals(50, kr.getItemsPerPage());
257 assertEquals(49950, kr.getStartIndex());
258 assertEquals(kr.getTotalResults(), 0);
Nils Diewald9f310832013-12-06 22:38:55 +0000259 };
260
Akron40550172015-08-04 03:06:12 +0200261
Akron001dab32015-07-02 12:30:15 +0200262 /*
263 * Queries should be mirrored correctly for debugging reasons.
264 */
265 @Test
266 public void queryJSONmirrorTestBug () throws IOException {
267 // Construct index
268 KrillIndex ki = new KrillIndex();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200269 String json = getJsonString(getClass()
270 .getResource("/queries/bugs/failing_mirror.jsonld").getFile());
Akron001dab32015-07-02 12:30:15 +0200271 Krill ks = new Krill(json);
272 Result kr = ks.apply(ki);
273
274 ObjectMapper mapper = new ObjectMapper();
275 JsonNode res = mapper.readTree(kr.toJsonString());
276
277 assertEquals("Unable to parse JSON", res.at("/errors/0/1").asText());
278
Eliza Margaretha6f989202016-10-14 21:48:29 +0200279 json = getJsonString(
280 getClass().getResource("/queries/bugs/failing_mirror_2.jsonld")
281 .getFile());
Akron001dab32015-07-02 12:30:15 +0200282 ks = new Krill(json);
283 kr = ks.apply(ki);
284
285 res = mapper.readTree(kr.toJsonString());
286
Akron40550172015-08-04 03:06:12 +0200287 assertEquals(23, res.at("/meta/count").asInt());
288 assertEquals(25, res.at("/meta/itemsPerPage").asInt());
Akron001dab32015-07-02 12:30:15 +0200289 assertEquals("base/s:p", res.at("/meta/context").asText());
290 assertFalse(res.at("/query").isMissingNode());
291 assertTrue(res.at("/query/@type").isMissingNode());
292 assertTrue(res.at("/collection/@type").isMissingNode());
293 };
294
295
Nils Diewaldc6b78752013-12-05 19:05:12 +0000296
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000297 @Test
298 public void searchJSONcontext () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000299 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000300 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000301 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000302 for (String i : new String[] { "00001", "00002", "00003", "00004",
303 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200304 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000305 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000306 };
307 ki.commit();
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000308
Eliza Margaretha6f989202016-10-14 21:48:29 +0200309 String json = getJsonString(getClass()
310 .getResource("/queries/bsp-context.jsonld").getFile());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000311
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000312 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000313 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000314 assertEquals(kr.getTotalResults(), 10);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200315 assertEquals(
316 "A bzw. a ist der erste Buchstabe des"
317 + " lateinischen [[Alphabets]] und ein Vokal."
318 + " Der Buchstabe A hat in deutschen Texten"
319 + " eine durchschnittliche Häufigkeit ...",
320 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000321
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000322 ks.getMeta().setCount(5);
323 ks.getMeta().setStartPage(2);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000324 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000325 assertEquals(kr.getTotalResults(), 10);
326 assertEquals(5, kr.getStartIndex());
327 assertEquals(5, kr.getItemsPerPage());
Nils Diewald891c53c2013-12-23 16:37:46 +0000328
Eliza Margaretha805e27f2016-10-14 21:39:42 +0200329 json = getJsonString(getClass()
Nils Diewaldbb33da22015-03-04 16:24:25 +0000330 .getResource("/queries/bsp-context-2.jsonld").getFile());
Nils Diewald891c53c2013-12-23 16:37:46 +0000331
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000332 kr = new Krill(json).apply(ki);
333
Nils Diewaldafab8f32015-01-26 19:11:32 +0000334 assertEquals(kr.getTotalResults(), -1);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200335 assertEquals(
336 "... lls seit den Griechen beibehalten worden."
337 + " 3. Bedeutungen in der Biologie steht A für"
338 + " das Nukleosid Adenosin steht A die Base"
339 + " Adenin steht A für die Aminosäure Alanin"
340 + " in der Informatik steht a für den dezimalen"
341 + " [[Wert]] 97 sowohl im ASCII- als auch im"
342 + " Unicode-Zeichensatz steht A für den dezimalen"
343 + " Wert 65 sowohl im ASCII- als auch im"
344 + " Unicode-Zeichensatz als Kfz-Kennzeichen"
345 + " steht A in Deutschland für Augsburg."
346 + " in Österreich auf ...",
347 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000348 };
349
Nils Diewaldbb33da22015-03-04 16:24:25 +0000350
Nils Diewald364eb642013-12-22 15:03:01 +0000351 @Test
352 public void searchJSONstartPage () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000353 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000354 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000355 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000356 for (String i : new String[] { "00001", "00002", "00003", "00004",
357 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200358 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000359 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000360 };
361 ki.commit();
Nils Diewald364eb642013-12-22 15:03:01 +0000362
Eliza Margaretha6f989202016-10-14 21:48:29 +0200363 String json = getJsonString(
364 getClass().getResource("/queries/bsp-paging.jsonld").getFile());
Nils Diewald364eb642013-12-22 15:03:01 +0000365
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000366 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000367 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000368 assertEquals(kr.getTotalResults(), 10);
369 assertEquals(5, kr.getStartIndex());
370 assertEquals(5, kr.getItemsPerPage());
Nils Diewald364eb642013-12-22 15:03:01 +0000371
Eliza Margaretha6f989202016-10-14 21:48:29 +0200372 json = getJsonString(
373 getClass().getResource("/queries/bsp-cutoff.jsonld").getFile());
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000374 ks = ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000375 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000376 assertEquals(kr.getTotalResults(), -1);
377 assertEquals(2, kr.getStartIndex());
378 assertEquals(2, kr.getItemsPerPage());
Nils Diewald364eb642013-12-22 15:03:01 +0000379
Eliza Margaretha6f989202016-10-14 21:48:29 +0200380 json = getJsonString(
381 getClass().getResource("/queries/metaquery9.jsonld").getFile());
Nils Diewald2d5f8102015-02-26 21:07:54 +0000382 KrillCollection kc = new KrillCollection(json);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000383 kc.setIndex(ki);
384 assertEquals(7, kc.numberOf("documents"));
Nils Diewald364eb642013-12-22 15:03:01 +0000385 };
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000386
Nils Diewaldafab8f32015-01-26 19:11:32 +0000387
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000388 @Test
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000389 public void searchJSONitemsPerResource () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000390 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000391 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000392 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000393 for (String i : new String[] { "00001", "00002", "00003", "00004",
394 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200395 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000396 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000397 };
398 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200399 String json = getJsonString(getClass()
400 .getResource("/queries/bsp-itemsPerResource.jsonld").getFile());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000401
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000402 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000403 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000404 assertEquals(kr.getTotalResults(), 10);
405 assertEquals(0, kr.getStartIndex());
406 assertEquals(20, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000407
Nils Diewaldafab8f32015-01-26 19:11:32 +0000408 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
409 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
410 assertEquals("WPD_AAA.00001", kr.getMatch(6).getDocID());
411 assertEquals("WPD_AAA.00002", kr.getMatch(7).getDocID());
412 assertEquals("WPD_AAA.00002", kr.getMatch(8).getDocID());
413 assertEquals("WPD_AAA.00004", kr.getMatch(9).getDocID());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000414
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000415 ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000416 ks.getMeta().setItemsPerResource(1);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000417
Nils Diewald3aa9e692015-02-20 22:20:11 +0000418 kr = ks.apply(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000419
Nils Diewaldafab8f32015-01-26 19:11:32 +0000420 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
421 assertEquals("WPD_AAA.00002", kr.getMatch(1).getDocID());
422 assertEquals("WPD_AAA.00004", kr.getMatch(2).getDocID());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000423
Nils Diewaldafab8f32015-01-26 19:11:32 +0000424 assertEquals(kr.getTotalResults(), 3);
425 assertEquals(0, kr.getStartIndex());
426 assertEquals(20, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000427
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000428 ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000429 ks.getMeta().setItemsPerResource(2);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000430
Nils Diewald3aa9e692015-02-20 22:20:11 +0000431 kr = ks.apply(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000432
Nils Diewaldafab8f32015-01-26 19:11:32 +0000433 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
434 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
435 assertEquals("WPD_AAA.00002", kr.getMatch(2).getDocID());
436 assertEquals("WPD_AAA.00002", kr.getMatch(3).getDocID());
437 assertEquals("WPD_AAA.00004", kr.getMatch(4).getDocID());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000438
Nils Diewaldafab8f32015-01-26 19:11:32 +0000439 assertEquals(kr.getTotalResults(), 5);
440 assertEquals(0, kr.getStartIndex());
441 assertEquals(20, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000442
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000443 ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000444 KrillMeta meta = ks.getMeta();
445 meta.setItemsPerResource(1);
446 meta.setStartIndex(1);
447 meta.setCount(1);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000448
Nils Diewald3aa9e692015-02-20 22:20:11 +0000449 kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000450
Nils Diewaldafab8f32015-01-26 19:11:32 +0000451 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000452
Nils Diewaldafab8f32015-01-26 19:11:32 +0000453 assertEquals(kr.getTotalResults(), 3);
454 assertEquals(1, kr.getStartIndex());
455 assertEquals(1, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000456
Nils Diewaldafab8f32015-01-26 19:11:32 +0000457 assertEquals((short) 1, kr.getItemsPerResource());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000458 };
459
Nils Diewaldafab8f32015-01-26 19:11:32 +0000460
Nils Diewaldd723d812014-09-23 18:50:52 +0000461 @Test
462 public void searchJSONitemsPerResourceServer () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000463 /*
464 * This test is a server-only implementation of
465 * TestResource#testCollection
466 */
467 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000468 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000469 // Indexing test files
470 int uid = 1;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000471 for (String i : new String[] { "00001", "00002", "00003", "00004",
472 "00005", "00006", "02439" }) {
473 ki.addDoc(uid++,
474 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
475 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000476 };
477 ki.commit();
Nils Diewaldd723d812014-09-23 18:50:52 +0000478
Eliza Margaretha6f989202016-10-14 21:48:29 +0200479 String json = getJsonString(getClass()
480 .getResource("/queries/bsp-uid-example.jsonld").getFile());
Nils Diewaldd723d812014-09-23 18:50:52 +0000481
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000482 Krill ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000483 ks.getMeta().setItemsPerResource(1);
484
Nils Diewald2d5f8102015-02-26 21:07:54 +0000485 KrillCollection kc = new KrillCollection();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000486 kc.filterUIDs(new String[] { "1", "4" });
Nils Diewaldafab8f32015-01-26 19:11:32 +0000487 kc.setIndex(ki);
488 ks.setCollection(kc);
Nils Diewaldd723d812014-09-23 18:50:52 +0000489
Nils Diewald884dbcf2015-02-27 17:02:28 +0000490 Result kr = ks.apply(ki);
Nils Diewaldd723d812014-09-23 18:50:52 +0000491
Nils Diewaldafab8f32015-01-26 19:11:32 +0000492 assertEquals(kr.getTotalResults(), 2);
493 assertEquals(0, kr.getStartIndex());
494 assertEquals(25, kr.getItemsPerPage());
Nils Diewaldd723d812014-09-23 18:50:52 +0000495 };
Nils Diewaldba197f22014-11-01 17:21:46 +0000496
Nils Diewaldafab8f32015-01-26 19:11:32 +0000497
Nils Diewaldba197f22014-11-01 17:21:46 +0000498 @Test
499 public void searchJSONnewJSON () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000500 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000501 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000502 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000503 FieldDocument fd = ki.addDoc(1,
504 getClass().getResourceAsStream("/goe/AGA-03828.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000505 ki.commit();
Nils Diewaldba197f22014-11-01 17:21:46 +0000506
Nils Diewaldafab8f32015-01-26 19:11:32 +0000507 assertEquals(fd.getUID(), 1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000508 assertEquals(fd.getTextSigle(), "GOE_AGA.03828");
509 assertEquals(fd.getDocSigle(), "GOE_AGA");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000510 assertEquals(fd.getCorpusSigle(), "GOE");
Akron32b95192019-01-11 13:58:55 +0100511 assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
512 assertNull(fd.getFieldValue("subTitle"));
513 assertEquals(fd.getFieldValue("textType"), "Autobiographie");
514 assertNull(fd.getFieldValue("textTypeArt"));
515 assertNull(fd.getFieldValue("textTypeRef"));
516 assertNull(fd.getFieldValue("textColumn"));
517 assertNull(fd.getFieldValue("textDomain"));
Akron69b958c2017-02-15 22:49:45 +0100518 // assertEquals(fd.getPages(), "529-547");
Akron32b95192019-01-11 13:58:55 +0100519 assertEquals(fd.getFieldValue("availability"), "QAO-NC");
520 assertEquals(fd.getFieldValue("creationDate"), "1820");
521 assertEquals(fd.getFieldValue("pubDate"), "1982");
522 assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von");
523 assertNull(fd.getFieldValue("textClass"));
524 assertEquals(fd.getFieldValue("language"), "de");
525 assertEquals(fd.getFieldValue("pubPlace"), "München");
526 assertEquals(fd.getFieldValue("reference"),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200527 "Goethe, Johann Wolfgang von:"
528 + " Autobiographische Einzelheiten,"
529 + " (Geschrieben bis 1832), In: Goethe,"
530 + " Johann Wolfgang von: Goethes Werke,"
531 + " Bd. 10, Autobiographische Schriften"
532 + " II, Hrsg.: Trunz, Erich. München: "
533 + "Verlag C. H. Beck, 1982, S. 529-547");
Akron32b95192019-01-11 13:58:55 +0100534 assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck");
535 assertNull(fd.getFieldValue("editor"));
536 assertNull(fd.getFieldValue("fileEditionStatement"));
537 assertNull(fd.getFieldValue("biblEditionStatement"));
538 assertNull(fd.getFieldValue("keywords"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000539
Akron32b95192019-01-11 13:58:55 +0100540 assertEquals(fd.getFieldValue("tokenSource"), "opennlp#tokens");
541 assertEquals(fd.getFieldValue("foundries"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000542 "base base/paragraphs base/sentences corenlp "
543 + "corenlp/constituency corenlp/morpho "
544 + "corenlp/namedentities corenlp/sentences "
545 + "glemm glemm/morpho mate mate/morpho"
546 + " opennlp opennlp/morpho opennlp/sentences"
547 + " treetagger treetagger/morpho "
548 + "treetagger/sentences");
Akron32b95192019-01-11 13:58:55 +0100549 assertEquals(fd.getFieldValue("layerInfos"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000550 "base/s=spans corenlp/c=spans corenlp/ne=tokens"
551 + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens"
552 + " mate/l=tokens mate/m=tokens mate/p=tokens"
553 + " opennlp/p=tokens opennlp/s=spans tt/l=tokens"
554 + " tt/p=tokens tt/s=spans");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000555
Akron32b95192019-01-11 13:58:55 +0100556 assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke");
557 assertNull(fd.getFieldValue("corpusSubTitle"));
558 assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von");
559 assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich");
560 assertEquals(fd.getFieldValue("docTitle"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000561 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
Akron32b95192019-01-11 13:58:55 +0100562 assertNull(fd.getFieldValue("docSubTitle"));
563 assertNull(fd.getFieldValue("docEditor"));
564 assertNull(fd.getFieldValue("docAuthor"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000565
Nils Diewaldbb33da22015-03-04 16:24:25 +0000566 Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom")
567 .with("mate/m:number:pl"));
Nils Diewald884dbcf2015-02-27 17:02:28 +0000568 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000569
570 assertEquals(kr.getTotalResults(), 148);
571 assertEquals(0, kr.getStartIndex());
572 assertEquals(25, kr.getItemsPerPage());
Nils Diewaldba197f22014-11-01 17:21:46 +0000573 };
Nils Diewald06368ba2014-11-03 20:53:27 +0000574
Akron69b958c2017-02-15 22:49:45 +0100575
576 @Test
577 public void searchJSONwithPagebreaks () throws IOException {
578 // Construct index
579 KrillIndex ki = new KrillIndex();
580 // Indexing test files
581 FieldDocument fd = ki.addDoc(1,
582 getClass().getResourceAsStream("/goe/AGA-03828-pb.json.gz"), true);
583 ki.commit();
584
585 assertEquals(fd.getUID(), 1);
586 assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
587 assertEquals(fd.getDocSigle(), "GOE/AGA");
588 assertEquals(fd.getCorpusSigle(), "GOE");
Akron32b95192019-01-11 13:58:55 +0100589 assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
590 assertNull(fd.getFieldValue("subTitle"));
591 assertEquals(fd.getFieldValue("textType"), "Autobiographie");
592 assertNull(fd.getFieldValue("textTypeArt"));
593 assertNull(fd.getFieldValue("textTypeRef"));
594 assertNull(fd.getFieldValue("textColumn"));
595 assertNull(fd.getFieldValue("textDomain"));
Akron69b958c2017-02-15 22:49:45 +0100596 // assertEquals(fd.getPages(), "529-547");
Akron32b95192019-01-11 13:58:55 +0100597 // assertEquals(fd.getFieldValue("availability"), "QAO-NC");
598 assertEquals(fd.getFieldValue("creationDate"), "1820");
599 assertEquals(fd.getFieldValue("pubDate"), "1982");
600 assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von");
601 assertNull(fd.getFieldValue("textClass"));
602 assertEquals(fd.getFieldValue("language"), "de");
603 assertEquals(fd.getFieldValue("pubPlace"), "München");
604 assertEquals(fd.getFieldValue("reference"),
Akron69b958c2017-02-15 22:49:45 +0100605 "Goethe, Johann Wolfgang von:"
606 + " Autobiographische Einzelheiten,"
607 + " (Geschrieben bis 1832), In: Goethe,"
608 + " Johann Wolfgang von: Goethes Werke,"
609 + " Bd. 10, Autobiographische Schriften"
610 + " II, Hrsg.: Trunz, Erich. München: "
611 + "Verlag C. H. Beck, 1982, S. 529-547");
Akron32b95192019-01-11 13:58:55 +0100612 assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck");
613 assertNull(fd.getFieldValue("editor"));
614 assertNull(fd.getFieldValue("fileEditionStatement"));
615 assertNull(fd.getFieldValue("biblEditionStatement"));
616 assertNull(fd.getFieldValue("keywords"));
Akron69b958c2017-02-15 22:49:45 +0100617
Akron32b95192019-01-11 13:58:55 +0100618 assertEquals(fd.getFieldValue("tokenSource"), "base#tokens_aggr");
619 assertEquals(fd.getFieldValue("foundries"),
Akron69b958c2017-02-15 22:49:45 +0100620 "dereko dereko/structure "+
621 "dereko/structure/base-sentences-paragraphs-pagebreaks");
Akron32b95192019-01-11 13:58:55 +0100622 assertEquals(fd.getFieldValue("layerInfos"), "dereko/s=spans");
Akron69b958c2017-02-15 22:49:45 +0100623
Akron32b95192019-01-11 13:58:55 +0100624 assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke");
625 assertNull(fd.getFieldValue("corpusSubTitle"));
626 assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von");
627 assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich");
628 assertEquals(fd.getFieldValue("docTitle"),
Akron69b958c2017-02-15 22:49:45 +0100629 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
Akron32b95192019-01-11 13:58:55 +0100630 assertNull(fd.getFieldValue("docSubTitle"));
631 assertNull(fd.getFieldValue("docEditor"));
632 assertNull(fd.getFieldValue("docAuthor"));
Akron69b958c2017-02-15 22:49:45 +0100633
634 Krill ks = new Krill(new QueryBuilder("tokens").seg("s:der"));
635 Result kr = ks.apply(ki);
636
637 assertEquals(kr.getTotalResults(), 97);
638 assertEquals(0, kr.getStartIndex());
639 assertEquals(25, kr.getItemsPerPage());
640
641 Match m = kr.getMatch(5);
642 assertEquals("Start page", m.getStartPage(), 529);
643
644 ObjectMapper mapper = new ObjectMapper();
645 JsonNode res = mapper.readTree(m.toJsonString());
646 assertEquals(529, res.at("/pages/0").asInt());
647 };
648
Nils Diewaldafab8f32015-01-26 19:11:32 +0000649
Nils Diewald06368ba2014-11-03 20:53:27 +0000650 @Test
651 public void searchJSONnewJSON2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000652 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000653 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000654 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000655 FieldDocument fd = ki.addDoc(1,
656 getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000657 ki.commit();
Nils Diewald06368ba2014-11-03 20:53:27 +0000658
Nils Diewaldafab8f32015-01-26 19:11:32 +0000659 assertEquals(fd.getUID(), 1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000660 assertEquals(fd.getTextSigle(), "BZK_D59.00089");
661 assertEquals(fd.getDocSigle(), "BZK_D59");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000662 assertEquals(fd.getCorpusSigle(), "BZK");
Akron32b95192019-01-11 13:58:55 +0100663 assertEquals(fd.getFieldValue("title"), "Saragat-Partei zerfällt");
664 assertEquals(fd.getFieldValue("pubDate"), "1959-02-19");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000665
Akron32b95192019-01-11 13:58:55 +0100666 assertNull(fd.getFieldValue("subTitle"));
667 assertNull(fd.getFieldValue("author"));
668 assertNull(fd.getFieldValue("editor"));
669 assertEquals(fd.getFieldValue("pubPlace"), "Berlin");
670 assertNull(fd.getFieldValue("publisher"));
671 assertEquals(fd.getFieldValue("textType"), "Zeitung: Tageszeitung");
672 assertNull(fd.getFieldValue("textTypeArt"));
673 assertEquals(fd.getFieldValue("textTypeRef"), "Tageszeitung");
674 assertEquals(fd.getFieldValue("textDomain"), "Politik");
675 assertEquals(fd.getFieldValue("creationDate"), "1959-02-19");
676 assertEquals(fd.getFieldValue("availability"), "ACA-NC-LC");
677 assertEquals(fd.getFieldValue("textColumn"), "POLITIK");
Akron69b958c2017-02-15 22:49:45 +0100678 // assertNull(fd.getPages());
Akron32b95192019-01-11 13:58:55 +0100679 assertEquals(fd.getFieldValue("textClass"), "politik ausland");
680 assertNull(fd.getFieldValue("fileEditionStatement"));
681 assertNull(fd.getFieldValue("biblEditionStatement"));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000682
Akron32b95192019-01-11 13:58:55 +0100683 assertEquals(fd.getFieldValue("language"), "de");
684 assertEquals(fd.getFieldValue("reference"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000685 "Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14,"
686 + " Berliner Ausgabe, S. 7. - Sachgebiet: Politik, "
687 + "Originalressort: POLITIK; Saragat-Partei zerfällt");
Akron32b95192019-01-11 13:58:55 +0100688 assertNull(fd.getFieldValue("publisher"));
689 assertNull(fd.getFieldValue("keywords"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000690
Akron32b95192019-01-11 13:58:55 +0100691 assertEquals(fd.getFieldValue("tokenSource"), "opennlp#tokens");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000692
Akron32b95192019-01-11 13:58:55 +0100693 assertEquals(fd.getFieldValue("foundries"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000694 "base base/paragraphs base/sentences corenlp "
695 + "corenlp/constituency corenlp/morpho corenlp/namedentities"
696 + " corenlp/sentences glemm glemm/morpho mate mate/morpho"
697 + " opennlp opennlp/morpho opennlp/sentences treetagger"
698 + " treetagger/morpho treetagger/sentences");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000699
Akron32b95192019-01-11 13:58:55 +0100700 assertEquals(fd.getFieldValue("layerInfos"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000701 "base/s=spans corenlp/c=spans corenlp/ne=tokens"
702 + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens"
703 + " mate/l=tokens mate/m=tokens mate/p=tokens"
704 + " opennlp/p=tokens opennlp/s=spans tt/l=tokens"
705 + " tt/p=tokens tt/s=spans");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000706
Akron32b95192019-01-11 13:58:55 +0100707 assertEquals(fd.getFieldValue("corpusTitle"), "Bonner Zeitungskorpus");
708 assertNull(fd.getFieldValue("corpusSubTitle"));
709 assertNull(fd.getFieldValue("corpusAuthor"));
710 assertNull(fd.getFieldValue("corpusEditor"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000711
Akron32b95192019-01-11 13:58:55 +0100712 assertEquals(fd.getFieldValue("docTitle"), "Neues Deutschland");
713 assertEquals(fd.getFieldValue("docSubTitle"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000714 "Organ des Zentralkomitees der Sozialistischen "
715 + "Einheitspartei Deutschlands");
Akron32b95192019-01-11 13:58:55 +0100716 assertNull(fd.getFieldValue("docEditor"));
717 assertNull(fd.getFieldValue("docAuthor"));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000718
719 Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom")
720 .with("mate/m:number:sg"));
Nils Diewald884dbcf2015-02-27 17:02:28 +0000721 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000722
723 assertEquals(kr.getTotalResults(), 6);
724 assertEquals(0, kr.getStartIndex());
725 assertEquals(25, kr.getItemsPerPage());
Nils Diewald06368ba2014-11-03 20:53:27 +0000726 };
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000727
Nils Diewaldafab8f32015-01-26 19:11:32 +0000728
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000729 @Test
Nils Diewald56dc2582014-11-04 21:33:46 +0000730 public void searchJSONcosmasBoundaryBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000731 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000732 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000733 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000734 FieldDocument fd = ki.addDoc(1,
735 getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000736 ki.commit();
Nils Diewald56dc2582014-11-04 21:33:46 +0000737
Eliza Margaretha6f989202016-10-14 21:48:29 +0200738 String json = getJsonString(getClass()
739 .getResource("/queries/bugs/cosmas_boundary.jsonld").getFile());
Nils Diewald56dc2582014-11-04 21:33:46 +0000740
Nils Diewald8904c1d2015-02-26 16:13:18 +0000741 QueryBuilder kq = new QueryBuilder("tokens");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000742 Krill ks = new Krill(kq.focus(1,
Akron4f52a632018-02-09 19:02:40 +0100743 kq.contains(kq.tag("base/s:s"), kq.nr(1, kq.seg("s:Leben")))));
Nils Diewald56dc2582014-11-04 21:33:46 +0000744
Nils Diewald884dbcf2015-02-27 17:02:28 +0000745 Result kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000746 assertEquals(kr.getSerialQuery(),
Akrona26184e2018-12-05 15:37:34 +0100747 "focus(1: spanContain(<tokens:base/s:s />, {1: tokens:s:Leben}),sorting)");
margarethaf70addb2015-04-27 13:17:18 +0200748 assertEquals(40, kr.getMatch(0).getStartPos());
749 assertEquals(41, kr.getMatch(0).getEndPos());
750
Eliza Margaretha6f989202016-10-14 21:48:29 +0200751 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000752 "... Initiative\" eine neue politische Gruppierung ins "
Akronf05fde62016-08-03 23:46:17 +0200753 + "[[{1:Leben}]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
Nils Diewald56dc2582014-11-04 21:33:46 +0000754
Nils Diewaldafab8f32015-01-26 19:11:32 +0000755 // Try with high class - don't highlight
Nils Diewaldbb33da22015-03-04 16:24:25 +0000756 ks = new Krill(kq.focus(129,
Akron4f52a632018-02-09 19:02:40 +0100757 kq.contains(kq.tag("base/s:s"), kq.nr(129, kq.seg("s:Leben")))));
Nils Diewald56dc2582014-11-04 21:33:46 +0000758
Nils Diewald3aa9e692015-02-20 22:20:11 +0000759 kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000760 assertEquals(kr.getSerialQuery(),
Akrona26184e2018-12-05 15:37:34 +0100761 "focus(129: spanContain(<tokens:base/s:s />, {129: tokens:s:Leben}),sorting)");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200762 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000763 "... Initiative\" eine neue politische Gruppierung ins "
Akronf05fde62016-08-03 23:46:17 +0200764 + "[[Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
Nils Diewald0fa2da22014-11-05 03:31:32 +0000765
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000766 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000767 kr = ks.apply(ki);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200768 assertEquals(kr.getSerialQuery(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000769 "focus(129: spanElementDistance({129: tokens:s:Namen}, "
Akrona26184e2018-12-05 15:37:34 +0100770 + "{129: tokens:s:Leben}, [(base/s:s[0:1], notOrdered, notExcluded)]),sorting)");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200771 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200772 "... ihren Austritt erklärt und unter dem [[Namen \"Einheitsbewegung "
Nils Diewaldbb33da22015-03-04 16:24:25 +0000773 + "der sozialistischen Initiative\" eine neue politische Gruppierung "
Akronf05fde62016-08-03 23:46:17 +0200774 + "ins Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000775 assertEquals(kr.getTotalResults(), 1);
776 assertEquals(0, kr.getStartIndex());
Nils Diewald56dc2582014-11-04 21:33:46 +0000777 };
778
Nils Diewaldbb33da22015-03-04 16:24:25 +0000779
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000780 @Test
781 public void searchJSONmultipleClassesBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000782 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000783 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000784 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000785 ki.addDoc(1, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
786 true);
787 ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
788 true);
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000789
Nils Diewaldafab8f32015-01-26 19:11:32 +0000790 ki.commit();
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000791
Eliza Margaretha6f989202016-10-14 21:48:29 +0200792 String json = getJsonString(
793 getClass().getResource("/queries/bugs/multiple_classes.jsonld")
794 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000795
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000796 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000797 Result kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000798 assertEquals(kr.getSerialQuery(),
799 "{4: spanNext({1: spanNext({2: tokens:s:ins}, "
800 + "{3: tokens:s:Leben})}, tokens:s:gerufen)}");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200801 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000802 "... sozialistischen Initiative\" eine neue politische"
Akronf05fde62016-08-03 23:46:17 +0200803 + " Gruppierung [[{4:{1:{2:ins} {3:Leben}} gerufen}]] hatten. "
Nils Diewaldbb33da22015-03-04 16:24:25 +0000804 + "Pressemeldungen zufolge haben sich in ...");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000805 assertEquals(kr.getTotalResults(), 2);
806 assertEquals(0, kr.getStartIndex());
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000807 };
808
Nils Diewaldbb33da22015-03-04 16:24:25 +0000809
Nils Diewald277e9ce2014-11-06 03:42:11 +0000810 @Test
811 public void searchJSONmultipleClassesBugTokenList () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000812 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000813 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000814 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000815 ki.addDoc(1, getClass().getResourceAsStream("/goe/AGA-03828.json.gz"),
816 true);
817 ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
818 true);
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000819
Nils Diewaldafab8f32015-01-26 19:11:32 +0000820 ki.commit();
Nils Diewald277e9ce2014-11-06 03:42:11 +0000821
Eliza Margaretha6f989202016-10-14 21:48:29 +0200822 String json = getJsonString(
823 getClass().getResource("/queries/bugs/multiple_classes.jsonld")
824 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000825
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000826 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000827 Result kr = ks.apply(ki);
Nils Diewald277e9ce2014-11-06 03:42:11 +0000828
Nils Diewaldafab8f32015-01-26 19:11:32 +0000829 ObjectMapper mapper = new ObjectMapper();
830 JsonNode res = mapper.readTree(kr.toTokenListJsonString());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000831
Akrond504f212015-06-20 00:27:54 +0200832 assertEquals(1, res.at("/meta/totalResults").asInt());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200833 assertEquals(
834 "{4: spanNext({1: spanNext({2: tokens:s:ins}, "
835 + "{3: tokens:s:Leben})}, tokens:s:gerufen)}",
Akrond504f212015-06-20 00:27:54 +0200836 res.at("/meta/serialQuery").asText());
837 assertEquals(0, res.at("/meta/startIndex").asInt());
838 assertEquals(25, res.at("/meta/itemsPerPage").asInt());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000839
Nils Diewaldafab8f32015-01-26 19:11:32 +0000840 assertEquals("BZK_D59.00089", res.at("/matches/0/textSigle").asText());
841 assertEquals(328, res.at("/matches/0/tokens/0/0").asInt());
842 assertEquals(331, res.at("/matches/0/tokens/0/1").asInt());
843 assertEquals(332, res.at("/matches/0/tokens/1/0").asInt());
844 assertEquals(337, res.at("/matches/0/tokens/1/1").asInt());
845 assertEquals(338, res.at("/matches/0/tokens/2/0").asInt());
846 assertEquals(345, res.at("/matches/0/tokens/2/1").asInt());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000847 };
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000848
Nils Diewaldafab8f32015-01-26 19:11:32 +0000849
Nils Diewaldb84e7272014-11-07 01:27:38 +0000850 @Test
851 public void searchJSONmultitermRewriteBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000852 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000853 KrillIndex ki = new KrillIndex();
Nils Diewaldb84e7272014-11-07 01:27:38 +0000854
Nils Diewaldafab8f32015-01-26 19:11:32 +0000855 assertEquals(ki.numberOf("documents"), 0);
856
857 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000858 FieldDocument fd = ki.addDoc(1,
859 getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000860 ki.commit();
861
862 assertEquals(ki.numberOf("documents"), 1);
863 assertEquals("BZK", fd.getCorpusSigle());
864
865 // [tt/p="A.*"]{0,3}[tt/p="N.*"]
Eliza Margaretha6f989202016-10-14 21:48:29 +0200866 String json = getJsonString(
867 getClass().getResource("/queries/bugs/multiterm_rewrite.jsonld")
868 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000869
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000870 Krill ks = new Krill(json);
Nils Diewald2d5f8102015-02-26 21:07:54 +0000871 KrillCollection kc = ks.getCollection();
Nils Diewaldc471b182014-11-19 22:51:15 +0000872
Nils Diewaldafab8f32015-01-26 19:11:32 +0000873 // No index was set
874 assertEquals(-1, kc.numberOf("documents"));
875 kc.setIndex(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +0000876
Nils Diewaldafab8f32015-01-26 19:11:32 +0000877 // Index was set but vc restricted to WPD
878 assertEquals(0, kc.numberOf("documents"));
Nils Diewaldc471b182014-11-19 22:51:15 +0000879
Akron176c9b12015-07-29 19:53:40 +0200880 /*
Nils Diewaldbb33da22015-03-04 16:24:25 +0000881 kc.extend(new CollectionBuilder().or("corpusSigle", "BZK"));
Akron176c9b12015-07-29 19:53:40 +0200882 */
883 CollectionBuilder cb = new CollectionBuilder();
Akron40550172015-08-04 03:06:12 +0200884 kc.fromBuilder(cb.orGroup().with(kc.getBuilder())
885 .with(cb.term("corpusSigle", "BZK")));
Akron176c9b12015-07-29 19:53:40 +0200886
Nils Diewaldafab8f32015-01-26 19:11:32 +0000887 ks.setCollection(kc);
888 assertEquals(1, kc.numberOf("documents"));
Nils Diewald1220e3e2014-11-08 03:18:58 +0000889
Nils Diewald884dbcf2015-02-27 17:02:28 +0000890 Result kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000891
892 assertEquals(kr.getSerialQuery(),
893 "spanOr([SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/), "
894 + "spanNext(spanRepetition(SpanMultiTermQueryWrapper"
895 + "(tokens:/tt/p:A.*/){1,3}), "
896 + "SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/))])");
Nils Diewaldb84e7272014-11-07 01:27:38 +0000897
margaretha7f4fd652018-11-22 18:00:02 +0100898 assertEquals(68,kr.getTotalResults());
Nils Diewaldafab8f32015-01-26 19:11:32 +0000899 assertEquals(0, kr.getStartIndex());
Nils Diewald5871e4d2014-11-07 03:48:25 +0000900
Nils Diewaldbb33da22015-03-04 16:24:25 +0000901 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200902 "[[Saragat-Partei]] zerfällt Rom (ADN) die von dem ...");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000903 assertEquals(kr.getMatch(1).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200904 "[[Saragat-Partei]] zerfällt Rom (ADN) die von dem ...");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000905 assertEquals(kr.getMatch(2).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200906 "Saragat-Partei zerfällt [[Rom]] (ADN) "
Akron43cea662016-02-15 23:43:59 +0100907 + "die von dem Rechtssozialisten Saragat ...");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000908 assertEquals(kr.getMatch(3).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200909 "Saragat-Partei zerfällt Rom ([[ADN]]) "
Akron43cea662016-02-15 23:43:59 +0100910 + "die von dem Rechtssozialisten Saragat geführte ...");
margaretha7f4fd652018-11-22 18:00:02 +0100911 assertEquals("... auseinander, nachdem vor einiger Zeit mehrere "
912 + "[[prominente Mitglieder]] ihren Austritt erklärt "
913 + "und unter dem ...", kr.getMatch(23).getSnippetBrackets());
Nils Diewaldb84e7272014-11-07 01:27:38 +0000914 };
915
916
Nils Diewald56dc2582014-11-04 21:33:46 +0000917 @Test
Akrone4fdce42015-11-13 16:06:10 +0100918 public void searchJSONtokenDistanceSpanBug () throws IOException {
919 // Construct index
920 KrillIndex ki = new KrillIndex();
921 ki.addDoc(1, getClass().getResourceAsStream("/goe/AGX-00002.json"),
Akron42993552016-02-04 13:24:24 +0100922 false);
Akrone4fdce42015-11-13 16:06:10 +0100923 ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
Akron42993552016-02-04 13:24:24 +0100924 true);
Akrone4fdce42015-11-13 16:06:10 +0100925 ki.commit();
Akron42993552016-02-04 13:24:24 +0100926
Akrone4fdce42015-11-13 16:06:10 +0100927 // ({1:Sonne []* Erde} | {2: Erde []* Sonne})
Eliza Margaretha6f989202016-10-14 21:48:29 +0200928 String json = getJsonString(getClass()
929 .getResource("/queries/bugs/tokendistancespan_bug.jsonld")
930 .getFile());
Akrone4fdce42015-11-13 16:06:10 +0100931
932 Krill ks = new Krill(json);
933 Result kr = ks.apply(ki);
934 ObjectMapper mapper = new ObjectMapper();
935 JsonNode res = mapper.readTree(kr.toJsonString());
936 assertTrue(res.at("/errors").isMissingNode());
937 };
938
939
940 @Test
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000941 public void searchJSONCollection () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000942 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000943 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000944 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000945 for (String i : new String[] { "00001", "00002", "00003", "00004",
946 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200947 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000948 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000949 };
950 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200951 String json = getJsonString(getClass()
952 .getResource("/queries/metaquery8-nocollection.jsonld")
953 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000954
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000955 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000956 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000957 assertEquals(kr.getTotalResults(), 276);
958 assertEquals(0, kr.getStartIndex());
959 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000960
Eliza Margaretha6f989202016-10-14 21:48:29 +0200961 json = getJsonString(
962 getClass().getResource("/queries/metaquery8.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000963
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000964 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000965 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000966
Nils Diewaldafab8f32015-01-26 19:11:32 +0000967 assertEquals(kr.getTotalResults(), 147);
968 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
969 assertEquals(0, kr.getStartIndex());
970 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000971
Eliza Margaretha6f989202016-10-14 21:48:29 +0200972 json = getJsonString(getClass()
973 .getResource("/queries/metaquery8-filtered.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000974
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000975 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000976 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000977
Nils Diewaldafab8f32015-01-26 19:11:32 +0000978 assertEquals(kr.getTotalResults(), 28);
979 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
980 assertEquals(0, kr.getStartIndex());
981 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000982
Eliza Margaretha6f989202016-10-14 21:48:29 +0200983 json = getJsonString(getClass()
984 .getResource("/queries/metaquery8-filtered-further.jsonld")
985 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000986
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000987 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000988 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000989
Nils Diewaldafab8f32015-01-26 19:11:32 +0000990 assertEquals(kr.getTotalResults(), 0);
991 assertEquals(0, kr.getStartIndex());
992 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000993
Akron176c9b12015-07-29 19:53:40 +0200994
Eliza Margaretha6f989202016-10-14 21:48:29 +0200995 json = getJsonString(getClass()
996 .getResource("/queries/metaquery8-filtered-nested.jsonld")
997 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000998
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000999 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +00001000 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001001
Akron176c9b12015-07-29 19:53:40 +02001002 /*
Nils Diewaldbb33da22015-03-04 16:24:25 +00001003 assertEquals("filter with QueryWrapperFilter("
1004 + "+(ID:WPD_AAA.00003 (+tokens:s:die"
1005 + " +tokens:s:Schriftzeichen)))",
1006 ks.getCollection().getFilter(1).toString());
Akron176c9b12015-07-29 19:53:40 +02001007 */
Akron40550172015-08-04 03:06:12 +02001008 assertEquals(
1009 "AndGroup(OrGroup(ID:WPD_AAA.00001 ID:WPD_AAA.00002) OrGroup(ID:WPD_AAA.00003 AndGroup(tokens:s:die tokens:s:Schriftzeichen)))",
1010 ks.getCollection().toString());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001011
Nils Diewaldafab8f32015-01-26 19:11:32 +00001012 assertEquals(kr.getTotalResults(), 119);
1013 assertEquals(0, kr.getStartIndex());
1014 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001015 };
1016
Nils Diewald1e5d5942014-05-20 13:29:53 +00001017
1018 @Test
1019 public void searchJSONSentenceContext () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001020 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +00001021 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001022 // Indexing test files
Akron42993552016-02-04 13:24:24 +01001023 for (String i : new String[] { "00001", "00002", "00003", "00004",
Nils Diewaldbb33da22015-03-04 16:24:25 +00001024 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001025 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001026 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001027 };
1028 ki.commit();
Nils Diewald1e5d5942014-05-20 13:29:53 +00001029
Eliza Margaretha6f989202016-10-14 21:48:29 +02001030 String json = getJsonString(getClass()
1031 .getResource("/queries/bsp-context-2.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001032
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001033 Krill ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +00001034 ks.getMeta().setCutOff(false);
1035 SearchContext sc = ks.getMeta().getContext();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001036 sc.left.setLength((short) 10);
1037 sc.right.setLength((short) 10);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001038
Nils Diewald884dbcf2015-02-27 17:02:28 +00001039 Result kr = ks.apply(ki);
Akronfd05f502015-07-30 18:34:26 +02001040
Nils Diewaldbb33da22015-03-04 16:24:25 +00001041 assertEquals(kr.getMatch(1).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001042 "... dezimalen [[Wert]] 65 sowohl ...");
Nils Diewaldafab8f32015-01-26 19:11:32 +00001043 assertEquals(kr.getTotalResults(), 3);
1044 assertEquals(0, kr.getStartIndex());
1045 assertEquals(25, kr.getItemsPerPage());
Akron499c94c2016-02-04 13:13:43 +01001046
Eliza Margaretha6f989202016-10-14 21:48:29 +02001047 assertFalse(
1048 kr.getContext().toJsonNode().toString().equals("\"base/s:s\""));
Nils Diewald1e5d5942014-05-20 13:29:53 +00001049
Eliza Margaretha6f989202016-10-14 21:48:29 +02001050 json = getJsonString(getClass()
1051 .getResource("/queries/bsp-context-sentence.jsonld").getFile());
Nils Diewald1e5d5942014-05-20 13:29:53 +00001052
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001053 kr = new Krill(json).apply(ki);
Akron43cea662016-02-15 23:43:59 +01001054 assertEquals(kr.getContext().toJsonNode().toString(), "\"base/s:s\"");
Akron499c94c2016-02-04 13:13:43 +01001055
Nils Diewaldbb33da22015-03-04 16:24:25 +00001056 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001057 "steht a für den dezimalen [[Wert]] 97 sowohl im ASCII-"
Nils Diewaldbb33da22015-03-04 16:24:25 +00001058 + " als auch im Unicode-Zeichensatz");
1059 assertEquals(kr.getMatch(1).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001060 "steht A für den dezimalen [[Wert]] 65 sowohl im ASCII-"
Nils Diewaldbb33da22015-03-04 16:24:25 +00001061 + " als auch im Unicode-Zeichensatz");
1062 assertEquals(kr.getMatch(2).getSnippetBrackets(),
1063 "In einem Zahlensystem mit einer Basis größer "
1064 + "als 10 steht A oder a häufig für den dezimalen"
Akronf05fde62016-08-03 23:46:17 +02001065 + " [[Wert]] 10, siehe auch Hexadezimalsystem.");
Nils Diewald1e5d5942014-05-20 13:29:53 +00001066 };
1067
1068
Nils Diewald2276e1c2014-04-10 15:01:59 +00001069 @Test
Nils Diewald54187632014-06-11 14:39:29 +00001070 public void searchJSONbug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001071 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +00001072 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001073 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +00001074 for (String i : new String[] { "00001", "00002", "00003", "00004",
1075 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001076 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001077 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001078 };
1079 ki.commit();
Nils Diewald54187632014-06-11 14:39:29 +00001080
Eliza Margaretha6f989202016-10-14 21:48:29 +02001081 String json = getJsonString(
1082 getClass().getResource("/queries/bsp-bug.jsonld").getFile());
Nils Diewald54187632014-06-11 14:39:29 +00001083
Nils Diewald884dbcf2015-02-27 17:02:28 +00001084 Result kr = new Krill(json).apply(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +00001085
Nils Diewaldbb33da22015-03-04 16:24:25 +00001086 assertEquals(kr.getError(0).getMessage(),
1087 "Operation needs operand list");
Nils Diewald54187632014-06-11 14:39:29 +00001088 };
1089
Akronf9def5e2016-10-10 21:26:46 +02001090
1091 @Test
Akronf785dae2016-08-10 17:12:40 +02001092 public void searchJSONdistanceWithRegexesBug () throws IOException {
1093 // Construct index
1094 KrillIndex ki = new KrillIndex();
1095 // Indexing test files
1096 for (String i : new String[] { "00001" }) {
Akronf9def5e2016-10-10 21:26:46 +02001097 // , "00002", "00003", "00004", "00005", "00006", "02439"
Eliza Margaretha6f989202016-10-14 21:48:29 +02001098 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Akronf785dae2016-08-10 17:12:40 +02001099 true);
1100 };
1101 ki.commit();
1102
Akronf9def5e2016-10-10 21:26:46 +02001103 // "der" []{2,3} [opennlp/p="NN"]
Eliza Margaretha6f989202016-10-14 21:48:29 +02001104 String json = getJsonString(getClass()
1105 .getResource("/queries/bugs/distances_with_regex_bug.jsonld")
1106 .getFile());
Akronf785dae2016-08-10 17:12:40 +02001107
1108 Result kr = new Krill(json).apply(ki);
1109
Eliza Margaretha6f989202016-10-14 21:48:29 +02001110 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf9def5e2016-10-10 21:26:46 +02001111 "Mit Ausnahme von Fremdwörtern und Namen ist das A der einzige Buchstabe im Deutschen, [[der zweifach am Anfang]] eines Wortes stehen darf, etwa im Wort Aal.");
Akronf785dae2016-08-10 17:12:40 +02001112
1113 };
1114
Nils Diewaldafab8f32015-01-26 19:11:32 +00001115
Nils Diewaldef7124e2014-11-12 20:08:13 +00001116 /**
1117 * This is a breaking test for #179
1118 */
1119 @Test
1120 public void searchJSONexpansionBug () throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001121 // Construct index
1122 KrillIndex ki = new KrillIndex();
1123 // Indexing test files
1124 ki.addDoc(getClass().getResourceAsStream("/wiki/00002.json.gz"), true);
1125 ki.commit();
1126
1127 // Expansion bug
1128 // der alte Digraph Aa durch Å
Eliza Margaretha6f989202016-10-14 21:48:29 +02001129 String json = getJsonString(getClass()
1130 .getResource("/queries/bugs/expansion_bug_2.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001131
1132 Result kr = new Krill(json).apply(ki);
Eliza Margaretha6f989202016-10-14 21:48:29 +02001133 assertEquals(
1134 "... Buchstabe des Alphabetes. In Dänemark ist "
1135 + "[[der alte Digraph Aa durch Å]] ersetzt worden, "
1136 + "in Eigennamen und Ortsnamen ...",
1137 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001138 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1139 assertEquals(kr.getTotalResults(), 1);
1140
Akron9f1a55b2016-04-20 19:11:06 +02001141
1142 // TODO: base/s:t needs to be defined!!!
1143 QueryBuilder qb = new QueryBuilder("tokens");
1144 kr = new Krill(qb.tag("base/s:t")).apply(ki);
Akron9f1a55b2016-04-20 19:11:06 +02001145 assertEquals(kr.getTotalResults(), 1);
1146
1147
Nils Diewaldbb33da22015-03-04 16:24:25 +00001148 // der alte Digraph Aa durch []
1149 // Works with one document
Eliza Margaretha6f989202016-10-14 21:48:29 +02001150 json = getJsonString(getClass()
1151 .getResource("/queries/bugs/expansion_bug.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001152
1153 kr = new Krill(json).apply(ki);
1154
Akron9f1a55b2016-04-20 19:11:06 +02001155 // focus(254: spanContain(<tokens:base/s:t />, {254: spanNext(spanNext(spanNext(spanNext(tokens:s:der, tokens:s:alte), tokens:s:Digraph), tokens:s:Aa), spanExpansion(tokens:s:durch, []{1, 1}, right))}))
1156
Eliza Margaretha6f989202016-10-14 21:48:29 +02001157 assertEquals(
1158 "... Buchstabe des Alphabetes. In Dänemark ist "
1159 + "[[der alte Digraph Aa durch Å]] ersetzt worden, "
1160 + "in Eigennamen und Ortsnamen ...",
1161 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001162 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1163 assertEquals(kr.getTotalResults(), 1);
1164
1165 // Now try with one file ahead
1166 ki = new KrillIndex();
1167 for (String i : new String[] { "00001", "00002" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001168 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001169 true);
1170 };
1171 ki.commit();
1172
1173 // Expansion bug
1174 // der alte Digraph Aa durch Å
Eliza Margaretha6f989202016-10-14 21:48:29 +02001175 json = getJsonString(getClass()
1176 .getResource("/queries/bugs/expansion_bug_2.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001177
1178 kr = new Krill(json).apply(ki);
1179
Eliza Margaretha6f989202016-10-14 21:48:29 +02001180 assertEquals(
1181 "... Buchstabe des Alphabetes. In Dänemark ist "
1182 + "[[der alte Digraph Aa durch Å]] ersetzt worden, "
1183 + "in Eigennamen und Ortsnamen ...",
1184 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001185 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1186 assertEquals(kr.getTotalResults(), 1);
1187
1188 // der alte Digraph Aa durch []
Eliza Margaretha6f989202016-10-14 21:48:29 +02001189 json = getJsonString(getClass()
1190 .getResource("/queries/bugs/expansion_bug.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001191
1192 kr = new Krill(json).apply(ki);
Eliza Margaretha6f989202016-10-14 21:48:29 +02001193 assertEquals(
1194 "... Buchstabe des Alphabetes. In Dänemark ist "
1195 + "[[der alte Digraph Aa durch Å]] ersetzt worden, "
1196 + "in Eigennamen und Ortsnamen ...",
1197 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001198 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1199 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldef7124e2014-11-12 20:08:13 +00001200 };
Akron8abefa12016-02-13 05:35:42 +01001201
Akronf9def5e2016-10-10 21:26:46 +02001202
1203 @Test
Akrondfc93572016-08-10 19:01:34 +02001204 public void queryJSONzeroRepetitionBug () throws IOException {
Akronf9def5e2016-10-10 21:26:46 +02001205 // der{0}
1206 KrillIndex ki = new KrillIndex();
1207 ki.addDoc(getClass().getResourceAsStream("/wiki/00001.json.gz"), true);
1208 ki.commit();
Akrondfc93572016-08-10 19:01:34 +02001209
Eliza Margaretha6f989202016-10-14 21:48:29 +02001210 String json = getJsonString(getClass()
1211 .getResource("/queries/bugs/zero_repetition_bug.jsonld")
1212 .getFile());
Akrondfc93572016-08-10 19:01:34 +02001213
Akronf9def5e2016-10-10 21:26:46 +02001214 Result kr = new Krill(json).apply(ki);
1215
1216 assertEquals(783, kr.getError(0).getCode());
Eliza Margaretha6f989202016-10-14 21:48:29 +02001217 assertEquals("This query can't match anywhere",
1218 kr.getError(0).getMessage());
Akronf9def5e2016-10-10 21:26:46 +02001219 };
1220
Akron13db6152016-02-19 14:08:38 +01001221
Akron8abefa12016-02-13 05:35:42 +01001222 /**
Akron13db6152016-02-19 14:08:38 +01001223 * This is a Schreibgebrauch ressource that didn't work for
1224 * element queries.
Akron8abefa12016-02-13 05:35:42 +01001225 */
1226 @Test
1227 public void searchSchreibgebrauchData () throws IOException {
1228 // Construct index
1229 KrillIndex ki = new KrillIndex();
1230 // Indexing test files
Eliza Margaretha6f989202016-10-14 21:48:29 +02001231 ki.addDoc(
1232 getClass().getResourceAsStream("/sgbr/BSP-2013-01-32.json.gz"),
1233 true);
Akron8abefa12016-02-13 05:35:42 +01001234 ki.commit();
1235
1236 Krill k = new Krill(new QueryBuilder("tokens").tag("base/s:s"));
1237
Akron13db6152016-02-19 14:08:38 +01001238 assertEquals(k.getSpanQuery().toString(), "<tokens:base/s:s />");
Akron8abefa12016-02-13 05:35:42 +01001239
1240 Result kr = k.apply(ki);
1241 assertEquals(kr.getTotalResults(), 1);
1242 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001243 "[[Selbst ist der Jeck]]");
Akron8abefa12016-02-13 05:35:42 +01001244
1245 assertEquals(kr.getMatch(0).getTextSigle(), "PRO-DUD_BSP-2013-01.32");
1246 };
1247
Akron2ea48e62017-04-28 20:23:30 +02001248
1249 /**
1250 * This is a Schreibgebrauch ressource that didn't work for
1251 * element queries.
1252 */
1253 @Test
1254 public void searchNewDeReKoData () throws IOException {
1255 // Construct index
1256 KrillIndex ki = new KrillIndex();
1257 // Indexing test files
1258 // Indexing test files
1259 FieldDocument fd = ki.addDoc(1,
1260 getClass().getResourceAsStream("/goe/AGA-03828-new.json.gz"),
1261 true);
1262 ki.commit();
1263
1264 assertEquals(fd.getUID(), 1);
1265 assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
1266 assertEquals(fd.getDocSigle(), "GOE/AGA");
1267 assertEquals(fd.getCorpusSigle(), "GOE");
Akron32b95192019-01-11 13:58:55 +01001268 assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
1269 assertNull(fd.getFieldValue("subTitle"));
1270 assertEquals(fd.getFieldValue("textType"), "Autobiographie");
1271 assertNull(fd.getFieldValue("textTypeArt"));
1272 assertNull(fd.getFieldValue("textTypeRef"));
1273 assertNull(fd.getFieldValue("textColumn"));
1274 assertNull(fd.getFieldValue("textDomain"));
Akron2ea48e62017-04-28 20:23:30 +02001275 // assertEquals(fd.getPages(), "529-547");
Akron32b95192019-01-11 13:58:55 +01001276 assertEquals(fd.getFieldValue("availability"), "QAO-NC");
1277 assertEquals(fd.getFieldValue("creationDate"), "1820");
1278 assertEquals(fd.getFieldValue("pubDate"), "1982");
1279 assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von");
1280 assertNull(fd.getFieldValue("textClass"));
1281 assertEquals(fd.getFieldValue("language"), "de");
1282 assertEquals(fd.getFieldValue("pubPlace"), "München");
1283 assertEquals(fd.getFieldValue("reference"),
Akron2ea48e62017-04-28 20:23:30 +02001284 "Goethe, Johann Wolfgang von:"
1285 + " Autobiographische Einzelheiten,"
1286 + " (Geschrieben bis 1832), In: Goethe,"
1287 + " Johann Wolfgang von: Goethes Werke,"
1288 + " Bd. 10, Autobiographische Schriften"
1289 + " II, Hrsg.: Trunz, Erich. München: "
1290 + "Verlag C. H. Beck, 1982, S. 529-547");
Akron32b95192019-01-11 13:58:55 +01001291 assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck");
1292 assertNull(fd.getFieldValue("editor"));
1293 assertNull(fd.getFieldValue("fileEditionStatement"));
1294 assertNull(fd.getFieldValue("biblEditionStatement"));
1295 assertNull(fd.getFieldValue("keywords"));
Akron2ea48e62017-04-28 20:23:30 +02001296
Akron32b95192019-01-11 13:58:55 +01001297 assertEquals(fd.getFieldValue("tokenSource"), "base#tokens");
1298 assertEquals(fd.getFieldValue("foundries"),
Akron2ea48e62017-04-28 20:23:30 +02001299 "corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure dereko/structure/base-sentences-paragraphs-pagebreaks malt malt/dependency marmot marmot/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho");
Akron32b95192019-01-11 13:58:55 +01001300 assertEquals(fd.getFieldValue("layerInfos"),
Akron2ea48e62017-04-28 20:23:30 +02001301 "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens");
1302
Akron32b95192019-01-11 13:58:55 +01001303 assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke");
1304 assertNull(fd.getFieldValue("corpusSubTitle"));
1305 assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von");
1306 assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich");
1307 assertEquals(fd.getFieldValue("docTitle"),
Akron2ea48e62017-04-28 20:23:30 +02001308 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
Akron32b95192019-01-11 13:58:55 +01001309 assertNull(fd.getFieldValue("docSubTitle"));
1310 assertNull(fd.getFieldValue("docEditor"));
1311 assertNull(fd.getFieldValue("docAuthor"));
Akron2ea48e62017-04-28 20:23:30 +02001312
1313 Krill ks = new Krill(new QueryBuilder("tokens").seg("marmot/m:case:nom")
1314 .with("marmot/m:number:pl"));
1315 Result kr = ks.apply(ki);
1316
1317 assertEquals(kr.getTotalResults(), 141);
1318 assertEquals(0, kr.getStartIndex());
1319 assertEquals(25, kr.getItemsPerPage());
1320 };
1321
Akron70ce0c02018-05-25 23:44:26 +02001322 @Test
1323 public void searchLongMatch () throws IOException {
1324
1325 // Construct index
1326 KrillIndex ki = new KrillIndex();
1327 // Indexing test files
1328 ki.addDoc(
1329 getClass().getResourceAsStream("/goe/AGX-00002.json"),
1330 false);
1331 ki.commit();
1332
1333 Krill k = new Krill(new QueryBuilder("tokens").tag("xy/z:long"));
1334
1335 assertEquals(k.getSpanQuery().toString(), "<tokens:xy/z:long />");
1336
1337 Result kr = k.apply(ki);
1338 assertEquals(kr.getTotalResults(), 1);
1339 assertEquals(2, kr.getMatch(0).getStartPos());
1340 assertEquals(52, kr.getMatch(0).getEndPos());
1341 assertEquals(kr.getMatch(0).getSnippetBrackets(),
1342 "Maximen und [[Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur]<!>], so hat er sie gleich in ...");
1343 assertEquals(kr.getMatch(0).getSnippetHTML(),
1344 "<span class=\"context-left\">Maximen und </span><span class=\"match\"><mark>Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur</mark><span class=\"cutted\"></span></span><span class=\"context-right\">, so hat er sie gleich in<span class=\"more\"></span></span>");
1345 assertEquals(kr.getMatch(0).getTextSigle(), "GOE_AGX.00002");
1346 };
1347
Nils Diewaldc925b492013-12-03 23:56:10 +00001348};