blob: fe4adcc6451233e3bbb3b843ca295edd8bbafb6f [file] [log] [blame]
Eliza Margaretha6a780692014-01-15 09:45:42 +00001package de.ids_mannheim.korap.search;
2
Eliza Margaretha805e27f2016-10-14 21:39:42 +02003import static de.ids_mannheim.korap.TestSimple.getJsonString;
margarethaf70addb2015-04-27 13:17:18 +02004import static org.junit.Assert.assertEquals;
Akron001dab32015-07-02 12:30:15 +02005import static org.junit.Assert.assertTrue;
margarethaf70addb2015-04-27 13:17:18 +02006import static org.junit.Assert.assertFalse;
7import static org.junit.Assert.assertNull;
Nils Diewaldc925b492013-12-03 23:56:10 +00008
margarethaf70addb2015-04-27 13:17:18 +02009import java.io.IOException;
10import java.util.HashMap;
Nils Diewald56dc2582014-11-04 21:33:46 +000011
Nils Diewaldc925b492013-12-03 23:56:10 +000012import org.junit.Test;
Akron176c9b12015-07-29 19:53:40 +020013import org.junit.Ignore;
Nils Diewaldc925b492013-12-03 23:56:10 +000014import org.junit.runner.RunWith;
15import org.junit.runners.JUnit4;
16
margarethaf70addb2015-04-27 13:17:18 +020017import com.fasterxml.jackson.databind.JsonNode;
18import com.fasterxml.jackson.databind.ObjectMapper;
19
20import de.ids_mannheim.korap.Krill;
21import de.ids_mannheim.korap.KrillCollection;
22import de.ids_mannheim.korap.KrillIndex;
23import de.ids_mannheim.korap.KrillMeta;
24import de.ids_mannheim.korap.collection.CollectionBuilder;
25import de.ids_mannheim.korap.index.FieldDocument;
26import de.ids_mannheim.korap.query.QueryBuilder;
margarethaf70addb2015-04-27 13:17:18 +020027import de.ids_mannheim.korap.response.Result;
Akron69b958c2017-02-15 22:49:45 +010028import de.ids_mannheim.korap.response.Match;
margarethaf70addb2015-04-27 13:17:18 +020029import de.ids_mannheim.korap.response.SearchContext;
30
Nils Diewaldc925b492013-12-03 23:56:10 +000031@RunWith(JUnit4.class)
Nils Diewaldbbd39a52015-02-23 19:56:57 +000032public class TestKrill {
Akronb82cf892019-08-28 11:00:02 +020033
Nils Diewaldc925b492013-12-03 23:56:10 +000034 @Test
35 public void searchCount () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000036 Krill k = new Krill(new QueryBuilder("field1").seg("a").with("b"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000037
38 KrillMeta meta = k.getMeta();
39
Nils Diewaldafab8f32015-01-26 19:11:32 +000040 // Count:
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000041 meta.setCount(30);
42 assertEquals(meta.getCount(), 30);
43 meta.setCount(20);
44 assertEquals(meta.getCount(), 20);
45 meta.setCount(-50);
46 assertEquals(meta.getCount(), 20);
47 meta.setCount(500);
48 assertEquals(meta.getCount(), meta.getCountMax());
Akronb82cf892019-08-28 11:00:02 +020049 meta.setCount(0);
50 assertEquals(meta.getCount(), 0);
Nils Diewaldc925b492013-12-03 23:56:10 +000051 };
52
53 @Test
54 public void searchStartIndex () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000055 Krill k = new Krill(new QueryBuilder("field1").seg("a").with("b"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000056
57 KrillMeta meta = k.getMeta();
58
Nils Diewaldafab8f32015-01-26 19:11:32 +000059 // startIndex
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000060 meta.setStartIndex(5);
61 assertEquals(meta.getStartIndex(), 5);
62 meta.setStartIndex(1);
63 assertEquals(meta.getStartIndex(), 1);
64 meta.setStartIndex(0);
65 assertEquals(meta.getStartIndex(), 0);
66 meta.setStartIndex(70);
67 assertEquals(meta.getStartIndex(), 70);
68 meta.setStartIndex(-5);
69 assertEquals(meta.getStartIndex(), 0);
Nils Diewaldc925b492013-12-03 23:56:10 +000070 };
71
Nils Diewaldbb33da22015-03-04 16:24:25 +000072
Nils Diewaldc925b492013-12-03 23:56:10 +000073 @Test
74 public void searchQuery () {
Nils Diewaldbb33da22015-03-04 16:24:25 +000075 Krill ks = new Krill(new QueryBuilder("field1").seg("a").with("b"));
Nils Diewaldafab8f32015-01-26 19:11:32 +000076 // query
Nils Diewaldbb33da22015-03-04 16:24:25 +000077 assertEquals(ks.getSpanQuery().toString(),
78 "spanSegment(field1:a, field1:b)");
Nils Diewaldc925b492013-12-03 23:56:10 +000079 };
80
Nils Diewaldafab8f32015-01-26 19:11:32 +000081
Nils Diewaldc925b492013-12-03 23:56:10 +000082 @Test
83 public void searchIndex () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +000084 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +000085 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +000086 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +000087 for (String i : new String[] { "00001", "00002", "00003", "00004",
88 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +020089 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +000090 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +000091 };
92 ki.commit();
Nils Diewaldc925b492013-12-03 23:56:10 +000093
Nils Diewaldbb33da22015-03-04 16:24:25 +000094 Krill ks = new Krill(new QueryBuilder("tokens").seg("s:Buchstaben"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000095
Akron176c9b12015-07-29 19:53:40 +020096 CollectionBuilder cb = new CollectionBuilder();
97
98 ks.getCollection().fromBuilder(cb.term("textClass", "reisen"));
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000099
100 KrillMeta meta = ks.getMeta();
101 meta.setCount(3);
102 meta.setStartIndex(5);
103 meta.getContext().left.setLength(1);
104 meta.getContext().right.setLength(1);
Akrond475d992021-11-23 18:39:47 +0100105 assertTrue(meta.hasSnippets());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000106
Nils Diewald884dbcf2015-02-27 17:02:28 +0000107 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000108 assertEquals(kr.getTotalResults(), 6);
Akronb82cf892019-08-28 11:00:02 +0200109 assertEquals(kr.getMatches().size(), 1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000110 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200111 "... dem [[Buchstaben]] A ...");
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000112
113 JsonNode res = ks.toJsonNode();
Akrond475d992021-11-23 18:39:47 +0100114
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000115 assertEquals(3, res.at("/meta/count").asInt());
116 assertEquals(5, res.at("/meta/startIndex").asInt());
117 assertEquals("token", res.at("/meta/context/left/0").asText());
118 assertEquals(1, res.at("/meta/context/left/1").asInt());
119 assertEquals("token", res.at("/meta/context/right/0").asText());
120 assertEquals(1, res.at("/meta/context/right/1").asInt());
Akrond475d992021-11-23 18:39:47 +0100121 assertTrue(res.at("/matches/0/snippet").isMissingNode());
122 assertTrue(res.at("/matches/0/tokens").isMissingNode());
123
124 res = kr.toJsonNode();
125
126 assertFalse(res.at("/matches/0/snippet").isMissingNode());
127 assertTrue(res.at("/matches/0/tokens").isMissingNode());
128
Akronb82cf892019-08-28 11:00:02 +0200129
130 // Handle count=0 correctly
131 meta = ks.getMeta();
132 meta.setCount(0);
Akrond475d992021-11-23 18:39:47 +0100133
Akronb82cf892019-08-28 11:00:02 +0200134 kr = ks.apply(ki);
135 assertEquals(kr.getTotalResults(), 6);
136 assertEquals(kr.getItemsPerPage(), 0);
137 assertEquals(kr.getMatches().size(), 0);
Akrond475d992021-11-23 18:39:47 +0100138
Akron5a748962024-09-24 16:34:14 +0200139 // Handle count=0 correctly
140 meta = ks.getMeta();
141 meta.setCount(0);
142 meta.setCutOff(true);
143
144 kr = ks.apply(ki);
145 assertEquals(kr.getTotalResults(), -1);
146 assertEquals(kr.getItemsPerPage(), 0);
147 assertEquals(kr.getMatches().size(), 0);
148
Akrond475d992021-11-23 18:39:47 +0100149 // Handle tokens=true and
150 // snippet=false correctly
151 meta = ks.getMeta();
Akron5a748962024-09-24 16:34:14 +0200152 meta.setCutOff(false);
Akrond475d992021-11-23 18:39:47 +0100153 meta.setCount(1);
154 meta.setTokens(true);
155 meta.setSnippets(false);
156
157 kr = ks.apply(ki);
158 assertEquals(kr.getTotalResults(), 6);
159 assertEquals(kr.getMatches().size(), 1);
160
161 res = kr.toJsonNode();
162
163 assertFalse(res.at("/matches/0/hasSnippet").asBoolean());
164 assertTrue(res.at("/matches/0/hasTokens").asBoolean());
165 assertTrue(res.at("/matches/0/snippet").isMissingNode());
166 assertEquals("dem", res.at("/matches/0/tokens/left/0").asText());
167 assertEquals("Buchstaben", res.at("/matches/0/tokens/match/0").asText());
Akron4bc5c462023-07-17 16:36:17 +0200168
169 // The test-data is old and therefore precedes the correct testfolding.
170 // However, we can check the correct behaviour nonetheless.
171 String json = "{\"query\":{\"@type\":\"koral:token\",\"wrap\":{\"@type\":\"koral:term\",\"flags\": [\"flags:caseInsensitive\"],\"key\": \"Grรถsstenteils\",\"layer\":\"orth\",\"match\": \"match:eq\"}}}";
172
173 ObjectMapper mapper = new ObjectMapper();
174
175 ks = new Krill(json);
176 kr = ks.apply(ki);
177 assertEquals(kr.getTotalResults(), 0);
178 assertEquals(kr.getItemsPerPage(), 25);
179 assertEquals(kr.getMatches().size(), 0);
180
181 res = mapper.readTree(kr.toJsonString());
182 assertEquals(res.at("/meta/serialQuery").asText(),"tokens:i:grรถsstenteils");
183
184 json = "{\"query\":{\"@type\":\"koral:token\",\"wrap\":{\"@type\":\"koral:term\",\"flags\": [\"flags:caseInsensitive\"],\"key\": \"GrรถรŸtenteils\",\"layer\":\"orth\",\"match\": \"match:eq\"}}}";
185
186 ks = new Krill(json);
187 kr = ks.apply(ki);
188
189 assertEquals(kr.getTotalResults(), 2);
190 assertEquals(kr.getItemsPerPage(), 25);
191 assertEquals(kr.getMatches().size(), 2);
192
193 res = mapper.readTree(kr.toJsonString());
194 assertEquals(res.at("/meta/serialQuery").asText(),
195 "spanOr([tokens:i:grรถsstenteils, tokens:i:grรถรŸtenteils])");
Nils Diewaldc925b492013-12-03 23:56:10 +0000196 };
Nils Diewaldc6b78752013-12-05 19:05:12 +0000197
Nils Diewaldafab8f32015-01-26 19:11:32 +0000198
Nils Diewaldc6b78752013-12-05 19:05:12 +0000199 @Test
200 public void searchJSON () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000201 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000202 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000203 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000204 for (String i : new String[] { "00001", "00002", "00003", "00004",
205 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200206 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000207 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000208 };
209 ki.commit();
Nils Diewaldc6b78752013-12-05 19:05:12 +0000210
Eliza Margaretha6f989202016-10-14 21:48:29 +0200211 String json = getJsonString(
212 getClass().getResource("/queries/metaquery3.jsonld").getFile());
Nils Diewaldc6b78752013-12-05 19:05:12 +0000213
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000214 Krill ks = new Krill(json);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000215 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000216 assertEquals(kr.getTotalResults(), 66);
217 assertEquals(5, kr.getItemsPerPage());
218 assertEquals(5, kr.getStartIndex());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200219 assertEquals("... a: A ist [[der klangreichste]] der V ...",
220 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000221 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000222
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000224 @Test
225 public void searchJSON2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000226 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000227 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000228 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000229 for (String i : new String[] { "00001", "00002", "00003", "00004",
230 "00005", "00006", "02439", "00012-fakemeta", "00030-fakemeta",
Eliza Margaretha6f989202016-10-14 21:48:29 +0200231 /*
232 "02035-substring",
233 "05663-unbalanced",
234 "07452-deep"
235 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000236 }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200237 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000238 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000239 };
240 ki.commit();
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000241
Eliza Margaretha6f989202016-10-14 21:48:29 +0200242 String json = getJsonString(
243 getClass().getResource("/queries/metaquery4.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000244
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000245 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000246 Result kr = ks.apply(ki);
Nils Diewaldc86aa482014-02-12 16:58:05 +0000247
Nils Diewaldafab8f32015-01-26 19:11:32 +0000248 assertEquals(kr.getTotalResults(), 1);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000249
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000250 ks = new Krill(json);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000251 // Ignore the collection part of the query!
Nils Diewald2d5f8102015-02-26 21:07:54 +0000252 ks.setCollection(new KrillCollection());
Nils Diewald3aa9e692015-02-20 22:20:11 +0000253 kr = ks.apply(ki);
Nils Diewald979b2fe2014-09-29 16:21:41 +0000254
Nils Diewaldafab8f32015-01-26 19:11:32 +0000255 assertEquals(kr.getTotalResults(), 5);
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000256
Eliza Margaretha6f989202016-10-14 21:48:29 +0200257 json = getJsonString(
258 getClass().getResource("/queries/metaquery5.jsonld").getFile());
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000259
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000260 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000261 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000262 assertEquals(kr.getTotalResults(), 1);
263
Eliza Margaretha6f989202016-10-14 21:48:29 +0200264 json = getJsonString(
265 getClass().getResource("/queries/metaquery6.jsonld").getFile());
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000266 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000267 kr = ks.apply(ki);
Akronb82cf892019-08-28 11:00:02 +0200268 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldc6b78752013-12-05 19:05:12 +0000269 };
270
Akronbb5d1732015-06-22 01:22:40 +0200271
Akronc63697c2015-06-17 22:32:02 +0200272 // Todo: There SHOULD be a failure here, but Koral currently creates empty collections
273 @Test
274 public void queryJSONapiTest1 () {
Akronbb5d1732015-06-22 01:22:40 +0200275 Krill test = new Krill(
276 "{\"@context\":\"http://korap.ids-mannheim.de/ns/koral/0.3/context.jsonld\",\"errors\":[],\"warnings\":[],\"messages\":[],\"collection\":{},\"query\":{\"@type\":\"koral:token\",\"wrap\":{\"@type\":\"koral:term\",\"layer\":\"orth\",\"key\":\"Baum\",\"match\":\"match:eq\"}},\"meta\":{}}");
Akronc63697c2015-06-17 22:32:02 +0200277 assertFalse(test.hasErrors());
278 };
279
Nils Diewaldc6b78752013-12-05 19:05:12 +0000280
281 @Test
282 public void searchJSONFailure () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000283 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000284 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000285 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000286 for (String i : new String[] { "00001", "00002", "00003", "00004",
287 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200288 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000289 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000290 };
291 ki.commit();
Nils Diewald884dbcf2015-02-27 17:02:28 +0000292 Result kr = new Krill("{ query").apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000293 assertEquals(kr.getTotalResults(), 0);
294 assertEquals(kr.getError(0).getMessage(), "Unable to parse JSON");
Nils Diewaldc6b78752013-12-05 19:05:12 +0000295 };
296
297
Nils Diewald9f310832013-12-06 22:38:55 +0000298 @Test
299 public void searchJSONindexboundary () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000300 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000301 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000302 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000303 for (String i : new String[] { "00001", "00002", "00003", "00004",
304 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200305 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000306 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000307 };
308 ki.commit();
Nils Diewald9f310832013-12-06 22:38:55 +0000309
Eliza Margaretha6f989202016-10-14 21:48:29 +0200310 String json = getJsonString(
311 getClass().getResource("/queries/bsp-fail1.jsonld").getFile());
Nils Diewald9f310832013-12-06 22:38:55 +0000312
Nils Diewald884dbcf2015-02-27 17:02:28 +0000313 Result kr = new Krill(json).apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000314 assertEquals(0, kr.getStartIndex());
315 assertEquals(kr.getTotalResults(), 0);
316 assertEquals(25, kr.getItemsPerPage());
Nils Diewald9f310832013-12-06 22:38:55 +0000317 };
318
Nils Diewaldafab8f32015-01-26 19:11:32 +0000319
Nils Diewald9f310832013-12-06 22:38:55 +0000320 @Test
321 public void searchJSONindexboundary2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000322 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000323 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000324 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000325 for (String i : new String[] { "00001", "00002", "00003", "00004",
326 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200327 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000328 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000329 };
330 ki.commit();
Nils Diewald9f310832013-12-06 22:38:55 +0000331
Eliza Margaretha6f989202016-10-14 21:48:29 +0200332 String json = getJsonString(
333 getClass().getResource("/queries/bsp-fail2.jsonld").getFile());
Nils Diewald9f310832013-12-06 22:38:55 +0000334
Nils Diewald884dbcf2015-02-27 17:02:28 +0000335 Result kr = new Krill(json).apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000336 assertEquals(50, kr.getItemsPerPage());
337 assertEquals(49950, kr.getStartIndex());
338 assertEquals(kr.getTotalResults(), 0);
Nils Diewald9f310832013-12-06 22:38:55 +0000339 };
340
Akron40550172015-08-04 03:06:12 +0200341
Akron001dab32015-07-02 12:30:15 +0200342 /*
343 * Queries should be mirrored correctly for debugging reasons.
344 */
345 @Test
346 public void queryJSONmirrorTestBug () throws IOException {
347 // Construct index
348 KrillIndex ki = new KrillIndex();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200349 String json = getJsonString(getClass()
350 .getResource("/queries/bugs/failing_mirror.jsonld").getFile());
Akron001dab32015-07-02 12:30:15 +0200351 Krill ks = new Krill(json);
352 Result kr = ks.apply(ki);
353
354 ObjectMapper mapper = new ObjectMapper();
355 JsonNode res = mapper.readTree(kr.toJsonString());
356
357 assertEquals("Unable to parse JSON", res.at("/errors/0/1").asText());
358
Eliza Margaretha6f989202016-10-14 21:48:29 +0200359 json = getJsonString(
360 getClass().getResource("/queries/bugs/failing_mirror_2.jsonld")
361 .getFile());
Akron001dab32015-07-02 12:30:15 +0200362 ks = new Krill(json);
363 kr = ks.apply(ki);
364
365 res = mapper.readTree(kr.toJsonString());
366
Akron40550172015-08-04 03:06:12 +0200367 assertEquals(23, res.at("/meta/count").asInt());
368 assertEquals(25, res.at("/meta/itemsPerPage").asInt());
Akron001dab32015-07-02 12:30:15 +0200369 assertEquals("base/s:p", res.at("/meta/context").asText());
370 assertFalse(res.at("/query").isMissingNode());
371 assertTrue(res.at("/query/@type").isMissingNode());
372 assertTrue(res.at("/collection/@type").isMissingNode());
373 };
374
375
Nils Diewaldc6b78752013-12-05 19:05:12 +0000376
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000377 @Test
378 public void searchJSONcontext () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000379 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000380 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000381 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000382 for (String i : new String[] { "00001", "00002", "00003", "00004",
383 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200384 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000385 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000386 };
387 ki.commit();
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000388
Eliza Margaretha6f989202016-10-14 21:48:29 +0200389 String json = getJsonString(getClass()
390 .getResource("/queries/bsp-context.jsonld").getFile());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000391
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000392 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000393 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000394 assertEquals(kr.getTotalResults(), 10);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200395 assertEquals(
396 "A bzw. a ist der erste Buchstabe des"
397 + " lateinischen [[Alphabets]] und ein Vokal."
398 + " Der Buchstabe A hat in deutschen Texten"
399 + " eine durchschnittliche Hรคufigkeit ...",
400 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldb3a09db2013-12-21 00:22:02 +0000401
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000402 ks.getMeta().setCount(5);
403 ks.getMeta().setStartPage(2);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000404 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000405 assertEquals(kr.getTotalResults(), 10);
406 assertEquals(5, kr.getStartIndex());
407 assertEquals(5, kr.getItemsPerPage());
Nils Diewald891c53c2013-12-23 16:37:46 +0000408
Eliza Margaretha805e27f2016-10-14 21:39:42 +0200409 json = getJsonString(getClass()
Nils Diewaldbb33da22015-03-04 16:24:25 +0000410 .getResource("/queries/bsp-context-2.jsonld").getFile());
Nils Diewald891c53c2013-12-23 16:37:46 +0000411
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000412 kr = new Krill(json).apply(ki);
413
Nils Diewaldafab8f32015-01-26 19:11:32 +0000414 assertEquals(kr.getTotalResults(), -1);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200415 assertEquals(
416 "... lls seit den Griechen beibehalten worden."
417 + " 3. Bedeutungen in der Biologie steht A fรผr"
418 + " das Nukleosid Adenosin steht A die Base"
419 + " Adenin steht A fรผr die Aminosรคure Alanin"
420 + " in der Informatik steht a fรผr den dezimalen"
421 + " [[Wert]] 97 sowohl im ASCII- als auch im"
422 + " Unicode-Zeichensatz steht A fรผr den dezimalen"
423 + " Wert 65 sowohl im ASCII- als auch im"
424 + " Unicode-Zeichensatz als Kfz-Kennzeichen"
425 + " steht A in Deutschland fรผr Augsburg."
426 + " in ร–sterreich auf ...",
427 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000428 };
429
Nils Diewaldbb33da22015-03-04 16:24:25 +0000430
Nils Diewald364eb642013-12-22 15:03:01 +0000431 @Test
432 public void searchJSONstartPage () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000433 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000434 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000435 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000436 for (String i : new String[] { "00001", "00002", "00003", "00004",
437 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200438 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000439 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000440 };
441 ki.commit();
Nils Diewald364eb642013-12-22 15:03:01 +0000442
Eliza Margaretha6f989202016-10-14 21:48:29 +0200443 String json = getJsonString(
444 getClass().getResource("/queries/bsp-paging.jsonld").getFile());
Nils Diewald364eb642013-12-22 15:03:01 +0000445
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000446 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000447 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000448 assertEquals(kr.getTotalResults(), 10);
449 assertEquals(5, kr.getStartIndex());
450 assertEquals(5, kr.getItemsPerPage());
Nils Diewald364eb642013-12-22 15:03:01 +0000451
Eliza Margaretha6f989202016-10-14 21:48:29 +0200452 json = getJsonString(
453 getClass().getResource("/queries/bsp-cutoff.jsonld").getFile());
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000454 ks = ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000455 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000456 assertEquals(kr.getTotalResults(), -1);
457 assertEquals(2, kr.getStartIndex());
458 assertEquals(2, kr.getItemsPerPage());
Nils Diewald364eb642013-12-22 15:03:01 +0000459
Eliza Margaretha6f989202016-10-14 21:48:29 +0200460 json = getJsonString(
461 getClass().getResource("/queries/metaquery9.jsonld").getFile());
Nils Diewald2d5f8102015-02-26 21:07:54 +0000462 KrillCollection kc = new KrillCollection(json);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000463 kc.setIndex(ki);
464 assertEquals(7, kc.numberOf("documents"));
Nils Diewald364eb642013-12-22 15:03:01 +0000465 };
Nils Diewaldeabed8b2013-12-17 16:46:43 +0000466
Nils Diewaldafab8f32015-01-26 19:11:32 +0000467
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000468 @Test
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000469 public void searchJSONitemsPerResource () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000470 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000471 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000472 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000473 for (String i : new String[] { "00001", "00002", "00003", "00004",
474 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200475 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000476 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000477 };
478 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200479 String json = getJsonString(getClass()
480 .getResource("/queries/bsp-itemsPerResource.jsonld").getFile());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000481
Akron7f08b352023-07-11 10:56:00 +0200482 Krill ks;
483 Result kr;
484 KrillMeta meta;
485
486 ks = new Krill(json);
487 kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000488 assertEquals(kr.getTotalResults(), 10);
489 assertEquals(0, kr.getStartIndex());
490 assertEquals(20, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000491
Nils Diewaldafab8f32015-01-26 19:11:32 +0000492 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
493 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
494 assertEquals("WPD_AAA.00001", kr.getMatch(6).getDocID());
495 assertEquals("WPD_AAA.00002", kr.getMatch(7).getDocID());
496 assertEquals("WPD_AAA.00002", kr.getMatch(8).getDocID());
497 assertEquals("WPD_AAA.00004", kr.getMatch(9).getDocID());
Akron7f08b352023-07-11 10:56:00 +0200498 assertEquals(kr.getTotalResources(), 3);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000499
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000500 ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000501 ks.getMeta().setItemsPerResource(1);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000502
Nils Diewald3aa9e692015-02-20 22:20:11 +0000503 kr = ks.apply(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000504
Nils Diewaldafab8f32015-01-26 19:11:32 +0000505 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
506 assertEquals("WPD_AAA.00002", kr.getMatch(1).getDocID());
507 assertEquals("WPD_AAA.00004", kr.getMatch(2).getDocID());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000508
Nils Diewaldafab8f32015-01-26 19:11:32 +0000509 assertEquals(kr.getTotalResults(), 3);
Akron7f08b352023-07-11 10:56:00 +0200510 assertEquals(kr.getTotalResources(), 3);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000511 assertEquals(0, kr.getStartIndex());
512 assertEquals(20, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000513
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000514 ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000515 ks.getMeta().setItemsPerResource(2);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000516
Nils Diewald3aa9e692015-02-20 22:20:11 +0000517 kr = ks.apply(ki);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000518
Nils Diewaldafab8f32015-01-26 19:11:32 +0000519 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
520 assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
521 assertEquals("WPD_AAA.00002", kr.getMatch(2).getDocID());
522 assertEquals("WPD_AAA.00002", kr.getMatch(3).getDocID());
523 assertEquals("WPD_AAA.00004", kr.getMatch(4).getDocID());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000524
Nils Diewaldafab8f32015-01-26 19:11:32 +0000525 assertEquals(kr.getTotalResults(), 5);
Akron7f08b352023-07-11 10:56:00 +0200526 assertEquals(kr.getTotalResources(), 3);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000527 assertEquals(0, kr.getStartIndex());
528 assertEquals(20, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000529
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000530 ks = new Krill(json);
Akron7f08b352023-07-11 10:56:00 +0200531 meta = ks.getMeta();
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000532 meta.setItemsPerResource(1);
533 meta.setStartIndex(1);
534 meta.setCount(1);
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000535
Nils Diewald3aa9e692015-02-20 22:20:11 +0000536 kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000537
Nils Diewaldafab8f32015-01-26 19:11:32 +0000538 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000539
Nils Diewaldafab8f32015-01-26 19:11:32 +0000540 assertEquals(kr.getTotalResults(), 3);
Akron7f08b352023-07-11 10:56:00 +0200541 assertEquals(kr.getTotalResources(), 3);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000542 assertEquals(1, kr.getStartIndex());
543 assertEquals(1, kr.getItemsPerPage());
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000544
Nils Diewaldafab8f32015-01-26 19:11:32 +0000545 assertEquals((short) 1, kr.getItemsPerResource());
Akron7f08b352023-07-11 10:56:00 +0200546
547 ks = new Krill(json);
548 meta = ks.getMeta();
549 meta.setItemsPerResource(2);
550 meta.setStartIndex(2);
551 meta.setCount(1);
552
553 kr = ks.apply(ki);
554
555 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
556
557 assertEquals(kr.getTotalResults(), 5);
558 assertEquals(kr.getTotalResources(), 3);
559 assertEquals(2, kr.getStartIndex());
560 assertEquals(1, kr.getItemsPerPage());
561
562
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000563 };
564
Nils Diewaldafab8f32015-01-26 19:11:32 +0000565
Nils Diewaldd723d812014-09-23 18:50:52 +0000566 @Test
567 public void searchJSONitemsPerResourceServer () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000568 /*
569 * This test is a server-only implementation of
570 * TestResource#testCollection
571 */
572 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000573 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000574 // Indexing test files
575 int uid = 1;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000576 for (String i : new String[] { "00001", "00002", "00003", "00004",
577 "00005", "00006", "02439" }) {
578 ki.addDoc(uid++,
579 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
580 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000581 };
582 ki.commit();
Nils Diewaldd723d812014-09-23 18:50:52 +0000583
Eliza Margaretha6f989202016-10-14 21:48:29 +0200584 String json = getJsonString(getClass()
585 .getResource("/queries/bsp-uid-example.jsonld").getFile());
Nils Diewaldd723d812014-09-23 18:50:52 +0000586
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000587 Krill ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +0000588 ks.getMeta().setItemsPerResource(1);
589
Nils Diewald2d5f8102015-02-26 21:07:54 +0000590 KrillCollection kc = new KrillCollection();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000591 kc.filterUIDs(new String[] { "1", "4" });
Nils Diewaldafab8f32015-01-26 19:11:32 +0000592 kc.setIndex(ki);
593 ks.setCollection(kc);
Nils Diewaldd723d812014-09-23 18:50:52 +0000594
Nils Diewald884dbcf2015-02-27 17:02:28 +0000595 Result kr = ks.apply(ki);
Nils Diewaldd723d812014-09-23 18:50:52 +0000596
Nils Diewaldafab8f32015-01-26 19:11:32 +0000597 assertEquals(kr.getTotalResults(), 2);
598 assertEquals(0, kr.getStartIndex());
599 assertEquals(25, kr.getItemsPerPage());
Nils Diewaldd723d812014-09-23 18:50:52 +0000600 };
Nils Diewaldba197f22014-11-01 17:21:46 +0000601
Nils Diewaldafab8f32015-01-26 19:11:32 +0000602
Nils Diewaldba197f22014-11-01 17:21:46 +0000603 @Test
604 public void searchJSONnewJSON () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000605 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000606 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000607 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000608 FieldDocument fd = ki.addDoc(1,
609 getClass().getResourceAsStream("/goe/AGA-03828.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000610 ki.commit();
Nils Diewaldba197f22014-11-01 17:21:46 +0000611
Nils Diewaldafab8f32015-01-26 19:11:32 +0000612 assertEquals(fd.getUID(), 1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000613 assertEquals(fd.getTextSigle(), "GOE_AGA.03828");
614 assertEquals(fd.getDocSigle(), "GOE_AGA");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000615 assertEquals(fd.getCorpusSigle(), "GOE");
Akron32b95192019-01-11 13:58:55 +0100616 assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
617 assertNull(fd.getFieldValue("subTitle"));
618 assertEquals(fd.getFieldValue("textType"), "Autobiographie");
619 assertNull(fd.getFieldValue("textTypeArt"));
620 assertNull(fd.getFieldValue("textTypeRef"));
621 assertNull(fd.getFieldValue("textColumn"));
622 assertNull(fd.getFieldValue("textDomain"));
Akron69b958c2017-02-15 22:49:45 +0100623 // assertEquals(fd.getPages(), "529-547");
Akron32b95192019-01-11 13:58:55 +0100624 assertEquals(fd.getFieldValue("availability"), "QAO-NC");
625 assertEquals(fd.getFieldValue("creationDate"), "1820");
626 assertEquals(fd.getFieldValue("pubDate"), "1982");
627 assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von");
628 assertNull(fd.getFieldValue("textClass"));
629 assertEquals(fd.getFieldValue("language"), "de");
630 assertEquals(fd.getFieldValue("pubPlace"), "Mรผnchen");
631 assertEquals(fd.getFieldValue("reference"),
Eliza Margaretha6f989202016-10-14 21:48:29 +0200632 "Goethe, Johann Wolfgang von:"
633 + " Autobiographische Einzelheiten,"
634 + " (Geschrieben bis 1832), In: Goethe,"
635 + " Johann Wolfgang von: Goethes Werke,"
636 + " Bd. 10, Autobiographische Schriften"
637 + " II, Hrsg.: Trunz, Erich. Mรผnchen: "
638 + "Verlag C. H. Beck, 1982, S. 529-547");
Akron32b95192019-01-11 13:58:55 +0100639 assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck");
640 assertNull(fd.getFieldValue("editor"));
641 assertNull(fd.getFieldValue("fileEditionStatement"));
642 assertNull(fd.getFieldValue("biblEditionStatement"));
643 assertNull(fd.getFieldValue("keywords"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000644
Akron32b95192019-01-11 13:58:55 +0100645 assertEquals(fd.getFieldValue("tokenSource"), "opennlp#tokens");
646 assertEquals(fd.getFieldValue("foundries"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000647 "base base/paragraphs base/sentences corenlp "
648 + "corenlp/constituency corenlp/morpho "
649 + "corenlp/namedentities corenlp/sentences "
650 + "glemm glemm/morpho mate mate/morpho"
651 + " opennlp opennlp/morpho opennlp/sentences"
652 + " treetagger treetagger/morpho "
653 + "treetagger/sentences");
Akron32b95192019-01-11 13:58:55 +0100654 assertEquals(fd.getFieldValue("layerInfos"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000655 "base/s=spans corenlp/c=spans corenlp/ne=tokens"
656 + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens"
657 + " mate/l=tokens mate/m=tokens mate/p=tokens"
658 + " opennlp/p=tokens opennlp/s=spans tt/l=tokens"
659 + " tt/p=tokens tt/s=spans");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000660
Akron32b95192019-01-11 13:58:55 +0100661 assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke");
662 assertNull(fd.getFieldValue("corpusSubTitle"));
663 assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von");
664 assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich");
665 assertEquals(fd.getFieldValue("docTitle"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000666 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
Akron32b95192019-01-11 13:58:55 +0100667 assertNull(fd.getFieldValue("docSubTitle"));
668 assertNull(fd.getFieldValue("docEditor"));
669 assertNull(fd.getFieldValue("docAuthor"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000670
Nils Diewaldbb33da22015-03-04 16:24:25 +0000671 Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom")
672 .with("mate/m:number:pl"));
Nils Diewald884dbcf2015-02-27 17:02:28 +0000673 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000674
675 assertEquals(kr.getTotalResults(), 148);
676 assertEquals(0, kr.getStartIndex());
677 assertEquals(25, kr.getItemsPerPage());
Nils Diewaldba197f22014-11-01 17:21:46 +0000678 };
Nils Diewald06368ba2014-11-03 20:53:27 +0000679
Akron69b958c2017-02-15 22:49:45 +0100680
681 @Test
682 public void searchJSONwithPagebreaks () throws IOException {
683 // Construct index
684 KrillIndex ki = new KrillIndex();
685 // Indexing test files
686 FieldDocument fd = ki.addDoc(1,
687 getClass().getResourceAsStream("/goe/AGA-03828-pb.json.gz"), true);
688 ki.commit();
689
690 assertEquals(fd.getUID(), 1);
691 assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
692 assertEquals(fd.getDocSigle(), "GOE/AGA");
693 assertEquals(fd.getCorpusSigle(), "GOE");
Akron32b95192019-01-11 13:58:55 +0100694 assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
695 assertNull(fd.getFieldValue("subTitle"));
696 assertEquals(fd.getFieldValue("textType"), "Autobiographie");
697 assertNull(fd.getFieldValue("textTypeArt"));
698 assertNull(fd.getFieldValue("textTypeRef"));
699 assertNull(fd.getFieldValue("textColumn"));
700 assertNull(fd.getFieldValue("textDomain"));
Akron69b958c2017-02-15 22:49:45 +0100701 // assertEquals(fd.getPages(), "529-547");
Akron32b95192019-01-11 13:58:55 +0100702 // assertEquals(fd.getFieldValue("availability"), "QAO-NC");
703 assertEquals(fd.getFieldValue("creationDate"), "1820");
704 assertEquals(fd.getFieldValue("pubDate"), "1982");
705 assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von");
706 assertNull(fd.getFieldValue("textClass"));
707 assertEquals(fd.getFieldValue("language"), "de");
708 assertEquals(fd.getFieldValue("pubPlace"), "Mรผnchen");
709 assertEquals(fd.getFieldValue("reference"),
Akron69b958c2017-02-15 22:49:45 +0100710 "Goethe, Johann Wolfgang von:"
711 + " Autobiographische Einzelheiten,"
712 + " (Geschrieben bis 1832), In: Goethe,"
713 + " Johann Wolfgang von: Goethes Werke,"
714 + " Bd. 10, Autobiographische Schriften"
715 + " II, Hrsg.: Trunz, Erich. Mรผnchen: "
716 + "Verlag C. H. Beck, 1982, S. 529-547");
Akron32b95192019-01-11 13:58:55 +0100717 assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck");
718 assertNull(fd.getFieldValue("editor"));
719 assertNull(fd.getFieldValue("fileEditionStatement"));
720 assertNull(fd.getFieldValue("biblEditionStatement"));
721 assertNull(fd.getFieldValue("keywords"));
Akron69b958c2017-02-15 22:49:45 +0100722
Akron32b95192019-01-11 13:58:55 +0100723 assertEquals(fd.getFieldValue("tokenSource"), "base#tokens_aggr");
724 assertEquals(fd.getFieldValue("foundries"),
Akron69b958c2017-02-15 22:49:45 +0100725 "dereko dereko/structure "+
726 "dereko/structure/base-sentences-paragraphs-pagebreaks");
Akron32b95192019-01-11 13:58:55 +0100727 assertEquals(fd.getFieldValue("layerInfos"), "dereko/s=spans");
Akron69b958c2017-02-15 22:49:45 +0100728
Akron32b95192019-01-11 13:58:55 +0100729 assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke");
730 assertNull(fd.getFieldValue("corpusSubTitle"));
731 assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von");
732 assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich");
733 assertEquals(fd.getFieldValue("docTitle"),
Akron69b958c2017-02-15 22:49:45 +0100734 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
Akron32b95192019-01-11 13:58:55 +0100735 assertNull(fd.getFieldValue("docSubTitle"));
736 assertNull(fd.getFieldValue("docEditor"));
737 assertNull(fd.getFieldValue("docAuthor"));
Akron69b958c2017-02-15 22:49:45 +0100738
739 Krill ks = new Krill(new QueryBuilder("tokens").seg("s:der"));
740 Result kr = ks.apply(ki);
741
742 assertEquals(kr.getTotalResults(), 97);
743 assertEquals(0, kr.getStartIndex());
744 assertEquals(25, kr.getItemsPerPage());
745
746 Match m = kr.getMatch(5);
747 assertEquals("Start page", m.getStartPage(), 529);
748
749 ObjectMapper mapper = new ObjectMapper();
750 JsonNode res = mapper.readTree(m.toJsonString());
751 assertEquals(529, res.at("/pages/0").asInt());
752 };
753
Akron26e54172024-05-23 17:03:03 +0200754 @Test
755 public void searchJSONwithUtteranceAttributes () throws IOException {
756 // Construct index
757 KrillIndex ki = new KrillIndex();
758 // Indexing test files
759 FieldDocument fd = ki.addDoc(1,
760 getClass().getResourceAsStream("/others/kokokom-example.json.gz"), true);
761 ki.commit();
762
763 assertEquals(fd.getUID(), 1);
764 assertEquals(fd.getTextSigle(), "KTC/001/000001");
765
766 Krill ks = new Krill(new QueryBuilder("tokens").seg("s:Rรคuspern"));
767 Result kr = ks.apply(ki);
768
769 assertEquals(1, kr.getTotalResults());
770 assertEquals(0, kr.getStartIndex());
771 assertEquals(25, kr.getItemsPerPage());
772 Match m = kr.getMatch(0);
Akronc2517492024-06-13 14:54:30 +0200773 assertEquals(
774 "<span class=\"context-left\"><span class=\"inline-marker\" data-key=\"who\" data-value=\"Mai Thi Nguyen-Kim\"></span><span class=\"inline-marker\" data-key=\"start\" data-value=\"0:00\"></span><span class=\"inline-marker\" data-key=\"end\" data-value=\"01:20\"></span>(</span><span class=\"match\"><mark>Rรคuspern</mark></span><span class=\"context-right\">) Wie viele Geschlechter gibt es? Wenn<span class=\"more\"></span></span>",
775 m.getSnippetHTML());
776
777 assertEquals(
778 "{*who=Mai Thi Nguyen-Kim}{*start=0:00}{*end=01:20}([[Rรคuspern]]) Wie viele Geschlechter gibt es? Wenn ...",
779 m.getSnippetBrackets());
780
781 ks = new Krill(new QueryBuilder("tokens").seg("s:Geschlechter"));
782 kr = ks.apply(ki);
783
784 assertEquals(5, kr.getTotalResults());
785 assertEquals(0, kr.getStartIndex());
786 assertEquals(25, kr.getItemsPerPage());
787 m = kr.getMatch(0);
788 assertEquals("<span class=\"context-left\"><span class=\"inline-marker\" data-key=\"who\" data-value=\"Mai Thi Nguyen-Kim\"></span><span class=\"inline-marker\" data-key=\"start\" data-value=\"0:00\"></span><span class=\"inline-marker\" data-key=\"end\" data-value=\"01:20\"></span>(Rรคuspern) Wie viele </span><span class=\"match\"><mark>Geschlechter</mark></span><span class=\"context-right\"> gibt es? Wenn man hierzu รถffentliche<span class=\"more\"></span></span>", m.getSnippetHTML());
789
790 assertEquals(
791 "{*who=Mai Thi Nguyen-Kim}{*start=0:00}{*end=01:20}(Rรคuspern) Wie viele [[Geschlechter]] gibt es? Wenn man hierzu รถffentliche ...",
792 m.getSnippetBrackets());
793
794 ks = new Krill(new QueryBuilder("tokens").seg("s:Zunรคchst"));
795 kr = ks.apply(ki);
796
797 assertEquals(1, kr.getTotalResults());
798 assertEquals(0, kr.getStartIndex());
799 assertEquals(25, kr.getItemsPerPage());
800 m = kr.getMatch(0);
801 assertEquals("<span class=\"context-left\"><span class=\"more\"></span>Perspektiven, die dazu einladen, aneinander vorbeizureden </span><span class=\"match\"><mark><span class=\"inline-marker\" data-key=\"who\" data-value=\"Mai Thi Nguyen-Kim\"></span><span class=\"inline-marker\" data-key=\"start\" data-value=\"0:00\"></span><span class=\"inline-marker\" data-key=\"end\" data-value=\"01:20\"></span>Zunรคchst</mark></span><span class=\"context-right\"> einmal bezeichnet Geschlecht eine Rolle bei<span class=\"more\"></span></span>", m.getSnippetHTML());
802
803 assertEquals(
804 "... Perspektiven, die dazu einladen, aneinander vorbeizureden [[{*who=Mai Thi Nguyen-Kim}{*start=0:00}{*end=01:20}Zunรคchst]] einmal bezeichnet Geschlecht eine Rolle bei ...",
805 m.getSnippetBrackets());
806
807
Akron26e54172024-05-23 17:03:03 +0200808 };
809
810
Nils Diewaldafab8f32015-01-26 19:11:32 +0000811
Nils Diewald06368ba2014-11-03 20:53:27 +0000812 @Test
813 public void searchJSONnewJSON2 () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000814 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000815 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000816 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000817 FieldDocument fd = ki.addDoc(1,
818 getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000819 ki.commit();
Nils Diewald06368ba2014-11-03 20:53:27 +0000820
Nils Diewaldafab8f32015-01-26 19:11:32 +0000821 assertEquals(fd.getUID(), 1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000822 assertEquals(fd.getTextSigle(), "BZK_D59.00089");
823 assertEquals(fd.getDocSigle(), "BZK_D59");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000824 assertEquals(fd.getCorpusSigle(), "BZK");
Akron32b95192019-01-11 13:58:55 +0100825 assertEquals(fd.getFieldValue("title"), "Saragat-Partei zerfรคllt");
826 assertEquals(fd.getFieldValue("pubDate"), "1959-02-19");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000827
Akron32b95192019-01-11 13:58:55 +0100828 assertNull(fd.getFieldValue("subTitle"));
829 assertNull(fd.getFieldValue("author"));
830 assertNull(fd.getFieldValue("editor"));
831 assertEquals(fd.getFieldValue("pubPlace"), "Berlin");
832 assertNull(fd.getFieldValue("publisher"));
833 assertEquals(fd.getFieldValue("textType"), "Zeitung: Tageszeitung");
834 assertNull(fd.getFieldValue("textTypeArt"));
835 assertEquals(fd.getFieldValue("textTypeRef"), "Tageszeitung");
836 assertEquals(fd.getFieldValue("textDomain"), "Politik");
837 assertEquals(fd.getFieldValue("creationDate"), "1959-02-19");
838 assertEquals(fd.getFieldValue("availability"), "ACA-NC-LC");
839 assertEquals(fd.getFieldValue("textColumn"), "POLITIK");
Akron69b958c2017-02-15 22:49:45 +0100840 // assertNull(fd.getPages());
Akron32b95192019-01-11 13:58:55 +0100841 assertEquals(fd.getFieldValue("textClass"), "politik ausland");
842 assertNull(fd.getFieldValue("fileEditionStatement"));
843 assertNull(fd.getFieldValue("biblEditionStatement"));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000844
Akron32b95192019-01-11 13:58:55 +0100845 assertEquals(fd.getFieldValue("language"), "de");
846 assertEquals(fd.getFieldValue("reference"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000847 "Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14,"
848 + " Berliner Ausgabe, S. 7. - Sachgebiet: Politik, "
849 + "Originalressort: POLITIK; Saragat-Partei zerfรคllt");
Akron32b95192019-01-11 13:58:55 +0100850 assertNull(fd.getFieldValue("publisher"));
851 assertNull(fd.getFieldValue("keywords"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000852
Akron32b95192019-01-11 13:58:55 +0100853 assertEquals(fd.getFieldValue("tokenSource"), "opennlp#tokens");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000854
Akron32b95192019-01-11 13:58:55 +0100855 assertEquals(fd.getFieldValue("foundries"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000856 "base base/paragraphs base/sentences corenlp "
857 + "corenlp/constituency corenlp/morpho corenlp/namedentities"
858 + " corenlp/sentences glemm glemm/morpho mate mate/morpho"
859 + " opennlp opennlp/morpho opennlp/sentences treetagger"
860 + " treetagger/morpho treetagger/sentences");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000861
Akron32b95192019-01-11 13:58:55 +0100862 assertEquals(fd.getFieldValue("layerInfos"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000863 "base/s=spans corenlp/c=spans corenlp/ne=tokens"
864 + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens"
865 + " mate/l=tokens mate/m=tokens mate/p=tokens"
866 + " opennlp/p=tokens opennlp/s=spans tt/l=tokens"
867 + " tt/p=tokens tt/s=spans");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000868
Akron32b95192019-01-11 13:58:55 +0100869 assertEquals(fd.getFieldValue("corpusTitle"), "Bonner Zeitungskorpus");
870 assertNull(fd.getFieldValue("corpusSubTitle"));
871 assertNull(fd.getFieldValue("corpusAuthor"));
872 assertNull(fd.getFieldValue("corpusEditor"));
Nils Diewaldafab8f32015-01-26 19:11:32 +0000873
Akron32b95192019-01-11 13:58:55 +0100874 assertEquals(fd.getFieldValue("docTitle"), "Neues Deutschland");
875 assertEquals(fd.getFieldValue("docSubTitle"),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000876 "Organ des Zentralkomitees der Sozialistischen "
877 + "Einheitspartei Deutschlands");
Akron32b95192019-01-11 13:58:55 +0100878 assertNull(fd.getFieldValue("docEditor"));
879 assertNull(fd.getFieldValue("docAuthor"));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000880
881 Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom")
882 .with("mate/m:number:sg"));
Nils Diewald884dbcf2015-02-27 17:02:28 +0000883 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000884
885 assertEquals(kr.getTotalResults(), 6);
886 assertEquals(0, kr.getStartIndex());
887 assertEquals(25, kr.getItemsPerPage());
Nils Diewald06368ba2014-11-03 20:53:27 +0000888 };
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000889
Nils Diewaldafab8f32015-01-26 19:11:32 +0000890
Nils Diewald7cf8c6d2014-05-28 18:37:38 +0000891 @Test
Nils Diewald56dc2582014-11-04 21:33:46 +0000892 public void searchJSONcosmasBoundaryBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000893 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000894 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000895 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000896 FieldDocument fd = ki.addDoc(1,
897 getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +0000898 ki.commit();
Nils Diewald56dc2582014-11-04 21:33:46 +0000899
Eliza Margaretha6f989202016-10-14 21:48:29 +0200900 String json = getJsonString(getClass()
901 .getResource("/queries/bugs/cosmas_boundary.jsonld").getFile());
Nils Diewald56dc2582014-11-04 21:33:46 +0000902
Nils Diewald8904c1d2015-02-26 16:13:18 +0000903 QueryBuilder kq = new QueryBuilder("tokens");
Nils Diewaldbb33da22015-03-04 16:24:25 +0000904 Krill ks = new Krill(kq.focus(1,
Akron4f52a632018-02-09 19:02:40 +0100905 kq.contains(kq.tag("base/s:s"), kq.nr(1, kq.seg("s:Leben")))));
Nils Diewald56dc2582014-11-04 21:33:46 +0000906
Nils Diewald884dbcf2015-02-27 17:02:28 +0000907 Result kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000908 assertEquals(kr.getSerialQuery(),
Akrona26184e2018-12-05 15:37:34 +0100909 "focus(1: spanContain(<tokens:base/s:s />, {1: tokens:s:Leben}),sorting)");
margarethaf70addb2015-04-27 13:17:18 +0200910 assertEquals(40, kr.getMatch(0).getStartPos());
911 assertEquals(41, kr.getMatch(0).getEndPos());
912
Eliza Margaretha6f989202016-10-14 21:48:29 +0200913 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000914 "... Initiative\" eine neue politische Gruppierung ins "
Akronf05fde62016-08-03 23:46:17 +0200915 + "[[{1:Leben}]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
Nils Diewald56dc2582014-11-04 21:33:46 +0000916
Nils Diewaldafab8f32015-01-26 19:11:32 +0000917 // Try with high class - don't highlight
Nils Diewaldbb33da22015-03-04 16:24:25 +0000918 ks = new Krill(kq.focus(129,
Akron4f52a632018-02-09 19:02:40 +0100919 kq.contains(kq.tag("base/s:s"), kq.nr(129, kq.seg("s:Leben")))));
Nils Diewald56dc2582014-11-04 21:33:46 +0000920
Nils Diewald3aa9e692015-02-20 22:20:11 +0000921 kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000922 assertEquals(kr.getSerialQuery(),
Akrona26184e2018-12-05 15:37:34 +0100923 "focus(129: spanContain(<tokens:base/s:s />, {129: tokens:s:Leben}),sorting)");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200924 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000925 "... Initiative\" eine neue politische Gruppierung ins "
Akronf05fde62016-08-03 23:46:17 +0200926 + "[[Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
Nils Diewald0fa2da22014-11-05 03:31:32 +0000927
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000928 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +0000929 kr = ks.apply(ki);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200930 assertEquals(kr.getSerialQuery(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000931 "focus(129: spanElementDistance({129: tokens:s:Namen}, "
Akrona26184e2018-12-05 15:37:34 +0100932 + "{129: tokens:s:Leben}, [(base/s:s[0:1], notOrdered, notExcluded)]),sorting)");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200933 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +0200934 "... ihren Austritt erklรคrt und unter dem [[Namen \"Einheitsbewegung "
Nils Diewaldbb33da22015-03-04 16:24:25 +0000935 + "der sozialistischen Initiative\" eine neue politische Gruppierung "
Akronf05fde62016-08-03 23:46:17 +0200936 + "ins Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000937 assertEquals(kr.getTotalResults(), 1);
938 assertEquals(0, kr.getStartIndex());
Nils Diewald56dc2582014-11-04 21:33:46 +0000939 };
940
Nils Diewaldbb33da22015-03-04 16:24:25 +0000941
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000942 @Test
943 public void searchJSONmultipleClassesBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000944 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000945 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000946 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000947 ki.addDoc(1, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
948 true);
949 ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
950 true);
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000951
Nils Diewaldafab8f32015-01-26 19:11:32 +0000952 ki.commit();
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000953
Eliza Margaretha6f989202016-10-14 21:48:29 +0200954 String json = getJsonString(
955 getClass().getResource("/queries/bugs/multiple_classes.jsonld")
956 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000957
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000958 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000959 Result kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000960 assertEquals(kr.getSerialQuery(),
961 "{4: spanNext({1: spanNext({2: tokens:s:ins}, "
962 + "{3: tokens:s:Leben})}, tokens:s:gerufen)}");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200963 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Nils Diewaldbb33da22015-03-04 16:24:25 +0000964 "... sozialistischen Initiative\" eine neue politische"
Akronf05fde62016-08-03 23:46:17 +0200965 + " Gruppierung [[{4:{1:{2:ins} {3:Leben}} gerufen}]] hatten. "
Nils Diewaldbb33da22015-03-04 16:24:25 +0000966 + "Pressemeldungen zufolge haben sich in ...");
Nils Diewaldafab8f32015-01-26 19:11:32 +0000967 assertEquals(kr.getTotalResults(), 2);
968 assertEquals(0, kr.getStartIndex());
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000969 };
970
Nils Diewaldbb33da22015-03-04 16:24:25 +0000971
Nils Diewald277e9ce2014-11-06 03:42:11 +0000972 @Test
973 public void searchJSONmultipleClassesBugTokenList () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000974 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +0000975 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +0000976 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +0000977 ki.addDoc(1, getClass().getResourceAsStream("/goe/AGA-03828.json.gz"),
978 true);
979 ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
980 true);
Nils Diewaldc7d08d92014-11-05 21:30:05 +0000981
Nils Diewaldafab8f32015-01-26 19:11:32 +0000982 ki.commit();
Nils Diewald277e9ce2014-11-06 03:42:11 +0000983
Eliza Margaretha6f989202016-10-14 21:48:29 +0200984 String json = getJsonString(
985 getClass().getResource("/queries/bugs/multiple_classes.jsonld")
986 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000987
Nils Diewaldbbd39a52015-02-23 19:56:57 +0000988 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +0000989 Result kr = ks.apply(ki);
Nils Diewald277e9ce2014-11-06 03:42:11 +0000990
Nils Diewaldafab8f32015-01-26 19:11:32 +0000991 ObjectMapper mapper = new ObjectMapper();
992 JsonNode res = mapper.readTree(kr.toTokenListJsonString());
Nils Diewald277e9ce2014-11-06 03:42:11 +0000993
Akrond504f212015-06-20 00:27:54 +0200994 assertEquals(1, res.at("/meta/totalResults").asInt());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200995 assertEquals(
996 "{4: spanNext({1: spanNext({2: tokens:s:ins}, "
997 + "{3: tokens:s:Leben})}, tokens:s:gerufen)}",
Akrond504f212015-06-20 00:27:54 +0200998 res.at("/meta/serialQuery").asText());
999 assertEquals(0, res.at("/meta/startIndex").asInt());
1000 assertEquals(25, res.at("/meta/itemsPerPage").asInt());
Nils Diewald277e9ce2014-11-06 03:42:11 +00001001
Nils Diewaldafab8f32015-01-26 19:11:32 +00001002 assertEquals("BZK_D59.00089", res.at("/matches/0/textSigle").asText());
1003 assertEquals(328, res.at("/matches/0/tokens/0/0").asInt());
1004 assertEquals(331, res.at("/matches/0/tokens/0/1").asInt());
1005 assertEquals(332, res.at("/matches/0/tokens/1/0").asInt());
1006 assertEquals(337, res.at("/matches/0/tokens/1/1").asInt());
1007 assertEquals(338, res.at("/matches/0/tokens/2/0").asInt());
1008 assertEquals(345, res.at("/matches/0/tokens/2/1").asInt());
Nils Diewald277e9ce2014-11-06 03:42:11 +00001009 };
Nils Diewaldc7d08d92014-11-05 21:30:05 +00001010
Nils Diewaldafab8f32015-01-26 19:11:32 +00001011
Nils Diewaldb84e7272014-11-07 01:27:38 +00001012 @Test
1013 public void searchJSONmultitermRewriteBug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001014 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +00001015 KrillIndex ki = new KrillIndex();
Nils Diewaldb84e7272014-11-07 01:27:38 +00001016
Nils Diewaldafab8f32015-01-26 19:11:32 +00001017 assertEquals(ki.numberOf("documents"), 0);
1018
1019 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +00001020 FieldDocument fd = ki.addDoc(1,
1021 getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001022 ki.commit();
1023
1024 assertEquals(ki.numberOf("documents"), 1);
1025 assertEquals("BZK", fd.getCorpusSigle());
1026
1027 // [tt/p="A.*"]{0,3}[tt/p="N.*"]
Eliza Margaretha6f989202016-10-14 21:48:29 +02001028 String json = getJsonString(
1029 getClass().getResource("/queries/bugs/multiterm_rewrite.jsonld")
1030 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001031
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001032 Krill ks = new Krill(json);
Nils Diewald2d5f8102015-02-26 21:07:54 +00001033 KrillCollection kc = ks.getCollection();
Nils Diewaldc471b182014-11-19 22:51:15 +00001034
Nils Diewaldafab8f32015-01-26 19:11:32 +00001035 // No index was set
1036 assertEquals(-1, kc.numberOf("documents"));
1037 kc.setIndex(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +00001038
Nils Diewaldafab8f32015-01-26 19:11:32 +00001039 // Index was set but vc restricted to WPD
1040 assertEquals(0, kc.numberOf("documents"));
Nils Diewaldc471b182014-11-19 22:51:15 +00001041
Akron176c9b12015-07-29 19:53:40 +02001042 /*
Nils Diewaldbb33da22015-03-04 16:24:25 +00001043 kc.extend(new CollectionBuilder().or("corpusSigle", "BZK"));
Akron176c9b12015-07-29 19:53:40 +02001044 */
1045 CollectionBuilder cb = new CollectionBuilder();
Akron40550172015-08-04 03:06:12 +02001046 kc.fromBuilder(cb.orGroup().with(kc.getBuilder())
1047 .with(cb.term("corpusSigle", "BZK")));
Akron176c9b12015-07-29 19:53:40 +02001048
Nils Diewaldafab8f32015-01-26 19:11:32 +00001049 ks.setCollection(kc);
1050 assertEquals(1, kc.numberOf("documents"));
Nils Diewald1220e3e2014-11-08 03:18:58 +00001051
Nils Diewald884dbcf2015-02-27 17:02:28 +00001052 Result kr = ks.apply(ki);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001053
1054 assertEquals(kr.getSerialQuery(),
1055 "spanOr([SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/), "
1056 + "spanNext(spanRepetition(SpanMultiTermQueryWrapper"
1057 + "(tokens:/tt/p:A.*/){1,3}), "
1058 + "SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/))])");
Nils Diewaldb84e7272014-11-07 01:27:38 +00001059
margaretha7f4fd652018-11-22 18:00:02 +01001060 assertEquals(68,kr.getTotalResults());
Nils Diewaldafab8f32015-01-26 19:11:32 +00001061 assertEquals(0, kr.getStartIndex());
Nils Diewald5871e4d2014-11-07 03:48:25 +00001062
Nils Diewaldbb33da22015-03-04 16:24:25 +00001063 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001064 "[[Saragat-Partei]] zerfรคllt Rom (ADN) die von dem ...");
Nils Diewaldbb33da22015-03-04 16:24:25 +00001065 assertEquals(kr.getMatch(1).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001066 "[[Saragat-Partei]] zerfรคllt Rom (ADN) die von dem ...");
Nils Diewaldbb33da22015-03-04 16:24:25 +00001067 assertEquals(kr.getMatch(2).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001068 "Saragat-Partei zerfรคllt [[Rom]] (ADN) "
Akron43cea662016-02-15 23:43:59 +01001069 + "die von dem Rechtssozialisten Saragat ...");
Nils Diewaldbb33da22015-03-04 16:24:25 +00001070 assertEquals(kr.getMatch(3).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001071 "Saragat-Partei zerfรคllt Rom ([[ADN]]) "
Akron43cea662016-02-15 23:43:59 +01001072 + "die von dem Rechtssozialisten Saragat gefรผhrte ...");
margaretha7f4fd652018-11-22 18:00:02 +01001073 assertEquals("... auseinander, nachdem vor einiger Zeit mehrere "
1074 + "[[prominente Mitglieder]] ihren Austritt erklรคrt "
1075 + "und unter dem ...", kr.getMatch(23).getSnippetBrackets());
Nils Diewaldb84e7272014-11-07 01:27:38 +00001076 };
1077
1078
Nils Diewald56dc2582014-11-04 21:33:46 +00001079 @Test
Akrone4fdce42015-11-13 16:06:10 +01001080 public void searchJSONtokenDistanceSpanBug () throws IOException {
1081 // Construct index
1082 KrillIndex ki = new KrillIndex();
1083 ki.addDoc(1, getClass().getResourceAsStream("/goe/AGX-00002.json"),
Akron42993552016-02-04 13:24:24 +01001084 false);
Akrone4fdce42015-11-13 16:06:10 +01001085 ki.addDoc(2, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"),
Akron42993552016-02-04 13:24:24 +01001086 true);
Akrone4fdce42015-11-13 16:06:10 +01001087 ki.commit();
Akron42993552016-02-04 13:24:24 +01001088
Akrone4fdce42015-11-13 16:06:10 +01001089 // ({1:Sonne []* Erde} | {2: Erde []* Sonne})
Eliza Margaretha6f989202016-10-14 21:48:29 +02001090 String json = getJsonString(getClass()
1091 .getResource("/queries/bugs/tokendistancespan_bug.jsonld")
1092 .getFile());
Akrone4fdce42015-11-13 16:06:10 +01001093
1094 Krill ks = new Krill(json);
1095 Result kr = ks.apply(ki);
1096 ObjectMapper mapper = new ObjectMapper();
1097 JsonNode res = mapper.readTree(kr.toJsonString());
1098 assertTrue(res.at("/errors").isMissingNode());
1099 };
1100
1101
1102 @Test
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001103 public void searchJSONCollection () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001104 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +00001105 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001106 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +00001107 for (String i : new String[] { "00001", "00002", "00003", "00004",
1108 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001109 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001110 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001111 };
1112 ki.commit();
Eliza Margaretha6f989202016-10-14 21:48:29 +02001113 String json = getJsonString(getClass()
1114 .getResource("/queries/metaquery8-nocollection.jsonld")
1115 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001116
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001117 Krill ks = new Krill(json);
Nils Diewald884dbcf2015-02-27 17:02:28 +00001118 Result kr = ks.apply(ki);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001119 assertEquals(kr.getTotalResults(), 276);
1120 assertEquals(0, kr.getStartIndex());
1121 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001122
Eliza Margaretha6f989202016-10-14 21:48:29 +02001123 json = getJsonString(
1124 getClass().getResource("/queries/metaquery8.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001125
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001126 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +00001127 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001128
Nils Diewaldafab8f32015-01-26 19:11:32 +00001129 assertEquals(kr.getTotalResults(), 147);
1130 assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
1131 assertEquals(0, kr.getStartIndex());
1132 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001133
Eliza Margaretha6f989202016-10-14 21:48:29 +02001134 json = getJsonString(getClass()
1135 .getResource("/queries/metaquery8-filtered.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001136
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001137 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +00001138 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001139
Nils Diewaldafab8f32015-01-26 19:11:32 +00001140 assertEquals(kr.getTotalResults(), 28);
1141 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1142 assertEquals(0, kr.getStartIndex());
1143 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001144
Eliza Margaretha6f989202016-10-14 21:48:29 +02001145 json = getJsonString(getClass()
1146 .getResource("/queries/metaquery8-filtered-further.jsonld")
1147 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001148
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001149 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +00001150 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001151
Nils Diewaldafab8f32015-01-26 19:11:32 +00001152 assertEquals(kr.getTotalResults(), 0);
1153 assertEquals(0, kr.getStartIndex());
1154 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001155
Akron176c9b12015-07-29 19:53:40 +02001156
Eliza Margaretha6f989202016-10-14 21:48:29 +02001157 json = getJsonString(getClass()
1158 .getResource("/queries/metaquery8-filtered-nested.jsonld")
1159 .getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001160
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001161 ks = new Krill(json);
Nils Diewald3aa9e692015-02-20 22:20:11 +00001162 kr = ks.apply(ki);
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001163
Akron176c9b12015-07-29 19:53:40 +02001164 /*
Nils Diewaldbb33da22015-03-04 16:24:25 +00001165 assertEquals("filter with QueryWrapperFilter("
1166 + "+(ID:WPD_AAA.00003 (+tokens:s:die"
1167 + " +tokens:s:Schriftzeichen)))",
1168 ks.getCollection().getFilter(1).toString());
Akron176c9b12015-07-29 19:53:40 +02001169 */
Akron40550172015-08-04 03:06:12 +02001170 assertEquals(
1171 "AndGroup(OrGroup(ID:WPD_AAA.00001 ID:WPD_AAA.00002) OrGroup(ID:WPD_AAA.00003 AndGroup(tokens:s:die tokens:s:Schriftzeichen)))",
1172 ks.getCollection().toString());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001173
Nils Diewaldafab8f32015-01-26 19:11:32 +00001174 assertEquals(kr.getTotalResults(), 119);
1175 assertEquals(0, kr.getStartIndex());
1176 assertEquals(10, kr.getItemsPerPage());
Nils Diewaldfb4d7b02014-04-09 17:56:17 +00001177 };
1178
Nils Diewald1e5d5942014-05-20 13:29:53 +00001179
1180 @Test
1181 public void searchJSONSentenceContext () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001182 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +00001183 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001184 // Indexing test files
Akron42993552016-02-04 13:24:24 +01001185 for (String i : new String[] { "00001", "00002", "00003", "00004",
Nils Diewaldbb33da22015-03-04 16:24:25 +00001186 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001187 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001188 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001189 };
1190 ki.commit();
Nils Diewald1e5d5942014-05-20 13:29:53 +00001191
Eliza Margaretha6f989202016-10-14 21:48:29 +02001192 String json = getJsonString(getClass()
1193 .getResource("/queries/bsp-context-2.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001194
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001195 Krill ks = new Krill(json);
Nils Diewaldf5ab4b22015-02-25 20:55:16 +00001196 ks.getMeta().setCutOff(false);
1197 SearchContext sc = ks.getMeta().getContext();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001198 sc.left.setLength((short) 10);
1199 sc.right.setLength((short) 10);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001200
Nils Diewald884dbcf2015-02-27 17:02:28 +00001201 Result kr = ks.apply(ki);
Akronfd05f502015-07-30 18:34:26 +02001202
Nils Diewaldbb33da22015-03-04 16:24:25 +00001203 assertEquals(kr.getMatch(1).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001204 "... dezimalen [[Wert]] 65 sowohl ...");
Nils Diewaldafab8f32015-01-26 19:11:32 +00001205 assertEquals(kr.getTotalResults(), 3);
1206 assertEquals(0, kr.getStartIndex());
1207 assertEquals(25, kr.getItemsPerPage());
Akron499c94c2016-02-04 13:13:43 +01001208
Eliza Margaretha6f989202016-10-14 21:48:29 +02001209 assertFalse(
1210 kr.getContext().toJsonNode().toString().equals("\"base/s:s\""));
Nils Diewald1e5d5942014-05-20 13:29:53 +00001211
Eliza Margaretha6f989202016-10-14 21:48:29 +02001212 json = getJsonString(getClass()
1213 .getResource("/queries/bsp-context-sentence.jsonld").getFile());
Nils Diewald1e5d5942014-05-20 13:29:53 +00001214
Nils Diewaldbbd39a52015-02-23 19:56:57 +00001215 kr = new Krill(json).apply(ki);
Akron43cea662016-02-15 23:43:59 +01001216 assertEquals(kr.getContext().toJsonNode().toString(), "\"base/s:s\"");
Akron499c94c2016-02-04 13:13:43 +01001217
Nils Diewaldbb33da22015-03-04 16:24:25 +00001218 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001219 "steht a fรผr den dezimalen [[Wert]] 97 sowohl im ASCII-"
Nils Diewaldbb33da22015-03-04 16:24:25 +00001220 + " als auch im Unicode-Zeichensatz");
1221 assertEquals(kr.getMatch(1).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001222 "steht A fรผr den dezimalen [[Wert]] 65 sowohl im ASCII-"
Nils Diewaldbb33da22015-03-04 16:24:25 +00001223 + " als auch im Unicode-Zeichensatz");
1224 assertEquals(kr.getMatch(2).getSnippetBrackets(),
1225 "In einem Zahlensystem mit einer Basis grรถรŸer "
1226 + "als 10 steht A oder a hรคufig fรผr den dezimalen"
Akronf05fde62016-08-03 23:46:17 +02001227 + " [[Wert]] 10, siehe auch Hexadezimalsystem.");
Nils Diewald1e5d5942014-05-20 13:29:53 +00001228 };
1229
1230
Nils Diewald2276e1c2014-04-10 15:01:59 +00001231 @Test
Nils Diewald54187632014-06-11 14:39:29 +00001232 public void searchJSONbug () throws IOException {
Nils Diewaldafab8f32015-01-26 19:11:32 +00001233 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +00001234 KrillIndex ki = new KrillIndex();
Nils Diewaldafab8f32015-01-26 19:11:32 +00001235 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +00001236 for (String i : new String[] { "00001", "00002", "00003", "00004",
1237 "00005", "00006", "02439" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001238 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001239 true);
Nils Diewaldafab8f32015-01-26 19:11:32 +00001240 };
1241 ki.commit();
Nils Diewald54187632014-06-11 14:39:29 +00001242
Eliza Margaretha6f989202016-10-14 21:48:29 +02001243 String json = getJsonString(
1244 getClass().getResource("/queries/bsp-bug.jsonld").getFile());
Nils Diewald54187632014-06-11 14:39:29 +00001245
Nils Diewald884dbcf2015-02-27 17:02:28 +00001246 Result kr = new Krill(json).apply(ki);
Nils Diewaldc471b182014-11-19 22:51:15 +00001247
Nils Diewaldbb33da22015-03-04 16:24:25 +00001248 assertEquals(kr.getError(0).getMessage(),
1249 "Operation needs operand list");
Nils Diewald54187632014-06-11 14:39:29 +00001250 };
1251
Akronf9def5e2016-10-10 21:26:46 +02001252
1253 @Test
Akronf785dae2016-08-10 17:12:40 +02001254 public void searchJSONdistanceWithRegexesBug () throws IOException {
1255 // Construct index
1256 KrillIndex ki = new KrillIndex();
1257 // Indexing test files
1258 for (String i : new String[] { "00001" }) {
Akronf9def5e2016-10-10 21:26:46 +02001259 // , "00002", "00003", "00004", "00005", "00006", "02439"
Eliza Margaretha6f989202016-10-14 21:48:29 +02001260 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Akronf785dae2016-08-10 17:12:40 +02001261 true);
1262 };
1263 ki.commit();
1264
Akronf9def5e2016-10-10 21:26:46 +02001265 // "der" []{2,3} [opennlp/p="NN"]
Eliza Margaretha6f989202016-10-14 21:48:29 +02001266 String json = getJsonString(getClass()
1267 .getResource("/queries/bugs/distances_with_regex_bug.jsonld")
1268 .getFile());
Akronf785dae2016-08-10 17:12:40 +02001269
1270 Result kr = new Krill(json).apply(ki);
1271
Eliza Margaretha6f989202016-10-14 21:48:29 +02001272 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf9def5e2016-10-10 21:26:46 +02001273 "Mit Ausnahme von Fremdwรถrtern und Namen ist das A der einzige Buchstabe im Deutschen, [[der zweifach am Anfang]] eines Wortes stehen darf, etwa im Wort Aal.");
Akronf785dae2016-08-10 17:12:40 +02001274
1275 };
1276
Nils Diewaldafab8f32015-01-26 19:11:32 +00001277
Nils Diewaldef7124e2014-11-12 20:08:13 +00001278 /**
1279 * This is a breaking test for #179
1280 */
1281 @Test
1282 public void searchJSONexpansionBug () throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001283 // Construct index
1284 KrillIndex ki = new KrillIndex();
1285 // Indexing test files
1286 ki.addDoc(getClass().getResourceAsStream("/wiki/00002.json.gz"), true);
1287 ki.commit();
1288
1289 // Expansion bug
1290 // der alte Digraph Aa durch ร…
Eliza Margaretha6f989202016-10-14 21:48:29 +02001291 String json = getJsonString(getClass()
1292 .getResource("/queries/bugs/expansion_bug_2.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001293
1294 Result kr = new Krill(json).apply(ki);
Eliza Margaretha6f989202016-10-14 21:48:29 +02001295 assertEquals(
1296 "... Buchstabe des Alphabetes. In Dรคnemark ist "
1297 + "[[der alte Digraph Aa durch ร…]] ersetzt worden, "
1298 + "in Eigennamen und Ortsnamen ...",
1299 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001300 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1301 assertEquals(kr.getTotalResults(), 1);
1302
Akron9f1a55b2016-04-20 19:11:06 +02001303
1304 // TODO: base/s:t needs to be defined!!!
1305 QueryBuilder qb = new QueryBuilder("tokens");
1306 kr = new Krill(qb.tag("base/s:t")).apply(ki);
Akron9f1a55b2016-04-20 19:11:06 +02001307 assertEquals(kr.getTotalResults(), 1);
1308
1309
Nils Diewaldbb33da22015-03-04 16:24:25 +00001310 // der alte Digraph Aa durch []
1311 // Works with one document
Eliza Margaretha6f989202016-10-14 21:48:29 +02001312 json = getJsonString(getClass()
1313 .getResource("/queries/bugs/expansion_bug.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001314
1315 kr = new Krill(json).apply(ki);
1316
Akron9f1a55b2016-04-20 19:11:06 +02001317 // focus(254: spanContain(<tokens:base/s:t />, {254: spanNext(spanNext(spanNext(spanNext(tokens:s:der, tokens:s:alte), tokens:s:Digraph), tokens:s:Aa), spanExpansion(tokens:s:durch, []{1, 1}, right))}))
1318
Eliza Margaretha6f989202016-10-14 21:48:29 +02001319 assertEquals(
1320 "... Buchstabe des Alphabetes. In Dรคnemark ist "
1321 + "[[der alte Digraph Aa durch ร…]] ersetzt worden, "
1322 + "in Eigennamen und Ortsnamen ...",
1323 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001324 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1325 assertEquals(kr.getTotalResults(), 1);
1326
1327 // Now try with one file ahead
1328 ki = new KrillIndex();
1329 for (String i : new String[] { "00001", "00002" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001330 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +00001331 true);
1332 };
1333 ki.commit();
1334
1335 // Expansion bug
1336 // der alte Digraph Aa durch ร…
Eliza Margaretha6f989202016-10-14 21:48:29 +02001337 json = getJsonString(getClass()
1338 .getResource("/queries/bugs/expansion_bug_2.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001339
1340 kr = new Krill(json).apply(ki);
1341
Eliza Margaretha6f989202016-10-14 21:48:29 +02001342 assertEquals(
1343 "... Buchstabe des Alphabetes. In Dรคnemark ist "
1344 + "[[der alte Digraph Aa durch ร…]] ersetzt worden, "
1345 + "in Eigennamen und Ortsnamen ...",
1346 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001347 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1348 assertEquals(kr.getTotalResults(), 1);
1349
1350 // der alte Digraph Aa durch []
Eliza Margaretha6f989202016-10-14 21:48:29 +02001351 json = getJsonString(getClass()
1352 .getResource("/queries/bugs/expansion_bug.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001353
1354 kr = new Krill(json).apply(ki);
Eliza Margaretha6f989202016-10-14 21:48:29 +02001355 assertEquals(
1356 "... Buchstabe des Alphabetes. In Dรคnemark ist "
1357 + "[[der alte Digraph Aa durch ร…]] ersetzt worden, "
1358 + "in Eigennamen und Ortsnamen ...",
1359 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001360 assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
1361 assertEquals(kr.getTotalResults(), 1);
Nils Diewaldef7124e2014-11-12 20:08:13 +00001362 };
Akron8abefa12016-02-13 05:35:42 +01001363
Akronf9def5e2016-10-10 21:26:46 +02001364
1365 @Test
Akrondfc93572016-08-10 19:01:34 +02001366 public void queryJSONzeroRepetitionBug () throws IOException {
Akronf9def5e2016-10-10 21:26:46 +02001367 // der{0}
1368 KrillIndex ki = new KrillIndex();
1369 ki.addDoc(getClass().getResourceAsStream("/wiki/00001.json.gz"), true);
1370 ki.commit();
Akrondfc93572016-08-10 19:01:34 +02001371
Eliza Margaretha6f989202016-10-14 21:48:29 +02001372 String json = getJsonString(getClass()
1373 .getResource("/queries/bugs/zero_repetition_bug.jsonld")
1374 .getFile());
Akrondfc93572016-08-10 19:01:34 +02001375
Akronf9def5e2016-10-10 21:26:46 +02001376 Result kr = new Krill(json).apply(ki);
1377
1378 assertEquals(783, kr.getError(0).getCode());
Eliza Margaretha6f989202016-10-14 21:48:29 +02001379 assertEquals("This query can't match anywhere",
1380 kr.getError(0).getMessage());
Akronf9def5e2016-10-10 21:26:46 +02001381 };
1382
Akron13db6152016-02-19 14:08:38 +01001383
Akron163a04b2020-01-20 10:43:04 +01001384 @Test
1385 public void queryJSONcosmasSentenceNegationBug () throws IOException {
1386 KrillIndex ki = new KrillIndex();
1387
1388 // Indexing test files
1389 for (String i : new String[] {
1390 "00001",
1391 "00002",
1392 "00003",
1393 "00004",
1394 "00005",
1395 "00006",
1396 "02439"
1397 }) {
1398 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
1399 true);
1400 };
1401
1402 ki.commit();
1403
1404 String json = getJsonString(getClass()
1405 .getResource("/queries/bugs/cosmas-exclude.jsonld")
1406 .getFile());
1407
1408 Result kr = new Krill(json).apply(ki);
1409
1410 assertEquals(0, kr.getTotalResults());
1411 };
1412
1413
Akron8abefa12016-02-13 05:35:42 +01001414 /**
Akron13db6152016-02-19 14:08:38 +01001415 * This is a Schreibgebrauch ressource that didn't work for
1416 * element queries.
Akron8abefa12016-02-13 05:35:42 +01001417 */
1418 @Test
1419 public void searchSchreibgebrauchData () throws IOException {
1420 // Construct index
1421 KrillIndex ki = new KrillIndex();
1422 // Indexing test files
Eliza Margaretha6f989202016-10-14 21:48:29 +02001423 ki.addDoc(
1424 getClass().getResourceAsStream("/sgbr/BSP-2013-01-32.json.gz"),
1425 true);
Akron8abefa12016-02-13 05:35:42 +01001426 ki.commit();
1427
1428 Krill k = new Krill(new QueryBuilder("tokens").tag("base/s:s"));
1429
Akron13db6152016-02-19 14:08:38 +01001430 assertEquals(k.getSpanQuery().toString(), "<tokens:base/s:s />");
Akron8abefa12016-02-13 05:35:42 +01001431
1432 Result kr = k.apply(ki);
1433 assertEquals(kr.getTotalResults(), 1);
1434 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akronf05fde62016-08-03 23:46:17 +02001435 "[[Selbst ist der Jeck]]");
Akron8abefa12016-02-13 05:35:42 +01001436
1437 assertEquals(kr.getMatch(0).getTextSigle(), "PRO-DUD_BSP-2013-01.32");
1438 };
1439
Akron2ea48e62017-04-28 20:23:30 +02001440
1441 /**
1442 * This is a Schreibgebrauch ressource that didn't work for
1443 * element queries.
1444 */
1445 @Test
1446 public void searchNewDeReKoData () throws IOException {
1447 // Construct index
1448 KrillIndex ki = new KrillIndex();
1449 // Indexing test files
1450 // Indexing test files
1451 FieldDocument fd = ki.addDoc(1,
1452 getClass().getResourceAsStream("/goe/AGA-03828-new.json.gz"),
1453 true);
1454 ki.commit();
1455
1456 assertEquals(fd.getUID(), 1);
1457 assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
1458 assertEquals(fd.getDocSigle(), "GOE/AGA");
1459 assertEquals(fd.getCorpusSigle(), "GOE");
Akron32b95192019-01-11 13:58:55 +01001460 assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
1461 assertNull(fd.getFieldValue("subTitle"));
1462 assertEquals(fd.getFieldValue("textType"), "Autobiographie");
1463 assertNull(fd.getFieldValue("textTypeArt"));
1464 assertNull(fd.getFieldValue("textTypeRef"));
1465 assertNull(fd.getFieldValue("textColumn"));
1466 assertNull(fd.getFieldValue("textDomain"));
Akron2ea48e62017-04-28 20:23:30 +02001467 // assertEquals(fd.getPages(), "529-547");
Akron32b95192019-01-11 13:58:55 +01001468 assertEquals(fd.getFieldValue("availability"), "QAO-NC");
1469 assertEquals(fd.getFieldValue("creationDate"), "1820");
1470 assertEquals(fd.getFieldValue("pubDate"), "1982");
1471 assertEquals(fd.getFieldValue("author"), "Goethe, Johann Wolfgang von");
1472 assertNull(fd.getFieldValue("textClass"));
1473 assertEquals(fd.getFieldValue("language"), "de");
1474 assertEquals(fd.getFieldValue("pubPlace"), "Mรผnchen");
1475 assertEquals(fd.getFieldValue("reference"),
Akron2ea48e62017-04-28 20:23:30 +02001476 "Goethe, Johann Wolfgang von:"
1477 + " Autobiographische Einzelheiten,"
1478 + " (Geschrieben bis 1832), In: Goethe,"
1479 + " Johann Wolfgang von: Goethes Werke,"
1480 + " Bd. 10, Autobiographische Schriften"
1481 + " II, Hrsg.: Trunz, Erich. Mรผnchen: "
1482 + "Verlag C. H. Beck, 1982, S. 529-547");
Akron32b95192019-01-11 13:58:55 +01001483 assertEquals(fd.getFieldValue("publisher"), "Verlag C. H. Beck");
1484 assertNull(fd.getFieldValue("editor"));
1485 assertNull(fd.getFieldValue("fileEditionStatement"));
1486 assertNull(fd.getFieldValue("biblEditionStatement"));
1487 assertNull(fd.getFieldValue("keywords"));
Akron2ea48e62017-04-28 20:23:30 +02001488
Akron32b95192019-01-11 13:58:55 +01001489 assertEquals(fd.getFieldValue("tokenSource"), "base#tokens");
1490 assertEquals(fd.getFieldValue("foundries"),
Akron2ea48e62017-04-28 20:23:30 +02001491 "corenlp corenlp/constituency corenlp/morpho corenlp/sentences dereko dereko/structure dereko/structure/base-sentences-paragraphs-pagebreaks malt malt/dependency marmot marmot/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho");
Akron32b95192019-01-11 13:58:55 +01001492 assertEquals(fd.getFieldValue("layerInfos"),
Akron2ea48e62017-04-28 20:23:30 +02001493 "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens");
1494
Akron32b95192019-01-11 13:58:55 +01001495 assertEquals(fd.getFieldValue("corpusTitle"), "Goethes Werke");
1496 assertNull(fd.getFieldValue("corpusSubTitle"));
1497 assertEquals(fd.getFieldValue("corpusAuthor"), "Goethe, Johann Wolfgang von");
1498 assertEquals(fd.getFieldValue("corpusEditor"), "Trunz, Erich");
1499 assertEquals(fd.getFieldValue("docTitle"),
Akron2ea48e62017-04-28 20:23:30 +02001500 "Goethe: Autobiographische Schriften II, (1817-1825, 1832)");
Akron32b95192019-01-11 13:58:55 +01001501 assertNull(fd.getFieldValue("docSubTitle"));
1502 assertNull(fd.getFieldValue("docEditor"));
1503 assertNull(fd.getFieldValue("docAuthor"));
Akron2ea48e62017-04-28 20:23:30 +02001504
1505 Krill ks = new Krill(new QueryBuilder("tokens").seg("marmot/m:case:nom")
1506 .with("marmot/m:number:pl"));
1507 Result kr = ks.apply(ki);
1508
1509 assertEquals(kr.getTotalResults(), 141);
1510 assertEquals(0, kr.getStartIndex());
1511 assertEquals(25, kr.getItemsPerPage());
1512 };
1513
Akron70ce0c02018-05-25 23:44:26 +02001514 @Test
1515 public void searchLongMatch () throws IOException {
1516
1517 // Construct index
1518 KrillIndex ki = new KrillIndex();
1519 // Indexing test files
1520 ki.addDoc(
1521 getClass().getResourceAsStream("/goe/AGX-00002.json"),
1522 false);
1523 ki.commit();
1524
1525 Krill k = new Krill(new QueryBuilder("tokens").tag("xy/z:long"));
1526
1527 assertEquals(k.getSpanQuery().toString(), "<tokens:xy/z:long />");
1528
1529 Result kr = k.apply(ki);
1530 assertEquals(kr.getTotalResults(), 1);
1531 assertEquals(2, kr.getMatch(0).getStartPos());
1532 assertEquals(52, kr.getMatch(0).getEndPos());
1533 assertEquals(kr.getMatch(0).getSnippetBrackets(),
Akron26e54172024-05-23 17:03:03 +02001534 "Maximen und [[Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur]<!>], so hat er sie gleich in ...");
Akron70ce0c02018-05-25 23:44:26 +02001535 assertEquals(kr.getMatch(0).getSnippetHTML(),
1536 "<span class=\"context-left\">Maximen und </span><span class=\"match\"><mark>Reflexionen Religion und Christentum. wir sind naturforschend Pantheisten, dichtend Polytheisten, sittlich Monotheisten. Gott, wenn wir hoch stehen, ist alles; stehen wir niedrig, so ist er ein Supplement unsrer Armseligkeit. die Kreatur ist sehr schwach; denn sucht sie etwas, findet sie's nicht. stark aber ist Gott; denn sucht er die Kreatur</mark><span class=\"cutted\"></span></span><span class=\"context-right\">, so hat er sie gleich in<span class=\"more\"></span></span>");
1537 assertEquals(kr.getMatch(0).getTextSigle(), "GOE_AGX.00002");
1538 };
1539
Akron906470f2023-12-19 11:13:32 +01001540 @Test
1541 public void emojiSearch () throws IOException {
1542
1543 // Construct index
1544 KrillIndex ki = new KrillIndex();
1545 // Indexing test files
1546 ki.addDoc(
1547 getClass().getResourceAsStream("/others/KYC-MAI-001888-censored.json"),
1548 false);
1549 ki.commit();
1550
1551 Krill k = new Krill(new QueryBuilder("tokens").seg("s:๐ŸŽ‰"));
1552
1553 assertEquals(k.getSpanQuery().toString(), "tokens:s:๐ŸŽ‰");
1554
1555 Result kr = k.apply(ki);
1556 assertEquals(kr.getTotalResults(), 1);
1557 assertEquals(kr.getMatch(0).getSnippetBrackets(),
1558 "... Strasse antreffe.๐Ÿ˜Š Versprochen Xxx-Xxx [[๐ŸŽ‰]]");
1559 assertEquals(kr.getMatch(0).getSnippetHTML(),
1560 "<span class=\"context-left\"><span class=\"more\"></span>Strasse antreffe.๐Ÿ˜Š Versprochen Xxx-Xxx </span><span class=\"match\"><mark>๐ŸŽ‰</mark></span><span class=\"context-right\"></span>");
1561 assertEquals(kr.getMatch(0).getTextSigle(), "KYC/MAI/001888");
1562 };
1563
Nils Diewaldc925b492013-12-03 23:56:10 +00001564};