blob: 23f928ec5fdaf252cf29cf1f5df20d49a49e5a02 [file] [log] [blame]
Eliza Margaretha01929182014-02-19 11:48:59 +00001package de.ids_mannheim.korap.index;
2
Nils Diewaldf399a672013-11-18 17:55:22 +00003import java.util.*;
4import java.io.*;
5
6import org.apache.lucene.util.Version;
7import org.apache.lucene.util.BytesRef;
8import org.apache.lucene.util.Bits;
9
10import static org.junit.Assert.*;
11import org.junit.Test;
12import org.junit.Ignore;
13import org.junit.runner.RunWith;
14import org.junit.runners.JUnit4;
15
Nils Diewalda14ecd62015-02-26 21:00:20 +000016import de.ids_mannheim.korap.KrillIndex;
Nils Diewald0339d462015-02-26 14:53:56 +000017import de.ids_mannheim.korap.KrillQuery;
Akron74748c62016-06-29 00:22:43 +020018import de.ids_mannheim.korap.query.QueryBuilder;
Nils Diewaldf399a672013-11-18 17:55:22 +000019import de.ids_mannheim.korap.index.FieldDocument;
Nils Diewalde4986d72015-02-27 17:35:00 +000020import de.ids_mannheim.korap.index.MultiTermTokenStream;
Akron74748c62016-06-29 00:22:43 +020021import de.ids_mannheim.korap.response.Result;
22import de.ids_mannheim.korap.util.QueryException;
Nils Diewaldf399a672013-11-18 17:55:22 +000023
Akronc74dee02018-02-07 18:48:30 +010024import com.fasterxml.jackson.databind.ObjectMapper;
25import com.fasterxml.jackson.databind.JsonNode;
26
Nils Diewaldf399a672013-11-18 17:55:22 +000027@RunWith(JUnit4.class)
Nils Diewalda14ecd62015-02-26 21:00:20 +000028public class TestKrillIndex {
Nils Diewaldf399a672013-11-18 17:55:22 +000029
Akron8798be82016-06-23 23:10:25 +020030
31 /*
32 * Todo: Currently fields can only be set if they are
33 * part of the general field set.
34 * this will change soon!
35 */
36
Nils Diewaldf399a672013-11-18 17:55:22 +000037 @Test
38 public void indexExample () throws IOException {
Nils Diewalda14ecd62015-02-26 21:00:20 +000039 KrillIndex ki = new KrillIndex();
Nils Diewaldf399a672013-11-18 17:55:22 +000040
Nils Diewald5c375702015-02-09 20:58:24 +000041 assertEquals(0, ki.numberOf("base", "documents"));
42 assertEquals(0, ki.numberOf("base", "tokens"));
43 assertEquals(0, ki.numberOf("base", "sentences"));
44 assertEquals(0, ki.numberOf("base", "paragraphs"));
Nils Diewald019e3972014-09-29 19:18:26 +000045
Nils Diewald5c375702015-02-09 20:58:24 +000046 FieldDocument fd = new FieldDocument();
Nils Diewaldf399a672013-11-18 17:55:22 +000047
Nils Diewald5c375702015-02-09 20:58:24 +000048 fd.addString("name", "Peter");
49 fd.addInt("zahl1", 56);
50 fd.addInt("zahl2", "58");
Akron8798be82016-06-23 23:10:25 +020051 fd.addInt("zahl3", "059");
52 fd.addInt("UID", 1);
Nils Diewald5c375702015-02-09 20:58:24 +000053 fd.addText("teaser", "Das ist der Name der Rose");
Eliza Margaretha6f989202016-10-14 21:48:29 +020054 fd.addTV("base", "ich bau", "[(0-3)s:ich|l:ich|p:PPER|-:sentences$<i>2]"
55 + "[(4-7)s:bau|l:bauen|p:VVFIN]");
Nils Diewald5c375702015-02-09 20:58:24 +000056 ki.addDoc(fd);
Nils Diewaldf399a672013-11-18 17:55:22 +000057
Nils Diewald5c375702015-02-09 20:58:24 +000058 fd = new FieldDocument();
Nils Diewaldf399a672013-11-18 17:55:22 +000059
Nils Diewald5c375702015-02-09 20:58:24 +000060 fd.addString("name", "Hans");
61 fd.addInt("zahl1", 14);
62 fd.addText("teaser", "Das Sein");
Akron8798be82016-06-23 23:10:25 +020063 fd.addInt("UID", 2);
Nils Diewaldf399a672013-11-18 17:55:22 +000064
Nils Diewald5c375702015-02-09 20:58:24 +000065 MultiTermTokenStream mtts = fd.newMultiTermTokenStream();
66 mtts.addMultiTermToken("s:wir#0-3", "l:wir", "p:PPER");
67 mtts.addMultiTermToken("s:sind#4-8", "l:sein", "p:VVFIN");
68 mtts.addMeta("sentences", (int) 5);
69 fd.addTV("base", "wir sind", mtts);
Nils Diewaldbb33da22015-03-04 16:24:25 +000070
Nils Diewald5c375702015-02-09 20:58:24 +000071 ki.addDoc(fd);
Nils Diewaldf399a672013-11-18 17:55:22 +000072
Nils Diewald5c375702015-02-09 20:58:24 +000073 /* Save documents */
74 ki.commit();
Nils Diewaldf399a672013-11-18 17:55:22 +000075
Nils Diewald5c375702015-02-09 20:58:24 +000076 assertEquals(2, ki.numberOf("base", "documents"));
77 assertEquals(7, ki.numberOf("base", "sentences"));
Nils Diewaldf399a672013-11-18 17:55:22 +000078
Nils Diewald5c375702015-02-09 20:58:24 +000079 fd = new FieldDocument();
Nils Diewaldf399a672013-11-18 17:55:22 +000080
Nils Diewald5c375702015-02-09 20:58:24 +000081 fd.addString("name", "Frank");
82 fd.addInt("zahl1", 59);
83 fd.addInt("zahl2", 65);
Akron8798be82016-06-23 23:10:25 +020084 fd.addInt("UID", 3);
Nils Diewald5c375702015-02-09 20:58:24 +000085 fd.addText("teaser", "Noch ein Versuch");
Nils Diewaldbb33da22015-03-04 16:24:25 +000086 fd.addTV("base", "ich bau", "[(0-3)s:der|l:der|p:DET|-:sentences$<i>3]"
87 + "[(4-8)s:baum|l:baum|p:NN]");
Nils Diewald5c375702015-02-09 20:58:24 +000088 ki.addDoc(fd);
Nils Diewaldf399a672013-11-18 17:55:22 +000089
Nils Diewald5c375702015-02-09 20:58:24 +000090 /* Save documents */
91 ki.commit();
Nils Diewaldf399a672013-11-18 17:55:22 +000092
Nils Diewald5c375702015-02-09 20:58:24 +000093 assertEquals(3, ki.numberOf("base", "documents"));
94 assertEquals(10, ki.numberOf("base", "sentences"));
Nils Diewaldf399a672013-11-18 17:55:22 +000095
Nils Diewald0339d462015-02-26 14:53:56 +000096 // KrillQuery kq = new KrillQuery("text");
Nils Diewald5c375702015-02-09 20:58:24 +000097 // ki.search();
Akron8798be82016-06-23 23:10:25 +020098
99 ki.getDoc("1");
Nils Diewaldf399a672013-11-18 17:55:22 +0000100 };
Nils Diewaldc82379b2014-10-02 14:58:18 +0000101
Nils Diewaldbb33da22015-03-04 16:24:25 +0000102
Nils Diewaldc82379b2014-10-02 14:58:18 +0000103 @Test
104 public void indexAlteration () throws IOException {
Nils Diewalda14ecd62015-02-26 21:00:20 +0000105 KrillIndex ki = new KrillIndex();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000106
Nils Diewald5c375702015-02-09 20:58:24 +0000107 assertEquals(0, ki.numberOf("base", "documents"));
Nils Diewaldc82379b2014-10-02 14:58:18 +0000108
Nils Diewald5c375702015-02-09 20:58:24 +0000109 FieldDocument fd = new FieldDocument();
110 fd.addString("name", "Peter");
111 ki.addDoc(fd);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000112
Nils Diewald5c375702015-02-09 20:58:24 +0000113 assertEquals(0, ki.numberOf("base", "documents"));
Nils Diewaldc82379b2014-10-02 14:58:18 +0000114
Nils Diewald5c375702015-02-09 20:58:24 +0000115 fd = new FieldDocument();
116 fd.addString("name", "Michael");
117 ki.addDoc(fd);
Nils Diewaldc82379b2014-10-02 14:58:18 +0000118
Nils Diewald5c375702015-02-09 20:58:24 +0000119 assertEquals(0, ki.numberOf("base", "documents"));
Nils Diewaldc82379b2014-10-02 14:58:18 +0000120
Nils Diewald5c375702015-02-09 20:58:24 +0000121 ki.commit();
Nils Diewaldc82379b2014-10-02 14:58:18 +0000122
Nils Diewald5c375702015-02-09 20:58:24 +0000123 assertEquals(2, ki.numberOf("base", "documents"));
Nils Diewaldc82379b2014-10-02 14:58:18 +0000124
Nils Diewald5c375702015-02-09 20:58:24 +0000125 // hasDeletions, hasPendingMerges
Nils Diewaldc82379b2014-10-02 14:58:18 +0000126 };
Akron75ee2b82016-06-20 21:20:34 +0200127
Akron08f4ceb2016-08-03 23:53:32 +0200128
Akron74748c62016-06-29 00:22:43 +0200129 /*
130 * This test demonstrates the behaviour
131 */
132 @Test
133 public void indexUnicode () throws IOException, QueryException {
134 KrillIndex ki = new KrillIndex();
135
136 FieldDocument fd = new FieldDocument();
137 fd.addString("name", "Peter");
138
139 // These values are canonically equivalent
140 // But indexed as byte sequences
Eliza Margaretha6f989202016-10-14 21:48:29 +0200141 fd.addTV("base",
142 new String("ju" + "\u006E" + "\u0303" + "o") + " "
143 + new String("ju" + "\u00F1" + "o"),
144 "[(0-5)s:ju" + "\u006E" + "\u0303" + "o|_0$<i>0<i>5|-:t$<i>2]"
145 + "[(6-10)s:ju" + "\u00F1" + "o|_1$<i>6<i>10]");
Akron74748c62016-06-29 00:22:43 +0200146 ki.addDoc(fd);
147 ki.commit();
148
149 assertEquals(1, ki.numberOf("base", "documents"));
150
151 QueryBuilder kq = new QueryBuilder("base");
152 Result kr = ki.search(kq.seg("s:ju" + "\u00F1" + "o").toQuery());
153 assertEquals(1, kr.getTotalResults());
154
155 kr = ki.search(kq.seg("s:ju" + "\u006E" + "\u0303" + "o").toQuery());
156 assertEquals(1, kr.getTotalResults());
157 };
158
Akron08f4ceb2016-08-03 23:53:32 +0200159
Akron75ee2b82016-06-20 21:20:34 +0200160 @Test
161 public void indexFieldInfo () throws IOException {
162 KrillIndex ki = new KrillIndex();
163
164 FieldDocument fd = new FieldDocument();
165 fd.setTitle("Peter");
166 fd.setUID(22);
167 ki.addDoc(fd);
Akron8798be82016-06-23 23:10:25 +0200168
169 fd = new FieldDocument();
170 fd.setTitle("Akron");
171 fd.setUID("05678");
172 ki.addDoc(fd);
173
Akron75ee2b82016-06-20 21:20:34 +0200174 ki.commit();
175
Akron8798be82016-06-23 23:10:25 +0200176 assertEquals(2, ki.numberOf("base", "documents"));
Akron75ee2b82016-06-20 21:20:34 +0200177
178 assertEquals("Peter", ki.getDoc("22").getTitle());
179 assertEquals(22, ki.getDoc("22").getUID());
Akron8798be82016-06-23 23:10:25 +0200180
181 assertEquals("Akron", ki.getDoc("5678").getTitle());
182 assertEquals(5678, ki.getDoc("5678").getUID());
183
184 assertEquals("Akron", ki.getDoc("05678").getTitle());
185 assertEquals(5678, ki.getDoc("05678").getUID());
Akron75ee2b82016-06-20 21:20:34 +0200186 };
Akronc74dee02018-02-07 18:48:30 +0100187
188
Akron70ce0c02018-05-25 23:44:26 +0200189 @Test
Akronc74dee02018-02-07 18:48:30 +0100190 public void indexRetrieveFieldInfo () throws IOException {
191 KrillIndex ki = new KrillIndex();
192
193 FieldDocument fd = new FieldDocument();
194
195 fd.addString("name", "Peter");
196 fd.addString("textSigle", "a/b/c");
197 fd.addInt("zahl1", 56);
198 fd.addStored("ref", "My reference");
199
200 fd.addKeyword("keyword", "baum");
201 fd.addKeyword("keyword", "wald");
202
203 fd.addText("title", "Der Name der Rose");
204
205 ki.addDoc(fd);
206
207 /* Save documents */
208 ki.commit();
209
210 JsonNode res = ki.getFields("a/b/c").toJsonNode();
211
212 // TODO: Check if the sorting is always identical!
213
Akron7cba5de2018-02-09 18:49:34 +0100214 Iterator fieldIter = res.at("/document/fields").elements();
Akronc74dee02018-02-07 18:48:30 +0100215
Akron7cba5de2018-02-09 18:49:34 +0100216 int checkC = 0;
217 while (fieldIter.hasNext()) {
218 JsonNode field = (JsonNode) fieldIter.next();
Akronc74dee02018-02-07 18:48:30 +0100219
Akron7cba5de2018-02-09 18:49:34 +0100220 String key = field.at("/key").asText();
Akronc74dee02018-02-07 18:48:30 +0100221
Akron7cba5de2018-02-09 18:49:34 +0100222 switch (key) {
223 case "ref":
224 assertEquals("type:store", field.at("/type").asText());
225 assertEquals("koral:field", field.at("/@type").asText());
226 assertEquals("My reference", field.at("/value").asText());
227 checkC++;
228 break;
Akronc74dee02018-02-07 18:48:30 +0100229
Akron7cba5de2018-02-09 18:49:34 +0100230 case "title":
231 assertEquals("type:text", field.at("/type").asText());
232 assertEquals("koral:field", field.at("/@type").asText());
233 assertEquals("Der Name der Rose", field.at("/value").asText());
234 checkC++;
235 break;
Akronc74dee02018-02-07 18:48:30 +0100236
Akron7cba5de2018-02-09 18:49:34 +0100237 case "textSigle":
238 assertEquals("type:string", field.at("/type").asText());
239 assertEquals("koral:field", field.at("/@type").asText());
240 assertEquals("a/b/c", field.at("/value").asText());
241 checkC++;
242 break;
243
244 case "keyword":
Akron04f00952018-03-06 18:56:54 +0100245 assertEquals("type:keywords", field.at("/type").asText());
Akron7cba5de2018-02-09 18:49:34 +0100246 assertEquals("koral:field", field.at("/@type").asText());
247 assertEquals("baum", field.at("/value/0").asText());
248 assertEquals("wald", field.at("/value/1").asText());
249 checkC++;
250 break;
251
252 case "zahl1":
253 assertEquals("type:number", field.at("/type").asText());
254 assertEquals("koral:field", field.at("/@type").asText());
255 assertEquals(56, field.at("/value").asInt());
256 checkC++;
257 break;
258
259 case "name":
260 assertEquals("type:string", field.at("/type").asText());
261 assertEquals("koral:field", field.at("/@type").asText());
262 assertEquals("Peter", field.at("/value").asText());
263 checkC++;
264 break;
265 };
266 };
267
268 assertEquals(6, checkC);
Akronc74dee02018-02-07 18:48:30 +0100269
Akron4497aba2018-02-08 19:03:09 +0100270
271 // Test with real document
272 ki.addDoc(getClass().getResourceAsStream("/wiki/wdd17-982-72848.json.gz"),true);
273
274 /* Save documents */
275 ki.commit();
276
Akron7cba5de2018-02-09 18:49:34 +0100277 res = ki.getFields("wdd17/982/72841").toJsonNode();
Akron4497aba2018-02-08 19:03:09 +0100278 assertEquals("Document not found", res.at("/errors/0/1").asText());
279
Akron7cba5de2018-02-09 18:49:34 +0100280 res = ki.getFields("WDD17/982/72848").toJsonNode();
Akron4497aba2018-02-08 19:03:09 +0100281
Akron7cba5de2018-02-09 18:49:34 +0100282 fieldIter = res.at("/document/fields").elements();
Akron4497aba2018-02-08 19:03:09 +0100283
Akron7cba5de2018-02-09 18:49:34 +0100284 checkC = 0;
285 while (fieldIter.hasNext()) {
286 JsonNode field = (JsonNode) fieldIter.next();
Akron4497aba2018-02-08 19:03:09 +0100287
Akron7cba5de2018-02-09 18:49:34 +0100288 String key = field.at("/key").asText();
289
290 switch (key) {
291 case "pubDate":
292
Akron0c8a5c52018-03-07 20:00:36 +0100293 assertEquals("type:date", field.at("/type").asText());
294 assertEquals("2017-07-01", field.at("/value").asText());
Akron7cba5de2018-02-09 18:49:34 +0100295 break;
296
297 case "textSigle":
298
299 assertEquals("type:string", field.at("/type").asText());
300 assertEquals("WDD17/982/72848", field.at("/value").asText());
301 break;
302
303 case "foundries":
Akron04f00952018-03-06 18:56:54 +0100304 assertEquals("type:keywords", field.at("/type").asText());
305 assertEquals("dereko", field.at("/value/0").asText());
306 assertEquals("dereko/structure", field.at("/value/1").asText());
307 assertEquals("dereko/structure/base-sentences-paragraphs-pagebreaks", field.at("/value/2").asText());
Akron7cba5de2018-02-09 18:49:34 +0100308 break;
309 };
310 };
Akronc74dee02018-02-07 18:48:30 +0100311 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000312};