blob: d3b2a0a4a9084da66a7086ceaed2a551a136bb53 [file] [log] [blame]
Eliza Margaretha6a780692014-01-15 09:45:42 +00001package de.ids_mannheim.korap.index;
2
margaretha4f995582015-12-14 14:14:34 +01003import static org.junit.Assert.assertEquals;
Akron1a8bb762019-01-18 15:48:59 +01004import static org.junit.Assert.assertTrue;
Akronbaeaf0e2019-06-19 15:04:41 +02005import static org.junit.Assert.assertFalse;
margaretha4f995582015-12-14 14:14:34 +01006import static org.junit.Assert.fail;
Nils Diewaldf399a672013-11-18 17:55:22 +00007
Akronfafde022018-12-14 14:17:05 +01008import java.util.*;
margaretha4f995582015-12-14 14:14:34 +01009import java.io.BufferedReader;
10import java.io.FileReader;
11import java.io.IOException;
Eliza Margaretha805e27f2016-10-14 21:39:42 +020012import java.net.URLDecoder;
Nils Diewaldf399a672013-11-18 17:55:22 +000013
margaretha4f995582015-12-14 14:14:34 +010014import org.apache.lucene.search.spans.SpanQuery;
Nils Diewaldf399a672013-11-18 17:55:22 +000015import org.junit.Test;
Akron1a8bb762019-01-18 15:48:59 +010016import org.junit.Ignore;
Nils Diewaldf399a672013-11-18 17:55:22 +000017import org.junit.runner.RunWith;
18import org.junit.runners.JUnit4;
19
Akronfafde022018-12-14 14:17:05 +010020import com.fasterxml.jackson.databind.JsonNode;
21
22import static de.ids_mannheim.korap.TestSimple.*;
Nils Diewaldbbd39a52015-02-23 19:56:57 +000023import de.ids_mannheim.korap.Krill;
margaretha4f995582015-12-14 14:14:34 +010024import de.ids_mannheim.korap.KrillIndex;
Nils Diewaldf5ab4b22015-02-25 20:55:16 +000025import de.ids_mannheim.korap.KrillMeta;
margaretha4f995582015-12-14 14:14:34 +010026import de.ids_mannheim.korap.KrillQuery;
Nils Diewald8904c1d2015-02-26 16:13:18 +000027import de.ids_mannheim.korap.query.QueryBuilder;
Nils Diewald92729ce2014-10-06 16:00:17 +000028import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
margaretha4f995582015-12-14 14:14:34 +010029import de.ids_mannheim.korap.response.Match;
Akron685ec962019-02-25 19:04:46 +010030import de.ids_mannheim.korap.response.MetaFields;
margaretha4f995582015-12-14 14:14:34 +010031import de.ids_mannheim.korap.response.Result;
Nils Diewaldda1722b2014-02-17 00:12:05 +000032import de.ids_mannheim.korap.util.QueryException;
33
Akron4376e742019-01-16 15:02:30 +010034import org.apache.lucene.document.Document;
35
36
Nils Diewaldf399a672013-11-18 17:55:22 +000037@RunWith(JUnit4.class)
38public class TestFieldDocument {
39
40 @Test
41 public void indexExample1 () throws IOException {
Nils Diewaldbb33da22015-03-04 16:24:25 +000042 FieldDocument fd = new FieldDocument();
Nils Diewaldf399a672013-11-18 17:55:22 +000043
Nils Diewaldbb33da22015-03-04 16:24:25 +000044 fd.addString("corpusID", "WPD");
45 fd.addString("ID", "WPD-AAA-00001");
46 fd.addText("textClass", "music entertainment");
47 fd.addText("author", "Peter Frankenfeld");
Akronc7a2abc2019-01-17 14:21:34 +010048 fd.addDate("pubDate", 20130617);
49 fd.addInt("justanumber", 12345678);
Nils Diewaldbb33da22015-03-04 16:24:25 +000050 fd.addText("title", "Wikipedia");
51 fd.addText("subTitle", "Die freie Enzyklopädie");
52 fd.addStored("layerInfo", "opennlp/p=pos");
53 fd.addString("pubPlace", "Bochum");
Akronc7a2abc2019-01-17 14:21:34 +010054 fd.addDate("lastModified", 20130717);
margaretha4f995582015-12-14 14:14:34 +010055 fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10]"
56 + "[(1-2)s:b|i:b|_1$<i>1<i>2]" + "[(2-3)s:c|i:c|_2$<i>2<i>3]");
Akron8bb3bc32018-12-12 19:34:56 +010057 fd.addAttachement("Wikilink", "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel");
Nils Diewaldf399a672013-11-18 17:55:22 +000058
Akron4376e742019-01-16 15:02:30 +010059 Document doc = fd.compile();
60
61 assertEquals(doc.getField("title").name(), "title");
62 assertEquals(doc.getField("title").stringValue(), "Wikipedia");
Akronc7a2abc2019-01-17 14:21:34 +010063
Akron4376e742019-01-16 15:02:30 +010064 assertEquals(doc.getField("corpusID").name(), "corpusID");
65 assertEquals(doc.getField("corpusID").stringValue(), "WPD");
Nils Diewaldf399a672013-11-18 17:55:22 +000066
Akron4376e742019-01-16 15:02:30 +010067 assertEquals(doc.getField("ID").name(), "ID");
68 assertEquals(doc.getField("ID").stringValue(), "WPD-AAA-00001");
Nils Diewaldf399a672013-11-18 17:55:22 +000069
Akron4376e742019-01-16 15:02:30 +010070 assertEquals(doc.getField("subTitle").name(), "subTitle");
71 assertEquals(doc.getField("subTitle").stringValue(),
Nils Diewaldbb33da22015-03-04 16:24:25 +000072 "Die freie Enzyklopädie");
Nils Diewaldf399a672013-11-18 17:55:22 +000073
Akron4376e742019-01-16 15:02:30 +010074 assertEquals(doc.getField("pubPlace").name(), "pubPlace");
75 assertEquals(doc.getField("pubPlace").stringValue(), "Bochum");
Nils Diewaldf399a672013-11-18 17:55:22 +000076
Akron4376e742019-01-16 15:02:30 +010077 assertEquals(doc.getField("lastModified").name(), "lastModified");
78 assertEquals(doc.getField("lastModified").stringValue(), "20130717");
Nils Diewaldf399a672013-11-18 17:55:22 +000079
Akron4376e742019-01-16 15:02:30 +010080 assertEquals(doc.getField("tokens").name(), "tokens");
81 assertEquals(doc.getField("tokens").stringValue(), "abc");
Nils Diewaldf399a672013-11-18 17:55:22 +000082
Akron4376e742019-01-16 15:02:30 +010083 assertEquals(doc.getField("author").name(), "author");
84 assertEquals(doc.getField("author").stringValue(),
Nils Diewaldbb33da22015-03-04 16:24:25 +000085 "Peter Frankenfeld");
Nils Diewaldf399a672013-11-18 17:55:22 +000086
Akron4376e742019-01-16 15:02:30 +010087 assertEquals(doc.getField("layerInfo").name(), "layerInfo");
88 assertEquals(doc.getField("layerInfo").stringValue(),
Nils Diewaldbb33da22015-03-04 16:24:25 +000089 "opennlp/p=pos");
Nils Diewaldd4401ec2014-06-16 17:04:02 +000090
Akron4376e742019-01-16 15:02:30 +010091 assertEquals(doc.getField("textClass").name(), "textClass");
92 assertEquals(doc.getField("textClass").stringValue(),
Nils Diewaldbb33da22015-03-04 16:24:25 +000093 "music entertainment");
Akron4376e742019-01-16 15:02:30 +010094 assertEquals(doc.getField("Wikilink").name(), "Wikilink");
95 assertEquals(doc.getField("Wikilink").stringValue(),
Akron8bb3bc32018-12-12 19:34:56 +010096 "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel"
97 );
Akronc7a2abc2019-01-17 14:21:34 +010098
99 assertEquals(doc.getField("justanumber").numericValue().intValue(), 12345678);
100
Nils Diewaldf399a672013-11-18 17:55:22 +0000101 };
102
Nils Diewaldbb33da22015-03-04 16:24:25 +0000103
Nils Diewaldf399a672013-11-18 17:55:22 +0000104 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000105 public void indexExample2 () throws Exception {
Nils Diewaldf399a672013-11-18 17:55:22 +0000106
Eliza Margaretha6f989202016-10-14 21:48:29 +0200107 String json = new String("{" + " \"fields\" : [" + " { "
108 + " \"primaryData\" : \"abc\"" + " }," + " {"
109 + " \"name\" : \"tokens\"," + " \"data\" : ["
110 + " [ \"s:a\", \"i:a\", \"_0$<i>0<i>1\", \"-:t$<i>3\"],"
111 + " [ \"s:b\", \"i:b\", \"_1$<i>1<i>2\" ],"
112 + " [ \"s:c\", \"i:c\", \"_2$<i>2<i>3\" ]" + " ]"
113 + " }" + " ]," + " \"corpusID\" : \"WPD\","
114 + " \"ID\" : \"WPD-AAA-00001\","
115 + " \"textClass\" : \"music entertainment\","
116 + " \"author\" : \"Peter Frankenfeld\","
117 + " \"pubDate\" : 20130617,"
118 + " \"title\" : \"Wikipedia\","
119 + " \"subTitle\" : \"Die freie Enzyklopädie\","
120 + " \"pubPlace\" : \"Bochum\"" + "}");
Nils Diewaldf399a672013-11-18 17:55:22 +0000121
Nils Diewaldbb33da22015-03-04 16:24:25 +0000122 KrillIndex ki = new KrillIndex();
123 FieldDocument fd = ki.addDoc(json);
Nils Diewaldf399a672013-11-18 17:55:22 +0000124
Nils Diewaldbb33da22015-03-04 16:24:25 +0000125 ki.commit();
Nils Diewaldf399a672013-11-18 17:55:22 +0000126
Nils Diewaldbb33da22015-03-04 16:24:25 +0000127 assertEquals(fd.getPrimaryData(), "abc");
128 assertEquals(fd.getCorpusID(), "WPD");
129 assertEquals(fd.getID(), "WPD-AAA-00001");
Akron32b95192019-01-11 13:58:55 +0100130 assertEquals(fd.getFieldValue("textClass"), "music entertainment");
131 assertEquals(fd.getFieldValue("author"), "Peter Frankenfeld");
132 assertEquals(fd.getFieldValue("title"), "Wikipedia");
133 assertEquals(fd.getFieldValue("subTitle"), "Die freie Enzyklopädie");
134 assertEquals(fd.getFieldValue("pubPlace"), "Bochum");
135 assertEquals(fd.getFieldValueAsDate("pubDate").toDisplay(), "2013-06-17");
Nils Diewaldf399a672013-11-18 17:55:22 +0000136
Nils Diewaldbb33da22015-03-04 16:24:25 +0000137 QueryBuilder kq = new QueryBuilder("tokens");
Eliza Margaretha6f989202016-10-14 21:48:29 +0200138 Result kr = ki
Akron4f52a632018-02-09 19:02:40 +0100139 .search((SpanQuery) kq.seq(kq.nr(3, kq.seg("s:b"))).toQuery());
Nils Diewald12f00d42013-12-12 18:47:59 +0000140
Nils Diewaldbb33da22015-03-04 16:24:25 +0000141 Match km = kr.getMatch(0);
Nils Diewaldf399a672013-11-18 17:55:22 +0000142
Nils Diewaldbb33da22015-03-04 16:24:25 +0000143 assertEquals(km.getPrimaryData(), "abc");
144 assertEquals(km.getCorpusID(), "WPD");
145 assertEquals(km.getDocID(), "WPD-AAA-00001");
Akron32b95192019-01-11 13:58:55 +0100146 assertEquals(km.getFieldValue("textClass"), "music entertainment");
147 assertEquals(km.getFieldValue("author"), "Peter Frankenfeld");
148 assertEquals(km.getFieldValue("title"), "Wikipedia");
149 assertEquals(km.getFieldValue("subTitle"), "Die freie Enzyklopädie");
150 assertEquals(km.getFieldValue("pubPlace"), "Bochum");
151 assertEquals(km.getFieldValueAsDate("pubDate").toDisplay(), "2013-06-17");
Nils Diewaldf399a672013-11-18 17:55:22 +0000152
Akronf05fde62016-08-03 23:46:17 +0200153 assertEquals(km.getSnippetBrackets(), "a[[{3:b}]]c");
Nils Diewaldf399a672013-11-18 17:55:22 +0000154 };
155
Nils Diewald6802acd2014-03-18 18:29:30 +0000156
Nils Diewaldf399a672013-11-18 17:55:22 +0000157 @Test
158 public void indexExample3 () throws IOException {
159
Nils Diewaldbb33da22015-03-04 16:24:25 +0000160 // Construct index
161 KrillIndex ki = new KrillIndex();
Nils Diewaldf399a672013-11-18 17:55:22 +0000162
Nils Diewaldbb33da22015-03-04 16:24:25 +0000163 // Indexing test files
164 for (String i : new String[] { "00001", "00002", "00003", "00004",
165 "00005", "00006", "02439" }) {
166 FieldDocument fd = ki.addDoc(
167 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
168 true);
169 };
170 ki.commit();
Nils Diewaldf399a672013-11-18 17:55:22 +0000171
Nils Diewaldbb33da22015-03-04 16:24:25 +0000172 QueryBuilder kq = new QueryBuilder("tokens");
Nils Diewald6802acd2014-03-18 18:29:30 +0000173
Nils Diewaldbb33da22015-03-04 16:24:25 +0000174 Krill ks;
175 Result kr;
Nils Diewald6802acd2014-03-18 18:29:30 +0000176
Nils Diewaldbb33da22015-03-04 16:24:25 +0000177 // Start creating query
178 // within(<s>, {1: {2: [mate/p=ADJA & mate/m=number:sg]}[opennlp/p=NN & tt/p=NN]})
Nils Diewald6802acd2014-03-18 18:29:30 +0000179
Akron567b6fe2018-12-06 15:06:58 +0100180 ks = new Krill(kq.contains(kq.tag("base/s:s"), kq.nr(1,
Eliza Margaretha6f989202016-10-14 21:48:29 +0200181 kq.seq(kq.seg("mate/p:ADJA")).append(kq.seg("opennlp/p:NN")))));
Nils Diewaldf399a672013-11-18 17:55:22 +0000182
Nils Diewaldbb33da22015-03-04 16:24:25 +0000183 KrillMeta meta = ks.getMeta();
184 meta.setCount(1);
185 meta.setCutOff(true);
Nils Diewald6802acd2014-03-18 18:29:30 +0000186
Nils Diewaldbb33da22015-03-04 16:24:25 +0000187 meta.getContext().left.setCharacter(true).setLength(6);
188 meta.getContext().right.setToken(true).setLength(6);
189
190 assertEquals(
Akronf05fde62016-08-03 23:46:17 +0200191 "... okal. [[Der Buchstabe A hat in {1:deutschen Texten} eine durchschnittliche Häufigkeit von 6,51 %.]] Er ist damit der sechsthäufigste Buchstabe ...",
Nils Diewaldbb33da22015-03-04 16:24:25 +0000192 ks.apply(ki).getMatch(0).getSnippetBrackets());
Akronfbc76162019-06-04 15:51:09 +0200193
194
195 // Do not retrieve snippets
196 meta.setSnippets(false);
197
198 Match km = ks.apply(ki).getMatch(0);
199
200 assertEquals("Ruru,Jens.Ol,Aglarech", km.toJsonNode().get("author").asText());
201 assertTrue(!km.toJsonNode().has("snippet"));
202 assertEquals("", km.getPrimaryData());
Akronbaeaf0e2019-06-19 15:04:41 +0200203 assertFalse(km.toJsonNode().has("startMore"));
204 assertFalse(km.toJsonNode().has("endMore"));
205 assertFalse(km.toJsonNode().has("endCutted"));
206 assertFalse(km.toJsonNode().has("snippet"));
Nils Diewaldda1722b2014-02-17 00:12:05 +0000207 };
Nils Diewaldd0481e62014-02-15 23:55:10 +0000208
Nils Diewaldbb33da22015-03-04 16:24:25 +0000209
Nils Diewaldda1722b2014-02-17 00:12:05 +0000210 @Test
Nils Diewaldbe5943e2014-10-21 19:35:34 +0000211 public void queryJSONBsp18 () throws Exception {
Nils Diewaldd0481e62014-02-15 23:55:10 +0000212
Nils Diewaldbb33da22015-03-04 16:24:25 +0000213 // Construct index
214 KrillIndex ki = new KrillIndex();
Nils Diewaldda1722b2014-02-17 00:12:05 +0000215
Nils Diewaldbb33da22015-03-04 16:24:25 +0000216 // Indexing test files
217 for (String i : new String[] { "00001", "00002", "00003", "00004",
218 "00005", "00006", "02439" }) {
219 FieldDocument fd = ki.addDoc(
220 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
221 true);
Akron91c60112015-09-24 22:05:40 +0200222
Nils Diewaldbb33da22015-03-04 16:24:25 +0000223 };
224 ki.commit();
Nils Diewaldda1722b2014-02-17 00:12:05 +0000225
Eliza Margaretha6f989202016-10-14 21:48:29 +0200226 String jsonPath = URLDecoder.decode(
227 getClass().getResource("/queries/bsp18.jsonld").getFile(),
228 "UTF-8");
229
Akron8798be82016-06-23 23:10:25 +0200230 // {1:der} \w0:5 nicht
Akronfafde022018-12-14 14:17:05 +0100231 SpanQueryWrapper sqwi = getJsonQuery(jsonPath);
Nils Diewaldda1722b2014-02-17 00:12:05 +0000232
Nils Diewaldbb33da22015-03-04 16:24:25 +0000233 Result kr = ki.search(sqwi.toQuery(), 0, (short) 5, true, (short) 2,
234 false, (short) 5);
Nils Diewaldda1722b2014-02-17 00:12:05 +0000235
Akron8798be82016-06-23 23:10:25 +0200236 assertEquals(1, kr.getTotalResults());
Akron08f4ceb2016-08-03 23:53:32 +0200237 assertEquals(
238 "... bezeichnen, sofern [[{1:der} schwedische Buchstabe „Å“ nicht]] verfügbar ist im SI-Einheitensystem ist ...",
239 kr.getMatch(0).getSnippetBrackets());
Nils Diewaldda1722b2014-02-17 00:12:05 +0000240 };
241
Nils Diewaldbb33da22015-03-04 16:24:25 +0000242
Akrona9d4c422017-04-28 21:18:59 +0200243 @Test
244 public void indexNoValidDate () throws Exception {
245
246 String json = new String("{" + " \"fields\" : [" + " { "
247 + " \"primaryData\" : \"abc\"" + " }," + " {"
248 + " \"name\" : \"tokens\"," + " \"data\" : ["
249 + " [ \"s:a\", \"i:a\", \"_0$<i>0<i>1\", \"-:t$<i>3\"],"
250 + " [ \"s:b\", \"i:b\", \"_1$<i>1<i>2\" ],"
251 + " [ \"s:c\", \"i:c\", \"_2$<i>2<i>3\" ]" + " ]"
252 + " }" + " ]," + " \"corpusID\" : \"WPD\","
253 + " \"ID\" : \"WPD-AAA-00001\","
254 + " \"textClass\" : \"music entertainment\","
255 + " \"author\" : \"Peter Frankenfeld\","
256 + " \"pubDate\" : \"00000000\","
257 + " \"title\" : \"Wikipedia\","
258 + " \"subTitle\" : \"Die freie Enzyklopädie\","
259 + " \"pubPlace\" : \"Bochum\"" + "}");
260
261 KrillIndex ki = new KrillIndex();
262 FieldDocument fd = ki.addDoc(json);
263
264 ki.commit();
265
266 assertEquals(fd.getPrimaryData(), "abc");
267 assertEquals(fd.getCorpusID(), "WPD");
268 assertEquals(fd.getID(), "WPD-AAA-00001");
Akron32b95192019-01-11 13:58:55 +0100269 assertEquals(fd.getFieldValue("textClass"), "music entertainment");
270 assertEquals(fd.getFieldValue("author"), "Peter Frankenfeld");
271 assertEquals(fd.getFieldValue("title"), "Wikipedia");
272 assertEquals(fd.getFieldValue("subTitle"), "Die freie Enzyklopädie");
273 assertEquals(fd.getFieldValue("pubPlace"), "Bochum");
274 assertEquals(fd.getFieldValueAsDate("pubDate").toDisplay(), "");
Akrona9d4c422017-04-28 21:18:59 +0200275 };
Akron798e6a22018-06-18 15:29:35 +0200276
Akronfafde022018-12-14 14:17:05 +0100277 @Test
278 public void indexNewMetaData () throws Exception {
279
280 String json = new String(
281 "{"
Akron510ba0b2019-02-06 19:07:17 +0100282 + " \"data\" : {"
283 + " \"text\" : \"abc\","
284 + " \"name\" : \"tokens\","
285 + " \"stream\" : ["
286 + " [ \"s:a\", \"i:a\", \"_0$<i>0<i>1\", \"-:t$<i>3\"],"
287 + " [ \"s:b\", \"i:b\", \"_1$<i>1<i>2\" ],"
288 + " [ \"s:c\", \"i:c\", \"_2$<i>2<i>3\" ]"
289 + " ]"
290 + " },"
Akronfafde022018-12-14 14:17:05 +0100291 + " \"fields\" : ["
Akronfafde022018-12-14 14:17:05 +0100292 + " {"
293 + " \"@type\" : \"koral:field\","
294 + " \"type\" : \"type:string\","
295 + " \"key\" : \"corpusID\","
296 + " \"value\" : \"WPD\""
297 + " },"
298 + " {"
299 + " \"@type\" : \"koral:field\","
300 + " \"type\" : \"type:string\","
301 + " \"key\" : \"textSigle\","
302 + " \"value\" : \"x/y/z\""
303 + " },"
304 + " {"
305 + " \"@type\" : \"koral:field\","
306 + " \"type\" : \"type:string\","
307 + " \"key\" : \"ID\","
308 + " \"value\" : \"WPD-AAA-00001\""
309 + " },"
310 + " {"
311 + " \"@type\" : \"koral:field\","
312 + " \"type\" : \"type:string\","
313 + " \"key\" : \"textClass\","
314 + " \"value\" : [\"music\",\"entertainment\"]"
315 + " },"
316 + " {"
317 + " \"@type\" : \"koral:field\","
318 + " \"type\" : \"type:text\","
319 + " \"key\" : \"author\","
320 + " \"value\" : \"Peter Frankenfeld\""
321 + " },"
322 + " {"
323 + " \"@type\" : \"koral:field\","
324 + " \"type\" : \"type:date\","
325 + " \"key\" : \"pubDate\","
326 + " \"value\" : \"2015-05-01\""
327 + " },"
328 + " {"
329 + " \"@type\" : \"koral:field\","
330 + " \"type\" : \"type:text\","
331 + " \"key\" : \"title\","
332 + " \"value\" : \"Wikipedia\""
333 + " },"
334 + " {"
335 + " \"@type\" : \"koral:field\","
336 + " \"type\" : \"type:text\","
337 + " \"key\" : \"subTitle\","
338 + " \"value\" : \"Die freie Enzyklopädie\""
339 + " },"
340 + " {"
341 + " \"@type\" : \"koral:field\","
342 + " \"type\" : \"type:string\","
343 + " \"key\" : \"pubPlace\","
344 + " \"value\" : \"Bochum\""
345 + " },"
346 + " {"
347 + " \"@type\" : \"koral:field\","
348 + " \"type\" : \"type:attachement\","
349 + " \"key\" : \"link\","
350 + " \"value\" : \"data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel\""
351 + " }"
352 + " ]"
353 + "}");
354
355 KrillIndex ki = new KrillIndex();
356 FieldDocument fd = ki.addDoc(json);
357
358 ki.commit();
359
360 assertEquals(fd.getPrimaryData(), "abc");
Akrona6dabb72019-01-09 13:09:41 +0100361 // assertEquals(fd.doc.getField("corpusID").stringValue(), "WPD");
Akronfafde022018-12-14 14:17:05 +0100362 assertEquals(fd.doc.getField("textSigle").stringValue(), "x/y/z");
363 assertEquals(fd.doc.getField("ID").stringValue(), "WPD-AAA-00001");
364 assertEquals(fd.doc.getField("textClass").stringValue(), "music entertainment");
365 assertEquals(fd.doc.getField("author").stringValue(), "Peter Frankenfeld");
366 assertEquals(fd.doc.getField("title").stringValue(), "Wikipedia");
367 assertEquals(fd.doc.getField("subTitle").stringValue(), "Die freie Enzyklopädie");
368 assertEquals(fd.doc.getField("pubPlace").stringValue(), "Bochum");
369 assertEquals(fd.doc.getField("pubDate").stringValue(), "20150501");
370 assertEquals(fd.doc.getField("link").stringValue(), "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel");
371
372 JsonNode res = ki.getFields("x/y/z").toJsonNode();
373
374 Iterator fieldIter = res.at("/document/fields").elements();
375
376 int checkC = 0;
377 while (fieldIter.hasNext()) {
378 JsonNode field = (JsonNode) fieldIter.next();
379
380 String key = field.at("/key").asText();
381
382 switch (key) {
383 case "corpusID":
384 assertEquals("type:string", field.at("/type").asText());
385 assertEquals("koral:field", field.at("/@type").asText());
386 assertEquals("WPD", field.at("/value").asText());
387 checkC++;
388 break;
389
390 case "textSigle":
391 assertEquals("type:string", field.at("/type").asText());
392 assertEquals("koral:field", field.at("/@type").asText());
393 assertEquals("x/y/z", field.at("/value").asText());
394 checkC++;
395 break;
396
397 case "ID":
398 assertEquals("type:string", field.at("/type").asText());
399 assertEquals("koral:field", field.at("/@type").asText());
400 assertEquals("WPD-AAA-00001", field.at("/value").asText());
401 checkC++;
402 break;
403
404 case "textClass":
405 assertEquals("type:keywords", field.at("/type").asText());
406 assertEquals("koral:field", field.at("/@type").asText());
407 assertEquals("music", field.at("/value/0").asText());
408 assertEquals("entertainment", field.at("/value/1").asText());
409 checkC++;
410 break;
411
412 case "author":
413 assertEquals("type:text", field.at("/type").asText());
414 assertEquals("koral:field", field.at("/@type").asText());
415 assertEquals("Peter Frankenfeld", field.at("/value").asText());
416 checkC++;
417 break;
418
419 case "title":
420 assertEquals("type:text", field.at("/type").asText());
421 assertEquals("koral:field", field.at("/@type").asText());
422 assertEquals("Wikipedia", field.at("/value").asText());
423 checkC++;
424 break;
425
426 case "subTitle":
427 assertEquals("type:text", field.at("/type").asText());
428 assertEquals("koral:field", field.at("/@type").asText());
429 assertEquals("Die freie Enzyklopädie", field.at("/value").asText());
430 checkC++;
431 break;
432
433 case "pubPlace":
434 assertEquals("type:string", field.at("/type").asText());
435 assertEquals("koral:field", field.at("/@type").asText());
436 assertEquals("Bochum", field.at("/value").asText());
437 checkC++;
438 break;
439
440 case "pubDate":
441 assertEquals("type:date", field.at("/type").asText());
442 assertEquals("koral:field", field.at("/@type").asText());
443 assertEquals("2015-05-01", field.at("/value").asText());
444 checkC++;
445 break;
446
447 case "link":
448 assertEquals("type:attachement", field.at("/type").asText());
449 assertEquals("koral:field", field.at("/@type").asText());
450 assertEquals("data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel", field.at("/value").asText());
451 checkC++;
452 break;
Akron1a975d12019-02-05 13:13:06 +0100453
454 default:
455 fail("Unknown field: " + key);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000456 };
Nils Diewald8904c1d2015-02-26 16:13:18 +0000457 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000458 };
Akronc7a2abc2019-01-17 14:21:34 +0100459
Akron1a8bb762019-01-18 15:48:59 +0100460
Akronc7a2abc2019-01-17 14:21:34 +0100461 @Test
462 public void indexArbitraryMetaData () throws Exception {
Akron1a8bb762019-01-18 15:48:59 +0100463 String json = createDocString1();
Akronc7a2abc2019-01-17 14:21:34 +0100464
465 KrillIndex ki = new KrillIndex();
466 FieldDocument fd = ki.addDoc(json);
467
468 ki.commit();
469
470 assertEquals(fd.getPrimaryData(), "abc");
471 assertEquals(fd.doc.getField("alter").stringValue(), "40.0");
472 assertEquals(fd.doc.getField("name").stringValue(), "Frank");
473 assertEquals(fd.doc.getField("schluesselwoerter").stringValue(), "musik unterhaltung");
474 assertEquals(fd.doc.getField("tags").stringValue(), "nachrichten feuilleton sport raetsel");
475 assertEquals(fd.doc.getField("titel").stringValue(), "Der alte Baum");
476 assertEquals(fd.doc.getField("anhang").stringValue(), "data:application/x.korap-link,http://spiegel.de/");
477 assertEquals(fd.doc.getField("referenz").stringValue(), "So war das");
478 assertEquals(fd.doc.getField("datum").stringValue(), "20180403");
479
480 JsonNode res = ki.getFields("aa/bb/cc").toJsonNode();
481
482 Iterator fieldIter = res.at("/document/fields").elements();
483
484 int checkC = 0;
485 while (fieldIter.hasNext()) {
486 JsonNode field = (JsonNode) fieldIter.next();
487
488 String key = field.at("/key").asText();
489
490 switch (key) {
491 case "textSigle":
492 assertEquals("type:string", field.at("/type").asText());
493 assertEquals("koral:field", field.at("/@type").asText());
494 assertEquals("aa/bb/cc", field.at("/value").asText());
495 checkC++;
496 break;
497
498 case "alter":
499 assertEquals("type:integer", field.at("/type").asText());
500 assertEquals("koral:field", field.at("/@type").asText());
501 assertEquals(40, field.at("/value").asInt());
502 checkC++;
503 break;
504
505 case "name":
506 assertEquals("type:string", field.at("/type").asText());
507 assertEquals("koral:field", field.at("/@type").asText());
508 assertEquals("Frank", field.at("/value").asText());
509 checkC++;
510 break;
511
512 case "schluesselwoerter":
513 assertEquals("type:keywords", field.at("/type").asText());
514 assertEquals("koral:field", field.at("/@type").asText());
515 assertEquals("musik", field.at("/value/0").asText());
516 assertEquals("unterhaltung", field.at("/value/1").asText());
517 checkC++;
518 break;
519
520 case "tags":
521 assertEquals("type:keywords", field.at("/type").asText());
522 assertEquals("koral:field", field.at("/@type").asText());
523 assertEquals("nachrichten", field.at("/value/0").asText());
524 assertEquals("feuilleton", field.at("/value/1").asText());
525 assertEquals("sport", field.at("/value/2").asText());
526 assertEquals("raetsel", field.at("/value/3").asText());
527 checkC++;
528 break;
529
530 case "titel":
531 assertEquals("type:text", field.at("/type").asText());
532 assertEquals("koral:field", field.at("/@type").asText());
533 assertEquals("Der alte Baum", field.at("/value").asText());
534 checkC++;
535 break;
536
537 case "anhang":
538 assertEquals("type:attachement", field.at("/type").asText());
539 assertEquals("koral:field", field.at("/@type").asText());
540 assertEquals("data:application/x.korap-link,http://spiegel.de/", field.at("/value").asText());
541 checkC++;
542 break;
543
544 case "referenz":
545 assertEquals("type:store", field.at("/type").asText());
546 assertEquals("koral:field", field.at("/@type").asText());
547 assertEquals("So war das", field.at("/value").asText());
548 checkC++;
549 break;
550
551 case "datum":
552 assertEquals("type:date", field.at("/type").asText());
553 assertEquals("koral:field", field.at("/@type").asText());
554 assertEquals("2018-04-03", field.at("/value").asText());
555 checkC++;
556 break;
Akron1a975d12019-02-05 13:13:06 +0100557
558 default:
559 fail("Unknown field: " + key);
Akronc7a2abc2019-01-17 14:21:34 +0100560 };
561 };
562 };
Akron1a8bb762019-01-18 15:48:59 +0100563
564 @Test
565 public void indexArbitraryMetaDataPartial () throws Exception {
566 String json = createDocString1();
567
568 KrillIndex ki = new KrillIndex();
569 FieldDocument fd = ki.addDoc(json);
570
571 ki.commit();
572
573 ArrayList hs = new ArrayList<String>();
574 hs.add("datum");
575 hs.add("titel");
576 JsonNode res = ki.getFields("aa/bb/cc", hs).toJsonNode();
577 assertEquals("type:date", res.at("/document/fields/0/type").asText());
578 assertEquals("datum", res.at("/document/fields/0/key").asText());
579 assertEquals("2018-04-03", res.at("/document/fields/0/value").asText());
580 assertEquals("type:text", res.at("/document/fields/1/type").asText());
581 assertEquals("titel", res.at("/document/fields/1/key").asText());
582 assertEquals("Der alte Baum", res.at("/document/fields/1/value").asText());
583 assertTrue(res.at("/document/fields/2").isMissingNode());
584 };
585
586 @Test
587 public void indexArbitraryMetaDataSorted () throws Exception {
588 String json = createDocString1();
589
590 KrillIndex ki = new KrillIndex();
591 FieldDocument fd = ki.addDoc(json);
592
593 ki.commit();
594
595 ArrayList hs = new ArrayList<String>();
596 hs.add("titel");
597 hs.add("datum");
598 JsonNode res = ki.getFields("aa/bb/cc", hs).toJsonNode();
599 assertEquals("type:text", res.at("/document/fields/0/type").asText());
600 assertEquals("titel", res.at("/document/fields/0/key").asText());
601 assertEquals("Der alte Baum", res.at("/document/fields/0/value").asText());
602 assertEquals("type:date", res.at("/document/fields/1/type").asText());
603 assertEquals("datum", res.at("/document/fields/1/key").asText());
604 assertEquals("2018-04-03", res.at("/document/fields/1/value").asText());
605 assertTrue(res.at("/document/fields/2").isMissingNode());
606 };
607
608 @Test
609 public void indexArbitraryMetaDataEmpty () throws Exception {
610 String json = createDocString1();
611
612 KrillIndex ki = new KrillIndex();
613 FieldDocument fd = ki.addDoc(json);
614
615 ki.commit();
616
617 ArrayList hs = new ArrayList<String>();
618 hs.add("titel");
619 hs.add("frage");
620 hs.add("datum");
621 JsonNode res = ki.getFields("aa/bb/cc", hs).toJsonNode();
622 assertEquals("type:text", res.at("/document/fields/0/type").asText());
623 assertEquals("titel", res.at("/document/fields/0/key").asText());
624 assertEquals("Der alte Baum", res.at("/document/fields/0/value").asText());
625 assertEquals("frage", res.at("/document/fields/1/key").asText());
626 assertTrue(res.at("/document/fields/1/type").isMissingNode());
627 assertEquals("type:date", res.at("/document/fields/2/type").asText());
628 assertEquals("datum", res.at("/document/fields/2/key").asText());
629 assertEquals("2018-04-03", res.at("/document/fields/2/value").asText());
630 assertTrue(res.at("/document/fields/3").isMissingNode());
631 };
632
Akron685ec962019-02-25 19:04:46 +0100633
634 @Test
635 public void indexUpsert () throws Exception {
636 KrillIndex ki = new KrillIndex();
637
638 // Add new document
639 FieldDocument fd = new FieldDocument();
640 fd.addString("textSigle", "AAA/BBB/001");
641 fd.addString("content", "Example1");
642 ki.upsertDoc(fd);
643 ki.commit();
644
645 MetaFields mfs = ki.getFields("AAA/BBB/001");
646 assertEquals(mfs.getFieldValue("indexCreationDate").length(), 10);
647 assertTrue(mfs.getFieldValue("indexCreationDate").matches("\\d{4}-\\d{2}-\\d{2}"));
648 assertEquals(
649 mfs.getFieldValue("indexCreationDate"),
650 mfs.getFieldValue("indexLastModified")
651 );
652 assertEquals(mfs.getFieldValue("content"), "Example1");
653
654
655 // Add new document
656 fd = new FieldDocument();
657 fd.addString("textSigle", "AAA/BBB/002");
658 fd.addString("content", "Example2");
659
660 ki.upsertDoc(fd);
661 ki.commit();
662
663 mfs = ki.getFields("AAA/BBB/002");
664 assertEquals(mfs.getFieldValue("indexCreationDate").length(), 10);
665
666 assertTrue(mfs.getFieldValue("indexCreationDate").matches("\\d{4}-\\d{2}-\\d{2}"));
667 assertEquals(mfs.getFieldValue("content"), "Example2");
668
669 fd = new FieldDocument();
670 fd.addString("textSigle", "AAA/BBB/001");
671 fd.addString("content", "Example3");
672
673 ki.upsertDoc(fd);
674 ki.commit();
675
676 mfs = ki.getFields("AAA/BBB/001");
677 assertEquals(mfs.getFieldValue("indexCreationDate").length(), 10);
678 assertTrue(mfs.getFieldValue("indexCreationDate").matches("\\d{4}-\\d{2}-\\d{2}"));
679 assertEquals(mfs.getFieldValue("content"), "Example3");
680
681 assertEquals(ki.numberOf("documents"), 2);
Akronf0e36532019-03-06 11:43:21 +0100682
683 // Test Inputstream method
684 ki.upsertDoc(getClass().getResourceAsStream("/wiki/WPD17-H81-63495.json.gz"), true);
685 ki.commit();
686 assertEquals(ki.numberOf("documents"), 3);
Akron81829f12019-04-09 23:06:34 +0200687
688 ki.close();
689
690 fd = new FieldDocument();
691 fd.addString("textSigle", "AAA/DDD/005");
692 fd.addString("content", "Example4");
693
694 ki.upsertDoc(fd);
695 ki.commit();
696
697 assertEquals(ki.numberOf("documents"), 4);
698
Akron685ec962019-02-25 19:04:46 +0100699 };
700
701
Akron1a8bb762019-01-18 15:48:59 +0100702 private static String createDocString1 () {
703 return new String(
704 "{"
Akron510ba0b2019-02-06 19:07:17 +0100705 + " \"data\" : {"
706 + " \"text\" : \"abc\","
707 + " \"name\" : \"tokens\","
708 + " \"stream\" : ["
709 + " [ \"s:a\", \"i:a\", \"_0$<i>0<i>1\", \"-:t$<i>3\"],"
710 + " [ \"s:b\", \"i:b\", \"_1$<i>1<i>2\" ],"
711 + " [ \"s:c\", \"i:c\", \"_2$<i>2<i>3\" ]"
712 + " ]"
713 + " },"
Akron1a8bb762019-01-18 15:48:59 +0100714 + " \"fields\" : ["
Akron1a8bb762019-01-18 15:48:59 +0100715 + " {"
716 + " \"@type\" : \"koral:field\","
717 + " \"type\" : \"type:string\","
718 + " \"key\" : \"textSigle\","
719 + " \"value\" : \"aa/bb/cc\""
720 + " },"
721 + " {"
722 + " \"@type\" : \"koral:field\","
723 + " \"type\" : \"type:integer\","
724 + " \"key\" : \"alter\","
725 + " \"value\" : 40"
726 + " },"
727 + " {"
728 + " \"@type\" : \"koral:field\","
729 + " \"type\" : \"type:string\","
730 + " \"key\" : \"name\","
731 + " \"value\" : \"Frank\""
732 + " },"
733 + " {"
734 + " \"@type\" : \"koral:field\","
735 + " \"type\" : \"type:string\","
736 + " \"key\" : \"name\","
737 + " \"value\" : \"Julian\""
738 + " },"
739 + " {"
740 + " \"@type\" : \"koral:field\","
741 + " \"type\" : \"type:string\","
742 + " \"key\" : \"schluesselwoerter\","
743 + " \"value\" : [\"musik\",\"unterhaltung\"]"
744 + " },"
745 + " {"
746 + " \"@type\" : \"koral:field\","
747 + " \"type\" : \"type:keywords\","
748 + " \"key\" : \"tags\","
749 + " \"value\" : \"nachrichten feuilleton\""
750 + " },"
751 + " {"
752 + " \"@type\" : \"koral:field\","
753 + " \"type\" : \"type:keywords\","
754 + " \"key\" : \"tags\","
755 + " \"value\" : [\"sport\",\"raetsel\"]"
756 + " },"
757 + " {"
758 + " \"@type\" : \"koral:field\","
759 + " \"type\" : \"type:text\","
760 + " \"key\" : \"titel\","
761 + " \"value\" : \"Der alte Baum\""
762 + " },"
763 + " {"
764 + " \"@type\" : \"koral:field\","
765 + " \"type\" : \"type:attachement\","
766 + " \"key\" : \"anhang\","
767 + " \"value\" : \"data:application/x.korap-link,http://spiegel.de/\""
768 + " },"
769 + " {"
770 + " \"@type\" : \"koral:field\","
771 + " \"type\" : \"type:store\","
772 + " \"key\" : \"referenz\","
773 + " \"value\" : \"So war das\""
774 + " },"
775 + " {"
776 + " \"@type\" : \"koral:field\","
777 + " \"type\" : \"type:date\","
778 + " \"key\" : \"datum\","
779 + " \"value\" : \"2018-04-03\""
780 + " }"
781 + " ]"
782 + "}");
783 };
Nils Diewaldf399a672013-11-18 17:55:22 +0000784};