blob: 0ff48b5404657d2d4a038fe0f58dc528e0161bc8 [file] [log] [blame]
Nils Diewalde3645702014-11-07 21:15:20 +00001package de.ids_mannheim.korap.search;
2
3import java.util.*;
4import java.io.*;
5
6import static de.ids_mannheim.korap.TestSimple.*;
7
Nils Diewaldbbd39a52015-02-23 19:56:57 +00008import de.ids_mannheim.korap.Krill;
Nils Diewald2d5f8102015-02-26 21:07:54 +00009import de.ids_mannheim.korap.KrillCollection;
Nils Diewald0339d462015-02-26 14:53:56 +000010import de.ids_mannheim.korap.KrillQuery;
Nils Diewalda14ecd62015-02-26 21:00:20 +000011import de.ids_mannheim.korap.KrillIndex;
Nils Diewalde3645702014-11-07 21:15:20 +000012import de.ids_mannheim.korap.index.FieldDocument;
Nils Diewald884dbcf2015-02-27 17:02:28 +000013import de.ids_mannheim.korap.response.Result;
Nils Diewalde3645702014-11-07 21:15:20 +000014import java.nio.file.Files;
15import java.nio.file.FileSystem;
16import java.nio.file.Path;
17import java.nio.charset.StandardCharsets;
18import java.nio.ByteBuffer;
19
Akron484c3c12015-07-07 20:25:44 +020020import org.apache.commons.lang.StringUtils;
21import org.apache.lucene.analysis.Analyzer;
22import org.apache.lucene.analysis.TokenStream;
23import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24
Nils Diewalde3645702014-11-07 21:15:20 +000025import com.fasterxml.jackson.databind.ObjectMapper;
26import com.fasterxml.jackson.databind.JsonNode;
27
28import static org.junit.Assert.*;
29import org.junit.Test;
30import org.junit.Ignore;
31import org.junit.runner.RunWith;
32import org.junit.runners.JUnit4;
33
34@RunWith(JUnit4.class)
35public class TestMetaFields {
Akron640458c2015-06-25 12:36:15 +020036
Nils Diewalde3645702014-11-07 21:15:20 +000037 @Test
38 public void searchMetaFields () throws IOException {
39
Nils Diewald3aa9e692015-02-20 22:20:11 +000040 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +000041 KrillIndex ki = new KrillIndex();
Nils Diewald3aa9e692015-02-20 22:20:11 +000042 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +000043 for (String i : new String[] { "00001", "00002" }) {
Nils Diewald50333552015-03-02 15:54:46 +000044 ki.addDoc(
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
46 true);
Nils Diewald3aa9e692015-02-20 22:20:11 +000047 };
48 ki.commit();
Nils Diewalde3645702014-11-07 21:15:20 +000049
Nils Diewaldbb33da22015-03-04 16:24:25 +000050 String jsonString = getString(getClass().getResource(
51 "/queries/metas/fields.jsonld").getFile());
52
Nils Diewaldbbd39a52015-02-23 19:56:57 +000053 Krill ks = new Krill(jsonString);
Nils Diewalde3645702014-11-07 21:15:20 +000054
Nils Diewald884dbcf2015-02-27 17:02:28 +000055 Result kr = ks.apply(ki);
Nils Diewald3aa9e692015-02-20 22:20:11 +000056 assertEquals((long) 17, kr.getTotalResults());
57 assertEquals(0, kr.getStartIndex());
58 assertEquals(9, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +000059
Nils Diewald3aa9e692015-02-20 22:20:11 +000060 ObjectMapper mapper = new ObjectMapper();
61 JsonNode res = mapper.readTree(kr.toJsonString());
Akronb1166442015-06-27 00:34:19 +020062
63 // System.err.println(res.toString());
64 // mirror fields
65 assertEquals(9, res.at("/meta/count").asInt());
66
67 if (res.at("/meta/fields/0").asText().equals("UID")) {
68 assertEquals("corpusID", res.at("/meta/fields/1").asText());
69 }
70 else {
71 assertEquals("corpusID", res.at("/meta/fields/0").asText());
72 assertEquals("UID", res.at("/meta/fields/1").asText());
73 };
74
Nils Diewald3aa9e692015-02-20 22:20:11 +000075 assertEquals(0, res.at("/matches/0/UID").asInt());
76 assertEquals("WPD", res.at("/matches/0/corpusID").asText());
Akron12f1f5b2015-06-24 15:56:52 +020077 assertTrue(res.at("/matches/0/docID").isMissingNode());
78 assertTrue(res.at("/matches/0/textSigle").isMissingNode());
79 assertTrue(res.at("/matches/0/ID").isMissingNode());
80 assertTrue(res.at("/matches/0/author").isMissingNode());
81 assertTrue(res.at("/matches/0/title").isMissingNode());
82 assertTrue(res.at("/matches/0/subTitle").isMissingNode());
83 assertTrue(res.at("/matches/0/textClass").isMissingNode());
84 assertTrue(res.at("/matches/0/pubPlace").isMissingNode());
85 assertTrue(res.at("/matches/0/pubDate").isMissingNode());
86 assertTrue(res.at("/matches/0/foundries").isMissingNode());
87 assertTrue(res.at("/matches/0/layerInfos").isMissingNode());
88 assertTrue(res.at("/matches/0/tokenization").isMissingNode());
Nils Diewalde3645702014-11-07 21:15:20 +000089
Nils Diewaldbb33da22015-03-04 16:24:25 +000090 jsonString = getString(getClass().getResource(
91 "/queries/metas/fields_2.jsonld").getFile());
Nils Diewaldbbd39a52015-02-23 19:56:57 +000092 ks = new Krill(jsonString);
Nils Diewald3aa9e692015-02-20 22:20:11 +000093 kr = ks.apply(ki);
94 assertEquals((long) 17, kr.getTotalResults());
95 assertEquals(0, kr.getStartIndex());
96 assertEquals(2, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +000097
Nils Diewald3aa9e692015-02-20 22:20:11 +000098 mapper = new ObjectMapper();
99 res = mapper.readTree(kr.toJsonString());
100 assertEquals(0, res.at("/matches/0/UID").asInt());
Akron12f1f5b2015-06-24 15:56:52 +0200101 assertTrue(res.at("/matches/0/corpusID").isMissingNode());
Nils Diewaldbb33da22015-03-04 16:24:25 +0000102 assertEquals("Ruru,Jens.Ol,Aglarech", res.at("/matches/0/author")
103 .asText());
Nils Diewald3aa9e692015-02-20 22:20:11 +0000104 assertEquals("A", res.at("/matches/0/title").asText());
105 assertEquals("WPD_AAA.00001", res.at("/matches/0/docID").asText());
Akron3e0403f2015-06-24 20:59:13 +0200106 assertTrue(res.at("/matches/0/textSigle").isMissingNode());
Akron7d45e6b2015-06-26 17:23:42 +0200107 assertEquals("match-WPD_AAA.00001-p6-7", res.at("/matches/0/matchID")
108 .asText());
Akron48937e92015-06-26 01:49:02 +0200109 // assertEquals("p6-7", res.at("/matches/0/matchID").asText());
Nils Diewald3aa9e692015-02-20 22:20:11 +0000110 assertEquals("", res.at("/matches/0/subTitle").asText());
111 assertEquals("", res.at("/matches/0/textClass").asText());
112 assertEquals("", res.at("/matches/0/pubPlace").asText());
113 assertEquals("", res.at("/matches/0/pubDate").asText());
114 assertEquals("", res.at("/matches/0/foundries").asText());
115 assertEquals("", res.at("/matches/0/layerInfo").asText());
116 assertEquals("", res.at("/matches/0/tokenization").asText());
Nils Diewalde3645702014-11-07 21:15:20 +0000117 };
Akron3e0403f2015-06-24 20:59:13 +0200118
Akron640458c2015-06-25 12:36:15 +0200119
Akron3e0403f2015-06-24 20:59:13 +0200120 @Test
121 public void searchMetaFieldsNew () throws IOException {
122
123 // Construct index
124 KrillIndex ki = new KrillIndex();
Akron640458c2015-06-25 12:36:15 +0200125 ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false);
Akron3e0403f2015-06-24 20:59:13 +0200126 ki.commit();
127
128 String jsonString = getString(getClass().getResource(
129 "/queries/metas/fields_no.jsonld").getFile());
130
131 Krill ks = new Krill(jsonString);
132 Result kr = ks.apply(ki);
133 ObjectMapper mapper = new ObjectMapper();
134 JsonNode res = mapper.readTree(kr.toJsonString());
135 assertEquals(0, res.at("/matches/0/UID").asInt());
136 assertEquals("GOE_AGX.00002", res.at("/matches/0/textSigle").asText());
Akron640458c2015-06-25 12:36:15 +0200137 assertEquals("Maximen und Reflexionen", res.at("/matches/0/title")
138 .asText());
Akron3e0403f2015-06-24 20:59:13 +0200139 assertEquals("1982", res.at("/matches/0/pubDate").asText());
Akron640458c2015-06-25 12:36:15 +0200140 assertEquals("Goethe, Johann Wolfgang von", res.at("/matches/0/author")
141 .asText());
Akron3e0403f2015-06-24 20:59:13 +0200142 assertEquals("GOE_AGX", res.at("/matches/0/docSigle").asText());
143 assertEquals("GOE", res.at("/matches/0/corpusSigle").asText());
Akron640458c2015-06-25 12:36:15 +0200144 assertEquals("Religion und Christentum", res.at("/matches/0/subTitle")
145 .asText());
Akron3e0403f2015-06-24 20:59:13 +0200146 assertEquals("München", res.at("/matches/0/pubPlace").asText());
Akron640458c2015-06-25 12:36:15 +0200147 assertEquals(
148 "base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans",
149 res.at("/matches/0/layerInfos").asText());
Akron3e0403f2015-06-24 20:59:13 +0200150 assertTrue(res.at("/matches/0/textType").isMissingNode());
Akron48937e92015-06-26 01:49:02 +0200151 assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID")
152 .asText());
153
Akron3e0403f2015-06-24 20:59:13 +0200154
155 // All fields
156 jsonString = getString(getClass().getResource(
Akron640458c2015-06-25 12:36:15 +0200157 "/queries/metas/fields_all.jsonld").getFile());
Akron3e0403f2015-06-24 20:59:13 +0200158
159 ks = new Krill(jsonString);
160 kr = ks.apply(ki);
161 mapper = new ObjectMapper();
162 res = mapper.readTree(kr.toJsonString());
Akron640458c2015-06-25 12:36:15 +0200163 assertEquals("Verlag C. H. Beck", res.at("/matches/0/publisher")
164 .asText());
Akron3e0403f2015-06-24 20:59:13 +0200165 assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
166 assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
Akron640458c2015-06-25 12:36:15 +0200167 assertEquals(
168 "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377",
169 res.at("/matches/0/reference").asText());
Akron3e0403f2015-06-24 20:59:13 +0200170 assertEquals("de", res.at("/matches/0/language").asText());
Akron640458c2015-06-25 12:36:15 +0200171 assertEquals("opennlp#tokens", res.at("/matches/0/tokenSource")
172 .asText());
173 assertEquals(
174 "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
175 res.at("/matches/0/foundries").asText());
Akron3e0403f2015-06-24 20:59:13 +0200176 assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText());
177 assertEquals("QAO-NC", res.at("/matches/0/license").asText());
Akron640458c2015-06-25 12:36:15 +0200178 assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)",
179 res.at("/matches/0/docTitle").asText());
Akron3e0403f2015-06-24 20:59:13 +0200180 assertEquals("1827", res.at("/matches/0/creationDate").asText());
181 assertEquals("372-377", res.at("/matches/0/pages").asText());
Akron48937e92015-06-26 01:49:02 +0200182 assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID")
Akron640458c2015-06-25 12:36:15 +0200183 .asText());
Akron6590c322015-07-02 16:08:13 +0200184
185
186 // @All fields
187 jsonString = getString(getClass().getResource(
188 "/queries/metas/fields_at_all.jsonld").getFile());
189
190 ks = new Krill(jsonString);
191 kr = ks.apply(ki);
192 mapper = new ObjectMapper();
193 res = mapper.readTree(kr.toJsonString());
194
195 assertEquals("Verlag C. H. Beck", res.at("/matches/0/publisher")
196 .asText());
197 assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
198 assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
199 assertEquals(
200 "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377",
201 res.at("/matches/0/reference").asText());
202 assertEquals("de", res.at("/matches/0/language").asText());
203 assertEquals("opennlp#tokens", res.at("/matches/0/tokenSource")
204 .asText());
205 assertEquals(
206 "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
207 res.at("/matches/0/foundries").asText());
208 assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText());
209 assertEquals("QAO-NC", res.at("/matches/0/license").asText());
210 assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)",
211 res.at("/matches/0/docTitle").asText());
212 assertEquals("1827", res.at("/matches/0/creationDate").asText());
213 assertEquals("372-377", res.at("/matches/0/pages").asText());
214 assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID")
215 .asText());
Akron3e0403f2015-06-24 20:59:13 +0200216 };
Akron48937e92015-06-26 01:49:02 +0200217
Akron40550172015-08-04 03:06:12 +0200218
Akron484c3c12015-07-07 20:25:44 +0200219 @Test
220 public void searchCollectionFields () throws IOException {
221 KrillIndex ki = new KrillIndex();
222 FieldDocument fd = new FieldDocument();
223 fd.addString("corpusSigle", "ABC");
224 fd.addString("docSigle", "ABC-123");
225 fd.addString("textSigle", "ABC-123-0001");
226 fd.addText("title", "Die Wahlverwandschaften");
227 fd.addText("author", "Johann Wolfgang von Goethe");
228 fd.addKeyword("textClass", "reisen wissenschaft");
229 fd.addInt("pubDate", 20130617);
230 fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
231 + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
232 ki.addDoc(fd);
233
234 FieldDocument fd2 = new FieldDocument();
235 fd2.addString("corpusSigle", "ABC");
236 fd2.addString("docSigle", "ABC-125");
237 fd2.addString("textSigle", "ABC-125-0001");
238 fd2.addText("title", "Die Glocke");
239 fd2.addText("author", "Schiller, Friedrich");
240 fd2.addKeyword("textClass", "Reisen geschichte");
241 fd2.addInt("pubDate", 20130203);
242 fd2.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
243 + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
244 ki.addDoc(fd2);
245 ki.commit();
246
247 // textClass = reisen & wissenschaft
248 String jsonString = getString(getClass().getResource(
Akron40550172015-08-04 03:06:12 +0200249 "/queries/collections/collection_textClass.jsonld").getFile());
Akron484c3c12015-07-07 20:25:44 +0200250 Krill ks = new Krill(jsonString);
251 KrillCollection kc = ks.getCollection();
252 kc.setIndex(ki);
253 assertEquals(1, kc.getCount()); // 1 filter operation
254 assertEquals(1, kc.numberOf("documents"));
255
256 // textClass = reisen
257 jsonString = getString(getClass().getResource(
Akron40550172015-08-04 03:06:12 +0200258 "/queries/collections/collection_textClass_2.jsonld").getFile());
Akron484c3c12015-07-07 20:25:44 +0200259 ks = new Krill(jsonString);
260 kc = ks.getCollection();
261 kc.setIndex(ki);
262 assertEquals(1, kc.getCount()); // 1 filter operation
263 assertEquals(2, kc.numberOf("documents"));
264
265 /*
266 System.err.println(StringUtils.join(fd2.doc.getValues("textClass"), ","));
267 System.err.println(StringUtils.join(fd2.doc.getValues("author"), ", "));
268 */
269 /*
270 TokenStream ts = fd2.doc.getField("author").tokenStream(
271 (Analyzer) ki.writer().getAnalyzer(),
272 (TokenStream) null
273 );
274 // OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
275 CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
276
277 ts.reset();
278 while (ts.incrementToken()) {
279 String term = charTermAttribute.toString();
280 System.err.println(">>" + term + "<<");
281 };
282 */
283
284 // author = wolfgang
285 jsonString = getString(getClass().getResource(
Akron40550172015-08-04 03:06:12 +0200286 "/queries/collections/collection_goethe.jsonld").getFile());
Akron484c3c12015-07-07 20:25:44 +0200287 ks = new Krill(jsonString);
288 kc = ks.getCollection();
289 kc.setIndex(ki);
290 assertEquals(1, kc.getCount()); // 1 filter operation
291 assertEquals(1, kc.numberOf("documents"));
292
293 // author = Wolfgang
294 jsonString = getString(getClass().getResource(
Akron40550172015-08-04 03:06:12 +0200295 "/queries/collections/collection_goethe_2.jsonld").getFile());
Akron484c3c12015-07-07 20:25:44 +0200296 ks = new Krill(jsonString);
297 kc = ks.getCollection();
298 kc.setIndex(ki);
299 assertEquals(1, kc.getCount()); // 1 filter operation
300 assertEquals(1, kc.numberOf("documents"));
301
302 Result kr = ks.apply(ki);
Akron40550172015-08-04 03:06:12 +0200303
Akron484c3c12015-07-07 20:25:44 +0200304 ObjectMapper mapper = new ObjectMapper();
305 JsonNode res = mapper.readTree(kr.toJsonString());
306 assertEquals(1, res.at("/meta/totalResults").asInt());
307 };
308
Akron48937e92015-06-26 01:49:02 +0200309
310 @Test
311 public void searchMetaContext () throws IOException {
312
313 // All fields
314 String jsonString = getString(getClass().getResource(
315 "/queries/metas/context_paragraph.jsonld").getFile());
316
317 Krill ks = new Krill(jsonString);
318 assertTrue(ks.getMeta().getContext().isSpanDefined());
319 assertEquals("base/p", ks.getMeta().getContext().getSpanContext());
320 };
Nils Diewalde3645702014-11-07 21:15:20 +0000321};