blob: 022a6f6f7f031c56d8994fafac351dfde00ca836 [file] [log] [blame]
Nils Diewalde3645702014-11-07 21:15:20 +00001package de.ids_mannheim.korap.search;
2
3import java.util.*;
4import java.io.*;
5
6import static de.ids_mannheim.korap.TestSimple.*;
7
Nils Diewaldbbd39a52015-02-23 19:56:57 +00008import de.ids_mannheim.korap.Krill;
Nils Diewald2d5f8102015-02-26 21:07:54 +00009import de.ids_mannheim.korap.KrillCollection;
Nils Diewald0339d462015-02-26 14:53:56 +000010import de.ids_mannheim.korap.KrillQuery;
Nils Diewalda14ecd62015-02-26 21:00:20 +000011import de.ids_mannheim.korap.KrillIndex;
Nils Diewalde3645702014-11-07 21:15:20 +000012import de.ids_mannheim.korap.index.FieldDocument;
Nils Diewald884dbcf2015-02-27 17:02:28 +000013import de.ids_mannheim.korap.response.Result;
Nils Diewalde3645702014-11-07 21:15:20 +000014import java.nio.file.Files;
15import java.nio.file.FileSystem;
16import java.nio.file.Path;
17import java.nio.charset.StandardCharsets;
18import java.nio.ByteBuffer;
19
Akron484c3c12015-07-07 20:25:44 +020020import org.apache.commons.lang.StringUtils;
21import org.apache.lucene.analysis.Analyzer;
22import org.apache.lucene.analysis.TokenStream;
23import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24
Nils Diewalde3645702014-11-07 21:15:20 +000025import com.fasterxml.jackson.databind.ObjectMapper;
26import com.fasterxml.jackson.databind.JsonNode;
27
28import static org.junit.Assert.*;
29import org.junit.Test;
30import org.junit.Ignore;
31import org.junit.runner.RunWith;
32import org.junit.runners.JUnit4;
33
34@RunWith(JUnit4.class)
35public class TestMetaFields {
Akron640458c2015-06-25 12:36:15 +020036
Nils Diewalde3645702014-11-07 21:15:20 +000037 @Test
38 public void searchMetaFields () throws IOException {
39
Nils Diewald3aa9e692015-02-20 22:20:11 +000040 // Construct index
Nils Diewalda14ecd62015-02-26 21:00:20 +000041 KrillIndex ki = new KrillIndex();
Nils Diewald3aa9e692015-02-20 22:20:11 +000042 // Indexing test files
Nils Diewaldbb33da22015-03-04 16:24:25 +000043 for (String i : new String[] { "00001", "00002" }) {
Eliza Margaretha6f989202016-10-14 21:48:29 +020044 ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 true);
Nils Diewald3aa9e692015-02-20 22:20:11 +000046 };
47 ki.commit();
Nils Diewalde3645702014-11-07 21:15:20 +000048
Eliza Margaretha6f989202016-10-14 21:48:29 +020049 String jsonString = getJsonString(getClass()
50 .getResource("/queries/metas/fields.jsonld").getFile());
Nils Diewaldbb33da22015-03-04 16:24:25 +000051
Nils Diewaldbbd39a52015-02-23 19:56:57 +000052 Krill ks = new Krill(jsonString);
Nils Diewalde3645702014-11-07 21:15:20 +000053
Nils Diewald884dbcf2015-02-27 17:02:28 +000054 Result kr = ks.apply(ki);
Nils Diewald3aa9e692015-02-20 22:20:11 +000055 assertEquals((long) 17, kr.getTotalResults());
56 assertEquals(0, kr.getStartIndex());
57 assertEquals(9, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +000058
Nils Diewald3aa9e692015-02-20 22:20:11 +000059 ObjectMapper mapper = new ObjectMapper();
60 JsonNode res = mapper.readTree(kr.toJsonString());
Akronb1166442015-06-27 00:34:19 +020061
62 // System.err.println(res.toString());
63 // mirror fields
64 assertEquals(9, res.at("/meta/count").asInt());
65
66 if (res.at("/meta/fields/0").asText().equals("UID")) {
67 assertEquals("corpusID", res.at("/meta/fields/1").asText());
68 }
69 else {
70 assertEquals("corpusID", res.at("/meta/fields/0").asText());
71 assertEquals("UID", res.at("/meta/fields/1").asText());
72 };
73
Nils Diewald3aa9e692015-02-20 22:20:11 +000074 assertEquals(0, res.at("/matches/0/UID").asInt());
75 assertEquals("WPD", res.at("/matches/0/corpusID").asText());
Akron12f1f5b2015-06-24 15:56:52 +020076 assertTrue(res.at("/matches/0/docID").isMissingNode());
77 assertTrue(res.at("/matches/0/textSigle").isMissingNode());
78 assertTrue(res.at("/matches/0/ID").isMissingNode());
79 assertTrue(res.at("/matches/0/author").isMissingNode());
80 assertTrue(res.at("/matches/0/title").isMissingNode());
81 assertTrue(res.at("/matches/0/subTitle").isMissingNode());
82 assertTrue(res.at("/matches/0/textClass").isMissingNode());
83 assertTrue(res.at("/matches/0/pubPlace").isMissingNode());
84 assertTrue(res.at("/matches/0/pubDate").isMissingNode());
85 assertTrue(res.at("/matches/0/foundries").isMissingNode());
86 assertTrue(res.at("/matches/0/layerInfos").isMissingNode());
87 assertTrue(res.at("/matches/0/tokenization").isMissingNode());
Nils Diewalde3645702014-11-07 21:15:20 +000088
Eliza Margaretha6f989202016-10-14 21:48:29 +020089 jsonString = getJsonString(getClass()
90 .getResource("/queries/metas/fields_2.jsonld").getFile());
Nils Diewaldbbd39a52015-02-23 19:56:57 +000091 ks = new Krill(jsonString);
Nils Diewald3aa9e692015-02-20 22:20:11 +000092 kr = ks.apply(ki);
93 assertEquals((long) 17, kr.getTotalResults());
94 assertEquals(0, kr.getStartIndex());
95 assertEquals(2, kr.getItemsPerPage());
Nils Diewaldbb33da22015-03-04 16:24:25 +000096
Nils Diewald3aa9e692015-02-20 22:20:11 +000097 mapper = new ObjectMapper();
98 res = mapper.readTree(kr.toJsonString());
99 assertEquals(0, res.at("/matches/0/UID").asInt());
Akron12f1f5b2015-06-24 15:56:52 +0200100 assertTrue(res.at("/matches/0/corpusID").isMissingNode());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200101 assertEquals("Ruru,Jens.Ol,Aglarech",
102 res.at("/matches/0/author").asText());
Nils Diewald3aa9e692015-02-20 22:20:11 +0000103 assertEquals("A", res.at("/matches/0/title").asText());
104 assertEquals("WPD_AAA.00001", res.at("/matches/0/docID").asText());
Akron3e0403f2015-06-24 20:59:13 +0200105 assertTrue(res.at("/matches/0/textSigle").isMissingNode());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200106 assertEquals("match-WPD_AAA.00001-p6-7",
107 res.at("/matches/0/matchID").asText());
Akron48937e92015-06-26 01:49:02 +0200108 // assertEquals("p6-7", res.at("/matches/0/matchID").asText());
Nils Diewald3aa9e692015-02-20 22:20:11 +0000109 assertEquals("", res.at("/matches/0/subTitle").asText());
110 assertEquals("", res.at("/matches/0/textClass").asText());
111 assertEquals("", res.at("/matches/0/pubPlace").asText());
112 assertEquals("", res.at("/matches/0/pubDate").asText());
113 assertEquals("", res.at("/matches/0/foundries").asText());
114 assertEquals("", res.at("/matches/0/layerInfo").asText());
115 assertEquals("", res.at("/matches/0/tokenization").asText());
Nils Diewalde3645702014-11-07 21:15:20 +0000116 };
Akron3e0403f2015-06-24 20:59:13 +0200117
Akron640458c2015-06-25 12:36:15 +0200118
Akron3e0403f2015-06-24 20:59:13 +0200119 @Test
120 public void searchMetaFieldsNew () throws IOException {
121
122 // Construct index
123 KrillIndex ki = new KrillIndex();
Akron640458c2015-06-25 12:36:15 +0200124 ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false);
Akron3e0403f2015-06-24 20:59:13 +0200125 ki.commit();
126
Eliza Margaretha6f989202016-10-14 21:48:29 +0200127 String jsonString = getJsonString(getClass()
128 .getResource("/queries/metas/fields_no.jsonld").getFile());
Akron3e0403f2015-06-24 20:59:13 +0200129
130 Krill ks = new Krill(jsonString);
131 Result kr = ks.apply(ki);
132 ObjectMapper mapper = new ObjectMapper();
133 JsonNode res = mapper.readTree(kr.toJsonString());
134 assertEquals(0, res.at("/matches/0/UID").asInt());
135 assertEquals("GOE_AGX.00002", res.at("/matches/0/textSigle").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200136 assertEquals("Maximen und Reflexionen",
137 res.at("/matches/0/title").asText());
Akron3e0403f2015-06-24 20:59:13 +0200138 assertEquals("1982", res.at("/matches/0/pubDate").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200139 assertEquals("Goethe, Johann Wolfgang von",
140 res.at("/matches/0/author").asText());
Akron3e0403f2015-06-24 20:59:13 +0200141 assertEquals("GOE_AGX", res.at("/matches/0/docSigle").asText());
142 assertEquals("GOE", res.at("/matches/0/corpusSigle").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200143 assertEquals("Religion und Christentum",
144 res.at("/matches/0/subTitle").asText());
Akron3e0403f2015-06-24 20:59:13 +0200145 assertEquals("München", res.at("/matches/0/pubPlace").asText());
Akron640458c2015-06-25 12:36:15 +0200146 assertEquals(
147 "base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans",
148 res.at("/matches/0/layerInfos").asText());
Akron3e0403f2015-06-24 20:59:13 +0200149 assertTrue(res.at("/matches/0/textType").isMissingNode());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200150 assertEquals("match-GOE_AGX.00002-p7-8",
151 res.at("/matches/0/matchID").asText());
Akron48937e92015-06-26 01:49:02 +0200152
Akron3e0403f2015-06-24 20:59:13 +0200153
154 // All fields
Eliza Margaretha6f989202016-10-14 21:48:29 +0200155 jsonString = getJsonString(getClass()
156 .getResource("/queries/metas/fields_all.jsonld").getFile());
Akron3e0403f2015-06-24 20:59:13 +0200157
158 ks = new Krill(jsonString);
159 kr = ks.apply(ki);
160 mapper = new ObjectMapper();
161 res = mapper.readTree(kr.toJsonString());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200162 assertEquals("Verlag C. H. Beck",
163 res.at("/matches/0/publisher").asText());
Akron3e0403f2015-06-24 20:59:13 +0200164 assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
165 assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
Akron640458c2015-06-25 12:36:15 +0200166 assertEquals(
167 "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377",
168 res.at("/matches/0/reference").asText());
Akron3e0403f2015-06-24 20:59:13 +0200169 assertEquals("de", res.at("/matches/0/language").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200170 assertEquals("opennlp#tokens",
171 res.at("/matches/0/tokenSource").asText());
Akron640458c2015-06-25 12:36:15 +0200172 assertEquals(
173 "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
174 res.at("/matches/0/foundries").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200175 assertEquals("Goethe-Korpus",
176 res.at("/matches/0/corpusTitle").asText());
Akron7e3a10f2017-05-05 15:36:20 +0200177 assertEquals("QAO-NC", res.at("/matches/0/availability").asText());
Akron640458c2015-06-25 12:36:15 +0200178 assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)",
179 res.at("/matches/0/docTitle").asText());
Akron3e0403f2015-06-24 20:59:13 +0200180 assertEquals("1827", res.at("/matches/0/creationDate").asText());
Akron69b958c2017-02-15 22:49:45 +0100181 // assertEquals("372-377", res.at("/matches/0/pages").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200182 assertEquals("match-GOE_AGX.00002-p7-8",
183 res.at("/matches/0/matchID").asText());
Akron6590c322015-07-02 16:08:13 +0200184
185
186 // @All fields
Eliza Margaretha6f989202016-10-14 21:48:29 +0200187 jsonString = getJsonString(getClass()
188 .getResource("/queries/metas/fields_at_all.jsonld").getFile());
Akron6590c322015-07-02 16:08:13 +0200189
190 ks = new Krill(jsonString);
191 kr = ks.apply(ki);
192 mapper = new ObjectMapper();
193 res = mapper.readTree(kr.toJsonString());
194
Eliza Margaretha6f989202016-10-14 21:48:29 +0200195 assertEquals("Verlag C. H. Beck",
196 res.at("/matches/0/publisher").asText());
Akron6590c322015-07-02 16:08:13 +0200197 assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
198 assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
199 assertEquals(
200 "Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377",
201 res.at("/matches/0/reference").asText());
202 assertEquals("de", res.at("/matches/0/language").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200203 assertEquals("opennlp#tokens",
204 res.at("/matches/0/tokenSource").asText());
Akron6590c322015-07-02 16:08:13 +0200205 assertEquals(
206 "base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
207 res.at("/matches/0/foundries").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200208 assertEquals("Goethe-Korpus",
209 res.at("/matches/0/corpusTitle").asText());
Akron6590c322015-07-02 16:08:13 +0200210 assertEquals("QAO-NC", res.at("/matches/0/license").asText());
211 assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)",
212 res.at("/matches/0/docTitle").asText());
213 assertEquals("1827", res.at("/matches/0/creationDate").asText());
Akron69b958c2017-02-15 22:49:45 +0100214 // assertEquals("372-377", res.at("/matches/0/pages").asText());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200215 assertEquals("match-GOE_AGX.00002-p7-8",
216 res.at("/matches/0/matchID").asText());
Akron3e0403f2015-06-24 20:59:13 +0200217 };
Akron48937e92015-06-26 01:49:02 +0200218
Akron40550172015-08-04 03:06:12 +0200219
Akron484c3c12015-07-07 20:25:44 +0200220 @Test
221 public void searchCollectionFields () throws IOException {
222 KrillIndex ki = new KrillIndex();
223 FieldDocument fd = new FieldDocument();
224 fd.addString("corpusSigle", "ABC");
225 fd.addString("docSigle", "ABC-123");
226 fd.addString("textSigle", "ABC-123-0001");
227 fd.addText("title", "Die Wahlverwandschaften");
228 fd.addText("author", "Johann Wolfgang von Goethe");
229 fd.addKeyword("textClass", "reisen wissenschaft");
230 fd.addInt("pubDate", 20130617);
231 fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
232 + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
233 ki.addDoc(fd);
234
235 FieldDocument fd2 = new FieldDocument();
236 fd2.addString("corpusSigle", "ABC");
237 fd2.addString("docSigle", "ABC-125");
238 fd2.addString("textSigle", "ABC-125-0001");
239 fd2.addText("title", "Die Glocke");
240 fd2.addText("author", "Schiller, Friedrich");
241 fd2.addKeyword("textClass", "Reisen geschichte");
242 fd2.addInt("pubDate", 20130203);
243 fd2.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
244 + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
245 ki.addDoc(fd2);
246 ki.commit();
247
248 // textClass = reisen & wissenschaft
Eliza Margaretha6f989202016-10-14 21:48:29 +0200249 String jsonString = getJsonString(getClass()
250 .getResource("/queries/collections/collection_textClass.jsonld")
251 .getFile());
Akron484c3c12015-07-07 20:25:44 +0200252 Krill ks = new Krill(jsonString);
253 KrillCollection kc = ks.getCollection();
254 kc.setIndex(ki);
Akron484c3c12015-07-07 20:25:44 +0200255 assertEquals(1, kc.numberOf("documents"));
256
257 // textClass = reisen
Eliza Margaretha6f989202016-10-14 21:48:29 +0200258 jsonString = getJsonString(getClass()
259 .getResource(
260 "/queries/collections/collection_textClass_2.jsonld")
261 .getFile());
Akron484c3c12015-07-07 20:25:44 +0200262 ks = new Krill(jsonString);
263 kc = ks.getCollection();
264 kc.setIndex(ki);
Akron484c3c12015-07-07 20:25:44 +0200265 assertEquals(2, kc.numberOf("documents"));
266
267 /*
Akron484c3c12015-07-07 20:25:44 +0200268 TokenStream ts = fd2.doc.getField("author").tokenStream(
269 (Analyzer) ki.writer().getAnalyzer(),
270 (TokenStream) null
271 );
272 // OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
273 CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200274
Akron484c3c12015-07-07 20:25:44 +0200275 ts.reset();
276 while (ts.incrementToken()) {
277 String term = charTermAttribute.toString();
278 System.err.println(">>" + term + "<<");
279 };
280 */
281
282 // author = wolfgang
Eliza Margaretha6f989202016-10-14 21:48:29 +0200283 jsonString = getJsonString(getClass()
284 .getResource("/queries/collections/collection_goethe.jsonld")
285 .getFile());
Akron484c3c12015-07-07 20:25:44 +0200286 ks = new Krill(jsonString);
287 kc = ks.getCollection();
288 kc.setIndex(ki);
Akron484c3c12015-07-07 20:25:44 +0200289 assertEquals(1, kc.numberOf("documents"));
290
291 // author = Wolfgang
Eliza Margaretha6f989202016-10-14 21:48:29 +0200292 jsonString = getJsonString(getClass()
293 .getResource("/queries/collections/collection_goethe_2.jsonld")
294 .getFile());
Akron484c3c12015-07-07 20:25:44 +0200295 ks = new Krill(jsonString);
296 kc = ks.getCollection();
297 kc.setIndex(ki);
Akron484c3c12015-07-07 20:25:44 +0200298 assertEquals(1, kc.numberOf("documents"));
299
300 Result kr = ks.apply(ki);
Akron40550172015-08-04 03:06:12 +0200301
Akron484c3c12015-07-07 20:25:44 +0200302 ObjectMapper mapper = new ObjectMapper();
303 JsonNode res = mapper.readTree(kr.toJsonString());
304 assertEquals(1, res.at("/meta/totalResults").asInt());
305 };
306
Akron48937e92015-06-26 01:49:02 +0200307
308 @Test
309 public void searchMetaContext () throws IOException {
310
311 // All fields
Eliza Margaretha6f989202016-10-14 21:48:29 +0200312 String jsonString = getJsonString(getClass()
313 .getResource("/queries/metas/context_paragraph.jsonld")
314 .getFile());
Akron48937e92015-06-26 01:49:02 +0200315
316 Krill ks = new Krill(jsonString);
317 assertTrue(ks.getMeta().getContext().isSpanDefined());
318 assertEquals("base/p", ks.getMeta().getContext().getSpanContext());
319 };
Nils Diewalde3645702014-11-07 21:15:20 +0000320};