blob: 0ff48b5404657d2d4a038fe0f58dc528e0161bc8 [file] [log] [blame]
package de.ids_mannheim.korap.search;
import java.util.*;
import java.io.*;
import static de.ids_mannheim.korap.TestSimple.*;
import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.KrillQuery;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.response.Result;
import java.nio.file.Files;
import java.nio.file.FileSystem;
import java.nio.file.Path;
import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import static org.junit.Assert.*;
import org.junit.Test;
import org.junit.Ignore;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class TestMetaFields {
@Test
public void searchMetaFields () throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
for (String i : new String[] { "00001", "00002" }) {
ki.addDoc(
getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
true);
};
ki.commit();
String jsonString = getString(getClass().getResource(
"/queries/metas/fields.jsonld").getFile());
Krill ks = new Krill(jsonString);
Result kr = ks.apply(ki);
assertEquals((long) 17, kr.getTotalResults());
assertEquals(0, kr.getStartIndex());
assertEquals(9, kr.getItemsPerPage());
ObjectMapper mapper = new ObjectMapper();
JsonNode res = mapper.readTree(kr.toJsonString());
// System.err.println(res.toString());
// mirror fields
assertEquals(9, res.at("/meta/count").asInt());
if (res.at("/meta/fields/0").asText().equals("UID")) {
assertEquals("corpusID", res.at("/meta/fields/1").asText());
}
else {
assertEquals("corpusID", res.at("/meta/fields/0").asText());
assertEquals("UID", res.at("/meta/fields/1").asText());
};
assertEquals(0, res.at("/matches/0/UID").asInt());
assertEquals("WPD", res.at("/matches/0/corpusID").asText());
assertTrue(res.at("/matches/0/docID").isMissingNode());
assertTrue(res.at("/matches/0/textSigle").isMissingNode());
assertTrue(res.at("/matches/0/ID").isMissingNode());
assertTrue(res.at("/matches/0/author").isMissingNode());
assertTrue(res.at("/matches/0/title").isMissingNode());
assertTrue(res.at("/matches/0/subTitle").isMissingNode());
assertTrue(res.at("/matches/0/textClass").isMissingNode());
assertTrue(res.at("/matches/0/pubPlace").isMissingNode());
assertTrue(res.at("/matches/0/pubDate").isMissingNode());
assertTrue(res.at("/matches/0/foundries").isMissingNode());
assertTrue(res.at("/matches/0/layerInfos").isMissingNode());
assertTrue(res.at("/matches/0/tokenization").isMissingNode());
jsonString = getString(getClass().getResource(
"/queries/metas/fields_2.jsonld").getFile());
ks = new Krill(jsonString);
kr = ks.apply(ki);
assertEquals((long) 17, kr.getTotalResults());
assertEquals(0, kr.getStartIndex());
assertEquals(2, kr.getItemsPerPage());
mapper = new ObjectMapper();
res = mapper.readTree(kr.toJsonString());
assertEquals(0, res.at("/matches/0/UID").asInt());
assertTrue(res.at("/matches/0/corpusID").isMissingNode());
assertEquals("Ruru,Jens.Ol,Aglarech", res.at("/matches/0/author")
.asText());
assertEquals("A", res.at("/matches/0/title").asText());
assertEquals("WPD_AAA.00001", res.at("/matches/0/docID").asText());
assertTrue(res.at("/matches/0/textSigle").isMissingNode());
assertEquals("match-WPD_AAA.00001-p6-7", res.at("/matches/0/matchID")
.asText());
// assertEquals("p6-7", res.at("/matches/0/matchID").asText());
assertEquals("", res.at("/matches/0/subTitle").asText());
assertEquals("", res.at("/matches/0/textClass").asText());
assertEquals("", res.at("/matches/0/pubPlace").asText());
assertEquals("", res.at("/matches/0/pubDate").asText());
assertEquals("", res.at("/matches/0/foundries").asText());
assertEquals("", res.at("/matches/0/layerInfo").asText());
assertEquals("", res.at("/matches/0/tokenization").asText());
};
@Test
public void searchMetaFieldsNew () throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
ki.addDoc(getClass().getResourceAsStream("/goe/AGX-00002.json"), false);
ki.commit();
String jsonString = getString(getClass().getResource(
"/queries/metas/fields_no.jsonld").getFile());
Krill ks = new Krill(jsonString);
Result kr = ks.apply(ki);
ObjectMapper mapper = new ObjectMapper();
JsonNode res = mapper.readTree(kr.toJsonString());
assertEquals(0, res.at("/matches/0/UID").asInt());
assertEquals("GOE_AGX.00002", res.at("/matches/0/textSigle").asText());
assertEquals("Maximen und Reflexionen", res.at("/matches/0/title")
.asText());
assertEquals("1982", res.at("/matches/0/pubDate").asText());
assertEquals("Goethe, Johann Wolfgang von", res.at("/matches/0/author")
.asText());
assertEquals("GOE_AGX", res.at("/matches/0/docSigle").asText());
assertEquals("GOE", res.at("/matches/0/corpusSigle").asText());
assertEquals("Religion und Christentum", res.at("/matches/0/subTitle")
.asText());
assertEquals("München", res.at("/matches/0/pubPlace").asText());
assertEquals(
"base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/c=spans corenlp/ne=tokens corenlp/p=tokens corenlp/s=spans glemm/l=tokens mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans",
res.at("/matches/0/layerInfos").asText());
assertTrue(res.at("/matches/0/textType").isMissingNode());
assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID")
.asText());
// All fields
jsonString = getString(getClass().getResource(
"/queries/metas/fields_all.jsonld").getFile());
ks = new Krill(jsonString);
kr = ks.apply(ki);
mapper = new ObjectMapper();
res = mapper.readTree(kr.toJsonString());
assertEquals("Verlag C. H. Beck", res.at("/matches/0/publisher")
.asText());
assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
assertEquals(
"Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377",
res.at("/matches/0/reference").asText());
assertEquals("de", res.at("/matches/0/language").asText());
assertEquals("opennlp#tokens", res.at("/matches/0/tokenSource")
.asText());
assertEquals(
"base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
res.at("/matches/0/foundries").asText());
assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText());
assertEquals("QAO-NC", res.at("/matches/0/license").asText());
assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)",
res.at("/matches/0/docTitle").asText());
assertEquals("1827", res.at("/matches/0/creationDate").asText());
assertEquals("372-377", res.at("/matches/0/pages").asText());
assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID")
.asText());
// @All fields
jsonString = getString(getClass().getResource(
"/queries/metas/fields_at_all.jsonld").getFile());
ks = new Krill(jsonString);
kr = ks.apply(ki);
mapper = new ObjectMapper();
res = mapper.readTree(kr.toJsonString());
assertEquals("Verlag C. H. Beck", res.at("/matches/0/publisher")
.asText());
assertEquals("Aphorismus", res.at("/matches/0/textType").asText());
assertEquals("Aphorismen", res.at("/matches/0/textTypeRef").asText());
assertEquals(
"Goethe, Johann Wolfgang von: Maximen und Reflexionen. Religion und Christentum, [Aphorismen], (Erstveröffentlichung: Stuttgart ; Tübingen, 1827-1842), In: Goethe, Johann Wolfgang von: Goethes Werke, Bd. 12, Schriften zur Kunst. Schriften zur Literatur. Maximen und Reflexionen, Hrsg.: Trunz, Erich. München: Verlag C. H. Beck, 1982, S. 372-377",
res.at("/matches/0/reference").asText());
assertEquals("de", res.at("/matches/0/language").asText());
assertEquals("opennlp#tokens", res.at("/matches/0/tokenSource")
.asText());
assertEquals(
"base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/constituency corenlp/morpho corenlp/namedentities corenlp/sentences glemm glemm/morpho mate mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences",
res.at("/matches/0/foundries").asText());
assertEquals("Goethe-Korpus", res.at("/matches/0/corpusTitle").asText());
assertEquals("QAO-NC", res.at("/matches/0/license").asText());
assertEquals("Goethe: Maximen und Reflexionen, (1827-1842)",
res.at("/matches/0/docTitle").asText());
assertEquals("1827", res.at("/matches/0/creationDate").asText());
assertEquals("372-377", res.at("/matches/0/pages").asText());
assertEquals("match-GOE_AGX.00002-p7-8", res.at("/matches/0/matchID")
.asText());
};
@Test
public void searchCollectionFields () throws IOException {
KrillIndex ki = new KrillIndex();
FieldDocument fd = new FieldDocument();
fd.addString("corpusSigle", "ABC");
fd.addString("docSigle", "ABC-123");
fd.addString("textSigle", "ABC-123-0001");
fd.addText("title", "Die Wahlverwandschaften");
fd.addText("author", "Johann Wolfgang von Goethe");
fd.addKeyword("textClass", "reisen wissenschaft");
fd.addInt("pubDate", 20130617);
fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+ "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
ki.addDoc(fd);
FieldDocument fd2 = new FieldDocument();
fd2.addString("corpusSigle", "ABC");
fd2.addString("docSigle", "ABC-125");
fd2.addString("textSigle", "ABC-125-0001");
fd2.addText("title", "Die Glocke");
fd2.addText("author", "Schiller, Friedrich");
fd2.addKeyword("textClass", "Reisen geschichte");
fd2.addInt("pubDate", 20130203);
fd2.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+ "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
ki.addDoc(fd2);
ki.commit();
// textClass = reisen & wissenschaft
String jsonString = getString(getClass().getResource(
"/queries/collections/collection_textClass.jsonld").getFile());
Krill ks = new Krill(jsonString);
KrillCollection kc = ks.getCollection();
kc.setIndex(ki);
assertEquals(1, kc.getCount()); // 1 filter operation
assertEquals(1, kc.numberOf("documents"));
// textClass = reisen
jsonString = getString(getClass().getResource(
"/queries/collections/collection_textClass_2.jsonld").getFile());
ks = new Krill(jsonString);
kc = ks.getCollection();
kc.setIndex(ki);
assertEquals(1, kc.getCount()); // 1 filter operation
assertEquals(2, kc.numberOf("documents"));
/*
System.err.println(StringUtils.join(fd2.doc.getValues("textClass"), ","));
System.err.println(StringUtils.join(fd2.doc.getValues("author"), ", "));
*/
/*
TokenStream ts = fd2.doc.getField("author").tokenStream(
(Analyzer) ki.writer().getAnalyzer(),
(TokenStream) null
);
// OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String term = charTermAttribute.toString();
System.err.println(">>" + term + "<<");
};
*/
// author = wolfgang
jsonString = getString(getClass().getResource(
"/queries/collections/collection_goethe.jsonld").getFile());
ks = new Krill(jsonString);
kc = ks.getCollection();
kc.setIndex(ki);
assertEquals(1, kc.getCount()); // 1 filter operation
assertEquals(1, kc.numberOf("documents"));
// author = Wolfgang
jsonString = getString(getClass().getResource(
"/queries/collections/collection_goethe_2.jsonld").getFile());
ks = new Krill(jsonString);
kc = ks.getCollection();
kc.setIndex(ki);
assertEquals(1, kc.getCount()); // 1 filter operation
assertEquals(1, kc.numberOf("documents"));
Result kr = ks.apply(ki);
ObjectMapper mapper = new ObjectMapper();
JsonNode res = mapper.readTree(kr.toJsonString());
assertEquals(1, res.at("/meta/totalResults").asInt());
};
@Test
public void searchMetaContext () throws IOException {
// All fields
String jsonString = getString(getClass().getResource(
"/queries/metas/context_paragraph.jsonld").getFile());
Krill ks = new Krill(jsonString);
assertTrue(ks.getMeta().getContext().isSpanDefined());
assertEquals("base/p", ks.getMeta().getContext().getSpanContext());
};
};