blob: fabbff015cbcefae8f4cf165901231e3519bf6d2 [file] [log] [blame]
package de.ids_mannheim.korap.index;
import java.util.*;
import java.io.*;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
import static org.junit.Assert.*;
import org.junit.Test;
import org.junit.Ignore;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapSearch;
import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.KorapDocument;
import de.ids_mannheim.korap.query.SpanNextQuery;
import de.ids_mannheim.korap.query.SpanMatchModifyClassQuery;
import de.ids_mannheim.korap.query.SpanClassQuery;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.analysis.MultiTermTokenStream;
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapperInterface;
import de.ids_mannheim.korap.util.QueryException;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.index.Term;
// mvn -Dtest=TestWithinIndex#indexExample1 test
@RunWith(JUnit4.class)
public class TestFieldDocument {
@Test
public void indexExample1 () throws IOException {
FieldDocument fd = new FieldDocument();
fd.addString("corpusID", "WPD");
fd.addString("ID", "WPD-AAA-00001");
fd.addText("textClass", "music entertainment");
fd.addText("author", "Peter Frankenfeld");
fd.addInt("pubDate", 20130617);
fd.addText("title", "Wikipedia");
fd.addText("subTitle", "Die freie Enzyklopädie");
fd.addStored("layerInfo", "opennlp/p=pos");
fd.addString("pubPlace", "Bochum");
fd.addInt("lastModified", 20130717);
fd.addTV("tokens",
"abc",
"[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
"[(1-2)s:b|i:b|_1#1-2]" +
"[(2-3)s:c|i:c|_2#2-3]");
assertEquals(fd.doc.getField("title").name(), "title");
assertEquals(fd.doc.getField("title").stringValue(), "Wikipedia");
assertEquals(fd.doc.getField("corpusID").name(), "corpusID");
assertEquals(fd.doc.getField("corpusID").stringValue(), "WPD");
assertEquals(fd.doc.getField("ID").name(), "ID");
assertEquals(fd.doc.getField("ID").stringValue(), "WPD-AAA-00001");
assertEquals(fd.doc.getField("subTitle").name(), "subTitle");
assertEquals(fd.doc.getField("subTitle").stringValue(), "Die freie Enzyklopädie");
assertEquals(fd.doc.getField("pubPlace").name(), "pubPlace");
assertEquals(fd.doc.getField("pubPlace").stringValue(), "Bochum");
assertEquals(fd.doc.getField("lastModified").name(), "lastModified");
assertEquals(fd.doc.getField("lastModified").stringValue(), "20130717");
assertEquals(fd.doc.getField("tokens").name(), "tokens");
assertEquals(fd.doc.getField("tokens").stringValue(), "abc");
assertEquals(fd.doc.getField("author").name(), "author");
assertEquals(fd.doc.getField("author").stringValue(), "Peter Frankenfeld");
assertEquals(fd.doc.getField("layerInfo").name(), "layerInfo");
assertEquals(fd.doc.getField("layerInfo").stringValue(), "opennlp/p=pos");
assertEquals(fd.doc.getField("textClass").name(), "textClass");
assertEquals(fd.doc.getField("textClass").stringValue(), "music entertainment");
};
@Test
public void indexExample2 () throws IOException {
String json = new String(
"{" +
" \"fields\" : [" +
" { "+
" \"primaryData\" : \"abc\"" +
" }," +
" {" +
" \"name\" : \"tokens\"," +
" \"data\" : [" +
" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
" [ \"s:c\", \"i:c\", \"_2#2-3\" ]" +
" ]" +
" }" +
" ]," +
" \"corpusID\" : \"WPD\"," +
" \"ID\" : \"WPD-AAA-00001\"," +
" \"textClass\" : \"music entertainment\"," +
" \"author\" : \"Peter Frankenfeld\"," +
" \"pubDate\" : 20130617," +
" \"title\" : \"Wikipedia\"," +
" \"subTitle\" : \"Die freie Enzyklopädie\"," +
" \"pubPlace\" : \"Bochum\"" +
"}");
KorapIndex ki = new KorapIndex();
FieldDocument fd = ki.addDoc(json);
ki.commit();
assertEquals(fd.getPrimaryData(),"abc");
assertEquals(fd.getCorpusID(),"WPD");
assertEquals(fd.getID(),"WPD-AAA-00001");
assertEquals(fd.getTextClass(),"music entertainment");
assertEquals(fd.getAuthor(),"Peter Frankenfeld");
assertEquals(fd.getTitle(),"Wikipedia");
assertEquals(fd.getSubTitle(),"Die freie Enzyklopädie");
assertEquals(fd.getPubPlace(),"Bochum");
assertEquals(fd.getPubDate().toDisplay(),"2013-06-17");
KorapQuery kq = new KorapQuery("tokens");
KorapResult kr = ki.search((SpanQuery) kq.seq(kq._(3, kq.seg("s:b"))).toQuery());
KorapMatch km = kr.getMatch(0);
assertEquals(km.getPrimaryData(),"abc");
assertEquals(km.getCorpusID(),"WPD");
assertEquals(km.getDocID(),"WPD-AAA-00001");
assertEquals(km.getTextClass(),"music entertainment");
assertEquals(km.getAuthor(),"Peter Frankenfeld");
assertEquals(km.getTitle(),"Wikipedia");
assertEquals(km.getSubTitle(),"Die freie Enzyklopädie");
assertEquals(km.getPubPlace(),"Bochum");
assertEquals(km.getPubDate().toDisplay(),"2013-06-17");
assertEquals(km.getSnippetBrackets(),"a[{3:b}]c");
// System.err.println(kr.toJSON());
};
@Test
public void indexExample3 () throws IOException {
// Construct index
KorapIndex ki = new KorapIndex();
// Indexing test files
for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
FieldDocument fd = ki.addDocFile(
getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
);
// System.err.println(fd.getLayerInfo());
};
ki.commit();
KorapQuery kq = new KorapQuery("tokens");
KorapSearch ks;
KorapResult kr;
// Start creating query
/*
ks = new KorapSearch(kq.tag("xip/c:NPA"));
ks.setCount(1);
ks.setCutOff(true);
assertEquals(("A bzw. [a] ist der erste Buchstabe des lateinischen ...", ks.run(ki).getMatch(0).getSnippetBrackets());
*/
// within(<xip/const:NPA>, {1: {2: [cnx/p=A & mate/m=number:sg]}[opennlp/p=NN & tt/p=NN]})
/**
ks = new KorapSearch(kq.within(
kq.tag("xip/c:NPA"),
kq._(1,
kq.seq(
kq._(2, kq.seg("cnx/p:A").with("mate/m:number:sg"))
).append(
kq.seg("opennlp/p:NN").with("tt/p:NN")
)
)
));
**/
ks = new KorapSearch(kq.within(
kq.tag("xip/c:NPA"),
kq._(1,
kq.seq(
kq.seg("cnx/p:A")
).append(
kq.seg("opennlp/p:NN")
)
)
));
ks.setCount(1);
ks.setCutOff(true);
ks.context.left.setCharacter(true).setLength(6);
ks.context.right.setToken(true).setLength(6);
// System.err.println(ks.run(ki).getMatch(0).toJSON());
assertEquals("... e des [{1:lateinischen Alphabets}] und ein Vokal. Der Buchstabe A ...", ks.run(ki).getMatch(0).getSnippetBrackets());
// assertEquals("... e des [{1:lateinischen {2:Alphabets}}] und ein Vokal. Der Buchstabe A ...", ks.run(ki).getMatch(0).getSnippetBrackets());
/*
kr = ki.search(query, 0, (short) 1, true, (short) 2, false, (short) 5);
assertEquals("... Buchstabe des [{1:{2:lateinischen} Alphabets}] und ...", kr.match(0).getSnippetBrackets());
SpanQuery query;
kr = ki.search(query, 0, (short) 50, true, (short) 2, false, (short) 5);
// System.err.println(kr.toJSON());
// System.out.println(query.toString());
// System.out.println(kr.match(37));
assertEquals(38, kr.totalResults());
assertEquals(50, kr.itemsPerPage());
assertEquals("... Buchstabe des [{1:{2:lateinischen} Alphabets}] und ...", kr.match(0).getSnippetBrackets());
assertEquals("... Texten eine [{1:{2:durchschnittliche} Häufigkeit}] von ...", kr.match(1).getSnippetBrackets());
assertEquals("... damit der [{1:{2:sechsthäufigste} Buchstabe}] in d ...", kr.match(2).getSnippetBrackets());
assertEquals("... A der [{1:{2:einzige} Buchstabe}] im D ...", kr.match(3).getSnippetBrackets());
assertEquals("... für den [offenen vorderen {1:{2:ungerundeten} Vokal}] a: A ...", kr.match(4).getSnippetBrackets());
query = kq.seg("tt/l:Norwegen").toQuery();
kr = ki.search(query, 0, (short) 5, true, (short) 2, false, (short) 5);
assertEquals(3, kr.totalResults());
assertEquals("... Lofoten in [Norwegen], unt ...", kr.match(0).getSnippetBrackets());
assertEquals("WPD_AAA.00002", kr.match(0).getDocID());
assertEquals("... es in [Norwegen] noch ...", kr.match(1).getSnippetBrackets());
assertEquals("WPD_AAA.00002", kr.match(1).getDocID());
assertEquals("... Orte in [Norwegen]: Å i ...", kr.match(2).getSnippetBrackets());
assertEquals("WPD_AAA.00005", kr.match(2).getDocID());
*/
/*
System.err.println(ki.getMatchInfo(kr.match(2).getID(), "tokens", "xip", "l", true, false).getSnippetHTML());
*/
/*
query = kq.seg("tt/l:Vokal").without("mate/m:number:sg").toQuery();
kr = ki.search(query, 0, (short) 5, true, (short) 2, false, (short) 5);
assertEquals(1, kr.totalResults());
assertEquals("... reich an [Vokalen] war, ...", kr.match(0).getSnippetBrackets());
assertNotNull(kr.toJSON());
*/
/*
System.err.println(ki.getMatchInfo(
"match-WPD!WPD_AAA.00004-p200-206",
"tokens",
"xip",
"c",
true,
false,
true
).toJSON());
*/
// ki.getMatch();
};
@Test
public void queryJSONBsp18 () throws IOException {
// Construct index
KorapIndex ki = new KorapIndex();
// Indexing test files
for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
FieldDocument fd = ki.addDocFile(
getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
);
};
ki.commit();
SpanQueryWrapperInterface sqwi = jsonQuery(getClass().getResource("/queries/bsp18.jsonld").getFile());
KorapResult kr = ki.search(sqwi.toQuery(), 0, (short) 5, true, (short) 2, false, (short) 5);
// Bug:
// System.err.println(kr.toJSON());
};
public static String getString (String path) {
StringBuilder contentBuilder = new StringBuilder();
try {
BufferedReader in = new BufferedReader(new FileReader(path));
String str;
while ((str = in.readLine()) != null) {
contentBuilder.append(str);
};
in.close();
} catch (IOException e) {
fail(e.getMessage());
}
return contentBuilder.toString();
};
public static SpanQueryWrapperInterface jsonQuery (String jsonFile) {
SpanQueryWrapperInterface sqwi;
try {
String json = getString(jsonFile);
sqwi = new KorapQuery("tokens").fromJSON(json);
}
catch (QueryException e) {
fail(e.getMessage());
sqwi = new KorapQuery("tokens").seg("???");
};
return sqwi;
};
};