blob: 8341c1c42160ff78ac460978e22152e5af8a5c47 [file] [log] [blame]
package de.ids_mannheim.korap.collection;
import java.io.*;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.KrillQuery;
import de.ids_mannheim.korap.query.QueryBuilder;
import de.ids_mannheim.korap.collection.BooleanFilter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanQuery;
import static org.junit.Assert.*;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class TestKrillCollectionLegacy {
@Test
public void filterExample () throws Exception {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
for (String i : new String[] { "00001", "00002", "00003", "00004",
"00005", "00006", "02439" }) {
ki.addDoc(
getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
true);
};
ki.commit();
CollectionBuilder kf = new CollectionBuilder();
// Create Virtual collections:
KrillCollection kc = new KrillCollection(ki);
assertEquals("Documents", 7, kc.numberOf("documents"));
// The virtual collection consists of all documents that have
// the textClass "reisen" and "freizeit"
kc.filter(kf.and("textClass", "reisen").and("textClass",
"freizeit-unterhaltung"));
assertEquals("Documents", 5, kc.numberOf("documents"));
assertEquals("Tokens", 1678, kc.numberOf("tokens"));
assertEquals("Sentences", 194, kc.numberOf("sentences"));
assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
// Subset this to all documents that have also the text
kc.filter(kf.and("textClass", "kultur"));
assertEquals("Documents", 1, kc.numberOf("documents"));
assertEquals("Tokens", 405, kc.numberOf("tokens"));
assertEquals("Sentences", 75, kc.numberOf("sentences"));
assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
kc.filter(kf.and("corpusID", "WPD"));
assertEquals("Documents", 1, kc.numberOf("documents"));
assertEquals("Tokens", 405, kc.numberOf("tokens"));
assertEquals("Sentences", 75, kc.numberOf("sentences"));
assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
// Create a query
QueryBuilder kq = new QueryBuilder("tokens");
SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
Result kr = kc.search(query);
assertEquals(kr.getTotalResults(), 70);
kc.extend(kf.and("textClass", "uninteresting"));
assertEquals("Documents", 1, kc.numberOf("documents"));
kc.extend(kf.and("textClass", "wissenschaft"));
assertEquals("Documents", 3, kc.numberOf("documents"));
assertEquals("Tokens", 1669, kc.numberOf("tokens"));
assertEquals("Sentences", 188, kc.numberOf("sentences"));
assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
// System.err.println(kr.toJSON());
};
@Test
public void filterExampleAtomic () throws Exception {
// That's exactly the same test class, but with multiple atomic indices
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
for (String i : new String[] { "00001", "00002", "00003", "00004",
"00005", "00006", "02439" }) {
ki.addDoc(
getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
true);
ki.commit();
};
CollectionBuilder kf = new CollectionBuilder();
// Create Virtual collections:
KrillCollection kc = new KrillCollection(ki);
assertEquals("Documents", 7, kc.numberOf("documents"));
// If this is set - everything is fine automatically ...
kc.filter(kf.and("corpusID", "WPD"));
assertEquals("Documents", 7, kc.numberOf("documents"));
// The virtual collection consists of all documents that have the textClass "reisen" and "freizeit"
kc.filter(kf.and("textClass", "reisen").and("textClass",
"freizeit-unterhaltung"));
assertEquals("Documents", 5, kc.numberOf("documents"));
assertEquals("Tokens", 1678, kc.numberOf("tokens"));
assertEquals("Sentences", 194, kc.numberOf("sentences"));
assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
// Subset this to all documents that have also the text
kc.filter(kf.and("textClass", "kultur"));
assertEquals("Documents", 1, kc.numberOf("documents"));
assertEquals("Tokens", 405, kc.numberOf("tokens"));
assertEquals("Sentences", 75, kc.numberOf("sentences"));
assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
// This is already filtered though ...
kc.filter(kf.and("corpusID", "WPD"));
assertEquals("Documents", 1, kc.numberOf("documents"));
assertEquals("Tokens", 405, kc.numberOf("tokens"));
assertEquals("Sentences", 75, kc.numberOf("sentences"));
assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
// Create a query
QueryBuilder kq = new QueryBuilder("tokens");
SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
Result kr = kc.search(query);
assertEquals(kr.getTotalResults(), 70);
kc.extend(kf.and("textClass", "uninteresting"));
assertEquals("Documents", 1, kc.numberOf("documents"));
kc.extend(kf.and("textClass", "wissenschaft"));
assertEquals("Documents", 3, kc.numberOf("documents"));
assertEquals("Tokens", 1669, kc.numberOf("tokens"));
assertEquals("Sentences", 188, kc.numberOf("sentences"));
assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
};
@Test
public void filterExample2 () throws Exception {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
for (String i : new String[] { "00001", "00002", "00003", "00004",
"00005", "00006", "02439" }) {
ki.addDoc(
getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
true);
};
ki.commit();
ki.addDoc(getClass()
.getResourceAsStream("/wiki/00012-fakemeta.json.gz"), true);
ki.commit();
CollectionBuilder kf = new CollectionBuilder();
// Create Virtual collections:
KrillCollection kc = new KrillCollection(ki);
kc.filter(kf.and("textClass", "reisen").and("textClass",
"freizeit-unterhaltung"));
assertEquals("Documents", 5, kc.numberOf("documents"));
assertEquals("Tokens", 1678, kc.numberOf("tokens"));
assertEquals("Sentences", 194, kc.numberOf("sentences"));
assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
// Create a query
QueryBuilder kq = new QueryBuilder("tokens");
SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
Result kr = kc.search(query);
assertEquals(kr.getTotalResults(), 369);
kc.filter(kf.and("corpusID", "QQQ"));
assertEquals("Documents", 0, kc.numberOf("documents"));
assertEquals("Tokens", 0, kc.numberOf("tokens"));
assertEquals("Sentences", 0, kc.numberOf("sentences"));
assertEquals("Paragraphs", 0, kc.numberOf("paragraphs"));
};
@Test
public void uidCollection () throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
int uid = 1;
for (String i : new String[] { "00001", "00002", "00003", "00004",
"00005", "00006", "02439" }) {
FieldDocument fd = ki.addDoc(uid++,
getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
true);
};
ki.commit();
assertEquals("Documents", 7, ki.numberOf("documents"));
assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
assertEquals("Sentences", 281, ki.numberOf("sentences"));
assertEquals("Tokens", 2661, ki.numberOf("tokens"));
SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der"));
Result kr = ki.search(sq, (short) 10);
assertEquals(86, kr.getTotalResults());
// Create Virtual collections:
KrillCollection kc = new KrillCollection();
kc.filterUIDs(new String[] { "2", "3", "4" });
kc.setIndex(ki);
assertEquals("Documents", 3, kc.numberOf("documents"));
assertEquals("Paragraphs", 46, kc.numberOf("paragraphs"));
assertEquals("Sentences", 103, kc.numberOf("sentences"));
assertEquals("Tokens", 1229, kc.numberOf("tokens"));
kr = kc.search(sq);
assertEquals((long) 39, kr.getTotalResults());
};
};