Working Virtual Collections | Feature Freeze
diff --git a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
new file mode 100644
index 0000000..0419fdd
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
@@ -0,0 +1,63 @@
+import java.io.*;
+
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.index.FieldDocument;
+import de.ids_mannheim.korap.KorapCollection;
+import de.ids_mannheim.korap.KorapFilter;
+import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.KorapQuery;
+import de.ids_mannheim.korap.filter.BooleanFilter;
+import org.apache.lucene.search.spans.SpanQuery;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestKorapCollection {
+
+ @Test
+ public void filterExample () throws IOException {
+
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
+ FieldDocument fd = ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ );
+ };
+ ki.commit();
+
+ KorapFilter kf = new KorapFilter();
+
+ // Create Virtual collections:
+ KorapCollection kc = new KorapCollection(ki);
+
+ // The virtual collection consists of all documents that have the textClass "reisen" and "freizeit"
+ kc.filter( kf.and("textClass", "reisen").and("textClass", "freizeit") );
+
+ // Subset this to all documents that have also the text
+ kc.filter( kf.and("textClass", "kultur") );
+
+ // Create a query
+ KorapQuery kq = new KorapQuery("tokens");
+ SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
+
+ // Get some statistics (This can be improved):
+ /*
+ System.err.println("Tokens in this virtual collection: " + kc.numberOf("tokens", "t"));
+ System.err.println("Paragraphs in this virtual collection: " + kc.numberOf("tokens", "p"));
+ System.err.println("Sentences in this virtual collection: " + kc.numberOf("tokens", "s"));
+ */
+
+ KorapResult kr = kc.search(query);
+ // System.err.println(kr.toJSON());
+ };
+};
+
+
+
+// kc.filter( kf.and("textClass", "kultur").or("textClass", "wissenschaft") );
diff --git a/src/test/java/de/ids_mannheim/korap/filter/TestKorapFilter.java b/src/test/java/de/ids_mannheim/korap/filter/TestKorapFilter.java
index 8ccefa8..661e45b 100644
--- a/src/test/java/de/ids_mannheim/korap/filter/TestKorapFilter.java
+++ b/src/test/java/de/ids_mannheim/korap/filter/TestKorapFilter.java
@@ -21,10 +21,10 @@
KorapFilter kf = new KorapFilter();
- assertEquals("textClass:tree", kf.genre("tree").toString());
- assertEquals("+textClass:tree +textClass:sport", kf.genre("tree").and("sport").toString());
- assertEquals("(+textClass:tree +textClass:sport) textClass:news", kf.genre("tree").and("sport").or("news").toString());
- assertEquals("textClass:tree textClass:sport textClass:news", kf.genre("tree", "sport", "news").toString());
+ assertEquals("+textClass:tree", kf.and("textClass","tree").toString());
+ assertEquals("+textClass:tree +textClass:sport", kf.and("textClass","tree").and("textClass","sport").toString());
+ assertEquals("+textClass:tree +textClass:sport textClass:news", kf.and("textClass","tree").and("textClass","sport").or("textClass","news").toString());
+ assertEquals("+textClass:tree +textClass:sport +textClass:news", kf.and("textClass", "tree", "sport", "news").toString());
};
@Test
@@ -32,48 +32,53 @@
KorapFilter kf = new KorapFilter();
- assertEquals("pubDate:[20030604 TO 20030899]", kf.between("2003-06-04", "2003-08-99").toString());
- assertEquals("pubDate:[0 TO 20030604]", kf.till("2003-06-04").toString());
- assertEquals("pubDate:[20030604 TO 99999999]", kf.since("2003-06-04").toString());
- assertEquals("pubDate:20030604", kf.date("2003-06-04").toString());
+ assertEquals("+pubDate:[20030604 TO 20030899]", kf.between("2003-06-04", "2003-08-99").toString());
+ assertEquals("+pubDate:[0 TO 20030604]", kf.till("2003-06-04").toString());
+ assertEquals("+pubDate:[20030604 TO 99999999]", kf.since("2003-06-04").toString());
+ assertEquals("+pubDate:20030604", kf.date("2003-06-04").toString());
};
@Test
public void rangeLimited () throws IOException {
KorapFilter kf = new KorapFilter();
- assertEquals("pubDate:[20050000 TO 20099999]", kf.between("2005", "2009").toString());
- assertEquals("pubDate:[20051000 TO 20090899]", kf.between("200510", "200908").toString());
- assertEquals("pubDate:[20051000 TO 20090899]", kf.between("2005-10", "2009-08").toString());
- assertEquals("pubDate:[20051006 TO 20090803]", kf.between("2005-1006", "2009-0803").toString());
- assertEquals("pubDate:[20051006 TO 20090803]", kf.between("2005-10-06", "2009-08-03").toString());
- assertEquals("pubDate:[0 TO 20059999]", kf.till("2005").toString());
- assertEquals("pubDate:[0 TO 20051099]", kf.till("200510").toString());
- assertEquals("pubDate:[0 TO 20051099]", kf.till("2005-10").toString());
- assertEquals("pubDate:[0 TO 20051006]", kf.till("2005-1006").toString());
- assertEquals("pubDate:[0 TO 20051006]", kf.till("2005-10-06").toString());
+ assertEquals("+pubDate:[20050000 TO 20099999]", kf.between("2005", "2009").toString());
+ assertEquals("+pubDate:[20051000 TO 20090899]", kf.between("200510", "200908").toString());
+ assertEquals("+pubDate:[20051000 TO 20090899]", kf.between("2005-10", "2009-08").toString());
+ assertEquals("+pubDate:[20051006 TO 20090803]", kf.between("2005-1006", "2009-0803").toString());
+ assertEquals("+pubDate:[20051006 TO 20090803]", kf.between("2005-10-06", "2009-08-03").toString());
- assertEquals("pubDate:[20050000 TO 99999999]", kf.since("2005").toString());
- assertEquals("pubDate:[20051000 TO 99999999]", kf.since("200510").toString());
- assertEquals("pubDate:[20051000 TO 99999999]", kf.since("2005-10").toString());
- assertEquals("pubDate:[20051006 TO 99999999]", kf.since("2005-1006").toString());
- assertEquals("pubDate:[20051006 TO 99999999]", kf.since("2005-10-06").toString());
+ assertEquals("+pubDate:[0 TO 20059999]", kf.till("2005").toString());
+ assertEquals("+pubDate:[0 TO 20051099]", kf.till("200510").toString());
+ assertEquals("+pubDate:[0 TO 20051099]", kf.till("2005-10").toString());
+ assertEquals("+pubDate:[0 TO 20051006]", kf.till("2005-1006").toString());
+ assertEquals("+pubDate:[0 TO 20051006]", kf.till("2005-10-06").toString());
- assertEquals("pubDate:[20050000 TO 20059999]", kf.date("2005").toString());
- assertEquals("pubDate:[20051000 TO 20051099]", kf.date("200510").toString());
- assertEquals("pubDate:[20051000 TO 20051099]", kf.date("2005-10").toString());
- assertEquals("pubDate:20051006", kf.date("2005-1006").toString());
- assertEquals("pubDate:20051006", kf.date("2005-10-06").toString());
+ assertEquals("+pubDate:[20050000 TO 99999999]", kf.since("2005").toString());
+ assertEquals("+pubDate:[20051000 TO 99999999]", kf.since("200510").toString());
+ assertEquals("+pubDate:[20051000 TO 99999999]", kf.since("2005-10").toString());
+ assertEquals("+pubDate:[20051006 TO 99999999]", kf.since("2005-1006").toString());
+ assertEquals("+pubDate:[20051006 TO 99999999]", kf.since("2005-10-06").toString());
+
+ assertEquals("+pubDate:[20050000 TO 20059999]", kf.date("2005").toString());
+ assertEquals("+pubDate:[20051000 TO 20051099]", kf.date("200510").toString());
+ assertEquals("+pubDate:[20051000 TO 20051099]", kf.date("2005-10").toString());
+ assertEquals("+pubDate:20051006", kf.date("2005-1006").toString());
+ assertEquals("+pubDate:20051006", kf.date("2005-10-06").toString());
};
@Test
public void rangeFailure () throws IOException {
KorapFilter kf = new KorapFilter();
- assertNull(kf.between("aaaa-bb-cc", "aaaabbcc"));
- assertNull(kf.till("aaaa-bb-cc"));
- assertNull(kf.since("aaaa-bb-cc"));
- assertNull(kf.date("aaaa-bb-cc"));
+ assertEquals("", kf.between("aaaa-bb-cc", "aaaabbcc").toString());
+ assertEquals("", kf.till("aaaa-bb-cc").toString());
+ assertEquals("", kf.since("aaaa-bb-cc").toString());
+ assertEquals("", kf.date("aaaa-bb-cc").toString());
};
+
+
+ // TODO: More extensive testing!
+
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestClassIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestClassIndex.java
index ff3abaf..4aa9d5f 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestClassIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestClassIndex.java
@@ -189,8 +189,8 @@
assertEquals("StartPos (1)", 4, kr.match(1).startPos);
assertEquals("EndPos (1)", 6, kr.match(1).endPos);
- assertEquals("Document count", 1, ki.numberOf("documents"));
- assertEquals("Token count", 10, ki.numberOf("t"));
+ assertEquals("Document count", 1, ki.numberOf("base", "documents"));
+ assertEquals("Token count", 10, ki.numberOf("base", "t"));
sq = new SpanNextQuery(
@@ -211,8 +211,8 @@
assertEquals("StartPos (1)", 3, kr.match(1).startPos);
assertEquals("EndPos (1)", 6, kr.match(1).endPos);
- assertEquals(1, ki.numberOf("documents"));
- assertEquals(10, ki.numberOf("t"));
+ assertEquals(1, ki.numberOf("base", "documents"));
+ assertEquals(10, ki.numberOf("base", "t"));
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestKorapIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestKorapIndex.java
index a4ba6a2..2200069 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestKorapIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestKorapIndex.java
@@ -49,8 +49,8 @@
/* Save documents */
ki.commit();
- assertEquals(2, ki.numberOf("documents"));
- assertEquals(7, ki.numberOf("sentences"));
+ assertEquals(2, ki.numberOf("base", "documents"));
+ assertEquals(7, ki.numberOf("base", "sentences"));
fd = new FieldDocument();
@@ -65,8 +65,8 @@
/* Save documents */
ki.commit();
- assertEquals(3, ki.numberOf("documents"));
- assertEquals(10, ki.numberOf("sentences"));
+ assertEquals(3, ki.numberOf("base", "documents"));
+ assertEquals(10, ki.numberOf("base", "sentences"));
// KorapQuery kq = new KorapQuery("text");
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java
index c081f7c..32e9a72 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIndex.java
@@ -154,8 +154,8 @@
assertEquals("EndPos (1)", 6, kr.match(1).endPos);
assertEquals("SnippetBrackets (1)", "abca[{1:b}{2:c}]abac", kr.match(1).snippetBrackets());
- assertEquals("Document count", 1, ki.numberOf("documents"));
- assertEquals("Token count", 10, ki.numberOf("t"));
+ assertEquals("Document count", 1, ki.numberOf("base", "documents"));
+ assertEquals("Token count", 10, ki.numberOf("base", "t"));
sq = new SpanMatchModifyQuery(
@@ -180,7 +180,7 @@
assertEquals("EndPos (1)", 6, kr.match(1).endPos);
assertEquals("SnippetBrackets (1)", "abca[bc]abac", kr.match(1).snippetBrackets());
- assertEquals(1, ki.numberOf("documents"));
- assertEquals(10, ki.numberOf("t"));
+ assertEquals(1, ki.numberOf("base", "documents"));
+ assertEquals(10, ki.numberOf("base", "t"));
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
index a3860ea..9b9aae7 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestNextIndex.java
@@ -85,8 +85,8 @@
assertEquals("StartPos (1)", 4, kr.match(1).startPos);
assertEquals("EndPos (1)", 6, kr.match(1).endPos);
- assertEquals(1, ki.numberOf("documents"));
- assertEquals(10, ki.numberOf("t"));
+ assertEquals(1, ki.numberOf("base", "documents"));
+ assertEquals(10, ki.numberOf("base", "t"));
sq = new SpanNextQuery(
@@ -105,8 +105,8 @@
assertEquals("StartPos (1)", 3, kr.match(1).startPos);
assertEquals("EndPos (1)", 6, kr.match(1).endPos);
- assertEquals(1, ki.numberOf("documents"));
- assertEquals(10, ki.numberOf("t"));
+ assertEquals(1, ki.numberOf("base", "documents"));
+ assertEquals(10, ki.numberOf("base", "t"));
};
diff --git a/src/test/resources/wiki/readme.txt b/src/test/resources/wiki/readme.txt
new file mode 100644
index 0000000..32fc818
--- /dev/null
+++ b/src/test/resources/wiki/readme.txt
@@ -0,0 +1,7 @@
+00001: freizeit-unterhaltung,reisen,wissenschaft,populaerwissenschaft
+00002: freizeit-unterhaltung,reisen
+00003: kultur,musik
+00004: wissenschaft,populaerwissenschaft
+00005: freizeit-unterhaltung,reisen
+00006: freizeit-unterhaltung,reisen
+02439: kultur,musik,freizeit-unterhaltung,reisen