Virtual Collections can now be extended
diff --git a/CHANGES b/CHANGES
index 6b7e832..53182d2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,7 @@
+0.21 2013-11-28
+ - Virtual collections can now be defined,
+ searched, nested and extended.
+
0.20 2013-11-18
- Unboxing from sandbox.
diff --git a/pom.xml b/pom.xml
index e127b7b..f7b05f3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.20</version>
+ <version>0.21</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index a28800d..97e61d6 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -10,6 +10,7 @@
import de.ids_mannheim.korap.KorapFilter;
import de.ids_mannheim.korap.util.KorapDate;
import de.ids_mannheim.korap.filter.BooleanFilter;
+import de.ids_mannheim.korap.filter.FilterOperation;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.FilteredQuery;
@@ -26,6 +27,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+// TODO: Make a cache for the bits!!! DELETE IT IN CASE OF AN EXTENSION OR A FILTER!
+
// accepts as first parameter the index
// THIS MAY CHANGE for stuff like combining virtual collections
@@ -35,23 +38,7 @@
private KorapIndex index;
private String id;
private KorapDate created;
-
-
- // TODO:
- // Change this to support join operation
- /*
- private class CollectionOperation () {
- private boolean type;
- private Filter filter;
- private CollectionOperation (type, filter) {
- this.type = type;
- this.filter = filter;
- };
- };
- */
-
- private ArrayList<Filter> filter;
-
+ private ArrayList<FilterOperation> filter;
private int filterCount = 0;
// Logger
@@ -60,7 +47,7 @@
// user?
public KorapCollection (KorapIndex ki) {
this.index = ki;
- this.filter = new ArrayList<Filter>(5);
+ this.filter = new ArrayList<FilterOperation>(5);
};
public int getCount() {
@@ -68,11 +55,26 @@
};
public void filter (BooleanFilter filter) {
- this.filter.add(new QueryWrapperFilter(filter.toQuery()));
+ this.filter.add(
+ new FilterOperation(
+ (Filter) new QueryWrapperFilter(filter.toQuery()),
+ false
+ )
+ );
this.filterCount++;
};
- public ArrayList<Filter> getFilters () {
+ public void extend (BooleanFilter filter) {
+ this.filter.add(
+ new FilterOperation(
+ (Filter) new QueryWrapperFilter(filter.toQuery()),
+ true
+ )
+ );
+ this.filterCount++;
+ };
+
+ public ArrayList<FilterOperation> getFilters () {
return this.filter;
};
@@ -96,10 +98,10 @@
if (this.filterCount > 0) {
bitset = new FixedBitSet(atomic.reader().numDocs());
- ArrayList<Filter> filters = (ArrayList<Filter>) this.filter.clone();
+ ArrayList<FilterOperation> filters = (ArrayList<FilterOperation>) this.filter.clone();
// Init vector
- DocIdSet docids = filters.remove(0).getDocIdSet(atomic, null);
+ DocIdSet docids = filters.remove(0).filter.getDocIdSet(atomic, null);
DocIdSetIterator filterIter = docids.iterator();
if (filterIter != null) {
@@ -108,17 +110,27 @@
};
if (!noDoc) {
- for (Filter kc : filters) {
+ for (FilterOperation kc : filters) {
log.trace("FILTER: {}", kc);
- docids = kc.getDocIdSet(atomic, bitset);
+
+ // BUG!!!
+ docids = kc.filter.getDocIdSet(atomic, kc.isExtension() ? null : bitset);
filterIter = docids.iterator();
+
if (filterIter == null) {
// There must be a better way ...
- bitset.clear(0, bitset.length());
- noDoc = true;
- break;
+ if (kc.isFilter()) {
+ bitset.clear(0, bitset.length());
+ noDoc = true;
+ };
+ continue;
};
- bitset.and(filterIter);
+ if (kc.isExtension()) {
+ bitset.or(filterIter);
+ }
+ else {
+ bitset.and(filterIter);
+ };
};
if (!noDoc) {
diff --git a/src/main/java/de/ids_mannheim/korap/filter/FilterOperation.java b/src/main/java/de/ids_mannheim/korap/filter/FilterOperation.java
new file mode 100644
index 0000000..20dde1c
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/filter/FilterOperation.java
@@ -0,0 +1,38 @@
+package de.ids_mannheim.korap.filter;
+import org.apache.lucene.search.Filter;
+
+public class FilterOperation {
+ private boolean extension;
+ public Filter filter;
+
+ public FilterOperation (Filter filter, boolean extension) {
+ this.extension = extension;
+ this.filter = filter;
+ };
+
+ public boolean isExtension () {
+ return this.extension;
+ };
+
+ public boolean isFilter () {
+ return !(this.extension);
+ };
+
+ @Override
+ public Object clone () throws CloneNotSupportedException {
+ return (Object) new FilterOperation(this.filter, this.extension);
+ };
+
+ @Override
+ public String toString () {
+ StringBuilder sb = new StringBuilder();
+ if (this.extension) {
+ sb.append("extend with ");
+ }
+ else {
+ sb.append("filter with ");
+ };
+ sb.append(this.filter.toString());
+ return sb.toString();
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
index ed1101e..2e557b4 100644
--- a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
+++ b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
@@ -59,8 +59,15 @@
KorapResult kr = kc.search(query);
assertEquals(70, kr.totalResults());
+ kc.extend( kf.and("textClass", "uninteresting") );
+ assertEquals("Documents", 1, kc.numberOf("documents"));
- // System.err.println(kr.toJSON());
+ kc.extend( kf.and("textClass", "wissenschaft") );
+ assertEquals("Documents", 3, kc.numberOf("documents"));
+ assertEquals("Tokens", 1669, kc.numberOf("tokens"));
+ assertEquals("Sentences", 188, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
+ System.err.println(kr.toJSON());
};
};