Working Virtual Collections | Feature Freeze
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
new file mode 100644
index 0000000..a5e1a0d
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -0,0 +1,134 @@
+package de.ids_mannheim.korap;
+
+import java.util.*;
+import java.io.IOException;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.NumericRangeFilter;
+import org.apache.lucene.search.Filter;
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.KorapFilter;
+import de.ids_mannheim.korap.util.KorapDate;
+import de.ids_mannheim.korap.filter.BooleanFilter;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.DocIdSet;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+// accepts as first parameter the index
+// THIS MAY CHANGE for stuff like combining virtual collections
+// See http://mail-archives.apache.org/mod_mbox/lucene-java-user/200805.mbox/%3C17080852.post@talk.nabble.com%3E
+
+
+public class KorapCollection {
+ private KorapIndex index;
+ private String id;
+ private KorapDate created;
+ private ArrayList<Filter> filter;
+ private int filterCount = 0;
+
+ // Logger
+ private final static Logger log = LoggerFactory.getLogger(KorapCollection.class);
+
+
+ // user?
+
+ public KorapCollection (KorapIndex ki) {
+ this.index = ki;
+ this.filter = new ArrayList<Filter>(5);
+ };
+
+ public int getCount() {
+ return this.filterCount;
+ };
+
+ public void filter (BooleanFilter filter) {
+ this.filter.add(new QueryWrapperFilter(filter.toQuery()));
+ this.filterCount++;
+ };
+
+ public ArrayList<Filter> getFilters () {
+ return this.filter;
+ };
+
+
+ public KorapResult search (SpanQuery query) {
+ return this.index.search(this, query, 0, (short) 5, true, (short) 5, true, (short) 5);
+ };
+
+ public Bits bits (AtomicReaderContext atomic) throws IOException {
+
+ /*
+ TODO:
+ Don't check the live docs in advance - combine them afterwards with an "and" operation,
+ so before this you can fully use "and" and "or" on an empty bitset.
+ */
+
+ Bits bitset = (Bits) atomic.reader().getLiveDocs();
+
+ if (this.filterCount > 0) {
+ FixedBitSet fbitset = new FixedBitSet(atomic.reader().numDocs());
+
+ ArrayList<Filter> filters = (ArrayList<Filter>) this.filter.clone();
+
+ // Init vector
+ if (bitset == null) {
+ DocIdSet docids = filters.remove(0).getDocIdSet(atomic, null);
+ DocIdSetIterator filterIter = docids.iterator();
+ fbitset.or(filterIter);
+ };
+
+ for (Filter kc : filters) {
+ log.trace("FILTER: {}", kc);
+ DocIdSet docids = kc.getDocIdSet(atomic, bitset);
+ DocIdSetIterator filterIter = docids.iterator();
+ fbitset.and(filterIter);
+ };
+
+ bitset = fbitset.bits();
+ };
+
+ return bitset;
+ };
+
+ public long numberOf (String foundry, String type) {
+ return this.index.numberOf(this, foundry, type);
+ };
+
+ // implement "till" with rangefilter
+};
+
+/*
+
+Spans spans = yourSpanQuery.getSpans(reader);
+BitSet bits = yourFilter.bits(reader);
+int filterDoc = bits.nextSetBit(0);
+while ((filterDoc >= 0) and spans.skipTo(filterDoc)) {
+ boolean more = true;
+ while (more and (spans.doc() == filterDoc)) {
+ // use spans.start() and spans.end() here
+ // ...
+ more = spans.next();
+ }
+ if (! more) {
+ break;
+ }
+ filterDoc = bits.nextSetBit(spans.doc());
+}
+
+Please check the javadocs of java.util.BitSet, there may
+be a 1 off error in the arguments to nextSetBit().
+
+At this point, no skipping on the spans should be done when filterDoc
+equals spans.doc(), so this code still needs some work.
+But I think you get the idea.
+
+*/
\ No newline at end of file
diff --git a/src/main/java/de/ids_mannheim/korap/KorapFilter.java b/src/main/java/de/ids_mannheim/korap/KorapFilter.java
index 68153a8..843f618 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapFilter.java
@@ -1,17 +1,10 @@
package de.ids_mannheim.korap;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
-
import de.ids_mannheim.korap.filter.BooleanFilter;
import de.ids_mannheim.korap.filter.RegexFilter;
-import de.ids_mannheim.korap.util.KorapDate;
-import org.apache.lucene.index.Term;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.lucene.search.NumericRangeQuery;
/*
@@ -46,127 +39,72 @@
Suche XYZ in allen Documenten in den Foundries "Treetagger" und "MATE", die entweder den Texttyp "sports" oder den Texttyp "news" haben, bis höchsten 2009 publiziert wurden und deren Autor auf den regulären Ausdruck "Peter .+?" matcht.
+textClass
+ID
+title
+subTitle
+author
+corpusID
+pubDate
+pubPlace
+
*/
public class KorapFilter {
- private KorapFilter filter;
- private Query query;
+ private BooleanFilter filter;
// Logger
private final static Logger jlog = LoggerFactory.getLogger(KorapFilter.class);
- /**
- * Search for documents of a specific genre.
- * @param genre The name of the genre as a string
- */
- public BooleanFilter genre (String genre) {
- return new BooleanFilter("textClass", new TermQuery(
- new Term("textClass", genre)
- ));
- };
-
- /**
- * Search for documents of specific genres.
- * @param genre The name of the genres as a regular expression.
- */
- public BooleanFilter genre (RegexFilter genre) {
- return new BooleanFilter("textClass", genre.toQuery("textClass"));
- };
-
- /**
- * Search for a documents of specific genres.
- * @param genre The name of the genre as a string
- * @param genres The names of further genres as strings
- *
- * This method is EXPERIMENTAL and may change without warnings!
- */
- public BooleanFilter genre (String genre, String ... genres) {
- BooleanFilter bf = new BooleanFilter("textClass", new TermQuery(
- new Term("textClass", genre)
- ));
- bf = bf.or(genres);
+ public BooleanFilter and (String type, String ... terms) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.and(type, terms);
return bf;
};
- public RegexFilter re (String value) {
- return new RegexFilter(value);
+ public BooleanFilter or (String type, String ... terms) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.or(type, terms);
+ return bf;
};
- public Query since (String date) {
- int since = new KorapDate(date).floor();
- if (since == 0 || since == KorapDate.BEGINNING)
- return (Query) null;
-
- return NumericRangeQuery.newIntRange("pubDate", since, KorapDate.END, true, true);
+ public BooleanFilter and (String type, RegexFilter re) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.and(type, re);
+ return bf;
};
-
- public Query till (String date) {
- try {
- int till = new KorapDate(date).ceil();
- if (till == 0 || till == KorapDate.END)
- return (Query) null;
-
- return NumericRangeQuery.newIntRange("pubDate", KorapDate.BEGINNING, till, true, true);
- }
- catch (NumberFormatException e) {
- jlog.warn("Parameter of till(date) is invalid");
- };
- return (Query) null;
+ public BooleanFilter or (String type, RegexFilter re) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.or(type, re);
+ return bf;
};
-
- public Query between (String beginStr, String endStr) {
- KorapDate beginDF = new KorapDate(beginStr);
-
- int begin = beginDF.floor();
-
- int end = new KorapDate(endStr).ceil();
-
- if (end == 0)
- return (Query) null;
-
- if (begin == KorapDate.BEGINNING && end == KorapDate.END)
- return (Query) null;
-
- if (begin == end) {
- return new TermQuery(new Term("pubDate", beginDF.toString()));
- };
-
- return NumericRangeQuery.newIntRange("pubDate", begin, end, true, true);
+ public BooleanFilter since (String date) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.since(date);
+ return bf;
};
-
- public Query date (String date) {
- KorapDate dateDF = new KorapDate(date);
-
- if (dateDF.year() == 0)
- return (Query) null;
-
- if (dateDF.day() == 0 || dateDF.month() == 0) {
- int begin = dateDF.floor();
- int end = dateDF.ceil();
-
- if (end == 0 || (begin == KorapDate.BEGINNING && end == KorapDate.END))
- return (Query) null;
-
- return NumericRangeQuery.newIntRange("pubDate", begin, end, true, true);
- };
-
- return new TermQuery(new Term("pubDate", dateDF.toString()));
+ public BooleanFilter till (String date) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.till(date);
+ return bf;
};
+ public BooleanFilter date (String date) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.date(date);
+ return bf;
+ };
- /*
-textClass
-id
-title
-subtitle
-author
-corpus
-pubDate
-pubPlace
- */
+ public BooleanFilter between (String date1, String date2) {
+ BooleanFilter bf = new BooleanFilter();
+ bf.between(date1, date2);
+ return bf;
+ };
-
+ public RegexFilter re (String regex) {
+ return new RegexFilter(regex);
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 42349f6..b453e33 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -31,6 +31,8 @@
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.DocIdSetIterator;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -44,6 +46,7 @@
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapMatch;
+import de.ids_mannheim.korap.KorapCollection;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.document.KorapPrimaryData;
@@ -54,6 +57,9 @@
/*
+TODO::: http://lucene.apache.org/core/3_0_3/api/core/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.html
+
+
Todo: Use FieldCache!
@@ -301,7 +307,7 @@
* @param foundry The foundry to search in.
* @param type The type of meta information, e.g. "documents" or "sentences".
*/
- public long numberOf (String foundry, String type) {
+ public long numberOf (KorapCollection collection, String foundry, String type) {
// Short cut for documents
if (type.equals("documents")) {
return this.reader().numDocs();
@@ -317,7 +323,7 @@
// Iterate over all atomic readers and collect occurrences
for (AtomicReaderContext atomic : this.reader().leaves()) {
occurrences += this.numberOfAtomic(
- atomic.reader().getLiveDocs(),
+ collection.bits(atomic),
atomic,
term
);
@@ -332,6 +338,11 @@
return occurrences;
};
+ public long numberOf (String foundry, String type) throws IOException {
+ return this.numberOf(new KorapCollection(this), foundry, type);
+ };
+
+
/**
* Search for the number of occurrences of different types,
* e.g. "documents", "sentences" etc., in the base foundry.
@@ -341,7 +352,7 @@
* @see #numberOf(String, String)
*/
public long numberOf (String type) throws IOException {
- return this.numberOf("base", type);
+ return this.numberOf("tokens", type);
};
@@ -379,14 +390,14 @@
};
- // Deprecated
+ @Deprecated
public long countDocuments () throws IOException {
log.warn("countDocuments() is DEPRECATED in favor of numberOf(\"documents\")!");
return this.numberOf("documents");
};
- // Deprecated
+ @Deprecated
public long countAllTokens () throws IOException {
log.warn("countAllTokens() is DEPRECATED in favor of numberOf(\"tokens\")!");
return this.numberOf("tokens");
@@ -422,6 +433,8 @@
leftTokenContext, leftContext, rightTokenContext, rightContext);
};
+ // This is just a fallback! Delete!
+ @Deprecated
public KorapResult search (Bits bitset,
SpanQuery query,
int startIndex,
@@ -430,6 +443,21 @@
short leftContext,
boolean rightTokenContext,
short rightContext) {
+ // TODO: This might leak as hell!!!
+ return this.search(new KorapCollection(this), query, startIndex, count, leftTokenContext, leftContext, rightTokenContext, rightContext);
+ };
+
+
+ // old: Bits bitset
+ public KorapResult search (KorapCollection collection,
+ SpanQuery query,
+ int startIndex,
+ short count,
+ boolean leftTokenContext,
+ short leftContext,
+ boolean rightTokenContext,
+ short rightContext) {
+
this.termContexts = new HashMap<Term, TermContext>();
String foundry = query.getField();
@@ -449,13 +477,15 @@
try {
for (AtomicReaderContext atomic : this.reader().leaves()) {
- if (bitset == null)
- bitset = atomic.reader().getLiveDocs();
+
+
+ // Use OpenBitSet;
+ Bits bitset = collection.bits(atomic);
PositionsToOffset pto = new PositionsToOffset(atomic, foundry);
// Spans spans = NearSpansOrdered();
- Spans spans = query.getSpans(atomic, bitset, termContexts);
+ Spans spans = query.getSpans(atomic, (Bits) bitset, termContexts);
IndexReader lreader = atomic.reader();
diff --git a/src/main/java/de/ids_mannheim/korap/KorapResult.java b/src/main/java/de/ids_mannheim/korap/KorapResult.java
index 501092d..0e5184f 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapResult.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapResult.java
@@ -45,7 +45,7 @@
// mapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
mapper.disable(SerializationFeature.WRITE_NULL_MAP_VALUES);
- this.matches = new ArrayList<>();
+ this.matches = new ArrayList<>(itemsPerPage);
this.query = query;
this.startIndex = startIndex;
this.itemsPerPage = (itemsPerPage > 50 || itemsPerPage < 1) ? ITEMS_PER_PAGE : itemsPerPage;
diff --git a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
index 53ffd93..3e94790 100644
--- a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
@@ -8,66 +8,209 @@
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.NumericRangeQuery;
+
+import de.ids_mannheim.korap.util.KorapDate;
+import de.ids_mannheim.korap.filter.RegexFilter;
+import de.ids_mannheim.korap.KorapFilter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/*
Todo: !not
*/
/**
- * @author Nil Diewald
+ * @author Nils Diewald
*
* BooleanFilter implements a simple API for boolean operations
* on constraints for KorapFilter.
*/
public class BooleanFilter {
private String type;
- private Query query;
- public BooleanFilter (String type, Query query) {
- this.type = type;
- this.query = query;
+ // Logger
+ private final static Logger jlog = LoggerFactory.getLogger(KorapFilter.class);
+
+
+ private BooleanQuery bool;
+
+ public BooleanFilter () {
+ bool = new BooleanQuery();
};
- public BooleanFilter or (String ... values) {
- BooleanQuery bool = new BooleanQuery();
- bool.add(this.query, BooleanClause.Occur.SHOULD);
- for (String val : values) {
- bool.add(new TermQuery(new Term(this.type, val)), BooleanClause.Occur.SHOULD);
+ public BooleanFilter or (String type, String ... terms) {
+ for (String term : terms) {
+ bool.add(
+ new TermQuery(new Term(type, term)),
+ BooleanClause.Occur.SHOULD
+ );
};
- this.query = bool;
return this;
};
- public BooleanFilter or (RegexFilter value) {
- BooleanQuery bool = new BooleanQuery();
- bool.add(this.query, BooleanClause.Occur.SHOULD);
- bool.add(value.toQuery(this.type), BooleanClause.Occur.SHOULD);
- this.query = bool;
+ public BooleanFilter or (String type, RegexFilter value) {
+ bool.add(
+ value.toQuery(type),
+ BooleanClause.Occur.SHOULD
+ );
return this;
};
-
- public BooleanFilter and (String value) {
- BooleanQuery bool = new BooleanQuery();
- bool.add(this.query, BooleanClause.Occur.MUST);
- bool.add(new TermQuery(new Term(this.type, value)), BooleanClause.Occur.MUST);
- this.query = bool;
+ public BooleanFilter or (BooleanFilter bf) {
+ bool.add(
+ bf.toQuery(),
+ BooleanClause.Occur.SHOULD
+ );
return this;
};
- public BooleanFilter and (RegexFilter value) {
- BooleanQuery bool = new BooleanQuery();
- bool.add(this.query, BooleanClause.Occur.MUST);
- bool.add(value.toQuery(this.type), BooleanClause.Occur.MUST);
- this.query = bool;
+ public BooleanFilter or (NumericRangeQuery<Integer> nrq) {
+ bool.add(nrq, BooleanClause.Occur.SHOULD);
return this;
};
+
+ public BooleanFilter and (String type, String ... terms) {
+ for (String term : terms) {
+ bool.add(
+ new TermQuery(new Term(type, term)),
+ BooleanClause.Occur.MUST
+ );
+ };
+ return this;
+ };
+
+ public BooleanFilter and (String type, RegexFilter value) {
+ bool.add(
+ value.toQuery(type),
+ BooleanClause.Occur.MUST
+ );
+ return this;
+ };
+
+ public BooleanFilter and (BooleanFilter bf) {
+ bool.add(
+ bf.toQuery(),
+ BooleanClause.Occur.MUST
+ );
+ return this;
+ };
+
+ public BooleanFilter since (String date) {
+ int since = new KorapDate(date).floor();
+
+ if (since == 0 || since == KorapDate.BEGINNING)
+ return this;
+
+ bool.add(
+ NumericRangeQuery.newIntRange(
+ "pubDate",
+ since,
+ KorapDate.END,
+ true,
+ true
+ ),
+ BooleanClause.Occur.MUST
+ );
+
+ return this;
+ };
+
+
+ public BooleanFilter till (String date) {
+ try {
+ int till = new KorapDate(date).ceil();
+ if (till == 0 || till == KorapDate.END)
+ return this;
+
+ bool.add(
+ NumericRangeQuery.newIntRange(
+ "pubDate",
+ KorapDate.BEGINNING,
+ till,
+ true,
+ true
+ ),
+ BooleanClause.Occur.MUST
+ );
+ }
+ catch (NumberFormatException e) {
+ jlog.warn("Parameter of till(date) is invalid");
+ };
+ return this;
+ };
+
+
+ public BooleanFilter between (String beginStr, String endStr) {
+ KorapDate beginDF = new KorapDate(beginStr);
+
+ int begin = beginDF.floor();
+
+ int end = new KorapDate(endStr).ceil();
+
+ if (end == 0)
+ return this;
+
+ if (begin == KorapDate.BEGINNING && end == KorapDate.END)
+ return this;
+
+ if (begin == end) {
+ this.and("pubDate", beginDF.toString());
+ return this;
+ };
+
+ this.bool.add(
+ NumericRangeQuery.newIntRange(
+ "pubDate",
+ begin,
+ end,
+ true,
+ true
+ ),
+ BooleanClause.Occur.MUST
+ );
+ return this;
+ };
+
+
+ public BooleanFilter date (String date) {
+ KorapDate dateDF = new KorapDate(date);
+
+ if (dateDF.year() == 0)
+ return this;
+
+ if (dateDF.day() == 0 || dateDF.month() == 0) {
+ int begin = dateDF.floor();
+ int end = dateDF.ceil();
+
+ if (end == 0 || (begin == KorapDate.BEGINNING && end == KorapDate.END))
+ return this;
+
+ this.bool.add(
+ NumericRangeQuery.newIntRange(
+ "pubDate",
+ begin,
+ end,
+ true,
+ true
+ ),
+ BooleanClause.Occur.MUST
+ );
+ return this;
+ };
+
+ this.and("pubDate", dateDF.toString());
+ return this;
+ };
+
public Query toQuery () {
- return this.query;
+ return this.bool;
};
public String toString () {
- return this.query.toString();
+ return this.bool.toString();
};
};
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index 3a2747e..51e0403 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -3,9 +3,9 @@
log4j.rootLogger = DEBUG, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.ElementSpans = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.query.spans.WithinSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.query.spans.WithinSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.SpanNextQuery = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.query.spans.NextSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.query.spans.NextSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.KorapTermSpan = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.ClassSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout