blob: ae09adcbc8f16931ad9ebe8d69b9d055692cb1d7 [file] [log] [blame]
package de.ids_mannheim.korap;
import java.util.*;
import java.io.IOException;
import de.ids_mannheim.korap.collection.CollectionBuilder;
import de.ids_mannheim.korap.response.Notifications;
import de.ids_mannheim.korap.util.QueryException;
import de.ids_mannheim.korap.response.Result;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.search.BitsFilteredDocIdSet;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import java.nio.ByteBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Create a Virtual Collection of documents by means of a KoralQuery
* collection object.
*
* <blockquote><pre>
* KrillCollection kc = new KrillCollection(json);
* </pre></blockquote>
*
* @author diewald
*/
/*
* TODO: Make a cache for the bits
* Delete it in case of an extension or a filter
* TODO: Maybe use randomaccessfilterstrategy
* TODO: Maybe a constantScoreQuery can make things faster?
* See http://mail-archives.apache.org/mod_mbox/lucene-java-user/
* 200805.mbox/%3C17080852.post@talk.nabble.com%3E
*/
public final class KrillCollection extends Notifications {
private KrillIndex index;
private JsonNode json;
private CollectionBuilder cb = new CollectionBuilder();
private CollectionBuilder.Interface cbi;
private byte[] pl = new byte[4];
private static ByteBuffer bb = ByteBuffer.allocate(4);
// Logger
private final static Logger log = LoggerFactory
.getLogger(KrillCollection.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
/**
* Construct a new KrillCollection.
*
*/
public KrillCollection () {};
/**
* Construct a new KrillCollection by passing a KrillIndex.
*
* @param index
* The {@link KrillIndex} object.
*/
public KrillCollection (KrillIndex index) {
this.index = index;
};
/**
* Construct a new KrillCollection by passing a KoralQuery.
*
* @param json
* The KoralQuery document as a JSON string.
*/
public KrillCollection (String jsonString) {
ObjectMapper mapper = new ObjectMapper();
try {
JsonNode json = mapper.readTree(jsonString);
if (json.has("collection"))
this.fromJson(json.get("collection"));
else if (json.has("collections"))
this.addError(899,
"Collections are not supported anymore in favour of a single collection");
}
// Query Exception
catch (QueryException qe) {
this.addError(qe.getErrorCode(), qe.getMessage());
}
// JSON exception
catch (IOException e) {
this.addError(621, "Unable to parse JSON", "KrillCollection",
e.getLocalizedMessage());
};
};
/**
* Set the {@link KrillIndex} the virtual collection refers to.
*
* @param index
* The {@link KrillIndex} the virtual collection refers
* to.
*/
public void setIndex (KrillIndex index) {
this.index = index;
};
/**
* Import the "collection" part of a KoralQuery.
*
* @param jsonString
* The "collection" part of a KoralQuery.
* @throws QueryException
*/
public KrillCollection fromJson (String jsonString) throws QueryException {
ObjectMapper mapper = new ObjectMapper();
try {
this.fromJson((JsonNode) mapper.readTree(jsonString));
}
catch (Exception e) {
this.addError(621, "Unable to parse JSON", "KrillCollection");
};
return this;
};
/**
* Import the "collection" part of a KoralQuery.
*
* @param json
* The "collection" part of a KoralQuery
* as a {@link JsonNode} object.
* @throws QueryException
*/
public KrillCollection fromJson (JsonNode json) throws QueryException {
this.json = json;
return this.fromBuilder(this._fromJson(json));
};
private CollectionBuilder.Interface _fromJson (JsonNode json)
throws QueryException {
if (!json.has("@type")) {
throw new QueryException(701,
"JSON-LD group has no @type attribute");
};
String type = json.get("@type").asText();
if (type.equals("koral:doc")) {
String key = "tokens";
String valtype = "type:string";
String match = "match:eq";
if (json.has("key"))
key = json.get("key").asText();
if (json.has("type"))
valtype = json.get("type").asText();
// Filter based on date
if (valtype.equals("type:date")) {
if (!json.has("value"))
throw new QueryException(820, "Dates require value fields");
String dateStr = json.get("value").asText();
if (json.has("match"))
match = json.get("match").asText();
// TODO: This isn't stable yet
switch (match) {
case "match:eq":
return this.cb.date(key, dateStr);
case "match:ne":
return this.cb.date(key, dateStr).not();
case "match:geq":
return this.cb.since(key, dateStr);
case "match:leq":
return this.cb.till(key, dateStr);
};
throw new QueryException(841, "Match relation unknown for type");
}
// Filter based on string
else if (valtype.equals("type:string")) {
if (json.has("match"))
match = json.get("match").asText();
switch (match) {
case "match:eq":
return this.cb.term(key, json.get("value").asText());
case "match:ne":
return this.cb.term(key, json.get("value").asText())
.not();
// This may change - but for now it means the elements are lowercased
case "match:contains":
return this.cb.term(key, json.get("value").asText()
.toLowerCase());
case "match:containsnot":
return this.cb.term(key,
json.get("value").asText().toLowerCase()).not();
// <LEGACY>
case "match:excludes":
return this.cb.term(key,
json.get("value").asText().toLowerCase()).not();
// </LEGACY>
};
throw new QueryException(841, "Match relation unknown for type");
}
// Filter based on regex
else if (valtype.equals("type:regex")) {
if (json.has("match"))
match = json.get("match").asText();
if (match.equals("match:eq")) {
return this.cb.re(key, json.get("value").asText());
}
else if (match.equals("match:ne")) {
return this.cb.re(key, json.get("value").asText()).not();
}
else if (match.equals("match:contains")) {
return this.cb.re(key, json.get("value").asText());
}
else if (match.equals("match:excludes")) {
return this.cb.re(key, json.get("value").asText()).not();
};
throw new QueryException(841, "Match relation unknown for type");
};
throw new QueryException(843, "Document type is not supported");
}
// nested group
else if (type.equals("koral:docGroup")) {
if (!json.has("operands") || !json.get("operands").isArray())
throw new QueryException(842,
"Document group needs operand list");
CollectionBuilder.Group group;
String operation = "operation:and";
if (json.has("operation"))
operation = json.get("operation").asText();
if (operation.equals("operation:or"))
group = this.cb.orGroup();
else if (operation.equals("operation:and"))
group = this.cb.andGroup();
else
throw new QueryException(810,
"Unknown document group operation");
for (JsonNode operand : json.get("operands")) {
group.with(this._fromJson(operand));
};
return group;
}
// Unknown type
throw new QueryException(813, "Collection type is not supported");
};
// Returns the number of filters - always one!
@Deprecated
public int getCount () {
return 1;
};
/**
* Set the collection from a {@link CollectionBuilder} object.
*
* @param cb
* The CollectionBuilder object.
*/
public KrillCollection fromBuilder (CollectionBuilder.Interface cbi) {
this.cbi = cbi;
return this;
};
public CollectionBuilder.Interface getBuilder () {
return this.cbi;
};
public CollectionBuilder build () {
return this.cb;
};
public KrillCollection filter (CollectionBuilder.Interface filter) {
return this.fromBuilder(this.cb.andGroup().with(this.cbi).with(filter));
};
public KrillCollection extend (CollectionBuilder.Interface extension) {
return this.fromBuilder(this.cb.orGroup().with(this.cbi)
.with(extension));
};
/**
* Add a filter based on a list of unique document identifiers.
* UIDs may be indexed in the field "UID".
*
* This filter is not part of the legacy API!
*
* @param uids
* The list of unique document identifier.
* @return The {@link KrillCollection} object for chaining.
*/
public KrillCollection filterUIDs (String ... uids) {
CollectionBuilder.Group cbg = this.cb.orGroup();
for (String uid : uids) {
cbg.with(this.cb.term("UID", uid));
};
return this.filter(cbg);
};
/**
* Serialize collection to a {@link Filter} object.
*/
public Filter toFilter () {
if (this.cbi == null)
return null;
return this.cbi.toFilter();
};
/**
* Boolean value if the collection should work inverted or
* not.
*/
public boolean isNegative () {
if (this.cbi == null)
return false;
return this.cbi.isNegative();
};
/**
* Generate a string representatio of the virtual collection.
*
* <strong>Warning</strong>: This currently does not generate a
* valid
* KoralQuery string, so this may change in a future version.
*
* @return A string representation of the virtual collection.
*/
public String toString () {
Filter filter = this.toFilter();
if (filter == null)
return "";
return (this.isNegative() ? "-" : "") + filter.toString();
};
/**
* Return the associated KoralQuery collection object
* as a {@link JsonNode}. This won't work,
* if the object was build using a CollectionBuilder,
* therefore it is limited to mirror a deserialized KoralQuery
* object.
*
* @return The {@link JsonNode} representing the collection object
* of a deserialized KoralQuery object.
*/
public JsonNode toJsonNode () {
return this.json;
};
/**
* Create a bit vector representing the live documents of the
* virtual collection to be used in searches.
* This will respect deleted documents.
*
* @param The
* {@link LeafReaderContext} to search in.
* @return A bit vector representing the live documents of the
* virtual collection.
* @throws IOException
*/
public FixedBitSet bits (LeafReaderContext atomic) throws IOException {
LeafReader r = atomic.reader();
FixedBitSet bitset = new FixedBitSet(r.maxDoc());
DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
if (docids == null) {
if (this.cbi != null) {
bitset.clear(0, bitset.length());
}
else {
bitset.set(0, bitset.length());
};
}
else
bitset.or(docids.iterator());
return bitset;
};
/**
* Return the {@link DocIdSet} representing the documents of the
* virtual collection to be used in searches.
* This will respect deleted documents.
*
* @param atomic
* The {@link LeafReaderContext} to search in.
* @param accepted
* {@link Bits} vector of accepted documents.
* @throws IOException
*/
public DocIdSet getDocIdSet (LeafReaderContext atomic, Bits acceptDocs)
throws IOException {
int maxDoc = atomic.reader().maxDoc();
FixedBitSet bitset = new FixedBitSet(maxDoc);
Filter filter;
if (this.cbi == null || (filter = this.cbi.toFilter()) == null) {
if (acceptDocs == null)
return null;
bitset.set(0, maxDoc);
}
else {
// Init vector
DocIdSet docids = filter.getDocIdSet(atomic, null);
DocIdSetIterator filterIter = (docids == null) ? null : docids
.iterator();
if (filterIter == null) {
if (!this.cbi.isNegative())
return null;
bitset.set(0, maxDoc);
}
else {
// Or bit set
bitset.or(filterIter);
// Revert for negation
if (this.cbi.isNegative())
bitset.flip(0, maxDoc);
};
};
if (DEBUG) {
log.debug("Bit set is {}", _bits(bitset));
log.debug("Livedocs is {}", _bits(acceptDocs));
};
// Remove deleted docs
return (DocIdSet) BitsFilteredDocIdSet.wrap((DocIdSet) new BitDocIdSet(
bitset), acceptDocs);
};
public long numberOf (String type) throws IOException {
return this.numberOf("tokens", type);
};
/**
* Search for the number of occurrences of different types,
* e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual
* collection.
*
* @param field
* The field containing the textual data and the
* annotations as a string.
* @param type
* The type of meta information,
* e.g. <i>documents</i> or <i>sentences</i> as a
* string.
* @return The number of the occurrences.
* @throws IOException
* @see KrillIndex#numberOf
*/
public long numberOf (String field, String type) throws IOException {
// No index defined
if (this.index == null)
return (long) -1;
// No reader (inex is empty)
if (this.index.reader() == null)
return (long) 0;
// This is redundant to index stuff
if (type.equals("documents") || type.equals("base/texts")) {
if (this.cbi == null) {
if (this.index.reader() == null)
return (long) 0;
return (long) this.index.reader().numDocs();
}
else
return this.docCount();
};
// Create search term
// This may be prefixed by foundries
Term term = new Term(field, "-:" + type);
if (DEBUG)
log.debug("Iterate for {}/{}", field, type);
long occurrences = 0;
try {
// Iterate over all atomic readers and collect occurrences
for (LeafReaderContext atomic : this.index.reader().leaves()) {
Bits bits = this.bits(atomic);
if (DEBUG)
log.debug("Final bits {}", _bits(bits));
occurrences += this._numberOfAtomic(bits, atomic, term);
if (DEBUG)
log.debug("Added up to {} for {}/{}", occurrences, field,
type);
};
}
// Something went wrong
catch (IOException e) {
log.warn(e.getMessage());
};
return occurrences;
};
// Search for meta information in term vectors
// This will create the sum of all numerical payloads
// of the term in the document vector
private long _numberOfAtomic (Bits docvec, LeafReaderContext atomic,
Term term) throws IOException {
// This reimplements docsAndPositionsEnum with payloads
final Terms terms = atomic.reader().fields().terms(term.field());
// No terms were found
if (terms != null) {
// Todo: Maybe reuse a termsEnum!
final TermsEnum termsEnum = terms.iterator(null);
// Set the position in the iterator to the term that is seeked
if (termsEnum.seekExact(term.bytes())) {
// TODO: Reuse a DocsAndPositionsEnum!!
// Start an iterator to fetch all payloads of the term
DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec,
null, DocsAndPositionsEnum.FLAG_PAYLOADS);
// The iterator is empty
// This may even be an error, but we return 0
if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS)
return 0;
// Init some variables for data copying
long occurrences = 0;
BytesRef payload;
// Init nextDoc()
while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
if (docs.freq() < 1)
continue;
// Initialize (go to first term)
docs.nextPosition();
// Copy payload with the offset of the BytesRef
payload = docs.getPayload();
if (payload != null) {
System.arraycopy(payload.bytes, payload.offset, pl, 0,
4);
// Add payload as integer
occurrences += bb.wrap(pl).getInt();
if (DEBUG)
log.debug(
"Value for {} incremented by {} to {} in {}",
term, bb.wrap(pl).getInt(), occurrences,
docs.docID());
};
};
// Return the sum of all occurrences
return occurrences;
};
};
// Nothing found
return 0;
};
/**
* Return the number of documents in the virtual
* collection.
*
* @return The number of the occurrences.
* @see #numberOf
*/
public long docCount () {
// No index defined
if (this.index == null)
return (long) 0;
// TODO: Caching!
long docCount = 0;
try {
FixedBitSet bitset;
for (LeafReaderContext atomic : this.index.reader().leaves()) {
if ((bitset = this.bits(atomic)) != null)
docCount += bitset.cardinality();
};
}
catch (IOException e) {
log.warn(e.getLocalizedMessage());
};
return docCount;
};
private static String _bits (Bits bitset) {
String str = "";
for (int i = 0; i < bitset.length(); i++) {
str += bitset.get(i) ? "1" : "0";
};
return str;
};
/*
@Deprecated
public HashMap getTermRelation (String field) throws Exception {
return this.getTermRelation(new KrillCollection(this), field);
};
*/
/*
* Analyze how terms relate
*/
/*
@Deprecated
public HashMap getTermRelation (KrillCollection kc, String field)
throws Exception {
HashMap<String, Long> map = new HashMap<>(100);
long docNumber = 0, checkNumber = 0;
try {
if (kc.getCount() <= 0) {
checkNumber = (long) this.reader().numDocs();
};
for (LeafReaderContext atomic : this.reader().leaves()) {
HashMap<String, FixedBitSet> termVector = new HashMap<>(20);
FixedBitSet docvec = kc.bits(atomic);
if (docvec != null) {
docNumber += docvec.cardinality();
};
Terms terms = atomic.reader().fields().terms(field);
if (terms == null) {
continue;
};
int docLength = atomic.reader().maxDoc();
FixedBitSet bitset = new FixedBitSet(docLength);
// Iterate over all tokens in this field
TermsEnum termsEnum = terms.iterator(null);
while (termsEnum.next() != null) {
String termString = termsEnum.term().utf8ToString();
bitset.clear(0, docLength);
// Get frequency
bitset.or((DocIdSetIterator) termsEnum.docs((Bits) docvec,
null));
long value = 0;
if (map.containsKey(termString))
value = map.get(termString);
map.put(termString, value + bitset.cardinality());
termVector.put(termString, bitset.clone());
};
int keySize = termVector.size();
String[] keys = termVector.keySet()
.toArray(new String[keySize]);
java.util.Arrays.sort(keys);
if (keySize > maxTermRelations) {
throw new Exception("termRelations are limited to "
+ maxTermRelations + " sets"
+ " (requested were at least " + keySize + " sets)");
};
for (int i = 0; i < keySize; i++) {
for (int j = i + 1; j < keySize; j++) {
FixedBitSet comby = termVector.get(keys[i]).clone();
comby.and(termVector.get(keys[j]));
StringBuilder sb = new StringBuilder();
sb.append("#__").append(keys[i]).append(":###:")
.append(keys[j]);
String combString = sb.toString();
long cap = (long) comby.cardinality();
if (map.containsKey(combString)) {
cap += map.get(combString);
};
map.put(combString, cap);
};
};
};
map.put("-docs", checkNumber != 0 ? checkNumber : docNumber);
}
catch (IOException e) {
log.warn(e.getMessage());
};
return map;
};
*/
};