blob: 07a1598c5d4d89f6a718ceae0dffa3a259f2470a [file] [log] [blame]
package de.ids_mannheim.korap.collection;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.RegexpQuery;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.index.TextPrependedTokenStream;
import de.ids_mannheim.korap.util.KrillDate;
import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Element;
/*
* TODO: Optimize!
* - Remove identical object in Boolean groups
* - Flatten boolean groups
* - create "between" ranges for multiple date objects
*
* TODO:
* - Filters are deprecated, they should be ported to queries
*/
public class CollectionBuilder {
public final static CacheManager cacheManager = CacheManager.newInstance();
public final static Cache cache = cacheManager.getCache("named_vc");
// Logger
private final static Logger log = LoggerFactory
.getLogger(KrillCollection.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
public CollectionBuilder.Interface term (String field, String term) {
return new CollectionBuilder.Term(field, term);
};
public CollectionBuilder.Interface re (String field, String term) {
return new CollectionBuilder.Term(field, term, true);
};
public CollectionBuilder.Interface text (String field, String text) {
return new CollectionBuilder.Text(field, text);
};
public CollectionBuilder.Interface since (String field, String date) {
int since = new KrillDate(date).floor();
if (since == 0 || since == KrillDate.BEGINNING)
return null;
return new CollectionBuilder.Range(field, since, KrillDate.END);
};
public CollectionBuilder.Interface nothing () {
// Requires that a field with name "0---" does not exist
return new CollectionBuilder.Term("0---", "0");
};
public CollectionBuilder.Interface till (String field, String date) {
try {
int till = new KrillDate(date).ceil();
if (till == 0 || till == KrillDate.END)
return null;
return new CollectionBuilder.Range(field, KrillDate.BEGINNING,
till);
}
catch (NumberFormatException e) {
log.warn("Parameter of till(date) is invalid");
};
return null;
};
// This will be optimized away in future versions
public CollectionBuilder.Interface between (String field, String start,
String end) {
CollectionBuilder.Interface startObj = this.since(field, start);
if (startObj == null)
return null;
CollectionBuilder.Interface endObj = this.till(field, end);
if (endObj == null)
return null;
return this.andGroup().with(startObj).with(endObj);
};
public CollectionBuilder.Interface date (String field, String date) {
KrillDate dateDF = new KrillDate(date);
if (dateDF.year == 0)
return null;
if (dateDF.day == 0 || dateDF.month == 0) {
int begin = dateDF.floor();
int end = dateDF.ceil();
if (end == 0
|| (begin == KrillDate.BEGINNING && end == KrillDate.END))
return null;
return new CollectionBuilder.Range(field, begin, end);
};
return new CollectionBuilder.Range(field, dateDF.floor(),
dateDF.ceil());
};
public CollectionBuilder.Interface referTo (String reference) {
return new CollectionBuilder.Reference(reference);
};
public CollectionBuilder.Group andGroup () {
return new CollectionBuilder.Group(false);
};
public CollectionBuilder.Group orGroup () {
return new CollectionBuilder.Group(true);
};
public interface Interface {
public String toString ();
public Filter toFilter () throws QueryException;
public boolean isNegative ();
public CollectionBuilder.Interface not ();
};
public class Term implements CollectionBuilder.Interface {
private boolean isNegative = false;
private boolean regex = false;
private String field;
private String term;
public Term (String field, String term) {
this.field = field;
this.term = term;
};
public Term (String field, String term, boolean regex) {
this.field = field;
this.term = term;
this.regex = regex;
};
public Filter toFilter () {
// Regular expression
if (this.regex)
return new QueryWrapperFilter(
new RegexpQuery(new org.apache.lucene.index.Term(
this.field, this.term)));
// Simple term
return new TermsFilter(
new org.apache.lucene.index.Term(this.field, this.term));
};
public String toString () {
Filter filter = this.toFilter();
if (filter == null)
return "";
return filter.toString();
};
public boolean isNegative () {
return this.isNegative;
};
public CollectionBuilder.Interface not () {
this.isNegative = true;
return this;
};
};
public class Text implements CollectionBuilder.Interface {
private boolean isNegative = false;
// private boolean regex = false;
private String field;
private String text;
public Text (String field, String text) {
this.field = field;
this.text = text;
};
// TODO:
// Currently this treatment is language specific and
// does too much, I guess.
public Filter toFilter () {
PhraseQuery pq = new PhraseQuery();
int pos = 0;
try {
TextPrependedTokenStream tpts = new TextPrependedTokenStream(this.text);
tpts.doNotPrepend();
CharTermAttribute term;
tpts.reset();
while (tpts.incrementToken()) {
term = tpts.getAttribute(CharTermAttribute.class);
pq.add(new org.apache.lucene.index.Term(this.field, term.toString()), pos++);
};
tpts.close();
}
catch (IOException ie) {
System.err.println(ie);
return null;
};
return new QueryWrapperFilter(pq);
};
public String toString () {
Filter filter = this.toFilter();
if (filter == null)
return "";
return filter.toString();
};
public boolean isNegative () {
return this.isNegative;
};
public CollectionBuilder.Interface not () {
this.isNegative = true;
return this;
};
};
public class Reference implements CollectionBuilder.Interface {
private boolean isNegative = false;
private String reference;
private Map<Integer, DocBits> docIdMap =
new HashMap<Integer, DocBits>();
public Reference (String reference) {
this.reference = reference;
};
public Filter toFilter () throws QueryException {
Element element = KrillCollection.cache.get(this.reference);
if (element == null) {
if (DEBUG) {
log.debug(reference + " is NOT found in the cache");
}
KrillCollection kc = new KrillCollection();
kc.fromStore(this.reference);
if (kc.hasErrors()) {
throw new QueryException(
kc.getError(0).getCode(),
kc.getError(0).getMessage()
);
};
return new ToCacheVCFilter(
this.reference,
docIdMap,
kc.getBuilder(),
kc.toFilter()
);
}
else {
if (DEBUG) {
log.debug(reference + " is FOUND in the cache.");
}
CachedVCData cc = (CachedVCData) element.getObjectValue();
return new CachedVCFilter(this.reference, cc);
}
};
public String toString () {
return "referTo(" + this.reference + ")";
};
public boolean isNegative () {
return this.isNegative;
};
public CollectionBuilder.Interface not () {
this.isNegative = true;
return this;
};
private String loadVCFile (String ref) {
Properties prop = KrillProperties.loadDefaultProperties();
if (prop == null){
/*
this.addError(StatusCodes.MISSING_KRILL_PROPERTIES,
"krill.properties is not found.");
*/
return null;
}
String namedVCPath = prop.getProperty("krill.namedVC");
if (!namedVCPath.endsWith("/")){
namedVCPath += "/";
}
File file = new File(namedVCPath+ref+".jsonld");
String json = null;
try {
FileInputStream fis = new FileInputStream(file);
json = IOUtils.toString(fis);
}
catch (IOException e) {
/*
this.addError(StatusCodes.MISSING_COLLECTION,
"Collection is not found.");
*/
return null;
}
return json;
}
};
public class Group implements CollectionBuilder.Interface {
private boolean isOptional = false;
private boolean isNegative = false;
public boolean isNegative () {
return this.isNegative;
};
public void setNegative (boolean isNegative) {
this.isNegative = isNegative;
}
public boolean isOptional () {
return this.isOptional;
};
private ArrayList<CollectionBuilder.Interface> operands;
public Group (boolean optional) {
this.isOptional = optional;
this.operands = new ArrayList<CollectionBuilder.Interface>(3);
};
public Group with (CollectionBuilder.Interface cb) {
if (cb == null)
return this;
this.operands.add(cb);
return this;
};
public Group with (String field, String term) {
if (field == null || term == null)
return this;
return this.with(new CollectionBuilder.Term(field, term));
};
public Filter toFilter () throws QueryException {
if (this.operands == null || this.operands.isEmpty())
return null;
if (this.operands.size() == 1)
return this.operands.get(0).toFilter();
// BooleanFilter bool = new BooleanFilter();
BooleanGroupFilter bool = new BooleanGroupFilter(this.isOptional);
Iterator<CollectionBuilder.Interface> i = this.operands.iterator();
while (i.hasNext()) {
CollectionBuilder.Interface cb = i.next();
if (cb.isNegative()) {
bool.without(cb.toFilter());
}
else {
bool.with(cb.toFilter());
};
};
return bool;
};
public String toString () {
try {
Filter filter = this.toFilter();
if (filter == null)
return "";
return filter.toString();
}
catch (QueryException qe) {
log.warn(qe.getLocalizedMessage());
};
return "";
};
public CollectionBuilder.Interface not () {
this.isNegative = true;
return this;
};
};
public class Range implements CollectionBuilder.Interface {
private boolean isNegative = false;
private String field;
private int start, end;
public Range (String field, int start, int end) {
this.field = field;
this.start = start;
this.end = end;
};
public boolean isNegative () {
return this.isNegative;
};
public String toString () {
Filter filter = this.toFilter();
if (filter == null)
return "";
return filter.toString();
};
public Filter toFilter () {
return NumericRangeFilter.newIntRange(this.field, this.start,
this.end, true, true);
};
public CollectionBuilder.Interface not () {
this.isNegative = true;
return this;
};
};
/** Builder for virtual corpus / collection existing in the cache
*
* @author margaretha
*
*/
public class CachedVC implements CollectionBuilder.Interface {
private String cacheKey;
private CachedVCData cachedCollection;
private boolean isNegative = false;
public CachedVC (String vcRef, CachedVCData cc) {
this.cacheKey = vcRef;
this.cachedCollection = cc;
}
@Override
public Filter toFilter () {
return new CachedVCFilter(this.cacheKey, cachedCollection);
}
@Override
public boolean isNegative () {
return this.isNegative;
}
@Override
public CollectionBuilder.Interface not () {
this.isNegative = true;
return this;
}
}
/** Wraps a sub CollectionBuilder.Interface to allows VC caching
*
* @author margaretha
*
*/
public class ToCacheVC implements CollectionBuilder.Interface {
private CollectionBuilder.Interface child;
private String cacheKey;
private Map<Integer, DocBits> docIdMap;
public ToCacheVC (String vcRef, Interface cbi) {
this.child = cbi;
this.cacheKey = vcRef;
this.docIdMap = new HashMap<Integer, DocBits>();
}
@Override
public Filter toFilter () throws QueryException {
return new ToCacheVCFilter(cacheKey,docIdMap, child, child.toFilter());
}
@Override
public boolean isNegative () {
return child.isNegative();
}
@Override
public CollectionBuilder.Interface not () {
// not supported
return this;
}
}
// Maybe irrelevant
public Interface namedVC (String vcRef, CachedVCData cc) {
return new CollectionBuilder.CachedVC(vcRef, cc);
}
public Interface toCacheVC (String vcRef, Interface cbi) {
return new CollectionBuilder.ToCacheVC(vcRef, cbi);
}
};