blob: 34f4962485d219bee1dccab6e5abdba274300b42 [file] [log] [blame]
package de.ids_mannheim.korap;
import java.util.*;
import java.io.File;
import java.io.IOException;
// import java.net.URL;
import java.nio.ByteBuffer;
import java.util.zip.GZIPInputStream;
import java.io.FileInputStream;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.FixedBitSet;
// Automata
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.KorapCollection;
import de.ids_mannheim.korap.KorapSearch;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.index.TermInfo;
import de.ids_mannheim.korap.index.SpanInfo;
import de.ids_mannheim.korap.index.MatchIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/*
Todo: Use FieldCache!
TODO: Reuse the indexreader everywhere - it should be threadsafe!
http://invertedindex.blogspot.co.il/2009/04/lucene-dociduid-mapping-and-payload.html
see korap/search.java -> retrieveTokens
Support a callback for interrupts (to stop the searching)!
Support multiple indices.
Support frequency search with regular expressions, so multiple bookkeeping:
c<:VVFIN:ging:gehen:past::
c>:VVFIN:gnig:neheg:past::
-> search for frequencies of VVFIN/gehen
-> c:VVFIN:[^:]*?:gehen:past:...
*/
/**
* KorapIndex implements a simple API for searching in and writing to a
* Lucene index and equesting several information but the index's nature.
*
* @author ndiewald
*/
public class KorapIndex {
private Directory directory;
// Temp:
public IndexReader reader;
private IndexWriter writer;
private IndexWriterConfig config;
private IndexSearcher searcher;
private boolean readerOpen = false;
private int commitCounter = 0;
private int autoCommit = 500; // Todo: Use configuration
private HashMap termContexts;
private ObjectMapper mapper = new ObjectMapper();
private static ByteBuffer bb = ByteBuffer.allocate(4),
bbOffset = ByteBuffer.allocate(8),
bbTerm = ByteBuffer.allocate(16);
private byte[] pl = new byte[4];
private Set<String> fieldsToLoad;
// Logger
private final static Logger log = LoggerFactory.getLogger(KorapIndex.class);
public KorapIndex () throws IOException {
this((Directory) new RAMDirectory());
};
public KorapIndex (String index) throws IOException {
this(FSDirectory.open(new File( index )));
};
public KorapIndex (Directory directory) throws IOException {
this.directory = directory;
fieldsToLoad = new HashSet<String>(16);
fieldsToLoad.add("author");
fieldsToLoad.add("ID");
fieldsToLoad.add("title");
fieldsToLoad.add("subTitle");
fieldsToLoad.add("textClass");
fieldsToLoad.add("pubPlace");
fieldsToLoad.add("pubDate");
fieldsToLoad.add("corpusID");
fieldsToLoad.add("foundries");
fieldsToLoad.add("layerInfo");
fieldsToLoad.add("tokenization");
// Base analyzer for searching and indexing
// StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
Map<String,Analyzer> analyzerPerField = new HashMap<String,Analyzer>();
analyzerPerField.put("textClass", new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
analyzerPerField.put("foundries", new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
new StandardAnalyzer(Version.LUCENE_CURRENT),
analyzerPerField
);
// Create configuration with base analyzer
this.config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
};
public void close () throws IOException {
this.closeReader();
this.closeWriter();
};
public IndexReader reader () {
if (!readerOpen)
this.openReader();
return this.reader;
};
public IndexSearcher searcher () {
if (this.searcher == null) {
this.searcher = new IndexSearcher(this.reader());
};
return this.searcher;
};
public void closeWriter () throws IOException {
if (this.writer != null)
this.writer.close();
};
public void closeReader () throws IOException {
if (readerOpen) {
this.reader.close();
readerOpen = false;
};
};
public void openReader () {
try {
this.reader = DirectoryReader.open(this.directory);
readerOpen = true;
if (this.searcher != null) {
this.searcher = new IndexSearcher(reader);
};
}
catch (IOException e) {
log.warn( e.getLocalizedMessage() );
};
};
public FieldDocument addDoc (FieldDocument fd) throws IOException {
if (this.writer == null)
this.writer = new IndexWriter(this.directory, this.config);
// Add document to writer
this.writer.addDocument( fd.doc );
if (++commitCounter > autoCommit) {
this.commit();
commitCounter = 0;
};
return fd;
};
// Add with file!
public FieldDocument addDoc (String json) throws IOException {
FieldDocument fd = this.mapper.readValue(json, FieldDocument.class);
return this.addDoc(fd);
};
public FieldDocument addDoc (File json) throws IOException {
FieldDocument fd = this.mapper.readValue(json, FieldDocument.class);
return this.addDoc(fd);
};
public FieldDocument addDocFile(String json) throws IOException {
return this.addDocFile(json, false);
};
public FieldDocument addDocFile(String json, boolean gzip) {
try {
if (gzip) {
FieldDocument fd = this.mapper.readValue(new GZIPInputStream(new FileInputStream(json)), FieldDocument.class);
return this.addDoc(fd);
};
return this.addDoc(json);
}
catch (IOException e) {
log.error("File json not found");
};
return (FieldDocument) null;
};
public void commit () throws IOException {
if (this.writer == null)
return;
if (commitCounter > 0) {
this.writer.commit();
commitCounter = 0;
this.closeReader();
};
};
// Get autoCommit valiue
public int autoCommit () {
return this.autoCommit;
};
// Set autoCommit value
public void autoCommit (int number) {
this.autoCommit = number;
};
// Search for meta information in term vectors
private long numberOfAtomic (Bits docvec,
AtomicReaderContext atomic,
Term term) throws IOException {
// This reimplements docsAndPositionsEnum with payloads
final Terms terms = atomic.reader().fields().terms(term.field());
// No terms were found
if (terms != null) {
// Todo: Maybe reuse a termsEnum!
final TermsEnum termsEnum = terms.iterator(null);
// Set the positioon in the iterator to the term that is seeked
if (termsEnum.seekExact(term.bytes(), true)) {
// Start an iterator to fetch all payloads of the term
DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
docvec,
null,
DocsAndPositionsEnum.FLAG_PAYLOADS
);
// Iterator is empty
// TODO: Maybe this is an error ...
if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS) {
return 0;
};
// Init some variables for data copying
long occurrences = 0;
BytesRef payload;
// Init nextDoc()
while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
// Initialize (go to first term)
docs.nextPosition();
// Copy payload with the offset of the BytesRef
payload = docs.getPayload();
System.arraycopy(payload.bytes, payload.offset, pl, 0, 4);
// Add payload as integer
occurrences += bb.wrap(pl).getInt();
};
// Return the sum of all occurrences
return occurrences;
};
};
// Nothing found
return 0;
};
/**
* Search for the number of occurrences of different types,
* e.g. "documents", "sentences" etc.
*
* @param field The field containing the textual data and the annotations.
* @param type The type of meta information, e.g. "documents" or "sentences".
*/
public long numberOf (KorapCollection collection, String field, String type) throws IOException {
// Short cut for documents
if (type.equals("documents")) {
if (collection.getCount() <= 0) {
return (long) this.reader().numDocs();
};
long docCount = 0;
// System.err.println("CHECK");
int i = 1;
for (AtomicReaderContext atomic : this.reader().leaves()) {
// System.err.println("READER" + i + "a-" + docCount);
docCount += collection.bits(atomic).cardinality();
// System.err.println("READER" + i + "b-" + docCount);
i++;
};
return docCount;
};
// Create search term
Term term = new Term(field, "-:" + type);
// System.err.println(">> Search for -:" + type + " in " + field);
long occurrences = 0;
try {
// Iterate over all atomic readers and collect occurrences
for (AtomicReaderContext atomic : this.reader().leaves()) {
occurrences += this.numberOfAtomic(
collection.bits(atomic),
atomic,
term
);
};
}
// Something went wrong
catch (IOException e) {
log.warn( e.getLocalizedMessage() );
};
return occurrences;
};
public long numberOf (String field, String type) throws IOException {
return this.numberOf(new KorapCollection(this), field, type);
};
/**
* Search for the number of occurrences of different types,
* e.g. "documents", "sentences" etc., in the base foundry.
*
* @param type The type of meta information, e.g. "documents" or "sentences".
*
* @see #numberOf(String, String)
*/
public long numberOf (String type) throws IOException {
return this.numberOf("tokens", type);
};
/**
* Search for the number of occurrences of different types,
* e.g. "documents", "sentences" etc., in a specific set of documents.
*
* @param docvec The document vector for filtering the search space.
* @param field The field containing the textual data and the annotations.
* @param type The type of meta information, e.g. "documents" or "sentences".
*
* @see #numberOf(String, String)
*/
public long numberOf (Bits docvec, String field, String type) throws IOException {
// Shortcut for documents
if (type.equals("documents")) {
OpenBitSet os = (OpenBitSet) docvec;
return os.cardinality();
};
Term term = new Term(field, "-:" + type);
int occurrences = 0;
try {
for (AtomicReaderContext atomic : this.reader().leaves()) {
occurrences += this.numberOfAtomic(docvec, atomic, term);
};
}
catch (IOException e) {
log.warn( e.getLocalizedMessage() );
};
return occurrences;
};
@Deprecated
public long countDocuments () throws IOException {
log.warn("countDocuments() is DEPRECATED in favor of numberOf(\"documents\")!");
return this.numberOf("documents");
};
@Deprecated
public long countAllTokens () throws IOException {
log.warn("countAllTokens() is DEPRECATED in favor of numberOf(\"tokens\")!");
return this.numberOf("tokens");
};
public KorapMatch getMatch (String id) {
return this.getMatchInfo(id, "tokens", false, null, null, false, true);
};
public KorapMatch getMatchInfo (String id,
String field,
String foundry,
String layer,
boolean includeSpans,
boolean includeHighlights) {
return this.getMatchInfo(id, field, true, foundry, layer, includeSpans, includeHighlights);
};
/**
* Get a match.
* BE AWARE - THIS IS STILL A PLAYGROUND!
*/
/*
KorapInfo is associated with a KorapMatch and has an array with all informations
per position in the match.
*/
public KorapMatch getMatchInfo (String idString,
String field,
boolean info,
String foundry,
String layer,
boolean includeSpans,
boolean includeHighlights) {
KorapMatch match = new KorapMatch(idString, includeHighlights);
// Create a filter based on the corpusID and the docID
BooleanQuery bool = new BooleanQuery();
bool.add(new TermQuery(new Term("ID", match.getDocID())), BooleanClause.Occur.MUST);
bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())), BooleanClause.Occur.MUST);
Filter filter = (Filter) new QueryWrapperFilter(bool);
CompiledAutomaton fst = null;
if (info) {
/* Create an automaton for prefixed terms of interest.
* You can define the necessary foundry, the necessary layer,
* in case the foundry is given, and if span annotations
* are of interest.
*/
StringBuffer regex = new StringBuffer();
// Todo: Only support one direction!
if (includeSpans)
regex.append("((\">\"|\"<\"\">\"?)\":\")?");
if (foundry != null) {
regex.append(foundry).append('/');
if (layer != null)
regex.append(layer).append(":");
}
else if (includeSpans) {
regex.append("([^-is]|[-is][^:])");
}
else {
regex.append("([^-is<>]|([-is>][^:])|<[^:>])");
};
regex.append("(.){1,}|_[0-9]+");
log.trace("The final regexString is {}", regex.toString());
RegExp regexObj = new RegExp(regex.toString(), RegExp.COMPLEMENT);
fst = new CompiledAutomaton(regexObj.toAutomaton());
log.trace("The final regexObj is {}", regexObj.toString());
};
try {
// Iterate over all atomic indices and find the matching document
for (AtomicReaderContext atomic : this.reader().leaves()) {
// Retrieve the single document of interest
DocIdSet filterSet = filter.getDocIdSet(
atomic,
atomic.reader().getLiveDocs()
);
// Create a bitset for the correct document
Bits bitset = filterSet.bits();
DocIdSetIterator filterIterator = filterSet.iterator();
// No document found
if (filterIterator == null)
continue;
// Go to the matching doc - and remember its ID
int localDocID = filterIterator.nextDoc();
if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
continue;
// We've found the correct document! Hurray!
log.trace("We've found a matching document");
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
fieldsToLoadLocal.add(field);
// Get terms from the document
Terms docTerms = atomic.reader().getTermVector(localDocID, field);
// Load the necessary fields of the document
Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
// Put some more information to the match
PositionsToOffset pto = new PositionsToOffset(atomic, field);
match.setPositionsToOffset(pto);
match.setLocalDocID(localDocID);
match.populateDocument(doc, field, fieldsToLoadLocal);
log.trace("The document has the id '{}'", match.getDocID());
if (!info) break;
// Limit the terms to all the terms of interest
TermsEnum termsEnum = docTerms.intersect(fst, null);
DocsAndPositionsEnum docs = null;
// List of terms to populate
SpanInfo termList = new SpanInfo(pto, localDocID);
// Iterate over all terms in the document
while (termsEnum.next() != null) {
// Get the positions and payloads of the term in the document
// The bitvector may look different (don't know why)
// and so the local ID may differ.
// That's why the requesting bitset is null.
docs = termsEnum.docsAndPositions(
null,
docs,
DocsAndPositionsEnum.FLAG_PAYLOADS
);
// Init document iterator
docs.nextDoc();
// Should never happen ... but hell.
if (docs.docID() == DocIdSetIterator.NO_MORE_DOCS)
continue;
// How often does this term occur in the document?
int termOccurrences = docs.freq();
// log.trace("I found {} documents with this term", termOccurrences);
// String representation of the term
String termString = termsEnum.term().utf8ToString();
// Iterate over all occurrences
for (int i = 0; i < termOccurrences; i++) {
// Init positions and get the current
int pos = docs.nextPosition();
// Check, if the position of the term is in the interesting area
// log.trace("Check position!");
if (pos >= match.getStartPos() && pos < match.getEndPos()) {
log.trace(
">> {}: {}-{}-{}",
termString,
docs.freq(),
pos,
docs.getPayload()
);
BytesRef payload = docs.getPayload();
// Copy the payload
bbTerm.clear();
if (payload != null) {
bbTerm.put(
payload.bytes,
payload.offset,
payload.length
);
};
TermInfo ti = new TermInfo(termString, pos, bbTerm).analyze();
if (ti.getEndPos() < match.getEndPos()) {
log.trace("Add {}", ti.toString());
termList.add(ti);
};
};
};
};
// Add annotations based on the retrieved infos
for (TermInfo t : termList.getTerms()) {
log.trace("Add term {}/{}:{} to {}({})-{}({})",
t.getFoundry(),
t.getLayer(),
t.getValue(),
t.getStartChar(),
t.getStartPos(),
t.getEndChar(),
t.getEndPos());
if (t.getType() == "term" || t.getType() == "span")
match.addAnnotation(t.getStartPos(), t.getEndPos(), t.getAnnotation());
else if (t.getType() == "relSrc")
match.addRelation(t.getStartPos(), t.getEndPos(), t.getAnnotation());
};
break;
};
}
catch (IOException e) {
log.warn(e.getLocalizedMessage());
match.setError(e.getLocalizedMessage());
};
return match;
};
/**
* Search in the index.
*/
public KorapResult search (SpanQuery query) {
return this.search(new KorapCollection(this), new KorapSearch(query));
};
public KorapResult search (SpanQuery query, short count) {
return this.search(
new KorapCollection(this),
new KorapSearch(query).setCount(count)
);
};
public KorapResult search (SpanQuery query,
int startIndex,
short count,
boolean leftTokenContext,
short leftContext,
boolean rightTokenContext,
short rightContext) {
return this.search(
new KorapCollection(this),
query,
startIndex,
count,
leftTokenContext,
leftContext,
rightTokenContext,
rightContext
);
};
public KorapResult search (KorapSearch ks) {
// TODO: This might leak
return this.search(new KorapCollection(this), ks);
};
public KorapResult search (KorapCollection collection,
SpanQuery query,
int startIndex,
short count,
boolean leftTokenContext,
short leftContext,
boolean rightTokenContext,
short rightContext) {
KorapSearch ks = new KorapSearch(query);
ks.setStartIndex(startIndex).setCount(count);
ks.leftContext.setToken(leftTokenContext).setLength(leftContext);
ks.rightContext.setToken(rightTokenContext).setLength(rightContext);
return this.search(collection, ks);
};
public KorapResult search (KorapCollection collection, KorapSearch ks) {
log.trace("Start search");
this.termContexts = new HashMap<Term, TermContext>();
SpanQuery query = ks.getQuery();
// Get the field of textual data and annotations
String field = query.getField();
// Todo: Make kr subclassing ks - so ks has a method for a new KorapResult!
KorapResult kr = new KorapResult(
query.toString(),
ks.getStartIndex(),
ks.getCount(),
ks.leftContext.isToken(),
ks.leftContext.getLength(),
ks.rightContext.isToken(),
ks.rightContext.getLength()
);
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
fieldsToLoadLocal.add(field);
int i = 0;
long t1 = 0, t2 = 0;
int startIndex = kr.getStartIndex();
int count = kr.getItemsPerPage();
int hits = kr.itemsPerPage() + startIndex;
int limit = ks.getLimit();
boolean cutoff = ks.doCutOff();
if (limit > 0) {
if (hits > limit)
hits = limit;
if (limit < startIndex)
return kr;
};
ArrayList<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(kr.itemsPerPage());
try {
// Rewrite query (for regex and wildcard queries)
for (Query rewrittenQuery = query.rewrite(this.reader());
rewrittenQuery != (Query) query;
rewrittenQuery = query.rewrite(this.reader())) {
query = (SpanQuery) rewrittenQuery;
};
for (AtomicReaderContext atomic : this.reader().leaves()) {
// Use OpenBitSet;
Bits bitset = collection.bits(atomic);
PositionsToOffset pto = new PositionsToOffset(atomic, field);
// Spans spans = NearSpansOrdered();
Spans spans = query.getSpans(atomic, (Bits) bitset, termContexts);
IndexReader lreader = atomic.reader();
// TODO: Get document information from Cache!
// See: http://www.ibm.com/developerworks/java/library/j-benchmark1/index.html
t1 = System.nanoTime();
for (; i < hits; i++) {
log.trace("Match Nr {}/{}", i, count);
// There are no more spans to find
if (spans.next() != true)
break;
// The next matches are not yet part of the result
if (startIndex > i)
continue;
int localDocID = spans.doc();
int docID = atomic.docBase + localDocID;
// Document doc = lreader.document(docID, fieldsToLoadLocal);
// Do not load all of this, in case the doc is the same!
Document doc = lreader.document(localDocID, fieldsToLoadLocal);
KorapMatch match = kr.addMatch(
pto,
localDocID,
spans.start(),
spans.end()
); // new KorapMatch();
if (spans.isPayloadAvailable()) {
// TODO: Here are offsets and highlight offsets!
// <> payloads have 12 bytes (iii) or 8!?
// highlightoffsets have 11 bytes (iis)!
/*
int[] offsets = getOffsetsFromPayload(spans.getPayload());
match.startOffset(offsets[0]);
match.startOffset(offsets[1]);
*/
try {
ByteBuffer bb = ByteBuffer.allocate(10);
for (byte[] b : spans.getPayload()) {
log.trace("Found a payload!!! with length {}", b.length);
// Todo element searches!
// Highlights!
if (b.length == 9) {
bb.put(b);
bb.rewind();
int start = bb.getInt();
int end = bb.getInt() -1;
byte number = bb.get();
log.trace("Have a payload: {}-{}", start, end);
match.addHighlight(start, end, number);
}
// Element payload for match!
// This MAY BE the correct match
else if (b.length == 8) {
bb.put(b);
bb.rewind();
if (match.potentialStartPosChar == -1) {
match.potentialStartPosChar = bb.getInt(0);
}
else {
if (bb.getInt(0) < match.potentialStartPosChar)
match.potentialStartPosChar = bb.getInt(0);
};
if (bb.getInt(4) > match.potentialEndPosChar)
match.potentialEndPosChar = bb.getInt(4);
log.trace("Element payload from {} to {}",
match.potentialStartPosChar,
match.potentialEndPosChar);
}
else if (b.length == 4) {
bb.put(b);
bb.rewind();
log.debug("Unknown[4]: {}", bb.getInt());
};
bb.clear();
};
}
catch (Exception e) {
log.error(e.getMessage());
}
// match.payload(spans.getPayload());
};
match.internalDocID = docID;
match.populateDocument(doc, field, fieldsToLoadLocal);
log.trace("I've got a match in {} of {}", match.getDocID(), count);
atomicMatches.add(match);
};
// Benchmark till now
if (i >= kr.itemsPerPage() &&
kr.getBenchmarkSearchResults().length() == 0) {
t2 = System.nanoTime();
kr.setBenchmarkSearchResults(t1, t2);
};
// Can be disabled TEMPORARILY
while (!cutoff && spans.next()) {
if (limit > 0 && i <= limit)
break;
i++;
};
atomicMatches.clear();
};
t1 = System.nanoTime();
kr.setBenchmarkHitCounter(t2, t1);
if (kr.getBenchmarkSearchResults().length() == 0) {
kr.setBenchmarkSearchResults(t2, t1);
};
kr.setTotalResults(cutoff ? -1 : i);
}
catch (IOException e) {
kr.setError("There was an IO error");
log.warn( e.getLocalizedMessage() );
};
return kr;
};
};