src/main/java/de/ids_mannheim/korap/KrillIndex.java - KorAP/Krill - Gitiles

 package de.ids_mannheim.korap;

 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 // Java core classes
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.regex.Pattern;
 import java.util.zip.GZIPInputStream;

 import org.apache.lucene.analysis.Analyzer;
 /*
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 */
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 // Lucene classes
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryWrapperFilter;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.Spans;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 // Log4j Logger classes
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.fasterxml.jackson.databind.ObjectMapper;

 // Krill classes
 import de.ids_mannheim.korap.index.FieldDocument;
 import de.ids_mannheim.korap.index.KeywordAnalyzer;
 import de.ids_mannheim.korap.index.PositionsToOffset;
 import de.ids_mannheim.korap.index.SpanInfo;
 import de.ids_mannheim.korap.index.TermInfo;
 import de.ids_mannheim.korap.index.TextAnalyzer;
 import de.ids_mannheim.korap.index.TimeOutThread;
 import de.ids_mannheim.korap.response.Match;
 import de.ids_mannheim.korap.response.MatchCollector;
 import de.ids_mannheim.korap.response.MetaFields;
 import de.ids_mannheim.korap.response.Result;
 import de.ids_mannheim.korap.response.SearchContext;
 import de.ids_mannheim.korap.response.Text;
 import de.ids_mannheim.korap.util.KrillProperties;
 import de.ids_mannheim.korap.util.QueryException;

 /**
  * <p>KrillIndex implements a simple API for searching in and writing
  * to a
  * Lucene index and requesting several information about the index'
  * nature.
  * Please consult {@link Krill} for the preferred use of this
  * class.</p>
  *
  * <blockquote><pre>
  * // Create new file backed index
  * KrillIndex ki = new KrillIndex(
  * new MMapDirectory(new File("/myindex"))
  * );
  *
  * // Add documents to the index
  * ki.addDoc(1, "{\"ID\":\"WPD-001\", ... }");
  * ki.addDoc(2, "{\"ID\":\"WPD-002\", ... }");
  *
  * // Apply Krill searches on the index
  * String koral = "{\"@type\":"koral:group", ... }";
  * Result result = new Krill(koral).apply(ki);
  * </pre></blockquote>
  *
  * <p>Properties can be stored in a properies file called
  * <tt>krill.properties</tt>.
  *
  * @author diewald
  */
 /*
  * Concerning parallel processing:
  * ===============================
  * Search /could/ be run in parallel on atomic readers
  * (although Lucene developers strongly discourage that).
  * Benefits are not clear and would need some benchmarks,
  * the huge drawback would be more complicated testing.
  * Aside from (probably) co-occurrence analysis, shared memory
  * is not an important thing, so I guess the preferred
  * way of using Krill on multicore machines for now is by using
  * the same mechanism as for distribution:
  * Running multiple nodes (and separated indices) per machine,
  * registered independently at the Zookeeper.
  *
  * On the other hand: Threaded indexing should be implemented!
  */
 /*
   TODO: Use FieldCache!!!
   TODO: Add word count as a meta data field!
   TODO: Improve validation of document import!
   TODO: Don't store the text in the token field!
         (It has only to be lifted for match views!
         Benchmark how worse that is!)
   TODO: Support layer for specific foundries in terminfo (IMPORTANT)
   TODO: Reuse the indexreader everywhere - it should be threadsafe!
   TODO: Support document removal!
   TODO: Support document update!
   TODO: Support callback for interrupts (to stop the searching)!

   http://invertedindex.blogspot.co.il/2009/04/lucene-dociduid-mapping-and-payload.html
   see korap/search.java -> retrieveTokens

   Support frequency search with regular expressions, so multiple bookkeeping:
   c<:VVFIN:ging:gehen:past::
   c>:VVFIN:gnig:neheg:past::
   -> search for frequencies of VVFIN/gehen
   -> c:VVFIN:[^:]*?:gehen:past:...
 */
 public final class KrillIndex {

     // Logger
     private final static Logger log = LoggerFactory.getLogger(KrillIndex.class);

     // This advices the java compiler to ignore all loggings
     public static final boolean DEBUG = false;

     // TODO: Use configuration instead.
     // Last line of defense against DOS
     private int autoCommit = 500;
     private String version = "Unknown";
     private String name = "Unknown";

     // Temp:
     private IndexReader reader;

     private IndexWriter writer;
     private IndexWriterConfig config;
     private IndexSearcher searcher;
     private boolean readerOpen = false;
     private Directory directory;

     // The commit counter is only there for
     // counting unstaged changes per thread (for bulk insertions)
     // It does not represent real unstaged documents.
     private int commitCounter = 0;
     private HashMap termContexts;
     private ObjectMapper mapper = new ObjectMapper();

     private byte[] pl = new byte[4];
     private static ByteBuffer bb = ByteBuffer.allocate(4),
             bbOffset = ByteBuffer.allocate(8), bbTerm = ByteBuffer.allocate(32);

     // Some initializations ...
     {
         Properties prop = KrillProperties.loadDefaultProperties();
         Properties info = KrillProperties.loadInfo();
         if (info != null) {
             this.version = info.getProperty("krill.version");
             this.name = info.getProperty("krill.name");
         };

         // Check for auto commit value
         String autoCommitStr = null;
         if (prop != null)
             autoCommitStr = prop.getProperty("krill.index.commit.auto");

         if (autoCommitStr != null) {
             try {
                 this.autoCommit = Integer.parseInt(autoCommitStr);
             }
             catch (NumberFormatException e) {
                 log.error(
                         "krill.index.commit.auto expected to be a numerical value");
             };
         };
     };


     /**
      * Constructs a new KrillIndex.
      * This will be in-memory.
      *
      * @throws IOException
      */
     public KrillIndex () throws IOException {
         this((Directory) new RAMDirectory());
     };


     /**
      * Constructs a new KrillIndex bound to a persistant index.
      *
      * @param directory
      *            A {@link Directory} pointing to an index
      * @throws IOException
      */
     public KrillIndex (Directory directory) throws IOException {
         this.directory = directory;

         // Add analyzers
         // TODO: Should probably not be here - make configurable
         Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
         analyzerPerField.put("textClass", new KeywordAnalyzer());
         analyzerPerField.put("keywords", new KeywordAnalyzer());
         analyzerPerField.put("foundries", new KeywordAnalyzer());
         PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
                 new TextAnalyzer(), analyzerPerField);

         // Create configuration with base analyzer
         this.config = new IndexWriterConfig(analyzer);
     };


     /**
      * Get the version number of the index.
      *
      * @return A string containing the version number.
      */
     public String getVersion () {
         return this.version;
     };


     /**
      * Get the name of the index.
      *
      * @return A string containing the name of the index.
      */
     public String getName () {
         return this.name;
     };


     /**
      * The Lucene {@link IndexReader} object.
      *
      * Will be opened, in case it's closed.
      *
      * @return The {@link IndexReader} object.
      */
     public IndexReader reader () {
         // Todo: Maybe use DirectoryReader.openIfChanged(DirectoryReader)
         if (!readerOpen)
             this.openReader();
         return this.reader;
     };


     /**
      * The Lucene {@link IndexWriter} object.
      *
      * Will be created, in case it doesn't exist yet.
      *
      * @return The {@link IndexWriter} object.
      * @throws IOException
      */
     public IndexWriter writer () throws IOException {
         // Open writer if not already opened
         if (this.writer == null)
             this.writer = new IndexWriter(this.directory, this.config);
         return this.writer;
     };


     /**
      * The Lucene {@link IndexSearcher} object.
      *
      * Will be created, in case it doesn't exist yet.
      *
      * @return The {@link IndexSearcher} object.
      */
     public IndexSearcher searcher () {
         if (this.searcher == null)
             this.searcher = new IndexSearcher(this.reader());
         return this.searcher;
     };


     // Open index reader
     private void openReader () {
         try {
             // open reader
             this.reader = DirectoryReader.open(this.directory);
             readerOpen = true;
             if (this.searcher != null)
                 this.searcher = new IndexSearcher(reader);
         }

         // Failed to open reader
         catch (IOException e) {
             // e.printStackTrace();
             log.warn(e.getLocalizedMessage());
         };
     };


     // Close index reader
     private void closeReader () throws IOException {
         if (readerOpen) {
             this.reader.close();
             readerOpen = false;
         };
     };


     // Close index writer
     private void closeWriter () throws IOException {
         if (this.writer != null)
             this.writer.close();
     };


     /**
      * Close the associated {@link IndexReader} and the associated
      * {@link IndexWriter},
      * in case they are opened.
      *
      * @throws IOException
      */
     public void close () throws IOException {
         this.closeReader();
         this.closeWriter();
     };


     /**
      * Commit staged data to the index,
      * if the commit counter indicates there is staged data.
      *
      * @param force
      *            Force the commit,
      *            even if there is no staged data.
      * @throws IOException
      */
     public void commit (boolean force) throws IOException {
         // There is something to commit
         if (commitCounter > 0 || !force)
             this.commit();
     };


     /**
      * Commit staged data to the index.
      *
      * @throws IOException
      */
     public void commit () throws IOException {
         this.writer().commit();
         commitCounter = 0;
         this.closeReader();
         KrillCollection.cache.removeAll();
     };


     /**
      * Get autocommit value.
      *
      * @return The autocommit value.
      */
     public int getAutoCommit () {
         return this.autoCommit;
     };


     /**
      * Set the autocommit value.
      *
      * @param value
      *            The autocommit value.
      */
     public void setAutoCommit (int value) {
         this.autoCommit = value;
     };


     /**
      * Add a document to the index as a {@link FieldDocument}.
      *
      * @param doc
      *            The {@link FieldDocument} to add to the index.
      * @return The {@link FieldDocument}, which means, the same
      *         object, that was passed to the method.
      */
     public FieldDocument addDoc (FieldDocument doc) {
         if (doc == null)
             return doc;

         try {

             // Add document to writer
             this.writer().addDocument(doc.doc);
             if (++commitCounter > autoCommit) {
                 this.commit();
                 commitCounter = 0;
             };
         }

         // Failed to add document
         catch (IOException e) {
             log.error("Unable to add document");
         };
         return doc;
     };


     /**
      * Delete documents of the index by passing field information.
      *
      * @param field
      *            The meta field name.
      * @param term
      *            The meta field term.
      */
     public boolean delDocs (String field, String term) {
         if (field == null || term == null)
             return false;
         try {
             this.writer().deleteDocuments(new Term(field, term));
             if (++commitCounter > autoCommit) {
                 this.commit();
                 commitCounter = 0;
             };

             return true;
         }

         // Failed to add document
         catch (IOException e) {
             log.error("Unable to delete documents");
         };
         return false;
     };


     /**
      * Delete a document of the index by passing a UID.
      *
      * @param uid
      *            The unique identifier of the document.
      */
     public boolean delDoc (Integer uid) {
         if (uid < 0)
             return false;
         return this.delDocs("UID", uid.toString());
     };


     /**
      * Add a document to the index as a JSON string.
      *
      * @param json
      *            The document to add to the index as a string.
      * @return The created {@link FieldDocument}.
      * @throws IOException
      */
     public FieldDocument addDoc (String json) {
         return this.addDoc(_fromJson(json));
     };


     /**
      * Add a document to the index as a JSON string
      * with a unique integer ID (unique throughout the index
      * or even throughout the cluster of indices).
      *
      * @param uid
      *            The unique document identifier.
      * @param json
      *            The document to add to the index as a string.
      * @return The created {@link FieldDocument}.
      * @throws IOException
      */
     public FieldDocument addDoc (Integer uid, String json) {
         FieldDocument fd = _fromJson(json);
         if (fd != null) {
             fd.setUID(uid);
             fd = this.addDoc(fd);
         };
         return fd;
     };


     /**
      * Add a document to the index as a JSON string.
      *
      * @param json
      *            The document to add to the index as
      *            an {@link InputStream}.
      * @return The created {@link FieldDocument}.
      * @throws IOException
      */
     public FieldDocument addDoc (InputStream json) {
         return this.addDoc(_fromFile(json, false));
     };


     /**
      * Add a document to the index as a JSON string.
      *
      * @param json
      *            The document to add to the index as
      *            an {@link InputStream}.
      * @param gzip
      *            Boolean value indicating if the file is gzipped.
      * @return The created {@link FieldDocument}.
      * @throws IOException
      */
     public FieldDocument addDoc (InputStream json, boolean gzip) {
         return this.addDoc(_fromFile(json, gzip));
     };


     /**
      * Add a document to the index as a JSON string
      * with a unique integer ID (unique throughout the index
      * or even throughout the cluster of indices).
      *
      * @param uid
      *            The unique document identifier.
      * @param json
      *            The document to add to the index as
      *            an {@link InputStream}.
      * @param gzip
      *            Boolean value indicating if the file is gzipped.
      * @return The created {@link FieldDocument}.
      * @throws IOException
      */
     public FieldDocument addDoc (Integer uid, InputStream json, boolean gzip) {
         FieldDocument fd = _fromFile(json, gzip);
         if (fd != null) {
             fd.setUID(uid);
             return this.addDoc(fd);
         };

         return fd;
     };


     // Parse JSON document from Input stream
     private FieldDocument _fromJson (String json) {
         try {
             FieldDocument fd = this.mapper.readValue(json, FieldDocument.class);
             return fd;
         }
         catch (IOException e) {
             log.error("File json not found or unmappable: {}",
                     e.getLocalizedMessage());
         };
         return (FieldDocument) null;
     };


     // Load json document from file
     private FieldDocument _fromFile (InputStream json, boolean gzip) {
         try {
             if (gzip) {

                 // Create json field document
                 FieldDocument fd = this.mapper.readValue(
                         new GZIPInputStream(json), FieldDocument.class);
                 return fd;
             };
             return this.mapper.readValue(json, FieldDocument.class);
         }

         // Fail to add json object
         catch (IOException e) {
             log.error("File {} not found", json, e);
         };
         return (FieldDocument) null;
     };


     /**
      * Search for the number of occurrences of different types,
      * e.g. <i>documents</i>, <i>sentences</i> etc.
      *
      * @param collection
      *            The scope of the numbering by means of a
      *            {@link KrillCollection}
      * @param field
      *            The field containing the textual data and the
      *            annotations as a string.
      * @param type
      *            The type of meta information,
      *            e.g. <i>documents</i> or <i>sentences</i> as a
      *            string.
      * @return The number of the occurrences.
      * @see KrillCollection#numberOf
      */
     public long numberOf (KrillCollection collection, String field,
             String type) {

         collection.setIndex(this);
         try {
             return collection.numberOf(field, type);
         }
         catch (IOException e) {
             log.warn(e.getLocalizedMessage());
         };
         return (long) -1;

         // Short cut for documents
         // This will be only "texts" in the future
         /*
         if (type.equals("documents") || type.equals("base/texts")) {
             if (collection.getCount() <= 0) {
                 try {
                     return (long) this.reader().numDocs();
                 }
                 catch (Exception e) {
                     log.warn(e.getLocalizedMessage());
                 };
                 return (long) 0;
             };
             long docCount = 0;
             // int i = 1;
             try {
                 for (LeafReaderContext atomic : this.reader().leaves()) {
                     docCount += collection.bits(atomic).cardinality();
                     // i++;
                 };
             }
             catch (IOException e) {
                 log.warn(e.getLocalizedMessage());
             };
             return docCount;
         };

         // Create search term
         // This may be prefixed by foundries
         Term term = new Term(field, "-:" + type);

         long occurrences = 0;
         try {
             // Iterate over all atomic readers and collect occurrences
             for (LeafReaderContext atomic : this.reader().leaves()) {
                 occurrences += this._numberOfAtomic(collection.bits(atomic),
                         atomic, term);
             };
         }

         // Something went wrong
         catch (Exception e) {
             log.warn(e.getLocalizedMessage());
         };

         return occurrences;
         */
     };


     /**
      * Search for the number of occurrences of different types,
      * e.g. <i>documents</i>, <i>sentences</i> etc.
      *
      * @param field
      *            The field containing the textual data and the
      *            annotations as a string.
      * @param type
      *            The type of meta information,
      *            e.g. <i>documents</i> or <i>sentences</i> as a
      *            string.
      * @return The number of the occurrences.
      * @see KrillCollection#numberOf
      */
     public long numberOf (String field, String type) {
         return this.numberOf(new KrillCollection(this), field, type);
     };


     /**
      * Search for the number of occurrences of different types,
      * e.g. <i>documents<i>, <i>sentences</i> etc., in the
      * <i>base</i> foundry.
      *
      * @param type
      *            The type of meta information,
      *            e.g. <i>documents</i> or <i>sentences</i> as a
      *            string.
      * @return The number of the occurrences.
      * @see KrillCollection#numberOf
      */
     public long numberOf (String type) {
         return this.numberOf("tokens", type);
     };


     /**
      * Search for the number of occurrences of different types,
      * e.g. <i>documents</i>, <i>sentences</i> etc.
      *
      * @param docvec
      *            The scope of the numbering by means of a
      *            {@link Bits} vector
      * @param field
      *            The field containing the textual data and the
      *            annotations as a string.
      * @param type
      *            The type of meta information,
      *            e.g. <i>documents</i> or <i>sentences</i> as a
      *            string.
      * @return The number of the occurrences.
      * @throws IOException
      */
     /*
     @Deprecated
     public long numberOf (Bits docvec, String field, String type)
         throws IOException {
     // Shortcut for documents
     if (type.equals("documents")) {
         FixedBitSet os = (FixedBitSet) docvec;
         return os.cardinality();
     };

     Term term = new Term(field, "-:" + type);

     int occurrences = 0;
     try {
         for (LeafReaderContext atomic : this.reader().leaves()) {
             occurrences += this._numberOfAtomic(docvec, atomic, term);
         };
     }
     catch (IOException e) {
         log.warn(e.getLocalizedMessage());
     };

     return occurrences;
     };
     */


     // Search for meta information in term vectors
     // This will create the sum of all numerical payloads
     // of the term in the document vector
     @Deprecated
     private long _numberOfAtomic (Bits docvec, LeafReaderContext atomic,
             Term term) throws IOException {

         // This reimplements docsAndPositionsEnum with payloads
         final Terms terms = atomic.reader().fields().terms(term.field());

         // No terms were found
         if (terms != null) {
             // Todo: Maybe reuse a termsEnum!
             final TermsEnum termsEnum = terms.iterator(null);

             // Set the position in the iterator to the term that is seeked
             if (termsEnum.seekExact(term.bytes())) {

                 // Start an iterator to fetch all payloads of the term
                 DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec,
                         null, DocsAndPositionsEnum.FLAG_PAYLOADS);

                 // The iterator is empty
                 // This may even be an error, but we return 0
                 if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS)
                     return 0;

                 // Init some variables for data copying
                 long occurrences = 0;
                 BytesRef payload;

                 // Init nextDoc()
                 while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {

                     // Initialize (go to first term)
                     docs.nextPosition();

                     // Copy payload with the offset of the BytesRef
                     payload = docs.getPayload();
                     System.arraycopy(payload.bytes, payload.offset, pl, 0, 4);

                     // Add payload as integer
                     occurrences += bb.wrap(pl).getInt();
                 };

                 // Return the sum of all occurrences
                 return occurrences;
             };
         };

         // Nothing found
         return 0;
     };


     public Text getDoc (String uid) {
         // This is very similar to getMatchInfo

         Text text = new Text();

         // Rewrite parse ID
         uid = new Integer(Integer.parseInt(uid)).toString();

         Filter filter = (Filter) new QueryWrapperFilter(
                 new TermQuery(new Term("UID", uid)));

         try {

             // Iterate over all atomic indices and find the matching document
             for (LeafReaderContext atomic : this.reader().leaves()) {

                 // Retrieve the single document of interest
                 DocIdSet filterSet = filter.getDocIdSet(atomic,
                         atomic.reader().getLiveDocs());

                 DocIdSetIterator filterIterator = filterSet.iterator();

                 if (DEBUG) {
 					// Create a bitset for the correct document
 					Bits bitset = filterSet.bits();

                     log.trace("Checking document in {} with {}", filterSet,
 							  bitset);
 				};

                 // No document found
                 if (filterIterator == null)
                     continue;

                 // Go to the matching doc - and remember its ID
                 int localDocID = filterIterator.nextDoc();

                 if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
                     continue;

                 // We've found the correct document! Hurray!
                 if (DEBUG)
                     log.trace("We've found a matching document");

                 // HashSet<String> fields = (HashSet<String>) new Krill()
                 //    .getMeta().getFields().clone();
                 // fields.add(field);

                 // Load the necessary fields of the document

                 // TODO: Probably use
                 // document(int docID, StoredFieldVisitor visitor)
                 Document doc = atomic.reader().document(localDocID);
                 text.populateFields(doc);

                 return text;
             };
         }
         catch (IOException e) {
             text.addError(600, "Unable to read index", e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
         };

         text.addError(630, "Document not found");

         return text;
     };


     public String getMatchIDWithContext (String id) {
         /* No includeHighlights */
         return "";
     };


     public Match getMatch (String id) throws QueryException {
         return this.getMatchInfo(id,       // MatchID
                 "tokens", // field
                 false,    // info
                 (ArrayList) null,     // foundry
                 (ArrayList) null,     // layer
                 false,    // includeSpans
                 true,     // includeHighlights
                 false     // extendToSentence
         );
     };


     // There is a good chance that some of these methods will die ...
     public Match getMatchInfo (String id, String field, String foundry,
             String layer, boolean includeSpans, boolean includeHighlights)
             throws QueryException {
         return this.getMatchInfo(id, field, true, foundry, layer, includeSpans,
                 includeHighlights, false);
     };


     public Match getMatchInfo (String id, String field, String foundry,
             String layer, boolean includeSpans, boolean includeHighlights,
             boolean extendToSentence) throws QueryException {
         return this.getMatchInfo(id, field, true, foundry, layer, includeSpans,
                 includeHighlights, extendToSentence);
     };


     public Match getMatchInfo (String id, String field, boolean info,
             String foundry, String layer, boolean includeSpans,
             boolean includeHighlights, boolean extendToSentence)
             throws QueryException {
         ArrayList<String> foundryList = new ArrayList<>(1);
         if (foundry != null)
             foundryList.add(foundry);
         ArrayList<String> layerList = new ArrayList<>(1);
         if (layer != null)
             layerList.add(layer);
         return this.getMatchInfo(id, field, info, foundryList, layerList,
                 includeSpans, includeHighlights, extendToSentence);
     };


     /**
      * Get a match.
      */
     /*
       KorapInfo is associated with a Match and has an array with all informations
       per position in the match.
     */
     public Match getMatchInfo (String idString, String field, boolean info,
             List<String> foundry, List<String> layer, boolean includeSpans,
             boolean includeHighlights, boolean extendToSentence)
             throws QueryException {

         if (DEBUG)
             log.trace("Get info on {}", idString);

         Match match = new Match(idString, includeHighlights);

         if (this.getVersion() != null)
             match.setVersion(this.getVersion());

         if (this.getName() != null)
             match.setName(this.getName());

         if (match.getStartPos() == -1)
             return match;

         // Create a filter based on the corpusID and the docID
         BooleanQuery bool = new BooleanQuery();
         if (match.getTextSigle() != null) {
             bool.add(new TermQuery(new Term("textSigle", match.getTextSigle())),
                     BooleanClause.Occur.MUST);
         }

         // <legacy>
         else if (match.getDocID() != null) {
             bool.add(new TermQuery(new Term("ID", match.getDocID())),
                     BooleanClause.Occur.MUST);
             bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())),
                     BooleanClause.Occur.MUST);
         }
         // </legacy>

         // Invalid
         else {
             match.addError(730, "Invalid match identifier", idString);
             return match;
         };

         if (DEBUG)
             log.trace("The bool query is {}", bool.toString());

         Filter filter = (Filter) new QueryWrapperFilter(bool);

         CompiledAutomaton fst = null;

         if (info) {
             /* Create an automaton for prefixed terms of interest.
              * You can define the necessary foundry, the necessary layer,
              * in case the foundry is given, and if span annotations
              * are of interest.
              */
             StringBuilder regex = new StringBuilder();
             // TODO: Make these static
             Pattern harmlessFoundry = Pattern.compile("^[-a-zA-Z0-9_]+$");
             Pattern harmlessLayer = Pattern.compile("^[-a-zA-Z0-9_:]+$");
             Iterator<String> iter;
             int i = 0;

             if (includeSpans)
                 regex.append("((\">\"|\"<\"\">\")\":\")?");

             // There is a foundry given
             if (foundry != null && foundry.size() > 0) {

                 // Filter out bad foundries
                 for (i = foundry.size() - 1; i >= 0; i--) {
                     if (!harmlessFoundry.matcher(foundry.get(i)).matches()) {
                         match.addError(970, "Invalid foundry requested",
                                 foundry.get(i));
                         return match;
                     };
                 };

                 // Build regex for multiple foundries
                 if (foundry.size() > 0) {
                     regex.append("(");
                     iter = foundry.iterator();
                     while (iter.hasNext()) {
                         regex.append(iter.next()).append("|");
                     };
                     regex.replace(regex.length() - 1, regex.length(), ")");
                     regex.append("\"/\"");

                     // There is a filter given
                     if (layer != null && layer.size() > 0) {

                         // Filter out bad layers
                         for (i = layer.size() - 1; i >= 0; i--) {
                             if (!harmlessLayer.matcher(layer.get(i))
                                     .matches()) {
                                 throw new QueryException(
                                         "Invalid layer requested: "
                                                 + layer.get(i));
                                 // layer.remove(i);
                             };
                         };

                         // Build regex for multiple layers
                         if (layer.size() > 0) {
                             regex.append("(");
                             iter = layer.iterator();
                             while (iter.hasNext()) {
                                 regex.append(iter.next()).append("|");
                             };
                             regex.replace(regex.length() - 1, regex.length(),
                                     ")");
                             regex.append("\":\"");
                         };
                     };
                 };
             }
             else if (includeSpans) {
                 // No foundries - but spans
                 regex.append("([^-is]|[-is][^:])");
             }
             else {
                 // No foundries - no spans
                 regex.append("([^-is<>]|[-is>][^:]|<[^:>])");
             };
             regex.append("(.){1,}|_[0-9]+");

             if (DEBUG)
                 log.trace("The final regexString is {}", regex.toString());

             RegExp regexObj = new RegExp(regex.toString(), RegExp.COMPLEMENT);
             fst = new CompiledAutomaton(regexObj.toAutomaton());
             if (DEBUG)
                 log.trace("The final regexObj is {}", regexObj.toString());
         };

         try {

             // Iterate over all atomic indices and find the matching document
             for (LeafReaderContext atomic : this.reader().leaves()) {

                 // Retrieve the single document of interest
                 DocIdSet filterSet = filter.getDocIdSet(atomic,
                         atomic.reader().getLiveDocs());

                 DocIdSetIterator filterIterator = filterSet.iterator();

                 if (DEBUG) {
 					// Create a bitset for the correct document
 					Bits bitset = filterSet.bits();

                     log.trace("Checking document in {} with {}", filterSet,
 							  bitset);
 				};

                 // No document found
                 if (filterIterator == null)
                     continue;

                 // Go to the matching doc - and remember its ID
                 int localDocID = filterIterator.nextDoc();

                 if (DEBUG)
                     log.trace("localDocID is {}", localDocID);

                 if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
                     continue;

                 // We've found the correct document! Hurray!
                 if (DEBUG)
                     log.trace("We've found a matching document");

                 // Get terms from the document
                 Terms docTerms = atomic.reader().getTermVector(localDocID,
                         field);

 				// The following fields should be lifted for the match
                 HashSet<String> fields = (HashSet<String>) new Krill().getMeta()
                         .getFields().clone();

 				// Lift primary field
                 fields.add(field);

 				// Lift all fields
 				if (fields.contains("@all"))
 					fields = null;

                 // Load the necessary fields of the document
                 Document doc = atomic.reader().document(localDocID, fields);

                 // Put some more information to the match
                 PositionsToOffset pto = new PositionsToOffset(atomic, field);
                 match.setPositionsToOffset(pto);
                 match.setLocalDocID(localDocID);
                 match.populateDocument(doc, field, fields);
                 if (DEBUG)
                     log.trace("The document has the id '{}' or the sigle '{}'",
                             match.getDocID(), match.getTextSigle());

                 // Todo:
                 SearchContext context = match.getContext();

                 // Override the normal match marking
                 // to have an inner match
                 match.overrideMatchPosition(match.getStartPos(),
                         match.getEndPos() - 1);


                 // Search for minimal surrounding sentences
                 if (extendToSentence) {
                     String element = "base/s:s";
                     int[] spanContext = match.expandContextToSpan(element);

                     if (DEBUG)
                         log.trace("Extend to sentence element '{}'", element);

                     if (spanContext[0] >= 0
                             && spanContext[0] < spanContext[1]) {
                         match.setStartPos(spanContext[0]);
                         match.setEndPos(spanContext[1]);
 						match.potentialStartPosChar = spanContext[2];
 						match.potentialEndPosChar = spanContext[3];
                         match.startMore = false;
                         match.endMore = false;
                     }
                     else {
                         match.addWarning(651, "Unable to extend context");
                     };
                 }
                 else {
                     if (DEBUG)
                         log.trace("Don't expand context");
                 };

                 context.left.setToken(true).setLength(0);
                 context.right.setToken(true).setLength(0);

                 if (!info)
                     break;

                 // Limit the terms to all the terms of interest
                 TermsEnum termsEnum = docTerms.intersect(fst, null);

                 DocsAndPositionsEnum docs = null;

                 // List of terms to populate
                 SpanInfo termList = new SpanInfo(pto, localDocID);

                 // Iterate over all terms in the document
                 while (termsEnum.next() != null) {

                     // Get the positions and payloads of the term in the document
                     // The bitvector may look different (don't know why)
                     // and so the local ID may differ.
                     // That's why the requesting bitset is null.
                     docs = termsEnum.docsAndPositions(null, docs,
                             DocsAndPositionsEnum.FLAG_PAYLOADS);

                     // Init document iterator
                     docs.nextDoc();

                     // Should never happen ... but hell!
                     if (docs.docID() == DocIdSetIterator.NO_MORE_DOCS)
                         continue;

                     // String representation of the term
                     String termString = termsEnum.term().utf8ToString();

                     // Iterate over all occurrences
                     for (int i = 0; i < docs.freq(); i++) {

                         // Init positions and get the current
                         int pos = docs.nextPosition();

                         // Check, if the position of the term is in the area of interest
                         if (pos >= match.getStartPos()
                                 && pos < match.getEndPos()) {

                             if (DEBUG)
                                 log.trace(">> {}: freq:{}, pos:{}, payload:{}",
 										  termString,
 										  docs.freq(),
 										  pos,
 										  docs.getPayload());

                             BytesRef payload = docs.getPayload();

                             // Copy the payload
                             bbTerm.clear();

                             if (payload != null && payload.length <= bbTerm.capacity()) {
                                 bbTerm.put(payload.bytes, payload.offset,
                                         payload.length);
                             };
                             TermInfo ti = new TermInfo(termString, pos, bbTerm)
                                     .analyze();
                             if (ti.getEndPos() < match.getEndPos()) {
                                 if (DEBUG)
                                     log.trace("Add {}", ti.toString());
                                 termList.add(ti);
                             };
                         };
                     };
                 };

                 // Add annotations based on the retrieved infos
                 for (TermInfo t : termList.getTerms()) {
                     if (DEBUG)
                         log.trace(
 							"Add term {}/{}:{} with char:{}(pos:{})-char:{}(pos:{})",
 							t.getFoundry(), t.getLayer(), t.getValue(),
 							t.getStartChar(), t.getStartPos(),
 							t.getEndChar(), t.getEndPos());

 					// Ignore empty types for the moment
                     if (t.getType() == "term" || t.getType() == "span") {
                         match.addAnnotation(t.getStartPos(), t.getEndPos(),
                                 t.getAnnotation());
 					}

 					// TODO:
 					// else if (t.getType() == "empty") {
 					// }

 					// Use relSrc for annotation views
 					else if (t.getType() == "relSrc") {
 						// This only respects relSrc!
 						// May require more information for bidirectional relations
                         match.addRelation(
 							t.getStartPos(),
 							t.getEndPos(),
 							t.getTargetStartPos(),
 							t.getTargetEndPos(),
 							t.getAnnotation()
 							);
 					};
                 };

                 break;
             };
         }
         catch (IOException e) {
             match.addError(600, "Unable to read index",
                     e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
         };

         return match;
     };


     /**
      * Search in the index.
      */
     public Result search (SpanQuery query) {
         return this.search(new Krill(query));
     };


     public Result search (SpanQuery query, short count) {
         final Krill krill = new Krill(query);
         krill.getMeta().setCount(count);
         return this.search(krill);
     };


     @Deprecated
     public Result search (SpanQuery query, int startIndex, short count,
             boolean leftTokenContext, short leftContext,
             boolean rightTokenContext, short rightContext) {

         Krill ks = new Krill(query);
         KrillMeta meta = ks.getMeta();
         meta.setStartIndex(startIndex).setCount(count);
         meta.setContext(new SearchContext(leftTokenContext, leftContext,
                 rightTokenContext, rightContext));
         return this.search(ks);
     };


     /**
      * Search the endpoint.
      */
     public Result search (Krill ks) {
         if (DEBUG)
             log.trace("Start search");

         this.termContexts = new HashMap<Term, TermContext>();

         final KrillCollection collection = ks.getCollection();
         collection.setIndex(this);

         // Get the spanquery from the Krill object
         SpanQuery query = ks.getSpanQuery();

         // Get the field of textual data and annotations ("tokens")
         final String field = query.getField();

         final KrillMeta meta = ks.getMeta();

         // Todo: Make kr subclassing ks - so ks has a method for a new Result!
         // Or allow passing of Krill object
         final Result kr = new Result(query.toString(), meta.getStartIndex(),
                 meta.getCount(), meta.getContext());

         // Copy notification info
         kr.moveNotificationsFrom(ks);

         // Set version info to result
         if (this.getVersion() != null)
             kr.setVersion(this.getVersion());

         // The following fields should be lifted for matches
         HashSet<String> fields = (HashSet<String>) meta.getFields().clone();

         // Lift primary field
         fields.add(field);

         // Lift all fields
         if (fields.contains("@all"))
             fields = null;

         // Some initializations ...
         int i = 0;
         int startIndex = kr.getStartIndex();
         int count = kr.getItemsPerPage();
         int hits = kr.getItemsPerPage() + startIndex;
         int limit = meta.getLimit();
         int itemsPerResourceCounter = 0;
         boolean cutoff = meta.doCutOff();
         short itemsPerResource = meta.getItemsPerResource();

         // Check if there is work to do at all
         // TODO: Deprecated
         if (limit > 0) {
             if (hits > limit)
                 hits = limit;

             // Nah - nothing to do! Let's go shopping!
             if (limit < startIndex)
                 return kr;
         };

         // Collect matches from atomic readers
         final ArrayList<Match> atomicMatches = new ArrayList<Match>(
                 kr.getItemsPerPage());

         // Start time out thread
         final TimeOutThread tthread = new TimeOutThread();
         tthread.start();
         final long timeout = meta.getTimeOut();

         // See: http://www.ibm.com/developerworks/java/library/j-benchmark1/index.html
         long t1 = System.nanoTime();

         try {
             // Rewrite query (for regex and wildcard queries)
             // Revise!
             // Based on core/src/java/org/apache/lucene/search/IndexSearcher.java
             // and highlighter/src/java/org/apache/lucene/search/
 			//   postingshighlight/PostingsHighlighter.java
             for (Query rewrittenQuery = query.rewrite(this.reader());
 				 !rewrittenQuery.equals(query);
 				 rewrittenQuery = query.rewrite(this.reader())) {
                 query = (SpanQuery) rewrittenQuery;
             };

 			if (DEBUG)
 				log.trace("Rewritten query is {}", query.toString());


             // Todo: run this in a separated thread
             for (LeafReaderContext atomic : this.reader().leaves()) {

                 int oldLocalDocID = -1;

                 /*
                  * Todo: There may be a way to know early if the bitset is emty
                  * by using LongBitSet - but this may not be as fast as I think.
                  */
                 final FixedBitSet bitset = collection.bits(atomic);

 				if (bitset.nextSetBit(0) == DocIdSetIterator.NO_MORE_DOCS) {
 					continue;
 				};

                 final PositionsToOffset pto = new PositionsToOffset(atomic,
                         field);

                 // Spans spans = NearSpansOrdered();
                 final Spans spans = query.getSpans(atomic, (Bits) bitset,
                         termContexts);

                 final IndexReader lreader = atomic.reader();
                 int localDocID, docID;

                 // TODO: Get document information from Cache! Fieldcache?
                 for (; i < hits; i++) {

                     if (DEBUG)
                         log.trace("Match Nr {}/{}", i, count);

                     // There are no more spans to find
                     if (!spans.next())
                         break;

                     // Timeout!
                     if (tthread.getTime() > timeout) {
                         kr.setTimeExceeded(true);
                         break;
                     };

                     localDocID = spans.doc();

                     // Count hits per resource
                     if (itemsPerResource > 0) {

                         // IDS are identical
                         if (localDocID == oldLocalDocID
                                 || oldLocalDocID == -1) {
                             if (itemsPerResourceCounter++ >= itemsPerResource) {
                                 if (spans.skipTo(localDocID + 1) != true) {
                                     break;
                                 }
                                 else {
                                     itemsPerResourceCounter = 1;
                                     localDocID = spans.doc();
                                 };
                             };
                         }

                         // Reset counter
                         else
                             itemsPerResourceCounter = 0;

                         oldLocalDocID = localDocID;
                     };

                     // The next matches are not yet part of the result
                     if (startIndex > i)
                         continue;

                     docID = atomic.docBase + localDocID;

                     // Do not load all of this, in case the doc is the same!
                     final Document doc = (fields != null)
                             ? lreader.document(localDocID, fields)
                             : lreader.document(localDocID);

                     // Create new Match
                     final Match match = new Match(pto, localDocID,
                             spans.start(), spans.end());
                     match.setContext(kr.getContext());

 					match.retrievePagebreaks("~:base/s:pb");

 					if (DEBUG)
 						log.trace("Retrieve pagebreaks from index");

                     // Add match to Result
                     kr.add(match);

                     if (spans.isPayloadAvailable())
                         match.addPayload((List<byte[]>) spans.getPayload());

                     match.internalDocID = docID;

                     // Lift certain fields
                     if (fields != null) {
                         match.populateDocument(doc, field, fields);
                     }
                     // Lift all fields
                     else {
                         match.populateDocument(doc, field);
                     };

                     if (DEBUG) {
                         if (match.getDocID() != null)
                             log.trace(
                                     "With DocID: I've got 1 match of {} in {}",
                                     count, match.getDocID());
                         else
                             log.trace("With UID: I've got 1 match of {} in {}",
                                     count, match.getUID());
                     };

                     atomicMatches.add(match);
                 };

                 // Can be disabled TEMPORARILY
                 while (!cutoff && spans.next()) {

                     // TODO: Deprecated
                     if (limit > 0 && i >= limit)
                         break;

                     // Timeout!
                     if (tthread.getTime() > timeout) {
                         kr.setTimeExceeded(true);
                         break;
                     };

                     // Count hits per resource
                     if (itemsPerResource > 0) {
                         localDocID = spans.doc();

                         if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
                             break;

                         // IDS are identical
                         if (localDocID == oldLocalDocID
                                 || oldLocalDocID == -1) {
                             if (localDocID == -1)
                                 break;

                             if (itemsPerResourceCounter++ >= itemsPerResource) {
                                 if (spans.skipTo(localDocID + 1) != true) {
                                     break;
                                 };
                                 itemsPerResourceCounter = 1;
                                 localDocID = spans.doc();
                             };
                         }

                         // Reset counter
                         else
                             itemsPerResourceCounter = 0;

                         oldLocalDocID = localDocID;
                     };
                     i++;
                 };
                 atomicMatches.clear();
             };

             if (itemsPerResource > 0)
                 kr.setItemsPerResource(itemsPerResource);

             kr.setTotalResults(cutoff ? (long) -1 : (long) i);
         }

         catch (IOException e) {
             kr.addError(600, "Unable to read index", e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
 		}

 		catch (QueryException e) {
             kr.addError(e.getErrorCode(),e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
 		};

         // Stop timer thread
         tthread.stopTimer();

         // Calculate time
         kr.setBenchmark(t1, System.nanoTime());

         return kr;
     };


 	// Return field values
     public MetaFields getFields (String textSigle) {
 		// , HashSet<String> fields) {

 		// Create TermQuery for document
 		TermQuery textSigleQuery = new TermQuery(new Term("textSigle", textSigle));

 		Filter filter = (Filter) new QueryWrapperFilter(textSigleQuery);

 		/*
 		if (fields.contain("@all"))
 			fields = null;
 		*/

 		MetaFields metaFields = new MetaFields(textSigle);

         try {

             // Iterate over all atomic indices and find the matching document
             for (LeafReaderContext atomic : this.reader().leaves()) {

 				// Retrieve the single document of interest
                 DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader().getLiveDocs());

                 DocIdSetIterator filterIterator = filterSet.iterator();

 				// No document found
                 if (filterIterator == null)
                     continue;


                 // Go to the matching doc - and remember its ID
                 int localDocID = filterIterator.nextDoc();

                 if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
                     continue;

                 Document doc = atomic.reader().document(localDocID);
                 metaFields.populateFields(doc);

 				return metaFields;
 			};
 		}
 		catch  (IOException e) {
             metaFields.addError(600, "Unable to read index", e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
         };

         metaFields.addError(630, "Document not found");

 		return metaFields;
     };


     public void getValues (String field) {

     };


     // Collect matches
     public MatchCollector collect (Krill ks, MatchCollector mc) {
         if (DEBUG)
             log.trace("Start collecting");

         KrillCollection collection = ks.getCollection();
         collection.setIndex(this);

         // Init term context
         this.termContexts = new HashMap<Term, TermContext>();

         // Get span query
         SpanQuery query = ks.getSpanQuery();

         // Get the field of textual data and annotations
         String field = query.getField();

         // TODO: Get document information from Cache!
         // See: http://www.ibm.com/developerworks/java/library/j-benchmark1/index.html
         long t1 = System.nanoTime();

         // Only load UIDs
         HashSet<String> fields = new HashSet<>(1);
         fields.add("UID");

         // List<Match> atomicMatches = new ArrayList<Match>(10);
         try {

             // Rewrite query (for regex and wildcard queries)
             for (Query rewrittenQuery = query.rewrite(
                     this.reader()); rewrittenQuery != (Query) query; rewrittenQuery = query
                             .rewrite(this.reader())) {
                 query = (SpanQuery) rewrittenQuery;
             };

             int matchcount = 0;
             String uniqueDocIDString;;
             int uniqueDocID = -1;

             // start thread:
             for (LeafReaderContext atomic : this.reader().leaves()) {

                 int previousDocID = -1;
                 int oldLocalDocID = -1;

                 // Use LongBitSet;
                 Bits bitset = collection.bits(atomic);

                 // PositionsToOffset pto = new PositionsToOffset(atomic, field);

                 Spans spans = query.getSpans(atomic, (Bits) bitset,
                         termContexts);

                 IndexReader lreader = atomic.reader();

                 while (spans.next()) {
                     int localDocID = spans.doc();

                     // New match
                     // MatchIdentifier possibly needs more
                     /*
                       Match match = new Match();
                       match.setStartPos(spans.start());
                       match.setEndPos(spans.end());

                       // Add payload information to match
                       if (spans.isPayloadAvailable())
                       match.addPayload(spans.getPayload());
                     */

                     if (previousDocID != localDocID) {
                         if (matchcount > 0) {
                             mc.add(uniqueDocID, matchcount);
                             matchcount = 0;
                         };

                         // Read document id from index
                         uniqueDocIDString = lreader.document(localDocID, fields)
                                 .get("UID");

                         if (uniqueDocIDString != null)
                             uniqueDocID = Integer.parseInt(uniqueDocIDString);

                         previousDocID = localDocID;
                     }
                     else {
                         matchcount++;
                     };
                 };

                 // Add count to collector
                 if (matchcount > 0) {
                     mc.add(uniqueDocID, matchcount);
                     matchcount = 0;
                 };
             };
             // end thread

             // Benchmark the collector
             mc.setBenchmark(t1, System.nanoTime());
         }
         catch (IOException e) {
             mc.addError(600, "Unable to read index", e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
 		}
 		catch (QueryException e) {
             mc.addError(e.getErrorCode(),e.getLocalizedMessage());
             log.warn(e.getLocalizedMessage());
 		};

         mc.close();
         return mc;
     };
 };