Implemented a new cache with on disk storage and auto-update.
Change-Id: I41a35e9b02af4b905ff724cd13e4795f5c409d81
diff --git a/src/main/java/de/ids_mannheim/korap/IndexInfo.java b/src/main/java/de/ids_mannheim/korap/IndexInfo.java
new file mode 100644
index 0000000..3b64b55
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/IndexInfo.java
@@ -0,0 +1,9 @@
+package de.ids_mannheim.korap;
+
+import java.util.Set;
+
+public interface IndexInfo {
+
+ abstract Set<String> getAllLeafFingerprints();
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/Krill.java b/src/main/java/de/ids_mannheim/korap/Krill.java
index 6bee230..a99624b 100644
--- a/src/main/java/de/ids_mannheim/korap/Krill.java
+++ b/src/main/java/de/ids_mannheim/korap/Krill.java
@@ -9,6 +9,7 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import de.ids_mannheim.korap.cache.VirtualCorpusCache;
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
import de.ids_mannheim.korap.response.Response;
import de.ids_mannheim.korap.response.Result;
@@ -256,6 +257,7 @@
*/
public Krill setIndex (KrillIndex index) {
this.index = index;
+ VirtualCorpusCache.setIndexInfo(index);
return this;
};
@@ -272,6 +274,7 @@
* @return The result as a {@link Result} object.
*/
public Result apply (KrillIndex index) {
+ VirtualCorpusCache.setIndexInfo(index);
return this.setIndex(index).apply();
};
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 628173b..4ec7752 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -8,6 +8,7 @@
import java.util.List;
import java.util.Map;
import java.util.Properties;
+import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
@@ -32,16 +33,13 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
-import de.ids_mannheim.korap.collection.CachedVCData;
import de.ids_mannheim.korap.collection.CollectionBuilder;
import de.ids_mannheim.korap.collection.DocBits;
import de.ids_mannheim.korap.response.Notifications;
+import de.ids_mannheim.korap.util.Fingerprinter;
import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
import de.ids_mannheim.korap.util.StatusCodes;
-import net.sf.ehcache.Cache;
-import net.sf.ehcache.CacheManager;
-import net.sf.ehcache.Element;
/**
* Create a Virtual Collection of documents by means of a KoralQuery
@@ -61,10 +59,10 @@
* See http://mail-archives.apache.org/mod_mbox/lucene-java-user/
* 200805.mbox/%3C17080852.post@talk.nabble.com%3E
*/
-public final class KrillCollection extends Notifications {
+public final class KrillCollection extends Notifications implements IndexInfo {
private KrillIndex index;
private JsonNode json;
- private CollectionBuilder cb = new CollectionBuilder();
+ private final CollectionBuilder cb = new CollectionBuilder(this);
private CollectionBuilder.Interface cbi;
private byte[] pl = new byte[4];
@@ -74,22 +72,17 @@
// private static ByteBuffer bb = ByteBuffer.allocate(4);
// Logger
- private final static Logger log =
+ private final static Logger log =
LoggerFactory.getLogger(KrillCollection.class);
-
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
-
- public static CacheManager cacheManager;
- public static Cache cache;
+ private double start, end; // for debugging
/**
* Construct a new KrillCollection.
*
*/
- public KrillCollection () {
- initializeCache();
- };
+ public KrillCollection () {};
/**
@@ -99,7 +92,6 @@
* The {@link KrillIndex} object.
*/
public KrillCollection (KrillIndex index) {
- initializeCache();
this.index = index;
};
@@ -110,7 +102,6 @@
* The KoralQuery document as a JSON string.
*/
public KrillCollection (String jsonString) {
- initializeCache();
try {
JsonNode json = mapper.readTree(jsonString);
@@ -145,16 +136,7 @@
};
};
- public static void initializeCache () {
- if (cacheManager == null) {
- cacheManager = CacheManager.newInstance();
- }
- if (cache == null) {
- cache = cacheManager.getCache("named_vc");
- }
- }
-
-
+
/**
* Set the {@link KrillIndex} the virtual collection refers to.
*
@@ -175,7 +157,7 @@
* @throws QueryException
*/
public KrillCollection fromKoral (String jsonString) throws QueryException {
- this.prefiltered = null;
+ this.prefiltered = null;
try {
this.fromKoral((JsonNode) mapper.readTree(jsonString));
}
@@ -581,9 +563,35 @@
*/
public FixedBitSet bits (LeafReaderContext atomic) throws IOException, QueryException {
+ // EM: really need a fixedBitset?
+ // maybe better use org.apache.lucene.util.BitDocIdSet.Builder
+ // for automatic sparse bitset support
+ // appears possible by implementing a SparseDocBits class extending
+ // SparseFixedBitSet and implementing Serializable (only as marker interface)
LeafReader r = atomic.reader();
FixedBitSet bitset = new FixedBitSet(r.maxDoc());
- DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
+
+ if (DEBUG) {
+ start = System.currentTimeMillis();
+ }
+ DocIdSet docids = null;
+ try {
+ docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
+ }
+ catch (RuntimeException e) {
+ Throwable t = e.getCause();
+ if (t instanceof IOException) {
+ throw new IOException(t);
+ }
+ else if (t instanceof QueryException) {
+ throw new QueryException(((QueryException) t).getErrorCode(), t.getLocalizedMessage());
+ }
+ }
+
+ if (DEBUG) {
+ end = System.currentTimeMillis();
+ log.info("getDocIdSet in bits: " + (end - start));
+ }
if (docids == null) {
@@ -619,11 +627,13 @@
int maxDoc = atomic.reader().maxDoc();
FixedBitSet bitset = new FixedBitSet(maxDoc);
- Filter filter;
- if (this.cbi == null || (filter = this.toFilter()) == null) {
- if (acceptDocs == null) return null;
- bitset.set(0, maxDoc);
- }
+ final Filter filter = this.toFilter();
+
+ if (filter == null) {
+ if (acceptDocs == null)
+ return null;
+ bitset.set(0, maxDoc);
+ }
else {
// Init vector
@@ -838,30 +848,13 @@
};
return str;
};
-
-
- public void storeInCache (String cacheKey) throws IOException, QueryException {
- if (cacheKey ==null || cacheKey.isEmpty()) {
- this.addError(StatusCodes.MISSING_ID,
- "Collection name is required for caching.");
- }
+ public void storeInCache (String vcId) throws IOException, QueryException {
- List<LeafReaderContext> leaves = this.index.reader().leaves();
- Map<Integer, DocBits> docIdMap =
- new HashMap<Integer, DocBits>(leaves.size());
+ }
- for (LeafReaderContext context : leaves) {
- if (docIdMap.get(context.hashCode()) == null) {
- FixedBitSet bitset = bits(context);
- DocBits docBits = new DocBits(bitset.getBits(), bitset.length());
- docIdMap.put(context.hashCode(),
- docBits);
- }
- }
-
- CachedVCData cc = new CachedVCData(docIdMap);
- cache.put(new Element(cacheKey, cc));
- this.cbi = cb.namedVC(cacheKey, cc);
+ @Override
+ public Set<String> getAllLeafFingerprints () {
+ return index.getAllLeafFingerprints();
}
/*
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 60805bd..4e9ea06 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -3,16 +3,22 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.time.LocalDate;
// Java core classes
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
-import java.time.LocalDate;
-
-import java.security.NoSuchAlgorithmException;
-import java.security.MessageDigest;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
@@ -51,6 +57,7 @@
import com.fasterxml.jackson.databind.ObjectMapper;
+import de.ids_mannheim.korap.cache.VirtualCorpusCache;
// Krill classes
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.index.KeywordAnalyzer;
@@ -65,8 +72,9 @@
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
import de.ids_mannheim.korap.response.Text;
-import de.ids_mannheim.korap.util.KrillProperties;
+import de.ids_mannheim.korap.util.Fingerprinter;
import de.ids_mannheim.korap.util.KrillDate;
+import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
/**
@@ -135,7 +143,7 @@
-> search for frequencies of VVFIN/gehen
-> c:VVFIN:[^:]*?:gehen:past:...
*/
-public final class KrillIndex {
+public final class KrillIndex implements IndexInfo {
// Logger
private final static Logger log = LoggerFactory.getLogger(KrillIndex.class);
@@ -387,8 +395,6 @@
this.writer().commit();
commitCounter = 0;
this.closeReader();
- if (KrillCollection.cache != null)
- KrillCollection.cache.removeAll();
};
@@ -1715,25 +1721,9 @@
if (this.reader() == null) {
return "null";
}
-
- MessageDigest md;
- try {
- // MD5 used for fingerprinting (no security implications here)
- md = MessageDigest.getInstance("MD5");
- }
- catch (NoSuchAlgorithmException e) {
- log.error(e.getMessage());
- return e.getMessage();
- };
String hash = this.reader().getCombinedCoreAndDeletesKey().toString();
-
- md.update(hash.getBytes());
-
- // Turn bytes into Base64 string
- this.indexRevision = new String(
- Base64.getEncoder().encode(md.digest())
- );
+ this.indexRevision = Fingerprinter.create(hash);
return this.indexRevision;
};
@@ -1856,4 +1846,18 @@
public boolean isReaderOpen () {
return readerOpen;
}
+
+
+ @Override
+ public Set<String> getAllLeafFingerprints () {
+ List<LeafReaderContext> leaves = this.reader().leaves();
+ Set<String> fingerprints = new HashSet<>(leaves.size() * 2);
+ for (LeafReaderContext context : leaves) {
+ String fp = Fingerprinter.create(
+ context.reader().getCombinedCoreAndDeletesKey().toString());
+ fingerprints.add(fp);
+ }
+ return fingerprints;
+ }
+
};
diff --git a/src/main/java/de/ids_mannheim/korap/cache/VirtualCorpusCache.java b/src/main/java/de/ids_mannheim/korap/cache/VirtualCorpusCache.java
new file mode 100644
index 0000000..84cc29d
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/cache/VirtualCorpusCache.java
@@ -0,0 +1,248 @@
+package de.ids_mannheim.korap.cache;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Supplier;
+
+import org.apache.lucene.index.LeafReaderContext;
+
+import de.ids_mannheim.korap.IndexInfo;
+import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.collection.DocBits;
+import de.ids_mannheim.korap.collection.VirtualCorpusReferenceFilter;
+import de.ids_mannheim.korap.collection.VirtualCorpusReferenceFilter.DocBitsSupplier;
+import de.ids_mannheim.korap.util.Fingerprinter;
+import de.ids_mannheim.korap.util.QueryException;
+
+/**
+ *
+ * @author margaretha
+ *
+ */
+public class VirtualCorpusCache {
+
+ public static final String CACHE_LOCATION = "vc-cache";
+ public static final int CAPACITY = 5;
+ public static final Map<String, Map<String, DocBits>> map = Collections
+ .synchronizedMap(new LinkedHashMap<String, Map<String, DocBits>>(
+ CAPACITY, (float) 0.75, true) {
+
+ private static final long serialVersionUID = 1815514581428132435L;
+
+ @SuppressWarnings("rawtypes")
+ @Override
+ protected boolean removeEldestEntry (Map.Entry eldest) {
+ return size() > CAPACITY;
+ }
+ });
+
+ private static IndexInfo indexInfo;
+
+ private static final Set<String> vcToCleanUp = Collections
+ .synchronizedSet(new HashSet<>());
+
+
+ public VirtualCorpusCache () {
+ File dir = new File(CACHE_LOCATION);
+ dir.mkdirs();
+ }
+
+
+ public static void storeOnDisk (String vcId, String leafFingerprint,
+ DocBits docBits) {
+ File dir = new File(CACHE_LOCATION + "/" + vcId);
+ if (!dir.exists()) {
+ dir.mkdirs();
+ }
+
+ String filepath = dir + "/" + leafFingerprint;
+ File f = new File(filepath);
+ if (f.exists()) {
+ f.delete();
+ }
+ try {
+ ObjectOutputStream os = new ObjectOutputStream(
+ new FileOutputStream(f));
+ os.writeObject(docBits);
+ os.close();
+ }
+ catch (IOException e) {
+ e.printStackTrace();
+ System.err.println("Cannot write " + filepath);
+ }
+ }
+
+
+ public static void store (String vcId, Map<String, DocBits> vcData) {
+ map.put(vcId, vcData);
+ for (String leafFingerprint : vcData.keySet()) {
+ storeOnDisk(vcId, leafFingerprint, vcData.get(leafFingerprint));
+ }
+
+ }
+
+ public static void store (String vcId, KrillIndex index)
+ throws QueryException, IOException {
+
+ DocBitsSupplier docBitsSupplier = new VirtualCorpusReferenceFilter(
+ vcId).new DocBitsSupplier();
+ String leafFingerprint;
+ for (LeafReaderContext context : index.reader().leaves()) {
+ leafFingerprint = Fingerprinter.create(
+ context.reader().getCombinedCoreAndDeletesKey().toString());
+
+ getDocBits(vcId, leafFingerprint, () -> {
+ try {
+ return docBitsSupplier.supplyDocBits(context,
+ context.reader().getLiveDocs());
+ }
+ catch (IOException | QueryException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+ }
+
+
+ public static Map<String, DocBits> retrieve (String vcId) {
+ if (map.containsKey(vcId)) {
+ return map.get(vcId);
+ }
+ Map<String, DocBits> vcData = null;
+ File dir = new File(CACHE_LOCATION + "/" + vcId);
+ if (dir.exists()) {
+ vcData = new HashMap<String, DocBits>();
+ for (File f : dir.listFiles()) {
+ ObjectInputStream ois;
+ try {
+ ois = new ObjectInputStream(new FileInputStream(f));
+ DocBits d = (DocBits) ois.readObject();
+ vcData.put(f.getName(), d);
+ ois.close();
+ }
+ catch (IOException | ClassNotFoundException e) {
+ return null;
+ }
+ }
+ map.put(vcId, vcData);
+ }
+ return vcData;
+
+ }
+
+
+ public static boolean contains (String vcId) {
+ if (map.containsKey(vcId)) {
+ return true;
+ }
+ else {
+ File f = new File(CACHE_LOCATION + "/" + vcId);
+ return f.exists();
+ }
+ }
+
+
+ public static void reset () {
+ vcToCleanUp.clear();
+ map.clear();
+
+ File vcCache = new File(VirtualCorpusCache.CACHE_LOCATION + "/");
+ for (File vc : vcCache.listFiles()) {
+ for (File f : vc.listFiles()) {
+ if (f.exists()) {
+ f.delete();
+ }
+ }
+ vc.delete();
+ }
+ vcCache.delete();
+ }
+
+ /**
+ * When the VC cache knows that a leaf-fingerprint is not in the
+ * map of a VC, it is marked for clean up. The cached VC will be
+ * cleaned up, next time a VC Reference is created.
+ * see {@link #getDocBits(String, String, Supplier)}
+ */
+ public static void setIndexInfo (IndexInfo indexInfo) {
+ VirtualCorpusCache.indexInfo = indexInfo;
+ synchronized (vcToCleanUp) {
+ if (!vcToCleanUp.isEmpty()) {
+ cleanup();
+ }
+ }
+ }
+
+
+ private static void cleanup () {
+ final Set<String> currentLeafFingerprints = indexInfo
+ .getAllLeafFingerprints();
+ Map<String, DocBits> vcData;
+ for (String vcId : vcToCleanUp) {
+ vcData = map.get(vcId);
+ vcData.keySet()
+ .removeIf(storedFingerPrint -> currentLeafFingerprints
+ .contains(storedFingerPrint) == false);
+ store(vcId, vcData);
+ }
+ vcToCleanUp.clear();
+ }
+
+
+ /**
+ * Gets DocBits for a single leaf from the VC cache or calculates
+ * and stores it, if it doesn't exist in the cache. This can
+ * happen when:
+ * <ul>
+ * <li> The VC has not been cached before</li>
+ * <p>The VC will be cached with a single leaf-fingerprint in a
+ * leafToDocBitMap. The map will be updated for the other leaf-
+ * fingerprints and thus be cleaned up once.
+ * </p>
+ * <li>The index has been updated</li>
+ * <p>
+ * In this case, the VC may contain old leaf-fingerprints. It will
+ * be clean up when the index is used next time.
+ * </p>
+ * </ul>
+ *
+ * @see #setIndexInfo(IndexInfo)
+ * @param vcId
+ * @param leafFingerprint
+ * @param calculateDocBits
+ * a supplier calculating the DocBits
+ * @return DocBits
+ */
+ public static DocBits getDocBits (String vcId, String leafFingerprint,
+ Supplier<DocBits> calculateDocBits) {
+ DocBits docBits = null;
+ Map<String, DocBits> leafToDocBitMap = retrieve(vcId);
+ if (leafToDocBitMap == null) {
+ leafToDocBitMap = Collections
+ .synchronizedMap(new HashMap<String, DocBits>());
+ map.put(vcId, leafToDocBitMap);
+ }
+ else {
+ docBits = leafToDocBitMap.get(leafFingerprint);
+ if (docBits == null) {
+ vcToCleanUp.add(vcId);
+ }
+ }
+ if (docBits == null) {
+ docBits = calculateDocBits.get();
+ leafToDocBitMap.put(leafFingerprint, docBits);
+ storeOnDisk(vcId, leafFingerprint, docBits);
+ }
+ return docBits;
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java b/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
deleted file mode 100644
index ca97d52..0000000
--- a/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package de.ids_mannheim.korap.collection;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-import java.util.Map;
-
-/**
- * Virtual corpus data to cache
- *
- * @author margaretha
- *
- */
-public class CachedVCData implements Serializable {
-
- /**
- * Auto generated
- *
- */
- private static final long serialVersionUID = 5635087441839303653L;
-
- private Map<Integer, DocBits> docIdMap;
-
- public CachedVCData (Map<Integer, DocBits> docIdMap) {
- this.docIdMap = docIdMap;
- }
-
- public Map<Integer, DocBits> getDocIdMap () {
- return docIdMap;
- }
-
- public void setDocIdMap (Map<Integer, DocBits> docIdMap) {
- this.docIdMap = docIdMap;
- }
-
- // EM: for optimization. has not been checked.
- // ehcache retrieves a byte[] much faster than a map, however,
- // there is an additional cost for converting a map to a byte[]
- // and vice versa.
-
- private byte[] toByteArray () throws IOException {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- ObjectOutputStream oos = new ObjectOutputStream(bos);
- oos.writeObject(docIdMap);
- oos.flush();
- return bos.toByteArray();
- }
-
- private Map<Integer, DocBits> toMap (byte[] bytes)
- throws ClassNotFoundException, IOException {
- ByteArrayInputStream bis = null;
- ObjectInputStream ois = null;
- Map<Integer, DocBits> map = null;
- try {
- bis = new ByteArrayInputStream(bytes);
- ois = new ObjectInputStream(bis);
- map = (Map<Integer, DocBits>) ois.readObject();
-
- }
- finally {
- if (bis != null) {
- bis.close();
- }
- if (ois != null) {
- ois.close();
- }
- }
- return map;
- }
-
- public String toString () {
- return this.docIdMap.toString();
- }
-}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
deleted file mode 100644
index de851c1..0000000
--- a/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
+++ /dev/null
@@ -1,50 +0,0 @@
-package de.ids_mannheim.korap.collection;
-
-import java.io.IOException;
-
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.search.DocIdSet;
-import org.apache.lucene.search.Filter;
-import org.apache.lucene.util.Bits;
-
-/**
- * Filter for virtual corpus/collection existing in the cache.
- *
- * @author margaretha
- *
- */
-public class CachedVCFilter extends Filter {
-
- public static final boolean DEBUG = false;
-
- public static Logger jlog = LogManager.getLogger(CachedVCFilter.class);
-
- private CachedVCData cachedCollection;
- private String cacheKey;
-
- public CachedVCFilter (String cacheKey, CachedVCData cachedCollection) {
- this.cacheKey = cacheKey;
- this.cachedCollection = cachedCollection;
- }
-
- @Override
- public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
- throws IOException {
- DocBits docBits =
- cachedCollection.getDocIdMap().get(context.hashCode());
-
- if (docBits == null) {
- if (DEBUG)
- jlog.debug("LeafReaderContext is not found in the cache.");
- return null;
- }
- return docBits.createBitDocIdSet();
- }
-
- @Override
- public String toString () {
- return "referTo(cached:" + this.cacheKey + ")";
- };
-}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 9f6dbe7..9f7e30f 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -1,15 +1,9 @@
package de.ids_mannheim.korap.collection;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.Iterator;
-import java.util.Map;
-import java.util.Properties;
-import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.Filter;
@@ -20,14 +14,11 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import de.ids_mannheim.korap.IndexInfo;
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.index.TextPrependedTokenStream;
import de.ids_mannheim.korap.util.KrillDate;
-import de.ids_mannheim.korap.util.KrillProperties;
import de.ids_mannheim.korap.util.QueryException;
-import net.sf.ehcache.Cache;
-import net.sf.ehcache.CacheManager;
-import net.sf.ehcache.Element;
/*
@@ -41,17 +32,20 @@
*/
public class CollectionBuilder {
-
- public final static CacheManager cacheManager = CacheManager.newInstance();
- public final static Cache cache = cacheManager.getCache("named_vc");
-
-
// Logger
private final static Logger log = LoggerFactory
.getLogger(KrillCollection.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
+
+ protected IndexInfo indexInfo;
+
+ public CollectionBuilder () {}
+
+ public CollectionBuilder (IndexInfo indexInfo) {
+ this.indexInfo = indexInfo;
+ }
public CollectionBuilder.Interface term (String field, String term) {
@@ -138,7 +132,7 @@
};
public CollectionBuilder.Interface referTo (String reference) {
- return new CollectionBuilder.Reference(reference);
+ return new CollectionBuilder.VirtualCorpusReference(reference);
};
@@ -275,55 +269,25 @@
};
- public class Reference implements CollectionBuilder.Interface {
+ public class VirtualCorpusReference implements CollectionBuilder.Interface {
private boolean isNegative = false;
- private String reference;
- private Map<Integer, DocBits> docIdMap =
- new HashMap<Integer, DocBits>();
-
- public Reference (String reference) {
- this.reference = reference;
+ private String vcId;
+ private VirtualCorpusReferenceFilter vcRefFilter;
+
+
+ public VirtualCorpusReference (String vcId) {
+ this.vcId = vcId;
+ vcRefFilter = new VirtualCorpusReferenceFilter(vcId);
+// VirtualCorpusCache.setIndexInfo(indexInfo);
};
public Filter toFilter () throws QueryException {
- Element element = null;
- if (KrillCollection.cache != null){
- element = KrillCollection.cache.get(this.reference);
- }
- if (element == null) {
- if (DEBUG) {
- log.debug(reference + " is NOT found in the cache");
- }
- KrillCollection kc = new KrillCollection();
-
- kc.fromStore(this.reference);
-
- if (kc.hasErrors()) {
- throw new QueryException(
- kc.getError(0).getCode(),
- kc.getError(0).getMessage()
- );
- };
-
- return new ToCacheVCFilter(
- this.reference,
- docIdMap,
- kc.getBuilder(),
- kc.toFilter()
- );
- }
- else {
- if (DEBUG) {
- log.debug(reference + " is FOUND in the cache.");
- }
- CachedVCData cc = (CachedVCData) element.getObjectValue();
- return new CachedVCFilter(this.reference, cc);
- }
+ return vcRefFilter;
};
public String toString () {
- return "referTo(" + this.reference + ")";
+ return "referTo(" + this.vcId + ")";
};
@@ -336,37 +300,6 @@
this.isNegative = true;
return this;
};
-
- private String loadVCFile (String ref) {
- Properties prop = KrillProperties.loadDefaultProperties();
- if (prop == null){
- /*
- this.addError(StatusCodes.MISSING_KRILL_PROPERTIES,
- "krill.properties is not found.");
- */
- return null;
- }
-
- String namedVCPath = prop.getProperty("krill.namedVC");
- if (!namedVCPath.endsWith("/")){
- namedVCPath += "/";
- }
- File file = new File(namedVCPath+ref+".jsonld");
-
- String json = null;
- try {
- FileInputStream fis = new FileInputStream(file);
- json = IOUtils.toString(fis);
- }
- catch (IOException e) {
- /*
- this.addError(StatusCodes.MISSING_COLLECTION,
- "Collection is not found.");
- */
- return null;
- }
- return json;
- }
};
@@ -494,82 +427,4 @@
return this;
};
};
-
- /** Builder for virtual corpus / collection existing in the cache
- *
- * @author margaretha
- *
- */
- public class CachedVC implements CollectionBuilder.Interface {
-
- private String cacheKey;
- private CachedVCData cachedCollection;
- private boolean isNegative = false;
-
- public CachedVC (String vcRef, CachedVCData cc) {
- this.cacheKey = vcRef;
- this.cachedCollection = cc;
- }
-
- @Override
- public Filter toFilter () {
- return new CachedVCFilter(this.cacheKey, cachedCollection);
- }
-
- @Override
- public boolean isNegative () {
- return this.isNegative;
- }
-
- @Override
- public CollectionBuilder.Interface not () {
- this.isNegative = true;
- return this;
- }
-
- }
-
- /** Wraps a sub CollectionBuilder.Interface to allows VC caching
- *
- * @author margaretha
- *
- */
- public class ToCacheVC implements CollectionBuilder.Interface {
-
- private CollectionBuilder.Interface child;
- private String cacheKey;
-
- private Map<Integer, DocBits> docIdMap;
-
- public ToCacheVC (String vcRef, Interface cbi) {
- this.child = cbi;
- this.cacheKey = vcRef;
- this.docIdMap = new HashMap<Integer, DocBits>();
- }
-
- @Override
- public Filter toFilter () throws QueryException {
- return new ToCacheVCFilter(cacheKey,docIdMap, child, child.toFilter());
- }
-
- @Override
- public boolean isNegative () {
- return child.isNegative();
- }
-
- @Override
- public CollectionBuilder.Interface not () {
- // not supported
- return this;
- }
- }
-
- // Maybe irrelevant
- public Interface namedVC (String vcRef, CachedVCData cc) {
- return new CollectionBuilder.CachedVC(vcRef, cc);
- }
-
- public Interface toCacheVC (String vcRef, Interface cbi) {
- return new CollectionBuilder.ToCacheVC(vcRef, cbi);
- }
};
diff --git a/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
deleted file mode 100644
index 072e1dc..0000000
--- a/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
+++ /dev/null
@@ -1,93 +0,0 @@
-package de.ids_mannheim.korap.collection;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.search.DocIdSet;
-import org.apache.lucene.search.Filter;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.FixedBitSet;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import de.ids_mannheim.korap.KrillCollection;
-import de.ids_mannheim.korap.collection.CollectionBuilder.Interface;
-import net.sf.ehcache.Element;
-
-/**
- * Filter for virtual corpus / collection that should be cached.
- *
- * @author margaretha
- *
- */
-public class ToCacheVCFilter extends Filter {
- private Filter filter;
- private CollectionBuilder.Interface cbi;
- private String cacheKey;
- private Map<Integer, DocBits> docIdMap;
- // EM: auto caching is disabled regarding issue #44
- private boolean isAutoCachingEnabled = false;
-
- public final static Logger log = LoggerFactory.getLogger(ToCacheVCFilter.class);
- public static final boolean DEBUG = false;
-
- public ToCacheVCFilter (String cacheKey, Map<Integer, DocBits> docIdMap,
- Interface cbi, Filter filter) {
- this.cacheKey = cacheKey;
- this.docIdMap = docIdMap;
- this.cbi = cbi;
- this.filter = filter;
- }
-
- @Override
- public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
- throws IOException {
-
- DocIdSet docIdSet = filter.getDocIdSet(context, acceptDocs);
-
- final LeafReader reader = context.reader();
- int maxDoc = reader.maxDoc();
- FixedBitSet bitset = new FixedBitSet(maxDoc);
-
- if (docIdSet == null) {
- if (cbi.isNegative()) {
- bitset.set(0, maxDoc);
- }
- else {
- bitset.clear(0, maxDoc);
- }
- }
- else {
- bitset.or(docIdSet.iterator());
- if (cbi.isNegative()) {
- bitset.flip(0, maxDoc);
- }
- }
-
- if (isAutoCachingEnabled) {
- docIdMap.put(context.hashCode(),
- new DocBits(bitset.getBits(), bitset.length()));
- CachedVCData cachedVCData =
- new CachedVCData(new HashMap<>(docIdMap));
-
- if (KrillCollection.cache == null){
- KrillCollection.initializeCache();
- }
- KrillCollection.cache.remove(cacheKey);
- KrillCollection.cache.put(new Element(cacheKey, cachedVCData));
- }
-
- if (DEBUG){
- log.debug("To cache doc bits length: "+ docIdSet.bits().length());
- }
- return docIdSet;
- }
-
- @Override
- public String toString () {
- return "referTo(" + this.cacheKey + ")";
- };
-}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/VirtualCorpusReferenceFilter.java b/src/main/java/de/ids_mannheim/korap/collection/VirtualCorpusReferenceFilter.java
new file mode 100644
index 0000000..2ee60bd
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/collection/VirtualCorpusReferenceFilter.java
@@ -0,0 +1,104 @@
+package de.ids_mannheim.korap.collection;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.KrillCollection;
+import de.ids_mannheim.korap.cache.VirtualCorpusCache;
+import de.ids_mannheim.korap.util.Fingerprinter;
+import de.ids_mannheim.korap.util.QueryException;
+
+public class VirtualCorpusReferenceFilter extends Filter {
+
+ public final static Logger log = LoggerFactory
+ .getLogger(VirtualCorpusReferenceFilter.class);
+ public static boolean DEBUG = false;
+
+ private String vcId;
+ private DocBitsSupplier docBitsSupplier;
+
+ public VirtualCorpusReferenceFilter (String vcId) {
+ this.vcId = vcId;
+ docBitsSupplier = new DocBitsSupplier();
+ }
+
+
+ @Override
+ public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
+ throws IOException {
+ String leafFingerprint = Fingerprinter.create(
+ context.reader().getCombinedCoreAndDeletesKey().toString());
+
+ DocBits docBits = VirtualCorpusCache.getDocBits(vcId, leafFingerprint,
+ () -> {
+ try {
+ return docBitsSupplier.supplyDocBits(context, acceptDocs);
+ }
+ catch (IOException | QueryException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ return docBits.createBitDocIdSet();
+ }
+
+ public class DocBitsSupplier {
+
+ private Filter filter;
+ private CollectionBuilder.Interface cbi;
+
+ public DocBitsSupplier () {}
+
+ public DocBits supplyDocBits (LeafReaderContext context,
+ Bits acceptDocs) throws IOException, QueryException {
+ if (cbi == null || filter == null) {
+ KrillCollection kc = new KrillCollection();
+ // load from file
+ kc.fromStore(vcId);
+ if (kc.hasErrors()) {
+ throw new QueryException(kc.getError(0).getCode(),
+ kc.getError(0).getMessage());
+ }
+
+ this.cbi = kc.getBuilder();
+ this.filter = kc.toFilter();
+ }
+
+ DocIdSet docIdSet = filter.getDocIdSet(context, acceptDocs);
+ return calculateDocBits(docIdSet, context.reader().maxDoc());
+ }
+
+
+ private DocBits calculateDocBits (DocIdSet docIdSet, int maxDoc)
+ throws IOException {
+ FixedBitSet bitset = new FixedBitSet(maxDoc);
+ if (docIdSet == null) {
+ if (cbi.isNegative()) {
+ bitset.set(0, maxDoc);
+ }
+ else {
+ bitset.clear(0, maxDoc);
+ }
+ }
+ else {
+ bitset.or(docIdSet.iterator());
+ if (cbi.isNegative()) {
+ bitset.flip(0, maxDoc);
+ }
+ }
+
+ return new DocBits(bitset.getBits(), bitset.length());
+ }
+ }
+
+ @Override
+ public String toString () {
+ return "VirtualCorpusReferenceFilter("+vcId+")";
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/util/Fingerprinter.java b/src/main/java/de/ids_mannheim/korap/util/Fingerprinter.java
new file mode 100644
index 0000000..59113c3
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/util/Fingerprinter.java
@@ -0,0 +1,32 @@
+package de.ids_mannheim.korap.util;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Base64;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Fingerprinter {
+
+ private final static Logger log = LoggerFactory
+ .getLogger(Fingerprinter.class);
+
+ private static MessageDigest md;
+
+ public static String create (String key) {
+ try {
+ md = MessageDigest.getInstance("MD5");
+ }
+ catch (NoSuchAlgorithmException e) {
+ log.error(e.getMessage());
+ return e.getMessage();
+ };
+
+ md.update(key.getBytes());
+ String code = new String(Base64.getEncoder().encode(md.digest()));
+ md.reset();
+ return code;
+
+ }
+}
diff --git a/src/test/java/de/ids_mannheim/korap/cache/TestCache.java b/src/test/java/de/ids_mannheim/korap/cache/TestCache.java
index 06a882d..857765a 100644
--- a/src/test/java/de/ids_mannheim/korap/cache/TestCache.java
+++ b/src/test/java/de/ids_mannheim/korap/cache/TestCache.java
@@ -1,18 +1,18 @@
package de.ids_mannheim.korap.cache;
+import static org.junit.Assert.assertEquals;
+
import java.util.Collections;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
import net.sf.jsr107cache.Cache;
import net.sf.jsr107cache.CacheException;
import net.sf.jsr107cache.CacheFactory;
import net.sf.jsr107cache.CacheManager;
-import static org.junit.Assert.*;
-import org.junit.Test;
-import org.junit.Ignore;
-import org.junit.runner.RunWith;
-import org.junit.runners.JUnit4;
-
@RunWith(JUnit4.class)
public class TestCache {
diff --git a/src/test/java/de/ids_mannheim/korap/cache/TestVirtualCorpusCache.java b/src/test/java/de/ids_mannheim/korap/cache/TestVirtualCorpusCache.java
new file mode 100644
index 0000000..bd65ab2
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/cache/TestVirtualCorpusCache.java
@@ -0,0 +1,130 @@
+package de.ids_mannheim.korap.cache;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.Test;
+
+import de.ids_mannheim.korap.Krill;
+import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.collection.DocBits;
+import de.ids_mannheim.korap.response.Result;
+import de.ids_mannheim.korap.util.QueryException;
+
+public class TestVirtualCorpusCache {
+
+ private KrillIndex ki;
+ private String queryRefJson;
+
+ public TestVirtualCorpusCache () throws IOException {
+ ki = createIndex();
+
+ String file = "/queries/collections/vc-ref/query-with-vc-ref.jsonld";
+ InputStream is = getClass().getResourceAsStream(file);
+ queryRefJson = IOUtils.toString(is, "utf-8");
+ }
+
+
+ private KrillIndex createIndex () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ String[] docIds = new String[] { "00001", "00002", "00003" };
+ int uid = 1;
+ for (String i : docIds) {
+ ki.addDoc(uid++,
+ getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ }
+ ki.commit();
+
+ ki.addDoc(uid++, getClass().getResourceAsStream("/wiki/00004.json.gz"),
+ true);
+ ki.commit();
+ return ki;
+ }
+
+
+ @Test
+ public void testStoreUncachedVC () throws IOException, QueryException {
+ String vcId = "named-vc4";
+
+ File f = new File(VirtualCorpusCache.CACHE_LOCATION + "/" + vcId);
+ assertFalse(f.exists());
+
+ VirtualCorpusCache.store(vcId, ki);
+ assertTrue(VirtualCorpusCache.contains(vcId));
+
+ Map<String, DocBits> docIdMap = VirtualCorpusCache.retrieve(vcId);
+ assertEquals(2, docIdMap.size());
+
+ VirtualCorpusCache.reset();
+ }
+
+
+ @Test
+ public void testReferToUncachedVC () throws IOException, QueryException {
+ String vcId = "named-vc1";
+ assertFalse(VirtualCorpusCache.contains(vcId));
+
+ Krill krill = new Krill(queryRefJson);
+ Result result = krill.apply(ki);
+ assertEquals(27, result.getTotalResults());
+
+ assertTrue(VirtualCorpusCache.contains(vcId));
+ Map<String, DocBits> vc1 = VirtualCorpusCache.retrieve(vcId);
+ assertNotNull(vc1);
+
+ VirtualCorpusCache.reset();
+ }
+
+
+ @Test
+ public void testUpdateCachedVC () throws IOException {
+ // VC cache will be marked for cleaning up
+ // because of storing a new VC
+ KrillIndex ki = createIndex();
+ Krill krill = new Krill(queryRefJson);
+ Result result = krill.apply(ki);
+ assertEquals(27, result.getTotalResults());
+
+ assertEquals(2,
+ VirtualCorpusCache.map.get("named-vc1").keySet().size());
+
+ ki.delDoc(2);
+ ki.commit();
+
+ // VC cache will be marked for cleaning up again
+ // because of index change.
+ krill = new Krill(queryRefJson);
+ result = krill.apply(ki);
+ assertEquals(17, result.getTotalResults());
+
+ // The old leaf fingerprint should be cleaned up, thus the map
+ // should have the same size. But the fingerprints should be
+ // different from before the 1st cleaning up
+ assertEquals(2,
+ VirtualCorpusCache.map.get("named-vc1").keySet().size());
+
+ // VC cache will be cleaned up for the 2nd time
+ // resulting the same leaf-fingerprints
+ krill = new Krill(queryRefJson);
+ result = krill.apply(ki);
+ assertEquals(17, result.getTotalResults());
+
+ assertEquals(2,
+ VirtualCorpusCache.map.get("named-vc1").keySet().size());
+
+ ki.close();
+
+ VirtualCorpusCache.reset();
+ }
+
+
+}
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestCollectionBuilder.java b/src/test/java/de/ids_mannheim/korap/collection/TestCollectionBuilder.java
index 2ee8cb1..9c500f1 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestCollectionBuilder.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestCollectionBuilder.java
@@ -97,10 +97,8 @@
public void builderReferenceNested () throws IOException {
CollectionBuilder kc = new CollectionBuilder();
- // The group can't stringify, because the filtering
- // phase won't work. This is acceptable.
assertEquals(
- "",
+ "OrGroup(VirtualCorpusReferenceFilter(example) opennlp:check)",
kc.orGroup().with(
kc.referTo("example")
).with(
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestCollectionCache.java b/src/test/java/de/ids_mannheim/korap/collection/TestCollectionCache.java
deleted file mode 100644
index ee3fef4..0000000
--- a/src/test/java/de/ids_mannheim/korap/collection/TestCollectionCache.java
+++ /dev/null
@@ -1,29 +0,0 @@
-package de.ids_mannheim.korap.collection;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-
-import org.junit.Test;
-
-import de.ids_mannheim.korap.KrillCollection;
-import de.ids_mannheim.korap.KrillIndex;
-import de.ids_mannheim.korap.index.FieldDocument;
-import net.sf.ehcache.Cache;
-
-public class TestCollectionCache {
-
- @Test
- public void testNullCache() throws IOException{
- KrillCollection kc = new KrillCollection();
- Cache temp = KrillCollection.cache;
- assertTrue(KrillCollection.cache != null);
-
- KrillCollection.cache = null;
- KrillIndex ki = new KrillIndex();
- ki.addDoc(new FieldDocument());
- ki.commit();
-
- KrillCollection.cache = temp;
- }
-}
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 69e8d9f..7e80ba2 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -2,17 +2,13 @@
import static de.ids_mannheim.korap.TestSimple.getJsonString;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
-import java.util.Properties;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
-import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@@ -24,10 +20,7 @@
import de.ids_mannheim.korap.query.QueryBuilder;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
-import de.ids_mannheim.korap.util.KrillProperties;
-import de.ids_mannheim.korap.util.QueryException;
import de.ids_mannheim.korap.util.StatusCodes;
-import net.sf.ehcache.Element;
@RunWith(JUnit4.class)
@@ -621,13 +614,12 @@
// This test was adopted from TestVCCaching,
// But does not fail anymore for deserialization
- String json = _getJSONString("unknown-vc-ref.jsonld");
+ String json = _getJSONString("vc-ref/unknown-vc-ref.jsonld");
KrillCollection kc = new KrillCollection(json);
assertEquals("referTo(https://korap.ids-mannheim.de/@ndiewald/MyCorpus)", kc.getBuilder().toString());
- // Fails on filtering
- assertEquals("", kc.toString());
+ assertEquals("VirtualCorpusReferenceFilter(https://korap.ids-mannheim.de/@ndiewald/MyCorpus)",kc.toString());
QueryBuilder kq = new QueryBuilder("field");
@@ -637,433 +629,10 @@
Result result = krill.apply(ki);
assertEquals(StatusCodes.MISSING_COLLECTION, result.getError(0).getCode());
+ assertTrue(result.getError(0).getMessage().startsWith("Collection is not found"));
};
@Test
- @Ignore
- public void testCache () throws IOException {
-
- Properties prop = KrillProperties.loadDefaultProperties();
-
- String vcPath = getClass().getResource(path + "named-vcs").getFile();
- String tempVC = prop.getProperty("krill.namedVC");
- prop.setProperty("krill.namedVC", vcPath);
-
- ki = new KrillIndex();
- ki.addDoc(createDoc1());
- ki.addDoc(createDoc2());
- ki.commit();
-
- testManualAddToCache(ki, "named-vcs/named-vc1.jsonld", "named-vc1");
- testManualAddToCache(ki, "named-vcs/named-vc2.jsonld", "named-vc2");
-
- Element element = KrillCollection.cache.get("named-vc1");
- CachedVCData cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- element = KrillCollection.cache.get("named-vc2");
- cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- // Check for cache location
- assertFalse(KrillCollection.cache.isElementInMemory("named-vc1"));
- assertTrue(KrillCollection.cache.isElementOnDisk("named-vc1"));
- assertTrue(KrillCollection.cache.isElementInMemory("named-vc2"));
- assertTrue(KrillCollection.cache.isElementOnDisk("named-vc2"));
-
- // testSearchCachedVC();
- String json = _getJSONString("query-with-vc-ref.jsonld");
- // references named-vc1: ID eq ["doc-2","doc-3"]
-
- Krill krill = new Krill(json);
- // TODO: Better keep the reference
- testManualAddToCache(ki, "named-vcs/named-vc1.jsonld", "named-vc1");
- assertEquals("referTo(cached:named-vc1)", krill.getCollection().toString());
-
- Result result = krill.apply(ki);
- assertEquals("[[a]] c d", result.getMatch(0).getSnippetBrackets());
- assertEquals(result.getMatch(0).getUID(), 2);
- assertEquals(result.getMatches().size(), 1);
-
- // testAddDocToIndex();
- ki.addDoc(createDoc3());
- ki.commit();
-
- // Cache is removed after index change
- element = KrillCollection.cache.get("named-vc1");
- assertNull(element);
-
- // Restart search - this time it's not precached
- krill = new Krill(json);
- assertEquals("referTo(named-vc1)", krill.getCollection().toString());
- result = krill.apply(ki);
-
- assertEquals("[[a]] c d", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(1).getSnippetBrackets());
- assertEquals(result.getMatches().size(), 2);
-
- // testAutoCachingMatch
- // Check autocache
- element = KrillCollection.cache.get("named-vc1");
- cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- // Because of autocaching, this should work now
- krill = new Krill(json);
- assertEquals("referTo(cached:named-vc1)", krill.getCollection().toString());
- result = krill.apply(ki);
- assertEquals("[[a]] c d", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(1).getSnippetBrackets());
- assertEquals(result.getMatches().size(), 2);
-
- // Cache is removed on deletion
- ki.addDoc(createDoc1());
- ki.commit();
-
- // Check cache
- element = KrillCollection.cache.get("named-vc1");
- assertNull(element);
-
- // Rerun query
- krill = new Krill(json);
- assertEquals("referTo(named-vc1)", krill.getCollection().toString());
- result = krill.apply(ki);
- assertEquals("[[a]] c d", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(1).getSnippetBrackets());
- assertEquals(result.getMatches().size(), 2);
-
- // testClearCache
- KrillCollection.cache.removeAll();
-
- element = KrillCollection.cache.get("named-vc1");
- assertNull(element);
-
- prop.setProperty("krill.namedVC", tempVC);
- };
-
- @Test
- @Ignore
- public void testNestedNamedVCs () throws IOException {
- KrillCollection.initializeCache();
-
- Properties prop = KrillProperties.loadDefaultProperties();
-
- String vcPath = getClass().getResource(path + "named-vcs").getFile();
- String tempVC = prop.getProperty("krill.namedVC");
- prop.setProperty("krill.namedVC", vcPath);
-
- ki = new KrillIndex();
- ki.addDoc(createDoc1());
- ki.addDoc(createDoc2());
- ki.addDoc(createDoc3());
- ki.commit();
-
- // Check cache
- Element element = KrillCollection.cache.get("named-vc1");
- assertNull(element);
-
- element = KrillCollection.cache.get("named-vc2");
- assertNull(element);
-
- QueryBuilder kq = new QueryBuilder("tokens");
- KrillCollection kc = new KrillCollection(ki);
- CollectionBuilder cb = kc.build();
- Krill krill = new Krill(kq.seg("i:a"));
-
- kc.fromBuilder(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- );
- krill.setCollection(kc);
- // named-vc1: UID:[2,3]
- // named-vc2: author:Frank (doc-1)
-
- assertEquals("OrGroup(referTo(named-vc1) referTo(named-vc2))",
- krill.getCollection().toString());
-
- assertEquals("tokens:i:a", krill.getSpanQuery().toString());
-
- Result result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] c d", result.getMatch(1).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(2).getSnippetBrackets());
- assertEquals(3, result.getMatches().size());
-
- element = KrillCollection.cache.get("named-vc2");
- CachedVCData cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- kc.fromBuilder(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- );
-
- assertEquals("OrGroup(referTo(cached:named-vc1) referTo(cached:named-vc2))",
- krill.getCollection().toString());
-
- result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] c d", result.getMatch(1).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(2).getSnippetBrackets());
- assertEquals(3, result.getMatches().size());
-
- kc.fromBuilder(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- );
-
- assertEquals("OrGroup(referTo(cached:named-vc1) referTo(cached:named-vc2))",
- krill.getCollection().toString());
-
- result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] c d", result.getMatch(1).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(2).getSnippetBrackets());
- assertEquals(3, result.getMatches().size());
-
- kc.fromBuilder(cb.referTo("named-vc1"));
-
- assertEquals("referTo(cached:named-vc1)",
- krill.getCollection().toString());
-
- result = krill.apply(ki);
- assertEquals("[[a]] c d", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(1).getSnippetBrackets());
- assertEquals(2, result.getMatches().size());
-
-
- kc.fromBuilder(cb.referTo("named-vc2"));
-
- assertEquals("referTo(cached:named-vc2)",
- krill.getCollection().toString());
-
- result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals(1, result.getMatches().size());
-
- prop.setProperty("krill.namedVC", tempVC);
- };
-
-
- @Test
- @Ignore
- public void testNamedVCsAfterQueryWithMissingDocs () throws IOException {
- KrillCollection.initializeCache();
- Properties prop = KrillProperties.loadDefaultProperties();
-
- String vcPath = getClass().getResource(path + "named-vcs").getFile();
- String tempVC = prop.getProperty("krill.namedVC");
- prop.setProperty("krill.namedVC", vcPath);
-
- ki = new KrillIndex();
- ki.addDoc(createDoc1());
- ki.commit();
- ki.addDoc(createDoc2());
- ki.commit();
- ki.addDoc(createDoc3());
- ki.commit();
-
- // Check cache
- Element element = KrillCollection.cache.get("named-vc1");
- assertNull(element);
-
- element = KrillCollection.cache.get("named-vc2");
- assertNull(element);
-
- QueryBuilder kq = new QueryBuilder("tokens");
- KrillCollection kc = new KrillCollection(ki);
- CollectionBuilder cb = kc.build();
-
- // Check only for c and cache
- Krill krill = new Krill(kq.seg("i:c"));
-
- kc.fromBuilder(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- );
- krill.setCollection(kc);
- // named-vc1: UID:[2,3]
- // named-vc2: author:Frank (doc-1)
-
- assertEquals("OrGroup(referTo(named-vc1) referTo(named-vc2))",
- krill.getCollection().toString());
-
- assertEquals("tokens:i:c", krill.getSpanQuery().toString());
-
- Result result = krill.apply(ki);
- assertEquals("a b [[c]]", result.getMatch(0).getSnippetBrackets());
- assertEquals("a [[c]] d", result.getMatch(1).getSnippetBrackets());
- assertEquals(2, result.getMatches().size());
-
- element = KrillCollection.cache.get("named-vc2");
- CachedVCData cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- kc.fromBuilder(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- );
-
- assertEquals("OrGroup(referTo(cached:named-vc1) referTo(cached:named-vc2))",
- krill.getCollection().toString());
-
- // Check again for c with cache
- result = krill.apply(ki);
- assertEquals("a b [[c]]", result.getMatch(0).getSnippetBrackets());
- assertEquals("a [[c]] d", result.getMatch(1).getSnippetBrackets());
- assertEquals(2, result.getMatches().size());
-
- // Check for a with cache
- krill = new Krill(kq.seg("i:a"));
- krill.setCollection(kc);
-
- assertEquals("OrGroup(referTo(cached:named-vc1) referTo(cached:named-vc2))",
- krill.getCollection().toString());
-
- // Check again for c with cache
- result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] c d", result.getMatch(1).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(2).getSnippetBrackets());
- assertEquals(3, result.getMatches().size());
-
- prop.setProperty("krill.namedVC", tempVC);
- };
-
-
- @Ignore
- public void testNamedVCsAfterCorpusWithMissingDocs () throws IOException {
- Properties prop = KrillProperties.loadDefaultProperties();
-
- String vcPath = getClass().getResource(path + "named-vcs").getFile();
- String tempVC = prop.getProperty("krill.namedVC");
- prop.setProperty("krill.namedVC", vcPath);
-
- ki = new KrillIndex();
- ki.addDoc(createDoc1());
- ki.commit();
- ki.addDoc(createDoc2());
- ki.commit();
- ki.addDoc(createDoc3());
- ki.commit();
-
- // Check cache
- Element element = KrillCollection.cache.get("named-vc1");
- assertNull(element);
-
- element = KrillCollection.cache.get("named-vc2");
- assertNull(element);
-
- QueryBuilder kq = new QueryBuilder("tokens");
- KrillCollection kc = new KrillCollection(ki);
- CollectionBuilder cb = kc.build();
-
- // Check only for c and cache
- Krill krill = new Krill(kq.seg("i:a"));
-
- kc.fromBuilder(
- cb.andGroup().with(
- cb.term("textClass","kultur")
- ).with(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- )
- );
- krill.setCollection(kc);
- // named-vc1: UID:[2,3]
- // named-vc2: author:Frank (doc-1)
- // textClass:kultur (doc-1,doc-2)
-
- assertEquals(
- "AndGroup(textClass:kultur OrGroup(referTo(named-vc1) referTo(named-vc2)))",
- krill.getCollection().toString());
-
- assertEquals("tokens:i:a", krill.getSpanQuery().toString());
-
- Result result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] c d", result.getMatch(1).getSnippetBrackets());
- assertEquals(2, result.getMatches().size());
-
- element = KrillCollection.cache.get("named-vc1");
- CachedVCData cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- element = KrillCollection.cache.get("named-vc2");
- cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- kc.fromBuilder(
- cb.orGroup().with(
- cb.referTo("named-vc1")
- ).with(
- cb.referTo("named-vc2")
- )
- );
-
- assertEquals("OrGroup(referTo(cached:named-vc1) referTo(cached:named-vc2))",
- krill.getCollection().toString());
-
- // Check again for c with cache
- result = krill.apply(ki);
- assertEquals("[[a]] b c", result.getMatch(0).getSnippetBrackets());
- assertEquals("[[a]] c d", result.getMatch(1).getSnippetBrackets());
- assertEquals("[[a]] d e", result.getMatch(2).getSnippetBrackets());
- assertEquals(3, result.getMatches().size());
-
- prop.setProperty("krill.namedVC", tempVC);
- };
-
- @Test
- public void testCollectionWithVCRefAndPubDate () throws IOException {
-
- KrillCollection.initializeCache();
-
- ki = new KrillIndex();
- ki.addDoc(createDoc2());
- ki.addDoc(createDoc3());
- ki.addDoc(createDoc5000());
- ki.commit();
-
- testManualAddToCache(ki, "named-vcs/named-vc3.jsonld", "named-vc3");
-
- Element element = KrillCollection.cache.get("named-vc3");
- CachedVCData cc = (CachedVCData) element.getObjectValue();
- assertTrue(cc.getDocIdMap().size() > 0);
-
- String json = _getJSONString("collection-with-vc-ref-and-pubDate.jsonld");
-
- KrillCollection kc = new KrillCollection(json);
- kc.setIndex(ki);
- assertEquals(2, kc.numberOf("documents"));
-
- // testAddDocToIndex();
- ki.addDoc(createDoc1());
- ki.commit();
- // Cache is removed after index change
-
- }
-
-
- @Test
public void filterExampleFromLegacy () throws Exception {
// Construct index
@@ -1654,20 +1223,6 @@
return fd;
};
-
- private void testManualAddToCache (KrillIndex index, String filename, String vcName) throws IOException {
- String json = _getJSONString(filename);
-
- KrillCollection kc = new KrillCollection(json);
- kc.setIndex(index);
- try {
- kc.storeInCache(vcName);
- }
- catch (QueryException qe) {
- System.err.println(qe.getLocalizedMessage());
- };
- };
-
private String _getJSONString (String file) {
return getJsonString(getClass().getResource(path + file).getFile());
};
diff --git a/src/test/resources/krill.properties b/src/test/resources/krill.properties
index ea60670..eb7f374 100644
--- a/src/test/resources/krill.properties
+++ b/src/test/resources/krill.properties
@@ -2,4 +2,6 @@
krill.name = ${project.name}
krill.indexDir = test-output
krill.namedVC = vc
-krill.index.commit.count = 15
\ No newline at end of file
+krill.index.commit.count = 15
+
+krill.namedVC=src/test/resources/queries/collections/named-vcs/
\ No newline at end of file
diff --git a/src/test/resources/queries/collections/named-vcs/named-vc4.jsonld b/src/test/resources/queries/collections/named-vcs/named-vc4.jsonld
new file mode 100644
index 0000000..7f19bb0
--- /dev/null
+++ b/src/test/resources/queries/collections/named-vcs/named-vc4.jsonld
@@ -0,0 +1,11 @@
+{"collection": {
+ "@type": "koral:doc",
+ "key": "textSigle",
+ "match": "match:eq",
+ "type" : "type:string",
+ "value": [
+ "WPD/AAA/00001",
+ "WPD/AAA/00002",
+ "WPD/AAA/00003"
+ ]
+}}
diff --git a/src/test/resources/queries/collections/query-with-vc-ref.jsonld b/src/test/resources/queries/collections/vc-ref/query-with-vc-ref.jsonld
similarity index 91%
rename from src/test/resources/queries/collections/query-with-vc-ref.jsonld
rename to src/test/resources/queries/collections/vc-ref/query-with-vc-ref.jsonld
index 86547ac..40d2c2c 100644
--- a/src/test/resources/queries/collections/query-with-vc-ref.jsonld
+++ b/src/test/resources/queries/collections/vc-ref/query-with-vc-ref.jsonld
@@ -3,7 +3,7 @@
"wrap":{
"@type":"koral:term",
"layer":"orth",
- "key":"a",
+ "key":"der",
"match":"match:eq"
}
},
diff --git a/src/test/resources/queries/collections/unknown-vc-ref.jsonld b/src/test/resources/queries/collections/vc-ref/unknown-vc-ref.jsonld
similarity index 100%
rename from src/test/resources/queries/collections/unknown-vc-ref.jsonld
rename to src/test/resources/queries/collections/vc-ref/unknown-vc-ref.jsonld