Implemented serializable doc bits vector for caching on disk.
Change-Id: I5ffdbe429b68b71d165c3ecf5b7504ac01a9a1ec
diff --git a/Changes b/Changes
index 3cba834..0952d84 100644
--- a/Changes
+++ b/Changes
@@ -4,6 +4,7 @@
- Implemented caching collection (margaretha)
- Implemented KrillCollection cache clearing (margaretha)
- Implemented auto-caching (margaretha)
+ - Implemented serializable doc bits vector for caching on disk (margaretha)
0.57 2018-04-05
- [feature] Support text queries in metadata
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index e408dfc..3c1b96e 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -31,7 +31,7 @@
import de.ids_mannheim.korap.collection.CachedVCData;
import de.ids_mannheim.korap.collection.CollectionBuilder;
-import de.ids_mannheim.korap.collection.SerializableDocIdSet;
+import de.ids_mannheim.korap.collection.DocBits;
import de.ids_mannheim.korap.response.Notifications;
import de.ids_mannheim.korap.util.QueryException;
import de.ids_mannheim.korap.util.StatusCodes;
@@ -72,10 +72,8 @@
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
- public static CacheManager cacheManager = CacheManager.newInstance();
- public static Cache cache = cacheManager.getCache("named_vc");
- private String name;
- private boolean toCache = false;
+ public final static CacheManager cacheManager = CacheManager.newInstance();
+ public final static Cache cache = cacheManager.getCache("named_vc");
/**
* Construct a new KrillCollection.
@@ -101,10 +99,6 @@
* The KoralQuery document as a JSON string.
*/
public KrillCollection (String jsonString) {
- createCollection(jsonString);
- }
-
- public void createCollection (String jsonString) {
ObjectMapper mapper = new ObjectMapper();
try {
JsonNode json = mapper.readTree(jsonString);
@@ -199,10 +193,6 @@
String type = json.get("@type").asText();
- if (json.has("cache")) {
- setToCache(json.get("cache").asBoolean());
- }
-
if (type.equals("koral:doc")) {
// default key
@@ -210,16 +200,6 @@
String valtype = "type:string";
String match = "match:eq";
- if (isToCache()) {
- if (!json.has("name")) {
- throw new QueryException(StatusCodes.MISSING_ID,
- "Collection id or name is required for caching.");
- }
- else {
- setName(json.get("name").asText());
- }
- }
-
if (json.has("key")) key = json.get("key").asText();
if (json.has("type")) valtype = json.get("type").asText();
@@ -252,6 +232,28 @@
// Filter based on string
else if (valtype.equals("type:string")) {
+
+ if (json.get("value").asText().startsWith("[")){
+ if (json.has("match")) {
+ match = json.get("match").asText();
+ }
+
+ CollectionBuilder.Group group = null;
+ if (match.equals("match:eq")) {
+ group = this.cb.orGroup();
+ for (JsonNode value : json.get("value")) {
+ group.with(cb.term(key, value.asText()));
+ }
+ }
+ else if (match.equals("match:ne")) {
+ group = this.cb.andGroup();
+ for (JsonNode value : json.get("value")) {
+ group.with(cb.term(key, value.asText()).not());
+ }
+ }
+ return group;
+ }
+
if (json.has("match")) match = json.get("match").asText();
switch (match) {
@@ -316,28 +318,6 @@
"Match relation unknown for type");
}
- else if (valtype.equals("type:string[]")) {
-
- if (json.has("match")) {
- match = json.get("match").asText();
- }
-
- CollectionBuilder.Group group = null;
- if (match.equals("match:eq")) {
- group = this.cb.orGroup();
- for (JsonNode value : json.get("value")) {
- group.with(cb.term(key, value.asText()));
- }
- }
- else if (match.equals("match:ne")) {
- group = this.cb.andGroup();
- for (JsonNode value : json.get("value")) {
- group.with(cb.term(key, value.asText()).not());
- }
- }
- return group;
- }
-
throw new QueryException(843, "Document type is not supported");
}
@@ -382,7 +362,7 @@
"ref is empty");
}
- Element element = cache.get(ref);
+ Element element = KrillCollection.cache.get(ref);
if (element == null) {
String corpusQuery = loadVCFile(ref);
if (corpusQuery == null){
@@ -418,7 +398,7 @@
private String loadVCFile (String ref) {
- File file = new File(ref);
+ File file = new File("vc/"+ref+".jsonld");
String json = null;
try {
FileInputStream fis = new FileInputStream(file);
@@ -797,45 +777,29 @@
};
- public CachedVCData storeInCache () throws IOException {
+ public void storeInCache (String cacheKey) throws IOException {
+ if (cacheKey ==null || cacheKey.isEmpty()) {
+ this.addError(StatusCodes.MISSING_ID,
+ "Collection name is required for caching.");
+ }
+
List<LeafReaderContext> leaves = this.index.reader().leaves();
- Map<Integer, DocIdSet> docIdMap =
- new HashMap<Integer, DocIdSet>(leaves.size());
+ Map<Integer, DocBits> docIdMap =
+ new HashMap<Integer, DocBits>(leaves.size());
for (LeafReaderContext context : leaves) {
if (docIdMap.get(context.hashCode()) == null) {
FixedBitSet bitset = bits(context);
docIdMap.put(context.hashCode(),
- new SerializableDocIdSet(bitset));
+ new DocBits(bitset.getBits()));
}
}
CachedVCData cc = new CachedVCData(docIdMap);
- cache.put(new Element(getName(), cc));
-
+ cache.put(new Element(cacheKey, cc));
this.cbi = cb.namedVC(cc);
- return cc;
}
- public String getName () {
- return name;
- }
-
-
- public void setName (String name) {
- this.name = name;
- }
-
-
- public boolean isToCache () {
- return toCache;
- }
-
-
- public void setToCache (boolean toCache) {
- this.toCache = toCache;
- }
-
/*
* Analyze how terms relate
*/
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java b/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
index d4bae4f..8b73a45 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
@@ -1,29 +1,73 @@
package de.ids_mannheim.korap.collection;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.Map;
-import org.apache.lucene.search.DocIdSet;
+/**
+ * Virtual corpus data to cache
+ *
+ * @author margaretha
+ *
+ */
+public class CachedVCData implements Serializable {
-public class CachedVCData implements Serializable{
-
- /** Auto generated
+ /**
+ * Auto generated
*
*/
private static final long serialVersionUID = 5635087441839303653L;
-
- private Map<Integer, DocIdSet> docIdMap;
-
- public CachedVCData (Map<Integer, DocIdSet> docIdMap) {
+
+ private Map<Integer, DocBits> docIdMap;
+
+ public CachedVCData (Map<Integer, DocBits> docIdMap) {
this.docIdMap = docIdMap;
}
-
- public Map<Integer, DocIdSet> getDocIdMap () {
+
+ public Map<Integer, DocBits> getDocIdMap () {
return docIdMap;
}
- public void setDocIdMap (Map<Integer, DocIdSet> docIdMap) {
+ public void setDocIdMap (Map<Integer, DocBits> docIdMap) {
this.docIdMap = docIdMap;
}
-
+
+ // EM: for optimization. has not been checked.
+ // ehcache retrieves a byte[] much faster than a map, however,
+ // there is an additional cost for converting a map to a byte[]
+ // and vice versa.
+
+ private byte[] toByteArray () throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ ObjectOutputStream oos = new ObjectOutputStream(bos);
+ oos.writeObject(docIdMap);
+ oos.flush();
+ return bos.toByteArray();
+ }
+
+ private Map<Integer, DocBits> toMap (byte[] bytes)
+ throws ClassNotFoundException, IOException {
+ ByteArrayInputStream bis = null;
+ ObjectInputStream ois = null;
+ Map<Integer, DocBits> map = null;
+ try {
+ bis = new ByteArrayInputStream(bytes);
+ ois = new ObjectInputStream(bis);
+ map = (Map<Integer, DocBits>) ois.readObject();
+
+ }
+ finally {
+ if (bis != null) {
+ bis.close();
+ }
+ if (ois != null) {
+ ois.close();
+ }
+ }
+ return map;
+ }
}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
index 39a5d93..db0d675 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
@@ -7,6 +7,11 @@
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
+/** Filter for virtual corpus/collection existing in the cache.
+ *
+ * @author margaretha
+ *
+ */
public class CachedVCFilter extends Filter {
private CachedVCData cachedCollection;
@@ -18,8 +23,8 @@
@Override
public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
throws IOException {
-
- return cachedCollection.getDocIdMap().get(context.hashCode());
+ DocBits docBits = cachedCollection.getDocIdMap().get(context.hashCode());
+ return docBits.createBitDocIdSet();
}
}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 5705f14..f235fe0 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -376,6 +376,11 @@
};
};
+ /** Builder for virtual corpus / collection existing in the cache
+ *
+ * @author margaretha
+ *
+ */
public class CachedVC implements CollectionBuilder.Interface {
private CachedVCData cachedCollection;
@@ -413,12 +418,12 @@
private CollectionBuilder.Interface child;
private String cacheKey;
- private Map<Integer, DocIdSet> docIdMap;
+ private Map<Integer, DocBits> docIdMap;
public ToCacheVC (String vcRef, Interface cbi) {
this.child = cbi;
this.cacheKey = vcRef;
- this.docIdMap = new HashMap<Integer, DocIdSet>();
+ this.docIdMap = new HashMap<Integer, DocBits>();
}
@Override
diff --git a/src/main/java/de/ids_mannheim/korap/collection/DocBits.java b/src/main/java/de/ids_mannheim/korap/collection/DocBits.java
new file mode 100644
index 0000000..59044ab
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/collection/DocBits.java
@@ -0,0 +1,47 @@
+package de.ids_mannheim.korap.collection;
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.BitDocIdSet;
+import org.apache.lucene.util.FixedBitSet;
+
+/** Serializable object for caching Lucene doc bit vector.
+ *
+ * @author margaretha
+ *
+ */
+public class DocBits implements Serializable {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = -3505650918983180852L;
+ final long[] bits;
+ final int numBits;
+
+ public DocBits (long[] bits) {
+ this.bits = bits;
+ this.numBits = bits.length;
+ }
+
+ public BitDocIdSet createBitDocIdSet () {
+ FixedBitSet bitset = new FixedBitSet(bits, numBits);
+ BitDocIdSet docIdSet = new BitDocIdSet(bitset);
+ return docIdSet;
+ }
+
+ @Override
+ public String toString () {
+ StringBuilder sb = new StringBuilder("[");
+ int i = 1;
+ for (long b : bits) {
+ sb.append(b);
+ if (i < numBits) {
+ sb.append(",");
+ }
+ i++;
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/SerializableDocIdSet.java b/src/main/java/de/ids_mannheim/korap/collection/SerializableDocIdSet.java
deleted file mode 100644
index 22907fe..0000000
--- a/src/main/java/de/ids_mannheim/korap/collection/SerializableDocIdSet.java
+++ /dev/null
@@ -1,19 +0,0 @@
-package de.ids_mannheim.korap.collection;
-
-import java.io.Serializable;
-
-import org.apache.lucene.util.BitDocIdSet;
-import org.apache.lucene.util.BitSet;
-
-public class SerializableDocIdSet extends BitDocIdSet implements Serializable {
-
- /**
- * Auto generated
- *
- */
- private static final long serialVersionUID = 171797306573832807L;
-
- public SerializableDocIdSet (BitSet set) {
- super(set);
- }
-}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
index 2928de3..7c8a815 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
@@ -1,6 +1,7 @@
package de.ids_mannheim.korap.collection;
import java.io.IOException;
+import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.index.LeafReader;
@@ -14,15 +15,20 @@
import de.ids_mannheim.korap.collection.CollectionBuilder.Interface;
import net.sf.ehcache.Element;
+/** Filter for virtual corpus / collection that should be cached.
+ *
+ * @author margaretha
+ *
+ */
public class ToCacheVCFilter extends Filter {
private Filter filter;
private CollectionBuilder.Interface cbi;
private String cacheKey;
- private Map<Integer, DocIdSet> docIdMap;
+ private Map<Integer, DocBits> docIdMap;
- public ToCacheVCFilter (String cacheKey, Map<Integer, DocIdSet> docIdMap,
+ public ToCacheVCFilter (String cacheKey, Map<Integer, DocBits> docIdMap,
Interface cbi, Filter filter) {
this.cacheKey = cacheKey;
this.docIdMap = docIdMap;
@@ -52,11 +58,11 @@
bitset.or(docIdSet.iterator());
}
- docIdMap.put(context.hashCode(), new SerializableDocIdSet(bitset));
- CachedVCData cachedVCData = new CachedVCData(docIdMap);
+ docIdMap.put(context.hashCode(), new DocBits(bitset.getBits()));
+ CachedVCData cachedVCData = new CachedVCData(new HashMap<>(docIdMap));
+ KrillCollection.cache.remove(cacheKey);
KrillCollection.cache.put(new Element(cacheKey, cachedVCData));
-
return docIdSet;
}
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java b/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java
index 427451d..86c815c 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java
@@ -1,5 +1,6 @@
package de.ids_mannheim.korap.collection;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -23,7 +24,6 @@
private KrillIndex getSampleIndex () throws IOException {
return new KrillIndex(new MMapDirectory(
Paths.get(getClass().getResource("/sample-index").getFile())));
-
}
private KrillIndex index;
@@ -34,27 +34,36 @@
@Test
public void testCache () throws IOException {
- testAddToCache();
+ testManualAddToCache("named-vc/named-vc1.jsonld", "named-vc1");
+ testManualAddToCache("named-vc/named-vc2.jsonld", "named-vc2");
+
+ Element element = KrillCollection.cache.get("named-vc1");
+ CachedVCData cc = (CachedVCData) element.getObjectValue();
+ assertTrue(cc.getDocIdMap().size() > 0);
+
+ element = KrillCollection.cache.get("named-vc2");
+ cc = (CachedVCData) element.getObjectValue();
+ assertTrue(cc.getDocIdMap().size() > 0);
+
+ assertFalse(KrillCollection.cache.isElementInMemory("named-vc1"));
+ assertTrue(KrillCollection.cache.isElementOnDisk("named-vc1"));
+ assertTrue(KrillCollection.cache.isElementInMemory("named-vc2"));
+ assertTrue(KrillCollection.cache.isElementOnDisk("named-vc2"));
+
testSearchCachedVC();
- testClearCache();
testAddDocToIndex();
testDelDocFromIndex();
}
- private void testAddToCache () throws IOException {
+ private void testManualAddToCache (String filename, String vcName) throws IOException {
InputStream is = getClass().getClassLoader()
- .getResourceAsStream("named-vc/named-vc-free.jsonld");
+ .getResourceAsStream(filename);
String json = IOUtils.toString(is);
is.close();
KrillCollection kc = new KrillCollection(json);
kc.setIndex(index);
- kc.storeInCache();
-
- Element element = KrillCollection.cache.get("cache-goe");
- CachedVCData cc = (CachedVCData) element.getObjectValue();
-
- assertTrue(cc.getDocIdMap().size() > 0);
+ kc.storeInCache(vcName);
}
private void testSearchCachedVC () throws IOException {
@@ -76,12 +85,12 @@
private void testClearCache () {
KrillCollection.cache.removeAll();
- Element element = KrillCollection.cache.get("cache-goe");
+ Element element = KrillCollection.cache.get("named-vc1");
assertNull(element);
}
public void testAddDocToIndex () throws IOException {
- testAddToCache();
+ testManualAddToCache("named-vc/named-vc1.jsonld", "named-vc1");
FieldDocument fd = new FieldDocument();
fd.addTV("base", "x y", "[(0-3)s:x]" + // 1
@@ -90,28 +99,23 @@
index.addDoc(fd);
index.commit();
- Element element = KrillCollection.cache.get("cache-goe");
+ Element element = KrillCollection.cache.get("named-vc1");
assertNull(element);
}
public void testDelDocFromIndex () throws IOException {
- testAddToCache();
+ testManualAddToCache("named-vc/named-vc1.jsonld", "named-vc1");
index.delDocs("textSigle", "GOE/AGF/00000");
index.commit();
- Element element = KrillCollection.cache.get("cache-goe");
+ Element element = KrillCollection.cache.get("named-vc1");
assertNull(element);
}
@Test
public void testAutoCaching () throws IOException {
- InputStream is = getClass().getClassLoader()
- .getResourceAsStream("collection/query-with-vc-ref.jsonld");
- String json = IOUtils.toString(is);
-
- String result = new Krill(json).apply(this.index).toJsonString();
- assertNotNull(result);
- assertTrue(!result.isEmpty());
+ testSearchCachedVC();
+ testClearCache();
}
}
diff --git a/src/main/resources/ehcache.xml b/src/test/resources/ehcache.xml
similarity index 68%
rename from src/main/resources/ehcache.xml
rename to src/test/resources/ehcache.xml
index 00465bf..b293534 100644
--- a/src/main/resources/ehcache.xml
+++ b/src/test/resources/ehcache.xml
@@ -12,10 +12,11 @@
<cache name="named_vc"
eternal="true"
- memoryStoreEvictionPolicy="LRU"
- maxBytesLocalHeap="256M" >
- <!-- maxBytesLocalDisk="1G">
- <persistence strategy="localTempSwap"/> -->
+ memoryStoreEvictionPolicy="LRU"
+ maxElementsInMemory="1"
+ maxBytesLocalDisk="1G"
+ diskExpiryThreadIntervalSeconds = "120" >
+ <persistence strategy="localTempSwap"/>
</cache>
</ehcache>
\ No newline at end of file
diff --git a/src/test/resources/named-vc/named-vc-free.jsonld b/src/test/resources/named-vc/named-vc1.jsonld
similarity index 76%
rename from src/test/resources/named-vc/named-vc-free.jsonld
rename to src/test/resources/named-vc/named-vc1.jsonld
index 65a6c52..0fddd89 100644
--- a/src/test/resources/named-vc/named-vc-free.jsonld
+++ b/src/test/resources/named-vc/named-vc1.jsonld
@@ -1,9 +1,9 @@
{"collection": {
- "name" : "cache-goe",
+ "name" : "named-vc1",
"@type": "koral:doc",
"key": "textSigle",
"match": "match:ne",
- "type" : "type:string[]",
+ "type" : "type:string",
"value": [
"GOE/AGF/00000",
"GOE/AGA/01784"
diff --git a/src/test/resources/named-vc/named-vc2.jsonld b/src/test/resources/named-vc/named-vc2.jsonld
new file mode 100644
index 0000000..be882a0
--- /dev/null
+++ b/src/test/resources/named-vc/named-vc2.jsonld
@@ -0,0 +1,13 @@
+{"collection": {
+ "name" : "named-vc2",
+ "@type": "koral:doc",
+ "key": "textSigle",
+ "match": "match:eq",
+ "type" : "type:string",
+ "value": [
+ "GOE/AGA/02232",
+ "GOE/AGA/02616",
+ "GOE/AGA/03828"
+ ],
+ "cache" : "true"
+}}
diff --git a/src/test/resources/named-vc/named-vc-free.jsonld b/vc/named-vc1.jsonld
similarity index 76%
copy from src/test/resources/named-vc/named-vc-free.jsonld
copy to vc/named-vc1.jsonld
index 65a6c52..0fddd89 100644
--- a/src/test/resources/named-vc/named-vc-free.jsonld
+++ b/vc/named-vc1.jsonld
@@ -1,9 +1,9 @@
{"collection": {
- "name" : "cache-goe",
+ "name" : "named-vc1",
"@type": "koral:doc",
"key": "textSigle",
"match": "match:ne",
- "type" : "type:string[]",
+ "type" : "type:string",
"value": [
"GOE/AGF/00000",
"GOE/AGA/01784"