Implemented serializable doc bits vector for caching on disk.

Change-Id: I5ffdbe429b68b71d165c3ecf5b7504ac01a9a1ec
diff --git a/Changes b/Changes
index 3cba834..0952d84 100644
--- a/Changes
+++ b/Changes
@@ -4,6 +4,7 @@
         - Implemented caching collection (margaretha)
         - Implemented KrillCollection cache clearing (margaretha)
         - Implemented auto-caching (margaretha)
+        - Implemented serializable doc bits vector for caching on disk (margaretha)
 
 0.57 2018-04-05
         - [feature] Support text queries in metadata
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index e408dfc..3c1b96e 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -31,7 +31,7 @@
 
 import de.ids_mannheim.korap.collection.CachedVCData;
 import de.ids_mannheim.korap.collection.CollectionBuilder;
-import de.ids_mannheim.korap.collection.SerializableDocIdSet;
+import de.ids_mannheim.korap.collection.DocBits;
 import de.ids_mannheim.korap.response.Notifications;
 import de.ids_mannheim.korap.util.QueryException;
 import de.ids_mannheim.korap.util.StatusCodes;
@@ -72,10 +72,8 @@
     // This advices the java compiler to ignore all loggings
     public static final boolean DEBUG = false;
 
-    public static CacheManager cacheManager = CacheManager.newInstance();
-    public static Cache cache = cacheManager.getCache("named_vc");
-    private String name;
-    private boolean toCache = false;
+    public final static CacheManager cacheManager = CacheManager.newInstance();
+    public final static Cache cache = cacheManager.getCache("named_vc");
 
     /**
      * Construct a new KrillCollection.
@@ -101,10 +99,6 @@
      *            The KoralQuery document as a JSON string.
      */
     public KrillCollection (String jsonString) {
-        createCollection(jsonString);
-    }
-
-    public void createCollection (String jsonString) {
         ObjectMapper mapper = new ObjectMapper();
         try {
             JsonNode json = mapper.readTree(jsonString);
@@ -199,10 +193,6 @@
 
         String type = json.get("@type").asText();
 
-        if (json.has("cache")) {
-            setToCache(json.get("cache").asBoolean());
-        }
-
         if (type.equals("koral:doc")) {
 
             // default key
@@ -210,16 +200,6 @@
             String valtype = "type:string";
             String match = "match:eq";
 
-            if (isToCache()) {
-                if (!json.has("name")) {
-                    throw new QueryException(StatusCodes.MISSING_ID,
-                            "Collection id or name is required for caching.");
-                }
-                else {
-                    setName(json.get("name").asText());
-                }
-            }
-
             if (json.has("key")) key = json.get("key").asText();
 
             if (json.has("type")) valtype = json.get("type").asText();
@@ -252,6 +232,28 @@
 
             // Filter based on string
             else if (valtype.equals("type:string")) {
+                
+                if (json.get("value").asText().startsWith("[")){
+                    if (json.has("match")) {
+                        match = json.get("match").asText();
+                    }
+
+                    CollectionBuilder.Group group = null;
+                    if (match.equals("match:eq")) {
+                        group = this.cb.orGroup();
+                        for (JsonNode value : json.get("value")) {
+                            group.with(cb.term(key, value.asText()));
+                        }
+                    }
+                    else if (match.equals("match:ne")) {
+                        group = this.cb.andGroup();
+                        for (JsonNode value : json.get("value")) {
+                            group.with(cb.term(key, value.asText()).not());
+                        }
+                    }
+                    return group;
+                }
+                
                 if (json.has("match")) match = json.get("match").asText();
 
                 switch (match) {
@@ -316,28 +318,6 @@
                         "Match relation unknown for type");
             }
 
-            else if (valtype.equals("type:string[]")) {
-
-                if (json.has("match")) {
-                    match = json.get("match").asText();
-                }
-
-                CollectionBuilder.Group group = null;
-                if (match.equals("match:eq")) {
-                    group = this.cb.orGroup();
-                    for (JsonNode value : json.get("value")) {
-                        group.with(cb.term(key, value.asText()));
-                    }
-                }
-                else if (match.equals("match:ne")) {
-                    group = this.cb.andGroup();
-                    for (JsonNode value : json.get("value")) {
-                        group.with(cb.term(key, value.asText()).not());
-                    }
-                }
-                return group;
-            }
-
             throw new QueryException(843, "Document type is not supported");
         }
 
@@ -382,7 +362,7 @@
                         "ref is empty");
             }
 
-            Element element = cache.get(ref);
+            Element element = KrillCollection.cache.get(ref);
             if (element == null) {
                 String corpusQuery = loadVCFile(ref);
                 if (corpusQuery == null){
@@ -418,7 +398,7 @@
 
     
     private String loadVCFile (String ref) {
-        File file = new File(ref);
+        File file = new File("vc/"+ref+".jsonld");
         String json = null;
         try {
             FileInputStream fis = new FileInputStream(file);
@@ -797,45 +777,29 @@
     };
 
 
-    public CachedVCData storeInCache () throws IOException {
+    public void storeInCache (String cacheKey) throws IOException {
+        if (cacheKey ==null || cacheKey.isEmpty()) {
+            this.addError(StatusCodes.MISSING_ID,
+                    "Collection name is required for caching.");
+        }
+        
         List<LeafReaderContext> leaves = this.index.reader().leaves();
-        Map<Integer, DocIdSet> docIdMap =
-                new HashMap<Integer, DocIdSet>(leaves.size());
+        Map<Integer, DocBits> docIdMap =
+                new HashMap<Integer, DocBits>(leaves.size());
 
         for (LeafReaderContext context : leaves) {
             if (docIdMap.get(context.hashCode()) == null) {
                 FixedBitSet bitset = bits(context);
                 docIdMap.put(context.hashCode(),
-                        new SerializableDocIdSet(bitset));
+                        new DocBits(bitset.getBits()));
             }
         }
 
         CachedVCData cc = new CachedVCData(docIdMap);
-        cache.put(new Element(getName(), cc));
-
+        cache.put(new Element(cacheKey, cc));
         this.cbi = cb.namedVC(cc);
-        return cc;
     }
     
-    public String getName () {
-        return name;
-    }
-
-
-    public void setName (String name) {
-        this.name = name;
-    }
-
-
-    public boolean isToCache () {
-        return toCache;
-    }
-
-
-    public void setToCache (boolean toCache) {
-        this.toCache = toCache;
-    }
-
     /*
      * Analyze how terms relate
      */
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java b/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
index d4bae4f..8b73a45 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CachedVCData.java
@@ -1,29 +1,73 @@
 package de.ids_mannheim.korap.collection;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
 import java.io.Serializable;
 import java.util.Map;
 
-import org.apache.lucene.search.DocIdSet;
+/**
+ * Virtual corpus data to cache
+ * 
+ * @author margaretha
+ *
+ */
+public class CachedVCData implements Serializable {
 
-public class CachedVCData implements Serializable{
-
-    /** Auto generated
+    /**
+     * Auto generated
      * 
      */
     private static final long serialVersionUID = 5635087441839303653L;
-    
-    private Map<Integer, DocIdSet> docIdMap;
-    
-    public CachedVCData (Map<Integer, DocIdSet> docIdMap) {
+
+    private Map<Integer, DocBits> docIdMap;
+
+    public CachedVCData (Map<Integer, DocBits> docIdMap) {
         this.docIdMap = docIdMap;
     }
-    
-    public Map<Integer, DocIdSet> getDocIdMap () {
+
+    public Map<Integer, DocBits> getDocIdMap () {
         return docIdMap;
     }
 
-    public void setDocIdMap (Map<Integer, DocIdSet> docIdMap) {
+    public void setDocIdMap (Map<Integer, DocBits> docIdMap) {
         this.docIdMap = docIdMap;
     }
-    
+
+    // EM: for optimization. has not been checked.
+    // ehcache retrieves a byte[] much faster than a map, however,
+    // there is an additional cost for converting a map to a byte[]
+    // and vice versa.
+
+    private byte[] toByteArray () throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        ObjectOutputStream oos = new ObjectOutputStream(bos);
+        oos.writeObject(docIdMap);
+        oos.flush();
+        return bos.toByteArray();
+    }
+
+    private Map<Integer, DocBits> toMap (byte[] bytes)
+            throws ClassNotFoundException, IOException {
+        ByteArrayInputStream bis = null;
+        ObjectInputStream ois = null;
+        Map<Integer, DocBits> map = null;
+        try {
+            bis = new ByteArrayInputStream(bytes);
+            ois = new ObjectInputStream(bis);
+            map = (Map<Integer, DocBits>) ois.readObject();
+
+        }
+        finally {
+            if (bis != null) {
+                bis.close();
+            }
+            if (ois != null) {
+                ois.close();
+            }
+        }
+        return map;
+    }
 }
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
index 39a5d93..db0d675 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CachedVCFilter.java
@@ -7,6 +7,11 @@
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.util.Bits;
 
+/** Filter for virtual corpus/collection existing in the cache.
+ * 
+ * @author margaretha
+ *
+ */
 public class CachedVCFilter extends Filter {
 
     private CachedVCData cachedCollection;
@@ -18,8 +23,8 @@
     @Override
     public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
             throws IOException {
-        
-        return cachedCollection.getDocIdMap().get(context.hashCode());
+        DocBits docBits = cachedCollection.getDocIdMap().get(context.hashCode());
+        return docBits.createBitDocIdSet();
     }
 
 }
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 5705f14..f235fe0 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -376,6 +376,11 @@
         };
     };
     
+    /** Builder for virtual corpus / collection existing in the cache
+     * 
+     * @author margaretha
+     *
+     */
     public class CachedVC implements CollectionBuilder.Interface {
 
         private CachedVCData cachedCollection;
@@ -413,12 +418,12 @@
         private CollectionBuilder.Interface child;
         private String cacheKey;
         
-        private Map<Integer, DocIdSet> docIdMap;
+        private Map<Integer, DocBits> docIdMap;
 
         public ToCacheVC (String vcRef, Interface cbi) {
             this.child = cbi;
             this.cacheKey = vcRef;
-            this.docIdMap  = new HashMap<Integer, DocIdSet>();
+            this.docIdMap  = new HashMap<Integer, DocBits>();
         }
 
         @Override
diff --git a/src/main/java/de/ids_mannheim/korap/collection/DocBits.java b/src/main/java/de/ids_mannheim/korap/collection/DocBits.java
new file mode 100644
index 0000000..59044ab
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/collection/DocBits.java
@@ -0,0 +1,47 @@
+package de.ids_mannheim.korap.collection;
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.BitDocIdSet;
+import org.apache.lucene.util.FixedBitSet;
+
+/** Serializable object for caching Lucene doc bit vector.
+ * 
+ * @author margaretha
+ *
+ */
+public class DocBits implements Serializable {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = -3505650918983180852L;
+    final long[] bits;
+    final int numBits;
+
+    public DocBits (long[] bits) {
+        this.bits = bits;
+        this.numBits = bits.length;
+    }
+
+    public BitDocIdSet createBitDocIdSet () {
+        FixedBitSet bitset = new FixedBitSet(bits, numBits);
+        BitDocIdSet docIdSet = new BitDocIdSet(bitset);
+        return docIdSet;
+    }
+
+    @Override
+    public String toString () {
+        StringBuilder sb = new StringBuilder("[");
+        int i = 1;
+        for (long b : bits) {
+            sb.append(b);
+            if (i < numBits) {
+                sb.append(",");
+            }
+            i++;
+        }
+        sb.append("]");
+        return sb.toString();
+    }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/SerializableDocIdSet.java b/src/main/java/de/ids_mannheim/korap/collection/SerializableDocIdSet.java
deleted file mode 100644
index 22907fe..0000000
--- a/src/main/java/de/ids_mannheim/korap/collection/SerializableDocIdSet.java
+++ /dev/null
@@ -1,19 +0,0 @@
-package de.ids_mannheim.korap.collection;
-
-import java.io.Serializable;
-
-import org.apache.lucene.util.BitDocIdSet;
-import org.apache.lucene.util.BitSet;
-
-public class SerializableDocIdSet extends BitDocIdSet implements Serializable {
-
-    /**
-     * Auto generated
-     * 
-     */
-    private static final long serialVersionUID = 171797306573832807L;
-
-    public SerializableDocIdSet (BitSet set) {
-        super(set);
-    }
-}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
index 2928de3..7c8a815 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/ToCacheVCFilter.java
@@ -1,6 +1,7 @@
 package de.ids_mannheim.korap.collection;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.lucene.index.LeafReader;
@@ -14,15 +15,20 @@
 import de.ids_mannheim.korap.collection.CollectionBuilder.Interface;
 import net.sf.ehcache.Element;
 
+/** Filter for virtual corpus / collection that should be cached.  
+ * 
+ * @author margaretha
+ *
+ */
 public class ToCacheVCFilter extends Filter {
 
 
     private Filter filter;
     private CollectionBuilder.Interface cbi;
     private String cacheKey;
-    private Map<Integer, DocIdSet> docIdMap;
+    private Map<Integer, DocBits> docIdMap;
 
-    public ToCacheVCFilter (String cacheKey, Map<Integer, DocIdSet> docIdMap,
+    public ToCacheVCFilter (String cacheKey, Map<Integer, DocBits> docIdMap,
                             Interface cbi, Filter filter) {
         this.cacheKey = cacheKey;
         this.docIdMap = docIdMap;
@@ -52,11 +58,11 @@
             bitset.or(docIdSet.iterator());
         }
 
-        docIdMap.put(context.hashCode(), new SerializableDocIdSet(bitset));
-        CachedVCData cachedVCData = new CachedVCData(docIdMap);
+        docIdMap.put(context.hashCode(), new DocBits(bitset.getBits()));
+        CachedVCData cachedVCData = new CachedVCData(new HashMap<>(docIdMap));
 
+        KrillCollection.cache.remove(cacheKey);
         KrillCollection.cache.put(new Element(cacheKey, cachedVCData));
-
         return docIdSet;
     }
 
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java b/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java
index 427451d..86c815c 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestVCCaching.java
@@ -1,5 +1,6 @@
 package de.ids_mannheim.korap.collection;
 
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
@@ -23,7 +24,6 @@
     private KrillIndex getSampleIndex () throws IOException {
         return new KrillIndex(new MMapDirectory(
                 Paths.get(getClass().getResource("/sample-index").getFile())));
-
     }
 
     private KrillIndex index;
@@ -34,27 +34,36 @@
 
     @Test
     public void testCache () throws IOException {
-        testAddToCache();
+        testManualAddToCache("named-vc/named-vc1.jsonld", "named-vc1");
+        testManualAddToCache("named-vc/named-vc2.jsonld", "named-vc2");
+        
+        Element element = KrillCollection.cache.get("named-vc1");
+        CachedVCData cc = (CachedVCData) element.getObjectValue();
+        assertTrue(cc.getDocIdMap().size() > 0);
+        
+        element = KrillCollection.cache.get("named-vc2");
+        cc = (CachedVCData) element.getObjectValue();
+        assertTrue(cc.getDocIdMap().size() > 0);
+        
+        assertFalse(KrillCollection.cache.isElementInMemory("named-vc1"));
+        assertTrue(KrillCollection.cache.isElementOnDisk("named-vc1"));
+        assertTrue(KrillCollection.cache.isElementInMemory("named-vc2"));
+        assertTrue(KrillCollection.cache.isElementOnDisk("named-vc2"));
+
         testSearchCachedVC();
-        testClearCache();
         testAddDocToIndex();
         testDelDocFromIndex();
     }
 
-    private void testAddToCache () throws IOException {
+    private void testManualAddToCache (String filename, String vcName) throws IOException {
         InputStream is = getClass().getClassLoader()
-                .getResourceAsStream("named-vc/named-vc-free.jsonld");
+                .getResourceAsStream(filename);
         String json = IOUtils.toString(is);
         is.close();
 
         KrillCollection kc = new KrillCollection(json);
         kc.setIndex(index);
-        kc.storeInCache();
-
-        Element element = KrillCollection.cache.get("cache-goe");
-        CachedVCData cc = (CachedVCData) element.getObjectValue();
-
-        assertTrue(cc.getDocIdMap().size() > 0);
+        kc.storeInCache(vcName);
     }
 
     private void testSearchCachedVC () throws IOException {
@@ -76,12 +85,12 @@
     private void testClearCache () {
         KrillCollection.cache.removeAll();
 
-        Element element = KrillCollection.cache.get("cache-goe");
+        Element element = KrillCollection.cache.get("named-vc1");
         assertNull(element);
     }
 
     public void testAddDocToIndex () throws IOException {
-        testAddToCache();
+        testManualAddToCache("named-vc/named-vc1.jsonld", "named-vc1");
 
         FieldDocument fd = new FieldDocument();
         fd.addTV("base", "x  y", "[(0-3)s:x]" + // 1
@@ -90,28 +99,23 @@
         index.addDoc(fd);
         index.commit();
         
-        Element element = KrillCollection.cache.get("cache-goe");
+        Element element = KrillCollection.cache.get("named-vc1");
         assertNull(element);
     }
     
     public void testDelDocFromIndex () throws IOException {
-        testAddToCache();
+        testManualAddToCache("named-vc/named-vc1.jsonld", "named-vc1");
 
         index.delDocs("textSigle", "GOE/AGF/00000");
         index.commit();
         
-        Element element = KrillCollection.cache.get("cache-goe");
+        Element element = KrillCollection.cache.get("named-vc1");
         assertNull(element);
     }
     
     @Test
     public void testAutoCaching () throws IOException {
-        InputStream is = getClass().getClassLoader()
-                .getResourceAsStream("collection/query-with-vc-ref.jsonld");
-        String json = IOUtils.toString(is);
-
-        String result = new Krill(json).apply(this.index).toJsonString();
-        assertNotNull(result);
-        assertTrue(!result.isEmpty());
+        testSearchCachedVC();
+        testClearCache();
     }
 }
diff --git a/src/main/resources/ehcache.xml b/src/test/resources/ehcache.xml
similarity index 68%
rename from src/main/resources/ehcache.xml
rename to src/test/resources/ehcache.xml
index 00465bf..b293534 100644
--- a/src/main/resources/ehcache.xml
+++ b/src/test/resources/ehcache.xml
@@ -12,10 +12,11 @@
 
 	<cache name="named_vc" 
 		eternal="true" 
-		memoryStoreEvictionPolicy="LRU" 
-		maxBytesLocalHeap="256M" >
-		<!-- maxBytesLocalDisk="1G"> 
-		<persistence strategy="localTempSwap"/> -->
+		memoryStoreEvictionPolicy="LRU"
+		maxElementsInMemory="1" 
+		maxBytesLocalDisk="1G"
+		diskExpiryThreadIntervalSeconds = "120" > 
+		<persistence strategy="localTempSwap"/>
 	</cache>
 
 </ehcache>
\ No newline at end of file
diff --git a/src/test/resources/named-vc/named-vc-free.jsonld b/src/test/resources/named-vc/named-vc1.jsonld
similarity index 76%
rename from src/test/resources/named-vc/named-vc-free.jsonld
rename to src/test/resources/named-vc/named-vc1.jsonld
index 65a6c52..0fddd89 100644
--- a/src/test/resources/named-vc/named-vc-free.jsonld
+++ b/src/test/resources/named-vc/named-vc1.jsonld
@@ -1,9 +1,9 @@
 {"collection": {
-    "name" : "cache-goe",
+    "name" : "named-vc1",
     "@type": "koral:doc",
     "key": "textSigle",
     "match": "match:ne",
-    "type" : "type:string[]",
+    "type" : "type:string",
     "value": [
         "GOE/AGF/00000",
         "GOE/AGA/01784"
diff --git a/src/test/resources/named-vc/named-vc2.jsonld b/src/test/resources/named-vc/named-vc2.jsonld
new file mode 100644
index 0000000..be882a0
--- /dev/null
+++ b/src/test/resources/named-vc/named-vc2.jsonld
@@ -0,0 +1,13 @@
+{"collection": {
+    "name" : "named-vc2",
+    "@type": "koral:doc",
+    "key": "textSigle",
+    "match": "match:eq",
+    "type" : "type:string",
+    "value": [
+        "GOE/AGA/02232",
+        "GOE/AGA/02616",
+        "GOE/AGA/03828"
+    ],
+    "cache" : "true"
+}}
diff --git a/src/test/resources/named-vc/named-vc-free.jsonld b/vc/named-vc1.jsonld
similarity index 76%
copy from src/test/resources/named-vc/named-vc-free.jsonld
copy to vc/named-vc1.jsonld
index 65a6c52..0fddd89 100644
--- a/src/test/resources/named-vc/named-vc-free.jsonld
+++ b/vc/named-vc1.jsonld
@@ -1,9 +1,9 @@
 {"collection": {
-    "name" : "cache-goe",
+    "name" : "named-vc1",
     "@type": "koral:doc",
     "key": "textSigle",
     "match": "match:ne",
-    "type" : "type:string[]",
+    "type" : "type:string",
     "value": [
         "GOE/AGF/00000",
         "GOE/AGA/01784"