Implemented referencing cached collection.
Change-Id: I02f2ee84b3f8e50c5632efc9363eb87d97a754f8
diff --git a/.gitignore b/.gitignore
index 04618c9..a89d9ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,5 @@
/src/main/resources/korap.conf
/dependency-reduced-pom.xml
/bin
-/src/test/resources/sample-index
\ No newline at end of file
+/src/test/resources/sample-index
+/krill_cache/
diff --git a/Changes b/Changes
index 50d7f41..0e620b5 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.58 2018-07-24
+ - Implemented referencing cached collection (margaretha)
+
0.57 2018-04-05
- [feature] Support text queries in metadata
(requires reindexing to work properly; diewald)
diff --git a/pom.xml b/pom.xml
index 5b72473..45ed86f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Krill</artifactId>
- <version>0.57.0</version>
+ <version>0.58.0</version>
<packaging>jar</packaging>
<name>Krill</name>
@@ -106,6 +106,12 @@
<artifactId>jul-to-slf4j</artifactId>
<version>1.7.25</version>
</dependency>
+
+ <dependency>
+ <groupId>net.sf.ehcache</groupId>
+ <artifactId>ehcache</artifactId>
+ <version>2.10.5</version>
+ </dependency>
<!-- SQLite for database connection tests -->
<dependency>
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index afd544e..8145f86 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -1,27 +1,39 @@
package de.ids_mannheim.korap;
import java.io.IOException;
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BitsFilteredDocIdSet;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.BitDocIdSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import de.ids_mannheim.korap.collection.CachedCollection;
import de.ids_mannheim.korap.collection.CollectionBuilder;
import de.ids_mannheim.korap.response.Notifications;
import de.ids_mannheim.korap.util.QueryException;
import de.ids_mannheim.korap.util.StatusCodes;
-
-import org.apache.lucene.search.*;
-import org.apache.lucene.index.*;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.BitDocIdSet;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.search.BitsFilteredDocIdSet;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.JsonNode;
-
-import java.nio.ByteBuffer;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import net.sf.ehcache.Cache;
+import net.sf.ehcache.CacheManager;
+import net.sf.ehcache.Element;
/**
* Create a Virtual Collection of documents by means of a KoralQuery
@@ -50,13 +62,16 @@
private static ByteBuffer bb = ByteBuffer.allocate(4);
// Logger
- private final static Logger log = LoggerFactory
- .getLogger(KrillCollection.class);
+ private final static Logger log =
+ LoggerFactory.getLogger(KrillCollection.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
-
+ public static Cache cache = CacheManager.newInstance().getCache("named_vc");
+ private boolean isNamedVC = false;
+ private Serializable name;
+
/**
* Construct a new KrillCollection.
*
@@ -85,34 +100,35 @@
ObjectMapper mapper = new ObjectMapper();
try {
JsonNode json = mapper.readTree(jsonString);
-
- if (json.has("errors") && json.get("errors").size()>0){
- this.addError(StatusCodes.INVALID_QUERY,"Json has errors.");
+
+ if (json.has("errors") && json.get("errors").size() > 0) {
+ this.addError(StatusCodes.INVALID_QUERY, "Json has errors.");
}
- else if (json.has("collection")){
+ else if (json.has("collection")) {
this.fromKoral(json.get("collection"));
}
- else if (json.has("collections")){
+ else if (json.has("collections")) {
this.addError(899,
"Collections are not supported anymore in favour of a single collection");
}
- else{
- this.addError(StatusCodes.MISSING_COLLECTION, "Collection is not found.");
- this.fromBuilder(this.build().nothing());
+ else {
+ this.addError(StatusCodes.MISSING_COLLECTION,
+ "Collection is not found.");
+ this.fromBuilder(this.build().nothing());
}
}
// Query Exception
catch (QueryException qe) {
this.addError(qe.getErrorCode(), qe.getMessage());
- this.fromBuilder(this.build().nothing());
+ this.fromBuilder(this.build().nothing());
}
// JSON exception
catch (IOException e) {
this.addError(621, "Unable to parse JSON", "KrillCollection",
e.getLocalizedMessage());
- this.fromBuilder(this.build().nothing());
+ this.fromBuilder(this.build().nothing());
};
};
@@ -143,7 +159,7 @@
}
catch (Exception e) {
this.addError(621, "Unable to parse JSON", "KrillCollection");
- this.fromBuilder(this.build().nothing());
+ this.fromBuilder(this.build().nothing());
};
return this;
@@ -181,11 +197,9 @@
String valtype = "type:string";
String match = "match:eq";
- if (json.has("key"))
- key = json.get("key").asText();
+ if (json.has("key")) key = json.get("key").asText();
- if (json.has("type"))
- valtype = json.get("type").asText();
+ if (json.has("type")) valtype = json.get("type").asText();
// Filter based on date
if (valtype.equals("type:date")) {
@@ -195,8 +209,7 @@
String dateStr = json.get("value").asText();
- if (json.has("match"))
- match = json.get("match").asText();
+ if (json.has("match")) match = json.get("match").asText();
// TODO: This isn't stable yet
switch (match) {
@@ -216,8 +229,7 @@
// Filter based on string
else if (valtype.equals("type:string")) {
- if (json.has("match"))
- match = json.get("match").asText();
+ if (json.has("match")) match = json.get("match").asText();
switch (match) {
@@ -227,21 +239,20 @@
return this.cb.term(key, json.get("value").asText())
.not();
- // Contains and containsnot (or excludes) is only
- // effective on text fields and ineffective on
- // string fields
+ // Contains and containsnot (or excludes) is only
+ // effective on text fields and ineffective on
+ // string fields
case "match:contains":
- return this.cb.text(key,
- json.get("value").asText());
+ return this.cb.text(key, json.get("value").asText());
case "match:containsnot":
- return this.cb.text(key,
- json.get("value").asText()).not();
+ return this.cb.text(key, json.get("value").asText())
+ .not();
// <LEGACY>
case "match:excludes":
- return this.cb.text(key,
- json.get("value").asText()).not();
+ return this.cb.text(key, json.get("value").asText())
+ .not();
// </LEGACY>
};
@@ -252,8 +263,7 @@
// Filter based on regex
else if (valtype.equals("type:regex")) {
- if (json.has("match"))
- match = json.get("match").asText();
+ if (json.has("match")) match = json.get("match").asText();
if (match.equals("match:eq")) {
return this.cb.re(key, json.get("value").asText());
@@ -262,21 +272,22 @@
return this.cb.re(key, json.get("value").asText()).not();
}
- // Contains and containsnot (or excludes) is
- // identical to eq and ne in case of regexes for the moment,
- // though it may be beneficial to circumfix these
- // with .*
+ // Contains and containsnot (or excludes) is
+ // identical to eq and ne in case of regexes for the
+ // moment,
+ // though it may be beneficial to circumfix these
+ // with .*
else if (match.equals("match:contains")) {
return this.cb.re(key, json.get("value").asText());
}
else if (match.equals("match:containsnot")) {
return this.cb.re(key, json.get("value").asText());
}
- // <LEGACY>
+ // <LEGACY>
else if (match.equals("match:excludes")) {
return this.cb.re(key, json.get("value").asText()).not();
};
- // </LEGACY>
+ // </LEGACY>
throw new QueryException(841,
"Match relation unknown for type");
@@ -311,6 +322,31 @@
};
return group;
}
+ // vc reference
+ else if (type.equals("koral:docGroupRef")) {
+ if (!json.has("ref")) {
+ throw new QueryException(StatusCodes.MISSING_VC_REFERENCE,
+ "ref is not found");
+ }
+
+ String ref = json.get("ref").asText();
+ if (ref.isEmpty()){
+ throw new QueryException(StatusCodes.MISSING_VC_REFERENCE,
+ "ref is empty");
+ }
+
+ Element element = cache.get(ref);
+ if (element == null){
+ this.addError(StatusCodes.MISSING_COLLECTION,
+ "Collection is not found.");
+ return this.build().nothing();
+ }
+ else {
+ CachedCollection cc = (CachedCollection) element.getObjectValue();
+ return cb.namedVC(cc);
+ }
+ }
+
// Unknown type
throw new QueryException(813, "Collection type is not supported");
@@ -374,8 +410,7 @@
* Serialize collection to a {@link Filter} object.
*/
public Filter toFilter () {
- if (this.cbi == null)
- return null;
+ if (this.cbi == null) return null;
return this.cbi.toFilter();
};
@@ -386,8 +421,7 @@
* not.
*/
public boolean isNegative () {
- if (this.cbi == null)
- return false;
+ if (this.cbi == null) return false;
return this.cbi.isNegative();
};
@@ -404,8 +438,7 @@
*/
public String toString () {
Filter filter = this.toFilter();
- if (filter == null)
- return "";
+ if (filter == null) return "";
return (this.isNegative() ? "-" : "") + filter.toString();
};
@@ -438,6 +471,7 @@
* @throws IOException
*/
public FixedBitSet bits (LeafReaderContext atomic) throws IOException {
+
LeafReader r = atomic.reader();
FixedBitSet bitset = new FixedBitSet(r.maxDoc());
DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
@@ -476,8 +510,7 @@
Filter filter;
if (this.cbi == null || (filter = this.cbi.toFilter()) == null) {
- if (acceptDocs == null)
- return null;
+ if (acceptDocs == null) return null;
bitset.set(0, maxDoc);
}
@@ -485,12 +518,11 @@
// Init vector
DocIdSet docids = filter.getDocIdSet(atomic, null);
- DocIdSetIterator filterIter = (docids == null) ? null
- : docids.iterator();
+ DocIdSetIterator filterIter =
+ (docids == null) ? null : docids.iterator();
if (filterIter == null) {
- if (!this.cbi.isNegative())
- return null;
+ if (!this.cbi.isNegative()) return null;
bitset.set(0, maxDoc);
}
@@ -499,8 +531,7 @@
bitset.or(filterIter);
// Revert for negation
- if (this.cbi.isNegative())
- bitset.flip(0, maxDoc);
+ if (this.cbi.isNegative()) bitset.flip(0, maxDoc);
};
};
@@ -539,18 +570,15 @@
public long numberOf (String field, String type) throws IOException {
// No index defined
- if (this.index == null)
- return (long) -1;
+ if (this.index == null) return (long) -1;
// No reader (inex is empty)
- if (this.index.reader() == null)
- return (long) 0;
+ if (this.index.reader() == null) return (long) 0;
// This is redundant to index stuff
if (type.equals("documents") || type.equals("base/texts")) {
if (this.cbi == null) {
- if (this.index.reader() == null)
- return (long) 0;
+ if (this.index.reader() == null) return (long) 0;
return (long) this.index.reader().numDocs();
}
else
@@ -561,8 +589,7 @@
// This may be prefixed by foundries
Term term = new Term(field, "-:" + type);
- if (DEBUG)
- log.debug("Iterate for {}/{}", field, type);
+ if (DEBUG) log.debug("Iterate for {}/{}", field, type);
long occurrences = 0;
try {
@@ -570,13 +597,11 @@
for (LeafReaderContext atomic : this.index.reader().leaves()) {
Bits bits = this.bits(atomic);
- if (DEBUG)
- log.debug("Final bits {}", _bits(bits));
+ if (DEBUG) log.debug("Final bits {}", _bits(bits));
occurrences += this._numberOfAtomic(bits, atomic, term);
- if (DEBUG)
- log.debug("Added up to {} for {}/{}", occurrences, field,
- type);
+ if (DEBUG) log.debug("Added up to {} for {}/{}", occurrences,
+ field, type);
};
}
@@ -603,7 +628,8 @@
// Todo: Maybe reuse a termsEnum!
final TermsEnum termsEnum = terms.iterator(null);
- // Set the position in the iterator to the term that is seeked
+ // Set the position in the iterator to the term that is
+ // seeked
if (termsEnum.seekExact(term.bytes())) {
// TODO: Reuse a DocsAndPositionsEnum!!
@@ -615,8 +641,7 @@
// The iterator is empty
// This may even be an error, but we return 0
- if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS)
- return 0;
+ if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS) return 0;
// Init some variables for data copying
long occurrences = 0;
@@ -625,8 +650,7 @@
// Init nextDoc()
while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
- if (docs.freq() < 1)
- continue;
+ if (docs.freq() < 1) continue;
// Initialize (go to first term)
docs.nextPosition();
@@ -640,11 +664,10 @@
// Add payload as integer
occurrences += bb.wrap(pl).getInt();
- if (DEBUG)
- log.debug(
- "Value for {} incremented by {} to {} in {}",
- term, bb.wrap(pl).getInt(), occurrences,
- docs.docID());
+ if (DEBUG) log.debug(
+ "Value for {} incremented by {} to {} in {}",
+ term, bb.wrap(pl).getInt(), occurrences,
+ docs.docID());
};
};
@@ -668,8 +691,7 @@
public long docCount () {
// No index defined
- if (this.index == null)
- return (long) 0;
+ if (this.index == null) return (long) 0;
// TODO: Caching!
@@ -697,6 +719,22 @@
};
+ private void createDocVector () {
+ List<FixedBitSet> bitSetList = new ArrayList<>();
+ try {
+ FixedBitSet bitset;
+ for (LeafReaderContext atomic : this.index.reader().leaves()) {
+ if ((bitset = this.bits(atomic)) != null) {
+ bitSetList.add(bitset);
+ }
+ };
+ }
+ catch (IOException e) {
+ log.warn(e.getLocalizedMessage());
+ };
+
+ }
+
/*
* Analyze how terms relate
*/
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CachedCollection.java b/src/main/java/de/ids_mannheim/korap/collection/CachedCollection.java
new file mode 100644
index 0000000..9d393f0
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/collection/CachedCollection.java
@@ -0,0 +1,20 @@
+package de.ids_mannheim.korap.collection;
+
+import java.util.Map;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSet;
+
+public class CachedCollection {
+
+ private Map<LeafReaderContext, DocIdSet> docIdMap;
+
+ public Map<LeafReaderContext, DocIdSet> getDocIdMap () {
+ return docIdMap;
+ }
+
+ public void setDocIdMap (Map<LeafReaderContext, DocIdSet> docIdMap) {
+ this.docIdMap = docIdMap;
+ }
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 6aeca37..3fceebb 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -1,24 +1,22 @@
package de.ids_mannheim.korap.collection;
-import java.util.*;
import java.io.IOException;
-import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Iterator;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.queries.TermsFilter;
-import org.apache.lucene.search.*;
-import org.apache.lucene.search.NumericRangeFilter;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-import de.ids_mannheim.korap.util.KrillDate;
-import de.ids_mannheim.korap.index.TextPrependedTokenStream;
-
+import org.apache.lucene.queries.TermsFilter;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.NumericRangeFilter;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.QueryWrapperFilter;
+import org.apache.lucene.search.RegexpQuery;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.KrillCollection;
-import de.ids_mannheim.korap.collection.BooleanGroupFilter;
+import de.ids_mannheim.korap.index.TextPrependedTokenStream;
+import de.ids_mannheim.korap.util.KrillDate;
/*
* TODO: Optimize!
@@ -374,4 +372,35 @@
return this;
};
};
+
+ public class NamedVC implements CollectionBuilder.Interface {
+
+ private CachedCollection cachedCollection;
+ private boolean isNegative = false;
+
+ public NamedVC (CachedCollection cc) {
+ this.cachedCollection = cc;
+ }
+
+ @Override
+ public Filter toFilter () {
+ return new NamedVCFilter(cachedCollection);
+ }
+
+ @Override
+ public boolean isNegative () {
+ return this.isNegative;
+ }
+
+ @Override
+ public CollectionBuilder.Interface not () {
+ this.isNegative = true;
+ return this;
+ }
+
+ }
+
+ public Interface namedVC (CachedCollection cc) {
+ return new CollectionBuilder.NamedVC(cc);
+ }
};
diff --git a/src/main/java/de/ids_mannheim/korap/collection/NamedVCFilter.java b/src/main/java/de/ids_mannheim/korap/collection/NamedVCFilter.java
new file mode 100644
index 0000000..63e89e0
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/collection/NamedVCFilter.java
@@ -0,0 +1,25 @@
+package de.ids_mannheim.korap.collection;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.Bits;
+
+public class NamedVCFilter extends Filter {
+
+ private CachedCollection cachedCollection;
+
+ public NamedVCFilter (CachedCollection cachedCollection) {
+ this.cachedCollection = cachedCollection;
+ }
+
+ @Override
+ public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
+ throws IOException {
+
+ return cachedCollection.getDocIdMap().get(context);
+ }
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/util/StatusCodes.java b/src/main/java/de/ids_mannheim/korap/util/StatusCodes.java
index bb97874..9aad2f0 100644
--- a/src/main/java/de/ids_mannheim/korap/util/StatusCodes.java
+++ b/src/main/java/de/ids_mannheim/korap/util/StatusCodes.java
@@ -39,6 +39,8 @@
public static final int MISSING_RELATION_NODE = 717;
public static final int MISSING_RELATION_TERM = 718;
public static final int INVALID_QUERY = 719;
+ public static final int MISSING_VC_REFERENCE = 720;
+
public static final int INVALID_MATCH_ID = 730;
public static final int MISSING_KEY = 740;
public static final int UNKNOWN_MATCH_RELATION = 741;
diff --git a/src/main/resources/ehcache.xml b/src/main/resources/ehcache.xml
new file mode 100644
index 0000000..0cb69b7
--- /dev/null
+++ b/src/main/resources/ehcache.xml
@@ -0,0 +1,20 @@
+<ehcache xsi:noNamespaceSchemaLocation="ehcache.xsd"
+ updateCheck="true" monitoring="autodetect" dynamicConfig="true">
+
+ <diskStore path="./krill_cache" />
+
+ <defaultCache
+ maxEntriesLocalHeap="0"
+ eternal="false"
+ timeToIdleSeconds="1200"
+ timeToLiveSeconds="1200">
+ </defaultCache>
+
+ <cache name='named_vc'
+ eternal='true'
+ memoryStoreEvictionPolicy="LRU"
+ maxBytesLocalHeap="256M"
+ maxBytesLocalDisk="1G"
+ overflowToDisk='true' />
+
+</ehcache>
\ No newline at end of file
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestVCReference.java b/src/test/java/de/ids_mannheim/korap/collection/TestVCReference.java
new file mode 100644
index 0000000..0a67fa2
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestVCReference.java
@@ -0,0 +1,31 @@
+package de.ids_mannheim.korap.collection;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.Test;
+
+import de.ids_mannheim.korap.KrillCollection;
+import de.ids_mannheim.korap.response.Message;
+import de.ids_mannheim.korap.util.StatusCodes;
+
+public class TestVCReference {
+
+ @Test
+ public void testUnknownVC () throws IOException {
+
+ InputStream is = getClass().getClassLoader()
+ .getResourceAsStream("collection/unknown-vc-ref.jsonld");
+ String json = IOUtils.toString(is);
+
+ KrillCollection kc = new KrillCollection(json);
+ List<Message> messages = kc.getErrors().getMessages();
+ assertEquals(1, messages.size());
+
+ assertEquals(StatusCodes.MISSING_COLLECTION, messages.get(0).getCode());
+ }
+}
diff --git a/src/test/resources/collection/unknown-vc-ref.jsonld b/src/test/resources/collection/unknown-vc-ref.jsonld
new file mode 100644
index 0000000..8c7e80e
--- /dev/null
+++ b/src/test/resources/collection/unknown-vc-ref.jsonld
@@ -0,0 +1,4 @@
+{"collection": {
+ "@type": "koral:docGroupRef",
+ "ref": "https://korap.ids-mannheim.de/@ndiewald/MyCorpus"
+}}
diff --git a/src/test/resources/named-vc/named-vc-free.jsonld b/src/test/resources/named-vc/named-vc-free.jsonld
new file mode 100644
index 0000000..4e8cd83
--- /dev/null
+++ b/src/test/resources/named-vc/named-vc-free.jsonld
@@ -0,0 +1,11 @@
+{"collection": {
+ "name" : "cache-goe",
+ "@type": "koral:doc",
+ "key": "textSigle",
+ "match": "match:eq",
+ "value": [
+ "GOE/AGF/00000",
+ "GOE/AGA/01784"
+ ]
+}}
+