Add leaf cache
Change-Id: Ib630e0327cf64c246c3648361e5a6844ea30cf3a
diff --git a/Changes b/Changes
index 07d41a6..fbe8c4c 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,5 @@
+ - [performance] Add leaf cache. (diewald)
+
0.64.5 2025-12-03
- [maintenance] Update to Java 21 (diewald)
- [enhancement] Alter vcNamePattern to allow system VC names with less
diff --git a/pom.xml b/pom.xml
index 88e1120..79974de 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
<groupId>de.ids-mannheim.korap.krill</groupId>
<artifactId>Krill</artifactId>
- <version>0.64.5</version>
+ <version>0.64.6</version>
<packaging>jar</packaging>
<name>Krill</name>
@@ -163,6 +163,13 @@
<version>1.1</version>
</dependency>
+ <!-- Caffeine -->
+ <dependency>
+ <groupId>com.github.ben-manes.caffeine</groupId>
+ <artifactId>caffeine</artifactId>
+ <version>3.1.8</version>
+ </dependency>
+
<!-- Jersey -->
<dependency>
<groupId>org.glassfish.jersey.containers</groupId>
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 431ce7d..0bcd1cd 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -57,6 +57,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.github.benmanes.caffeine.cache.*;
+
import com.fasterxml.jackson.databind.ObjectMapper;
// Krill classes
@@ -179,6 +181,17 @@
private HashMap termContexts;
private ObjectMapper mapper = new ObjectMapper();
+ // Prelim CacheKey
+ record PrelimCacheKey(int queryHash, int collHash, short itemsPerRessource) {}
+
+ // CacheKey
+ record SearchCacheKey(PrelimCacheKey prelimck, String atomicHash) {}
+
+ // CacheValue
+ record SearchCacheValue(long matchCount, long matchDocCount) {}
+
+ Cache<SearchCacheKey, SearchCacheValue> searchCache;
+
// private ByteBuffer bbTerm;
// Some initializations ...
@@ -193,8 +206,11 @@
// Check for auto commit value
String autoCommitStr = null;
+ String cacheSizeStr = null;
+ int cacheSize = (64 * 1024 * 1024); // 64 MB
if (prop != null) {
autoCommitStr = prop.getProperty("krill.index.commit.auto");
+ cacheSizeStr = prop.getProperty("krill.cache.size");
}
if (autoCommitStr != null) {
@@ -206,6 +222,20 @@
"krill.index.commit.auto expected to be a numerical value");
};
};
+
+ if (cacheSizeStr != null) {
+ try {
+ int retVal = Integer.parseInt(cacheSizeStr);
+ cacheSize = retVal;
+ } catch (NumberFormatException e) {
+ log.warn("krill.cache.size expected to be a numerical value");
+ }
+ };
+
+ searchCache = Caffeine.newBuilder()
+ .maximumWeight(cacheSize)
+ .weigher((SearchCacheKey key, SearchCacheValue value) -> 80) // estimate per-entry size
+ .build();
};
@@ -1459,10 +1489,12 @@
tthread.start();
final long timeout = meta.getTimeOut();
boolean isTimeout = false;
-
+
// See: http://www.ibm.com/developerworks/java/library/j-benchmark1/index.html
long t1 = System.nanoTime();
+ int fromCache = 0;
+
try {
// Rewrite query (for regex and wildcard queries)
// Revise!
@@ -1478,6 +1510,15 @@
if (DEBUG)
log.trace("Rewritten query is {}", query.toString());
+ int qHash = query.toString().hashCode();
+ int collHash = 0;
+ Filter collf = collection.toFilter();
+ if (collf != null) {
+ collHash = collf.toString().hashCode();
+ };
+
+ PrelimCacheKey prelim = new PrelimCacheKey(qHash, collHash, itemsPerResource);
+
// Todo: run this in a separated thread
for (LeafReaderContext atomic : this.reader().leaves()) {
@@ -1485,15 +1526,55 @@
if (isTimeout)
break;
+
+ SearchCacheKey finalCacheKey = new SearchCacheKey(prelim, atomic.reader().getCombinedCoreAndDeletesKey().toString());
+ SearchCacheValue foundCache = searchCache.getIfPresent(finalCacheKey);
+ if (foundCache != null) {
+ if (DEBUG) {
+ log.trace(
+ "Found cache for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}",
+ qHash, collHash, itemsPerResource, atomic.reader().getCombinedCoreAndDeletesKey().toString()
+ );
+ };
+
+ if (startIndex > (i + foundCache.matchCount)) {
+ fromCache += foundCache.matchCount;
+ i += foundCache.matchCount;
+ j += foundCache.matchDocCount;
+ continue;
+ };
+ } else if (DEBUG) {
+ log.trace(
+ "Found no cache for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}",
+ qHash, collHash, itemsPerResource, atomic.reader().getCombinedCoreAndDeletesKey().toString()
+ );
+ }
+
/*
* Todo: There may be a way to know early if the bitset is emty
* by using LongBitSet - but this may not be as fast as I think.
*/
final FixedBitSet bitset = collection.bits(atomic);
- if (bitset.nextSetBit(0) == DocIdSetIterator.NO_MORE_DOCS)
+ if (bitset.nextSetBit(0) == DocIdSetIterator.NO_MORE_DOCS) {
+ if (foundCache == null)
+ searchCache.put(
+ finalCacheKey,
+ new SearchCacheValue(0, 0)
+ );
+
+ if (DEBUG) {
+ log.trace(
+ "Store cache (1) for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}, store:0/0",
+ qHash,
+ collHash,
+ itemsPerResource,
+ atomic.reader().getCombinedCoreAndDeletesKey().toString()
+ );
+ };
continue;
+ };
final PositionsToOffset pto = snippets ? new PositionsToOffset(atomic, field) : null;
@@ -1504,6 +1585,9 @@
final IndexReader lreader = atomic.reader();
int localDocID, docID;
+ long li = i;
+ long lj = j;
+
// TODO: Get document information from Cache! Fieldcache?
for (; i < hits; i++) {
@@ -1511,9 +1595,28 @@
log.trace("Match Nr {}/{}", i, count);
// There are no more spans to find
- if (!spans.next())
- break;
+ if (!spans.next()) {
+ if (foundCache == null)
+ foundCache = new SearchCacheValue(i - li, j - lj);
+ searchCache.put(
+ finalCacheKey,
+ foundCache
+ );
+ if (DEBUG) {
+ log.trace(
+ "Store cache (2) for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}, store:{}/{}",
+ qHash,
+ collHash,
+ itemsPerResource,
+ atomic.reader().getCombinedCoreAndDeletesKey().toString(),
+ i - li,
+ j - lj
+ );
+ };
+ break;
+ };
+
// Increment resource counter
itemsPerResourceCounter++;
@@ -1680,6 +1783,28 @@
oldLocalDocID = localDocID;
i++;
};
+
+ if (!isTimeout && !cutoff) {
+ if (foundCache == null) {
+ searchCache.put(
+ finalCacheKey,
+ new SearchCacheValue(i - li, j -lj)
+ );
+
+ if (DEBUG) {
+ log.trace(
+ "Store cache (3) for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}, store:{}/{}",
+ qHash,
+ collHash,
+ itemsPerResource,
+ atomic.reader().getCombinedCoreAndDeletesKey().toString(),
+ i - li,
+ j - lj
+ );
+ };
+ };
+ };
+
atomicMatches.clear();
};
@@ -1712,6 +1837,9 @@
e.printStackTrace();
}
+ if (fromCache > 0)
+ kr.addMessage(0, "Some results were cached", String.valueOf(fromCache));
+
// Stop timer thread
tthread.stopTimer();
diff --git a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
index 956e4d2..6b35757 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
@@ -428,7 +428,10 @@
/** {@inheritDoc} */
@Override
public int hashCode () {
- int hc = firstClause.hashCode();
+ int hc = 0;
+ if (firstClause != null)
+ hc += firstClause.hashCode();
+
if (secondClause != null) {
hc += secondClause.hashCode();
}
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index e03e7ac..14bb1dc 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -1442,7 +1442,99 @@
assertEquals(0, kr.getTotalResults());
};
-
+
+ @Test
+ public void queryJSONcachedResults () throws IOException {
+ KrillIndex ki = new KrillIndex();
+
+ // Indexing test files
+ for (String i : new String[] {
+ "00001",
+ "00002",
+ }) {
+ ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+
+ ki.commit();
+
+ // Indexing test files
+ for (String i : new String[] {
+ "00003",
+ }) {
+ ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+
+ ki.commit();
+
+ // Indexing test files
+ for (String i : new String[] {
+ "00004",
+ "00005",
+ }) {
+ ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+
+ ki.commit();
+
+
+ // Indexing test files
+ for (String i : new String[] {
+ "00006",
+ "02439"
+ }) {
+ ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+
+ ki.commit();
+
+
+ Krill k = new Krill(new QueryBuilder("tokens").seg("s:der"));
+ Result kr = k.apply(ki);
+
+ KrillMeta meta = k.getMeta();
+ assertEquals(86, kr.getTotalResults());
+
+ meta.setStartIndex(25);
+ assertNull(kr.getMessage(0));
+
+ kr = k.apply(ki);
+ assertEquals(86, kr.getTotalResults());
+ assertNull(kr.getMessage(0));
+
+ k = new Krill(new QueryBuilder("tokens").seg("s:der"));
+ meta = k.getMeta();
+ meta.setStartIndex(50);
+
+ kr = k.apply(ki);
+ assertEquals(86, kr.getTotalResults());
+ assertEquals(kr.getMessage(0).getMessage(), "Some results were cached");
+
+ k = new Krill(new QueryBuilder("tokens").seg("s:die"));
+ meta = k.getMeta();
+ meta.setStartIndex(50);
+
+ kr = k.apply(ki);
+ assertEquals(59, kr.getTotalResults());
+ assertNull(kr.getMessage(0));
+
+ k = new Krill(new QueryBuilder("tokens").seg("s:Buchstabe"));
+ meta = k.getMeta();
+ meta.setStartIndex(50);
+
+ kr = k.apply(ki);
+ assertEquals(12, kr.getTotalResults());
+ assertNull(kr.getMessage(0));
+
+ kr = k.apply(ki);
+ assertEquals(12, kr.getTotalResults());
+ assertEquals(kr.getMessage(0).getMessage(), "Some results were cached");
+ };
+
+
/**
* This is a Schreibgebrauch ressource that didn't work for