Add leaf cache Change-Id: Ib630e0327cf64c246c3648361e5a6844ea30cf3a

commit: 859fb54180e0d32e977e8afb07983f571fa19c62 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Jul 08 15:52:45 2025 +0200
committer: Akron <nils@diewald-online.de> Tue Dec 16 10:40:59 2025 +0100
tree: 96071869b8b81d32b13a9e225b255c40dbc2231f
parent: 3104514317d4d4ae79b036522eb4df840b8085b2 [diff]
diff --git a/Changes b/Changes
index 07d41a6..fbe8c4c 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,5 @@
+    - [performance] Add leaf cache. (diewald)
+
 0.64.5 2025-12-03
     - [maintenance] Update to Java 21 (diewald)
     - [enhancement] Alter vcNamePattern to allow system VC names with less 

diff --git a/pom.xml b/pom.xml
index 88e1120..79974de 100644
--- a/pom.xml
+++ b/pom.xml

@@ -35,7 +35,7 @@
 
   <groupId>de.ids-mannheim.korap.krill</groupId>
   <artifactId>Krill</artifactId>
-  <version>0.64.5</version>
+  <version>0.64.6</version>
   <packaging>jar</packaging>
 
   <name>Krill</name>
@@ -163,6 +163,13 @@
       <version>1.1</version>
     </dependency>
 
+    <!-- Caffeine -->
+    <dependency>
+      <groupId>com.github.ben-manes.caffeine</groupId>
+      <artifactId>caffeine</artifactId>
+      <version>3.1.8</version>
+    </dependency>
+
     <!-- Jersey -->
     <dependency>
       <groupId>org.glassfish.jersey.containers</groupId>

diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 431ce7d..0bcd1cd 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java

@@ -57,6 +57,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.github.benmanes.caffeine.cache.*;
+
 import com.fasterxml.jackson.databind.ObjectMapper;
 
 // Krill classes
@@ -179,6 +181,17 @@
     private HashMap termContexts;
     private ObjectMapper mapper = new ObjectMapper();
 
+    // Prelim CacheKey
+    record PrelimCacheKey(int queryHash, int collHash, short itemsPerRessource) {}
+
+    // CacheKey
+    record SearchCacheKey(PrelimCacheKey prelimck, String atomicHash) {}
+
+    // CacheValue
+    record SearchCacheValue(long matchCount, long matchDocCount) {}
+
+    Cache<SearchCacheKey, SearchCacheValue> searchCache;
+    
     // private ByteBuffer bbTerm;
 
     // Some initializations ...
@@ -193,8 +206,11 @@
 
         // Check for auto commit value
         String autoCommitStr = null;
+        String cacheSizeStr = null;
+        int cacheSize = (64 * 1024 * 1024); // 64 MB 
         if (prop != null) {
             autoCommitStr = prop.getProperty("krill.index.commit.auto");
+            cacheSizeStr = prop.getProperty("krill.cache.size");
         }
         
         if (autoCommitStr != null) {
@@ -206,6 +222,20 @@
                         "krill.index.commit.auto expected to be a numerical value");
             };
         };
+
+        if (cacheSizeStr != null) {
+            try {
+                int retVal = Integer.parseInt(cacheSizeStr);
+                cacheSize = retVal;
+            } catch (NumberFormatException e) {
+                log.warn("krill.cache.size expected to be a numerical value");
+            }
+        };
+
+        searchCache = Caffeine.newBuilder()
+            .maximumWeight(cacheSize)
+            .weigher((SearchCacheKey key, SearchCacheValue value) -> 80) // estimate per-entry size
+            .build();
     };
 
 
@@ -1459,10 +1489,12 @@
         tthread.start();
         final long timeout = meta.getTimeOut();
         boolean isTimeout = false;
-
+       
         // See: http://www.ibm.com/developerworks/java/library/j-benchmark1/index.html
         long t1 = System.nanoTime();
 
+        int fromCache = 0;
+
         try {
             // Rewrite query (for regex and wildcard queries)
             // Revise!
@@ -1478,6 +1510,15 @@
 			if (DEBUG)
 				log.trace("Rewritten query is {}", query.toString());
 
+            int qHash = query.toString().hashCode();
+            int collHash = 0;
+            Filter collf = collection.toFilter();
+            if (collf != null) {
+                collHash = collf.toString().hashCode();
+            };
+
+            PrelimCacheKey prelim = new PrelimCacheKey(qHash, collHash, itemsPerResource);
+            
             // Todo: run this in a separated thread
             for (LeafReaderContext atomic : this.reader().leaves()) {
 
@@ -1485,15 +1526,55 @@
 
                 if (isTimeout)
                     break;
+
+                SearchCacheKey finalCacheKey = new SearchCacheKey(prelim, atomic.reader().getCombinedCoreAndDeletesKey().toString());
+                SearchCacheValue foundCache = searchCache.getIfPresent(finalCacheKey);
                 
+                if (foundCache != null) {
+                    if (DEBUG) {
+                        log.trace(
+                            "Found cache for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}",
+                            qHash, collHash, itemsPerResource, atomic.reader().getCombinedCoreAndDeletesKey().toString()
+                            );
+                    };
+
+                    if (startIndex > (i + foundCache.matchCount)) {
+                        fromCache += foundCache.matchCount;
+                        i += foundCache.matchCount;
+                        j += foundCache.matchDocCount;
+                        continue;
+                    };
+                } else if (DEBUG) {
+                    log.trace(
+                        "Found no cache for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}",
+                        qHash, collHash, itemsPerResource, atomic.reader().getCombinedCoreAndDeletesKey().toString()
+                        );
+                } 
+
                 /*
                  * Todo: There may be a way to know early if the bitset is emty
                  * by using LongBitSet - but this may not be as fast as I think.
                  */
                 final FixedBitSet bitset = collection.bits(atomic);
 
-				if (bitset.nextSetBit(0) == DocIdSetIterator.NO_MORE_DOCS)
+				if (bitset.nextSetBit(0) == DocIdSetIterator.NO_MORE_DOCS) {
+                    if (foundCache == null)
+                        searchCache.put(
+                            finalCacheKey,
+                            new SearchCacheValue(0, 0)
+                            );
+                    
+                    if (DEBUG) {
+                        log.trace(
+                            "Store cache (1) for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}, store:0/0",
+                            qHash,
+                            collHash,
+                            itemsPerResource,
+                            atomic.reader().getCombinedCoreAndDeletesKey().toString()
+                            );
+                    };
 					continue;
+                };
 
                 final PositionsToOffset pto = snippets ? new PositionsToOffset(atomic, field) : null;
 				
@@ -1504,6 +1585,9 @@
                 final IndexReader lreader = atomic.reader();
                 int localDocID, docID;
 
+                long li = i;
+                long lj = j;
+                
                 // TODO: Get document information from Cache! Fieldcache?
                 for (; i < hits; i++) {
 
@@ -1511,9 +1595,28 @@
                         log.trace("Match Nr {}/{}", i, count);
                    
                     // There are no more spans to find
-                    if (!spans.next())
-                        break;
+                    if (!spans.next()) {
+                        if (foundCache == null)
+                            foundCache = new SearchCacheValue(i - li, j - lj);
+                            searchCache.put(
+                                finalCacheKey,
+                                foundCache
+                                );
 
+                        if (DEBUG) {
+                            log.trace(
+                                "Store cache (2) for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}, store:{}/{}",
+                                qHash,
+                                collHash,
+                                itemsPerResource,
+                                atomic.reader().getCombinedCoreAndDeletesKey().toString(),
+                                i - li,
+                                j - lj
+                                );
+                        };
+                        break;
+                    };
+                    
                     // Increment resource counter
                     itemsPerResourceCounter++;
                     
@@ -1680,6 +1783,28 @@
                     oldLocalDocID = localDocID;
                     i++;
                 };
+
+                if (!isTimeout && !cutoff) {
+                    if (foundCache == null) {
+                        searchCache.put(
+                            finalCacheKey,
+                            new SearchCacheValue(i - li, j -lj)
+                            );
+                    
+                        if (DEBUG) {
+                            log.trace(
+                                "Store cache (3) for Query: {}, Collection: {}, itemsPerRessource: {}, Reader: {}, store:{}/{}",
+                                qHash,
+                                collHash,
+                                itemsPerResource,
+                                atomic.reader().getCombinedCoreAndDeletesKey().toString(),
+                                i - li,
+                                j - lj
+                                );
+                        };
+                    };
+                };
+                
                 atomicMatches.clear();
             };
 
@@ -1712,6 +1837,9 @@
             e.printStackTrace();
         }
 
+        if (fromCache > 0)
+            kr.addMessage(0, "Some results were cached", String.valueOf(fromCache));
+
         // Stop timer thread
         tthread.stopTimer();
 

diff --git a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
index 956e4d2..6b35757 100644
--- a/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/query/SimpleSpanQuery.java

@@ -428,7 +428,10 @@
     /** {@inheritDoc} */
     @Override
     public int hashCode () {
-        int hc = firstClause.hashCode();
+        int hc = 0;
+        if (firstClause != null)
+            hc += firstClause.hashCode();
+
         if (secondClause != null) {
             hc += secondClause.hashCode();
         }

diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index e03e7ac..14bb1dc 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java

@@ -1442,7 +1442,99 @@
 
         assertEquals(0, kr.getTotalResults());
     };
-   
+
+    @Test
+    public void queryJSONcachedResults () throws IOException {
+        KrillIndex ki = new KrillIndex();
+
+        // Indexing test files
+        for (String i : new String[] {
+                "00001",
+                "00002",
+            }) {
+            ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                      true);
+        };
+
+        ki.commit();
+
+        // Indexing test files
+        for (String i : new String[] {
+                "00003",
+            }) {
+            ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                      true);
+        };
+
+        ki.commit();
+
+        // Indexing test files
+        for (String i : new String[] {
+                "00004",
+                "00005",
+            }) {
+            ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                      true);
+        };
+
+        ki.commit();
+
+        
+        // Indexing test files
+        for (String i : new String[] {
+                "00006",
+                "02439"
+            }) {
+            ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                      true);
+        };
+
+        ki.commit();
+
+        
+        Krill k = new Krill(new QueryBuilder("tokens").seg("s:der"));
+        Result kr = k.apply(ki);
+
+        KrillMeta meta = k.getMeta();
+        assertEquals(86, kr.getTotalResults());
+
+        meta.setStartIndex(25);
+        assertNull(kr.getMessage(0));
+        
+        kr = k.apply(ki);
+        assertEquals(86, kr.getTotalResults());
+        assertNull(kr.getMessage(0));
+
+        k = new Krill(new QueryBuilder("tokens").seg("s:der"));
+        meta = k.getMeta();
+        meta.setStartIndex(50);
+        
+        kr = k.apply(ki);
+        assertEquals(86, kr.getTotalResults());
+        assertEquals(kr.getMessage(0).getMessage(), "Some results were cached");
+
+        k = new Krill(new QueryBuilder("tokens").seg("s:die"));
+        meta = k.getMeta();
+        meta.setStartIndex(50);
+        
+        kr = k.apply(ki);
+        assertEquals(59, kr.getTotalResults());
+        assertNull(kr.getMessage(0));
+
+        k = new Krill(new QueryBuilder("tokens").seg("s:Buchstabe"));
+        meta = k.getMeta();
+        meta.setStartIndex(50);
+        
+        kr = k.apply(ki);
+        assertEquals(12, kr.getTotalResults());
+        assertNull(kr.getMessage(0));        
+
+        kr = k.apply(ki);
+        assertEquals(12, kr.getTotalResults());
+        assertEquals(kr.getMessage(0).getMessage(), "Some results were cached");
+    };
+
+    
 
     /**
      * This is a Schreibgebrauch ressource that didn't work for
commit	859fb54180e0d32e977e8afb07983f571fa19c62	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Jul 08 15:52:45 2025 +0200
committer	Akron <nils@diewald-online.de>	Tue Dec 16 10:40:59 2025 +0100
tree	96071869b8b81d32b13a9e225b255c40dbc2231f
parent	3104514317d4d4ae79b036522eb4df840b8085b2 [diff]