New feature: itemsPerResource
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index d2df845..ed3e88d 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -967,6 +967,8 @@
int hits = kr.itemsPerPage() + startIndex;
int limit = ks.getLimit();
boolean cutoff = ks.doCutOff();
+ short itemsPerResource = ks.getItemsPerResource();
+
if (limit > 0) {
if (hits > limit)
@@ -978,6 +980,8 @@
ArrayList<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(kr.itemsPerPage());
+ int itemsPerResourceCounter = 0;
+
try {
// Rewrite query (for regex and wildcard queries)
@@ -989,6 +993,8 @@
for (AtomicReaderContext atomic : this.reader().leaves()) {
+ int oldLocalDocID = -1;
+
// Use OpenBitSet;
Bits bitset = collection.bits(atomic);
@@ -1012,12 +1018,36 @@
// There are no more spans to find
if (spans.next() != true)
break;
-
+
+ int localDocID = spans.doc();
+
+ // Count hits per resource
+ if (itemsPerResource > 0) {
+
+ // IDS are identical
+ if (localDocID == oldLocalDocID || oldLocalDocID == -1) {
+ if (itemsPerResourceCounter++ >= itemsPerResource) {
+ if (spans.skipTo(localDocID + 1) != true) {
+ break;
+ }
+ else {
+ itemsPerResourceCounter = 1;
+ localDocID = spans.doc();
+ };
+ };
+ }
+
+ // Reset counter
+ else
+ itemsPerResourceCounter = 0;
+
+ oldLocalDocID = localDocID;
+ };
+
// The next matches are not yet part of the result
if (startIndex > i)
continue;
- int localDocID = spans.doc();
int docID = atomic.docBase + localDocID;
// Document doc = lreader.document(docID, fieldsToLoadLocal);
@@ -1112,7 +1142,7 @@
match.internalDocID = docID;
match.populateDocument(doc, field, fieldsToLoadLocal);
-
+
if (DEBUG)
log.trace("I've got a match in {} of {}",
match.getDocID(), count);
@@ -1130,6 +1160,30 @@
while (!cutoff && spans.next()) {
if (limit > 0 && i >= limit)
break;
+
+ // Count hits per resource
+ if (itemsPerResource > 0) {
+ int localDocID = spans.doc();
+
+ // IDS are identical
+ if (localDocID == oldLocalDocID || oldLocalDocID == -1) {
+ if (itemsPerResourceCounter++ >= itemsPerResource)
+ if (spans.skipTo(localDocID + 1) != true) {
+ break;
+ }
+ else {
+ itemsPerResourceCounter = 1;
+ localDocID = spans.doc();
+ };
+ }
+
+ // Reset counter
+ else
+ itemsPerResourceCounter = 0;
+
+ oldLocalDocID = localDocID;
+ };
+
i++;
};
atomicMatches.clear();
@@ -1141,6 +1195,9 @@
kr.setBenchmarkSearchResults(t2, t1);
};
+ if (itemsPerResource > 0)
+ kr.setItemsPerResource(itemsPerResource);
+
kr.setTotalResults(cutoff ? -1 : i);
}
catch (IOException e) {
diff --git a/src/main/java/de/ids_mannheim/korap/KorapResult.java b/src/main/java/de/ids_mannheim/korap/KorapResult.java
index 883cec7..062e9a8 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapResult.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapResult.java
@@ -31,6 +31,7 @@
private SearchContext context;
private short itemsPerPage = ITEMS_PER_PAGE;
+ private short itemsPerResource = 0;
private String benchmarkSearchResults,
benchmarkHitCounter;
@@ -182,6 +183,15 @@
return this.benchmarkHitCounter;
}
+ public void setItemsPerResource (short value) {
+ this.itemsPerResource = value;
+ };
+
+ @JsonIgnore
+ public short getItemsPerResource () {
+ return this.itemsPerResource;
+ };
+
public String getQuery() {
return this.query;
@@ -227,6 +237,9 @@
json.put("context", this.getContext().toJSON());
+ if (this.itemsPerResource > 0)
+ json.put("itemsPerResource", this.itemsPerResource);
+
if (this.version != null)
json.put("version", this.version);
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
index 2a26c7f..56b06a6 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapSearch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -1,13 +1,17 @@
package de.ids_mannheim.korap;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import de.ids_mannheim.korap.index.SearchContext;
-import de.ids_mannheim.korap.query.wrap.SpanQueryWrapperInterface;
-import de.ids_mannheim.korap.util.QueryException;
-import org.apache.lucene.search.spans.SpanQuery;
+import java.io.*;
-import java.io.IOException;
+import org.apache.lucene.search.spans.SpanQuery;
+import de.ids_mannheim.korap.query.wrap.SpanQueryWrapperInterface;
+import de.ids_mannheim.korap.KorapCollection;
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.util.QueryException;
+import de.ids_mannheim.korap.index.SearchContext;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
// Todo: Use configuration file
@@ -18,14 +22,15 @@
/**
* @author Nils Diewald
- * <p/>
- * KoraSearch implements an object for all search relevant parameters.
+ *
+ * KoraSearch implements an object for all search relevant parameters.
*/
public class KorapSearch {
private int startIndex = 0, limit = 0;
private short count = 25,
- countMax = 50;
- private boolean cutoff = false;
+ countMax = 50;
+ private boolean cutOff = false;
+ private short itemsPerResource = 0;
private SpanQuery query;
private KorapCollection collection;
private KorapIndex index;
@@ -37,241 +42,231 @@
private String spanContext;
{
- context = new SearchContext();
- }
+ context = new SearchContext();
+ };
+ public KorapSearch (String jsonString) {
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ this.request = mapper.readValue(jsonString, JsonNode.class);
+
+ // "query" value
+ if (this.request.has("query")) {
+ try {
+ this.query = new KorapQuery("tokens").fromJSON(this.request.get("query")).toQuery();
+ }
+ catch (QueryException q) {
+ this.error = q.getMessage();
+ };
+ }
+ else {
+ this.error = "No query defined";
+ };
- public KorapSearch(String jsonString) {
- ObjectMapper mapper = new ObjectMapper();
- try {
- this.request = mapper.readValue(jsonString, JsonNode.class);
+ // "meta" virtual collections
+ if (this.request.has("collections"))
+ this.setCollection(new KorapCollection(jsonString));
- // "query" value
- if (this.request.has("query")) {
- try {
- this.query = new KorapQuery("tokens").fromJSON(this.request.get("query")).toQuery();
- } catch (QueryException q) {
- this.error = q.getMessage();
- }
+ if (this.error == null) {
+ if (this.request.has("meta")) {
+ JsonNode meta = this.request.get("meta");
- } else {
- this.error = "No query defined";
- }
+ // Defined count
+ if (meta.has("count"))
+ this.setCount(meta.get("count").asInt());
+ // Defined startIndex
+ if (meta.has("startIndex"))
+ this.setStartIndex(meta.get("startIndex").asInt());
- // "meta" virtual collections
- if (this.request.has("collections"))
- this.setCollection(new KorapCollection(jsonString));
+ // Defined startPage
+ if (meta.has("startPage"))
+ this.setStartPage(meta.get("startPage").asInt());
- if (this.error == null) {
- if (this.request.has("meta")) {
- JsonNode meta = this.request.get("meta");
+ // Defined cutOff
+ if (meta.has("cutOff"))
+ this.setCutOff(meta.get("cutOff").asBoolean());
- // Defined count
- if (meta.has("count"))
- this.setCount(meta.get("count").asInt());
+ // Defined contexts
+ if (meta.has("context"))
+ this.context.fromJSON(meta.get("context"));
- // Defined startIndex
- if (meta.has("startIndex"))
- this.setStartIndex(meta.get("startIndex").asInt());
+ // Defined resource count
+ if (meta.has("itemsPerResource"))
+ this.setItemsPerResource(meta.get("itemsPerResource").asInt());
+ };
+ };
+ }
- // Defined startPage
- if (meta.has("startPage"))
- this.setStartPage(meta.get("startPage").asInt());
-
- // Defined cutOff
- if (meta.has("cutOff"))
- this.setCutOff(meta.get("cutOff").asBoolean());
-
- // Defined contexts
- if (meta.has("context"))
- this.context.fromJSON(meta.get("context"));
- }
-
- }
-
- }
-
- // Unable to parse JSON
- catch (IOException e) {
- this.error = e.getMessage();
- }
-
- }
+ // Unable to parse JSON
+ catch (IOException e) {
+ this.error = e.getMessage();
+ };
+ };
// Maybe accept queryWrapperStuff
- public KorapSearch(SpanQueryWrapperInterface sqwi) {
- this.query = sqwi.toQuery();
- }
+ public KorapSearch (SpanQueryWrapperInterface sqwi) {
+ this.query = sqwi.toQuery();
+ };
-
- public KorapSearch(SpanQuery sq) {
- this.query = sq;
- }
-
+ public KorapSearch (SpanQuery sq) {
+ this.query = sq;
+ };
// Empty constructor
- public KorapSearch() {
- }
+ public KorapSearch () { };
+
+ public String getError () {
+ return this.error;
+ };
+
+ public SpanQuery getQuery () {
+ return this.query;
+ };
+
+ public JsonNode getRequest () {
+ return this.request;
+ };
+
+ public KorapSearch setQuery (SpanQueryWrapperInterface sqwi) {
+ this.query = sqwi.toQuery();
+ return this;
+ };
+
+ public KorapSearch setQuery (SpanQuery sq) {
+ this.query = sq;
+ return this;
+ };
+
+ public SearchContext getContext () {
+ return this.context;
+ };
+
+ public KorapSearch setContext (SearchContext context) {
+ this.context = context;
+ return this;
+ };
+
+ public int getStartIndex () {
+ return this.startIndex;
+ };
+
+ public KorapSearch setStartIndex (int value) {
+ if (value >= 0) {
+ this.startIndex = value;
+ }
+ else {
+ this.startIndex = 0;
+ };
+
+ return this;
+ };
+
+ public KorapSearch setStartPage (int value) {
+ if (value >= 0) {
+ this.setStartIndex((value * this.getCount()) - this.getCount());
+ }
+ else {
+ this.startIndex = 0;
+ };
+
+ return this;
+ };
+
+ public short getCount () {
+ return this.count;
+ };
+
+ public short getCountMax () {
+ return this.countMax;
+ };
+
+ public int getLimit () {
+ return this.limit;
+ };
+
+ public KorapSearch setLimit (int limit) {
+ if (limit > 0)
+ this.limit = limit;
+ return this;
+ };
+
+ public boolean doCutOff () {
+ return this.cutOff;
+ };
+
+ public KorapSearch setCutOff (boolean cutOff) {
+ this.cutOff = cutOff;
+ return this;
+ };
+
+ public KorapSearch setCount (int value) {
+ // Todo: Maybe update startIndex with known startPage!
+ this.setCount((short) value);
+ return this;
+ };
+
+ public KorapSearch setCount (short value) {
+ if (value > 0) {
+ if (value <= this.countMax)
+ this.count = value;
+ else
+ this.count = this.countMax;
+ };
+ return this;
+ };
+
+ public KorapSearch setItemsPerResource (short value) {
+ if (value >= 0)
+ this.itemsPerResource = value;
+ return this;
+ };
+
+ public KorapSearch setItemsPerResource (int value) {
+ return this.setItemsPerResource((short) value);
+ };
+
+ public short getItemsPerResource () {
+ return this.itemsPerResource;
+ };
- public String getError() {
- return this.error;
- }
+ public KorapSearch setCollection (KorapCollection kc) {
+ this.collection = kc;
+ if (kc.getError() != null)
+ this.error = kc.getError();
+ return this;
+ };
+ public KorapCollection getCollection () {
+ if (this.collection == null)
+ this.collection = new KorapCollection();
- public SpanQuery getQuery() {
- return this.query;
- }
+ return this.collection;
+ };
+ public KorapResult run (KorapIndex ki) {
+ if (this.query == null) {
+ KorapResult kr = new KorapResult();
+ kr.setRequest(this.request);
+ if (this.error != null)
+ kr.setError(this.error);
+ else
+ kr.setError(this.getClass() + " expects a query");
+ return kr;
+ };
- public JsonNode getRequest() {
- return this.request;
- }
+ if (this.error != null) {
+ KorapResult kr = new KorapResult();
+ kr.setRequest(this.request);
+ kr.setError(this.error);
+ return kr;
+ };
-
- public KorapSearch setQuery(SpanQueryWrapperInterface sqwi) {
- this.query = sqwi.toQuery();
- return this;
- }
-
-
- public KorapSearch setQuery(SpanQuery sq) {
- this.query = sq;
- return this;
- }
-
-
- public SearchContext getContext() {
- return this.context;
- }
-
-
- public KorapSearch setContext(SearchContext context) {
- this.context = context;
- return this;
- }
-
-
- public int getStartIndex() {
- return this.startIndex;
- }
-
-
- public KorapSearch setStartIndex(int value) {
- if (value >= 0) {
- this.startIndex = value;
- } else {
- this.startIndex = 0;
- }
- return this;
- }
-
-
- public KorapSearch setStartPage(int value) {
- if (value >= 0) {
- this.setStartIndex((value * this.getCount()) - this.getCount());
- } else {
- this.startIndex = 0;
- }
- return this;
- }
-
-
- public short getCount() {
- return this.count;
- }
-
-
- public short getCountMax() {
- return this.countMax;
- }
-
-
- public int getLimit() {
- return this.limit;
- }
-
-
- public KorapSearch setLimit(int limit) {
- if (limit > 0)
- this.limit = limit;
- return this;
- }
-
-
- public boolean doCutOff() {
- return this.cutoff;
- }
-
-
- public KorapSearch setCutOff(boolean cutoff) {
- this.cutoff = cutoff;
- return this;
- }
-
-
- public KorapSearch setCount(int value) {
- // Todo: Maybe update startIndex with known startPage!
- this.setCount((short) value);
- return this;
- }
-
-
- public KorapSearch setCount(short value) {
- if (value > 0) {
- if (value <= this.countMax)
- this.count = value;
- else
- this.count = this.countMax;
- }
-
- return this;
- }
-
-
- public KorapSearch setCollection(KorapCollection kc) {
- this.collection = kc;
- if (kc.getError() != null)
- this.error = kc.getError();
- return this;
- }
-
-
- public KorapCollection getCollection() {
- if (this.collection == null)
- this.collection = new KorapCollection();
-
- return this.collection;
- }
-
-
- public KorapResult run(KorapIndex ki) {
- if (this.query == null) {
- KorapResult kr = new KorapResult();
- kr.setRequest(this.request);
- if (this.error != null)
- kr.setError(this.error);
- else
- kr.setError(this.getClass() + " expects a query");
- return kr;
- }
-
-
- if (this.error != null) {
- KorapResult kr = new KorapResult();
- kr.setRequest(this.request);
- kr.setError(this.error);
- return kr;
- }
-
-
- this.getCollection().setIndex(ki);
- KorapResult kr = ki.search(this.getCollection(), this);
- kr.setRequest(this.request);
- return kr;
- }
-
-
+ this.getCollection().setIndex(ki);
+ KorapResult kr = ki.search(this.getCollection(), this);
+ kr.setRequest(this.request);
+ return kr;
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/SearchContext.java b/src/main/java/de/ids_mannheim/korap/index/SearchContext.java
index d32624c..9074512 100644
--- a/src/main/java/de/ids_mannheim/korap/index/SearchContext.java
+++ b/src/main/java/de/ids_mannheim/korap/index/SearchContext.java
@@ -133,9 +133,9 @@
};
public JsonNode toJSON () {
- if (this.isSpanDefined()) {
+
+ if (this.isSpanDefined())
return new TextNode(this.spanContext);
- };
ArrayNode leftContext = mapper.createArrayNode();
leftContext.add(this.left.isToken() ? "token" : "char");
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
index 215c82a..023ad9a 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
@@ -285,10 +285,87 @@
KorapCollection kc = new KorapCollection(json);
kc.setIndex(ki);
assertEquals(7, kc.numberOf("documents"));
-
};
@Test
+ public void searchJSONitemsPerResource () throws IOException {
+
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
+ ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ );
+ };
+ ki.commit();
+
+ String json = getString(getClass().getResource("/queries/bsp-itemsPerResource.jsonld").getFile());
+
+ KorapSearch ks = new KorapSearch(json);
+ KorapResult kr = ks.run(ki);
+ assertEquals(10, kr.getTotalResults());
+ assertEquals(0, kr.getStartIndex());
+ assertEquals(20, kr.getItemsPerPage());
+
+ assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
+ assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
+ assertEquals("WPD_AAA.00001", kr.getMatch(6).getDocID());
+ assertEquals("WPD_AAA.00002", kr.getMatch(7).getDocID());
+ assertEquals("WPD_AAA.00002", kr.getMatch(8).getDocID());
+ assertEquals("WPD_AAA.00004", kr.getMatch(9).getDocID());
+
+ ks = new KorapSearch(json);
+ ks.setItemsPerResource(1);
+
+ kr = ks.run(ki);
+
+ assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
+ assertEquals("WPD_AAA.00002", kr.getMatch(1).getDocID());
+ assertEquals("WPD_AAA.00004", kr.getMatch(2).getDocID());
+
+ assertEquals(3, kr.getTotalResults());
+ assertEquals(0, kr.getStartIndex());
+ assertEquals(20, kr.getItemsPerPage());
+
+
+ ks = new KorapSearch(json);
+ ks.setItemsPerResource(2);
+
+ kr = ks.run(ki);
+
+ // System.err.println(kr.toJSON());
+
+ assertEquals("WPD_AAA.00001", kr.getMatch(0).getDocID());
+ assertEquals("WPD_AAA.00001", kr.getMatch(1).getDocID());
+ assertEquals("WPD_AAA.00002", kr.getMatch(2).getDocID());
+ assertEquals("WPD_AAA.00002", kr.getMatch(3).getDocID());
+ assertEquals("WPD_AAA.00004", kr.getMatch(4).getDocID());
+
+ assertEquals(5, kr.getTotalResults());
+ assertEquals(0, kr.getStartIndex());
+ assertEquals(20, kr.getItemsPerPage());
+
+
+ ks = new KorapSearch(json);
+ ks.setItemsPerResource(1);
+ ks.setStartIndex(1);
+ ks.setCount(1);
+
+ kr = ks.run(ki);
+
+ assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
+
+ assertEquals(3, kr.getTotalResults());
+ assertEquals(1, kr.getStartIndex());
+ assertEquals(1, kr.getItemsPerPage());
+
+ assertEquals((short) 1, kr.getItemsPerResource());
+ };
+
+
+
+ @Test
public void searchJSONCollection () throws IOException {
// Construct index
diff --git a/src/test/resources/queries/bsp-itemsPerResource.jsonld b/src/test/resources/queries/bsp-itemsPerResource.jsonld
new file mode 100644
index 0000000..d4be7fe
--- /dev/null
+++ b/src/test/resources/queries/bsp-itemsPerResource.jsonld
@@ -0,0 +1,21 @@
+{
+ "@context": "http://ids-mannheim.de/ns/KorAP/json-ld/v0.1/context.jsonld",
+ "query": {
+ "@type": "korap:token",
+ "wrap": {
+ "@type": "korap:term",
+ "foundry": "mate",
+ "layer" : "lemma",
+ "key":"alphabet",
+ "match": "match:eq"
+ }
+ },
+ "meta":{
+ "startPage":1,
+ "count": 20,
+ "context":{
+ "left":["char",90],
+ "right":["char",90]
+ }
+ }
+}