Query Deserializer in KorapSearch
diff --git a/CHANGES b/CHANGES
index 53182d2..1145a25 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,7 @@
+0.22 2013-12-04
+ - Introduced KorapSearch.
+ - Json deserializer (started).
+
0.21 2013-11-28
- Virtual collections can now be defined,
searched, nested and extended.
diff --git a/pom.xml b/pom.xml
index 651e2b5..d5e783f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.21</version>
+ <version>0.22</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index 446c085..2db10b8 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -5,9 +5,11 @@
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.search.Filter;
+
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapFilter;
+
import de.ids_mannheim.korap.util.KorapDate;
import de.ids_mannheim.korap.filter.BooleanFilter;
import de.ids_mannheim.korap.filter.FilterOperation;
@@ -17,10 +19,6 @@
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Bits;
-/*
-import org.apache.lucene.util.Bits.MatchAllBits;
-import org.apache.lucene.util.Bits.MatchNoBits;
-*/
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocIdSet;
@@ -36,8 +34,8 @@
public class KorapCollection {
private KorapIndex index;
- private String id;
private KorapDate created;
+ private String id;
private ArrayList<FilterOperation> filter;
private int filterCount = 0;
@@ -50,11 +48,19 @@
this.filter = new ArrayList<FilterOperation>(5);
};
+ public KorapCollection () {
+ this.filter = new ArrayList<FilterOperation>(5);
+ };
+
public int getCount() {
return this.filterCount;
};
- public void filter (BooleanFilter filter) {
+ public void setIndex (KorapIndex ki) {
+ this.index = ki;
+ };
+
+ public KorapCollection filter (BooleanFilter filter) {
this.filter.add(
new FilterOperation(
(Filter) new QueryWrapperFilter(filter.toQuery()),
@@ -62,9 +68,10 @@
)
);
this.filterCount++;
+ return this;
};
- public void extend (BooleanFilter filter) {
+ public KorapCollection extend (BooleanFilter filter) {
this.filter.add(
new FilterOperation(
(Filter) new QueryWrapperFilter(filter.toQuery()),
@@ -72,14 +79,14 @@
)
);
this.filterCount++;
+ return this;
};
public ArrayList<FilterOperation> getFilters () {
return this.filter;
};
- // Todo: Create new KorapSearch Object!
-
+ // DEPRECATED BUT USED IN TEST CASES
public KorapResult search (SpanQuery query) {
return this.index.search(this, query, 0, (short) 20, true, (short) 5, true, (short) 5);
};
@@ -158,10 +165,16 @@
};
public long numberOf (String foundry, String type) throws IOException {
+ if (this.index == null)
+ return (long) 0;
+
return this.index.numberOf(this, foundry, type);
};
public long numberOf (String type) throws IOException {
+ if (this.index == null)
+ return (long) 0;
+
return this.index.numberOf(this, "tokens", type);
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 7492c7d..06d3edf 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -50,6 +50,7 @@
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.KorapCollection;
+import de.ids_mannheim.korap.KorapSearch;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.document.KorapPrimaryData;
@@ -471,6 +472,23 @@
return this.search(new KorapCollection(this), query, startIndex, count, leftTokenContext, leftContext, rightTokenContext, rightContext);
};
+ public KorapResult search (KorapCollection kc, KorapSearch ks) {
+ return this.search(kc,
+ ks.getQuery(),
+ ks.getStartIndex(),
+ ks.getCount(),
+ ks.leftContext.isToken(),
+ ks.leftContext.getLength(),
+ ks.rightContext.isToken(),
+ ks.rightContext.getLength()
+ );
+ };
+
+ public KorapResult search (KorapSearch ks) {
+ return this.search(new KorapCollection(this), ks);
+ };
+
+
// old: Bits bitset
public KorapResult search (KorapCollection collection,
@@ -506,6 +524,8 @@
long t1 = 0;
long t2 = 0;
+ int hits = kr.itemsPerPage() + startIndex;
+
ArrayList<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(kr.itemsPerPage());
for (AtomicReaderContext atomic : this.reader().leaves()) {
@@ -527,7 +547,7 @@
// See: http://www.ibm.com/developerworks/java/library/j-benchmark1/index.html
t1 = System.nanoTime();
- for (; i < kr.itemsPerPage(); i++) {
+ for (; i < hits; i++) {
log.trace("Match Nr {}/{}", i, count);
diff --git a/src/main/java/de/ids_mannheim/korap/KorapQuery.java b/src/main/java/de/ids_mannheim/korap/KorapQuery.java
index 82b67bd..905199b 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapQuery.java
@@ -5,6 +5,8 @@
import de.ids_mannheim.korap.query.wrap.*;
import org.apache.lucene.util.automaton.RegExp;
+import com.fasterxml.jackson.databind.JsonNode;
+
import java.util.*;
import org.slf4j.Logger;
@@ -30,6 +32,48 @@
this.field = field;
};
+ public SpanQueryWrapperInterface fromJSON (String json) {
+ // Todo:
+ return this.seg("s:test");
+ };
+
+ // http://fasterxml.github.io/jackson-databind/javadoc/2.2.0/com/fasterxml/jackson/databind/JsonNode.html
+ public SpanQueryWrapperInterface fromJSON (JsonNode json) {
+ String type = json.get("@type").asText();
+ if (type.equals("korap:group")) {
+ String relation = json.get("relation").asText();
+
+ // Alternation
+ if (relation.equals("or")) {
+ SpanAlterQueryWrapper ssaq = new SpanAlterQueryWrapper(this.field);
+ for (JsonNode operand : json.get("operands")) {
+ ssaq.or(this.fromJSON(operand));
+ };
+ return ssaq;
+ }
+ else {
+ System.err.println("Unknown element");
+ };
+ }
+ else if (type.equals("korap:token")) {
+ SpanSegmentQueryWrapper ssqw = new SpanSegmentQueryWrapper(this.field);
+ JsonNode value = json.get("@value");
+ type = value.get("@type").asText();
+ if (type.equals("korap:term")) {
+ if (value.get("relation").asText().equals("=")) {
+ ssqw.with(value.get("@value").asText());
+ };
+ }
+ else {
+ System.err.println("Unknown type");
+ };
+
+ return ssqw;
+ }
+
+ return this.seg("s:test");
+ };
+
// SpanSegmentRegexQuery
/**
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
new file mode 100644
index 0000000..d5e0c51
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -0,0 +1,161 @@
+package de.ids_mannheim.korap;
+
+import java.io.*;
+
+import org.apache.lucene.search.spans.SpanQuery;
+import de.ids_mannheim.korap.query.wrap.SpanQueryWrapperInterface;
+import de.ids_mannheim.korap.KorapCollection;
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapResult;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+
+// Todo: Use configuration file
+
+/*
+KorapResult = new KorapSearch(String json).run(KorapIndex ki);
+startPage!!!
+*/
+
+public class KorapSearch {
+ private int startIndex;
+ private short count = 25;
+ private short countMax = 50;
+ // private int limit = -1;
+ private SpanQuery query;
+ public KorapSearchContext leftContext, rightContext;
+ private KorapCollection collection;
+ private KorapIndex index;
+
+ {
+ leftContext = new KorapSearchContext();
+ rightContext = new KorapSearchContext();
+ };
+
+ public class KorapSearchContext {
+ private boolean type = true;
+ private short length = 6;
+ private short maxLength = 12;
+
+ public boolean isToken () {
+ return this.type;
+ };
+
+ public boolean isCharacter () {
+ return !(this.type);
+ };
+
+ public KorapSearchContext setToken (boolean value) {
+ this.type = value;
+ return this;
+ };
+
+ public KorapSearchContext setCharacter (boolean value) {
+ this.type = !(value);
+ return this;
+ };
+
+ public short getLength() {
+ return this.length;
+ };
+
+ public KorapSearchContext setLength (short value) {
+ if (value >= 0) {
+ if (value <= maxLength) {
+ this.length = value;
+ }
+ else {
+ this.length = this.maxLength;
+ }
+ };
+ return this;
+ };
+
+ public KorapSearchContext setLength (int value) {
+ return this.setLength((short) value);
+ };
+ };
+
+ public KorapSearch (String json) {
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ JsonNode rootNode = mapper.readValue(json, JsonNode.class);
+
+ this.query = new KorapQuery("tokens").fromJSON(rootNode.get("query")).toQuery();
+ }
+ catch (IOException e) {
+ };
+ };
+
+
+ // Maybe accept queryWrapperStuff
+ public KorapSearch (SpanQueryWrapperInterface sqwi) {
+ this.query = sqwi.toQuery();
+ };
+
+ public KorapSearch (SpanQuery sq) {
+ this.query = sq;
+ };
+
+ public SpanQuery getQuery () {
+ return this.query;
+ };
+
+ public int getStartIndex () {
+ return this.startIndex;
+ };
+
+ public KorapSearch setStartIndex (int value) {
+ if (value >= 0) {
+ this.startIndex = value;
+ }
+ else {
+ this.startIndex = 0;
+ };
+
+ return this;
+ };
+
+ public short getCount () {
+ return this.count;
+ };
+
+ public short getCountMax () {
+ return this.countMax;
+ };
+
+ public KorapSearch setCount (int value) {
+ this.setCount((short) value);
+ return this;
+ };
+
+ public KorapSearch setCount (short value) {
+ if (value > 0) {
+ if (value <= this.countMax) {
+ this.count = value;
+ }
+ else {
+ this.count = this.countMax;
+ };
+ };
+ return this;
+ };
+
+ public KorapSearch setCollection (KorapCollection kc) {
+ this.collection = kc;
+ return this;
+ };
+
+ public KorapCollection getCollection () {
+ if (this.collection == null)
+ this.collection = new KorapCollection();
+
+ return this.collection;
+ };
+
+ public KorapResult run (KorapIndex ki) {
+ this.getCollection().setIndex(ki);
+ return ki.search(this.getCollection(), this);
+ };
+};
\ No newline at end of file
diff --git a/src/main/resources/korap.conf b/src/main/resources/korap.conf
index debbd62..c28b0c3 100644
--- a/src/main/resources/korap.conf
+++ b/src/main/resources/korap.conf
@@ -1,5 +1,15 @@
# Lucene Backend properties
lucene.properties = true
lucene.index = /home/ndiewald/Repositories/korap/KorAP-modules/KorAP-lucene-index/sandbox/index
-lucene.index.commit.count = 10000
-lucene.index.commit.log = log/korap.commit.log
\ No newline at end of file
+lucene.index.commit.count = 134217000;
+lucene.index.commit.log = log/korap.commit.log
+
+# Not active at the moment:
+lucene.index.search.count.default = 25
+lucene.index.search.count.max = 100
+lucene.index.search.context.left.type = token
+lucene.index.search.context.left.default = 6
+lucene.index.search.context.left.max = 12
+lucene.index.search.context.right.type = token
+lucene.index.search.context.right.default = 6
+lucene.index.search.context.right.max = 12
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
new file mode 100644
index 0000000..705bf33
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
@@ -0,0 +1,118 @@
+import java.util.*;
+import java.io.*;
+
+import de.ids_mannheim.korap.KorapSearch;
+import de.ids_mannheim.korap.KorapQuery;
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.KorapFilter;
+import de.ids_mannheim.korap.KorapResult;
+import java.nio.file.Files;
+import java.nio.file.FileSystem;
+import java.nio.file.Path;
+import java.nio.charset.StandardCharsets;
+import java.nio.ByteBuffer;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestKorapSearch {
+ @Test
+ public void searchCount () {
+ KorapSearch ks = new KorapSearch(
+ new KorapQuery("field1").seg("a").with("b")
+ );
+ // Count:
+ ks.setCount(30);
+ assertEquals(ks.getCount(), 30);
+ ks.setCount(20);
+ assertEquals(ks.getCount(), 20);
+ ks.setCount(-50);
+ assertEquals(ks.getCount(), 20);
+ ks.setCount(500);
+ assertEquals(ks.getCount(), ks.getCountMax());
+ };
+
+ @Test
+ public void searchStartIndex () {
+ KorapSearch ks = new KorapSearch(
+ new KorapQuery("field1").seg("a").with("b")
+ );
+ // startIndex
+ ks.setStartIndex(5);
+ assertEquals(ks.getStartIndex(), 5);
+ ks.setStartIndex(1);
+ assertEquals(ks.getStartIndex(), 1);
+ ks.setStartIndex(0);
+ assertEquals(ks.getStartIndex(), 0);
+ ks.setStartIndex(70);
+ assertEquals(ks.getStartIndex(), 70);
+ ks.setStartIndex(-5);
+ assertEquals(ks.getStartIndex(), 0);
+ };
+
+ @Test
+ public void searchQuery () {
+ KorapSearch ks = new KorapSearch(
+ new KorapQuery("field1").seg("a").with("b")
+ );
+ // query
+ assertEquals(ks.getQuery().toString(), "spanNear([field1:a, field1:b], -1, false)");
+ };
+
+ @Test
+ public void searchIndex () throws IOException {
+
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
+ ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ );
+ };
+ ki.commit();
+
+ KorapSearch ks = new KorapSearch(
+ new KorapQuery("tokens").seg("s:Buchstaben")
+ );
+ ks.getCollection().filter(
+ new KorapFilter().and("textClass", "reisen")
+ );
+ ks.setCount(3);
+ ks.setStartIndex(5);
+ ks.leftContext.setLength(1);
+ ks.rightContext.setLength(1);
+ KorapResult kr = ks.run(ki);
+ assertEquals(6, kr.totalResults());
+ assertEquals(kr.getMatch(0).getSnippetBrackets(), "... dem [Buchstaben] A ...");
+ };
+
+ @Test
+ public void searchJSON () throws IOException {
+
+ String jsonFile = getClass().getResource("/queries/bsp1.json").getFile();
+
+ KorapSearch ks = new KorapSearch(getString(jsonFile));
+
+ // assertEquals(ks.getQuery().toString(), "");
+ };
+
+ public static String getString (String path) {
+ StringBuilder contentBuilder = new StringBuilder();
+ try {
+ BufferedReader in = new BufferedReader(new FileReader(path));
+ String str;
+ while ((str = in.readLine()) != null) {
+ contentBuilder.append(str);
+ };
+ in.close();
+ } catch (IOException e) {
+ }
+ return contentBuilder.toString();
+ };
+
+};