Added token list retrieval
diff --git a/CHANGES b/CHANGES
index 8b95c61..8dac796 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,7 @@
-0.47 2014-11-01
+0.48 2014-11-06
+ - [feature] Retrieval of token lists (diewald)
+
+0.47 2014-11-05
- [feature] Support new index format with more metadata (diewald)
- [bugfix] #142 Fixed class spans for skipTo() skips (margaretha)
- [feature] Added attribute support to relations (margaretha)
diff --git a/pom.xml b/pom.xml
index 721e4ea..14293f9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -24,7 +24,7 @@
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.47</version>
+ <version>0.48</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 6d23202..a649c35 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -71,7 +71,7 @@
* KorapIndex implements a simple API for searching in and writing to a
* Lucene index and equesting several information but the index's nature.
*
- * @author ndiewald
+ * @author Nils Diewald
*/
public class KorapIndex {
@@ -146,6 +146,7 @@
fieldsToLoad = new HashSet<String>(16);
fieldsToLoad.add("author");
fieldsToLoad.add("ID");
+ fieldsToLoad.add("textSigle");
fieldsToLoad.add("UID");
fieldsToLoad.add("title");
fieldsToLoad.add("subTitle");
@@ -1126,7 +1127,7 @@
int oldLocalDocID = -1;
/*
- * Todo: There may be a way to now early if the bitset is emty
+ * Todo: There may be a way to know early if the bitset is emty
* by using OpenBitSet - but this may not be as fast as I think.
*/
Bits bitset = collection.bits(atomic);
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index c09ff16..5291821 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -116,11 +116,14 @@
* @see #snippetBrackets()
* @see PositionsToOffset
*/
- public KorapMatch (PositionsToOffset pto, int localDocID, int startPos, int endPos) {
+ public KorapMatch (PositionsToOffset pto,
+ int localDocID,
+ int startPos,
+ int endPos) {
this.positionsToOffset = pto;
- this.localDocID = localDocID;
- this.startPos = startPos;
- this.endPos = endPos;
+ this.localDocID = localDocID;
+ this.startPos = startPos;
+ this.endPos = endPos;
};
@@ -386,6 +389,11 @@
this.setTokenization(doc.get("tokenization"));
if (fields.contains("layerInfo"))
this.setLayerInfo(doc.get("layerInfo"));
+
+ // New fields
+ if (fields.contains("textSigle"))
+ this.setTextSigle(doc.get("textSigle"));
+
};
@@ -1357,7 +1365,7 @@
// Identical to KorapResult!
public String toJSON () {
- ObjectNode json = (ObjectNode) mapper.valueToTree(this);
+ ObjectNode json = (ObjectNode) mapper.valueToTree(this);
// Match was no match
if (json.size() == 0)
@@ -1379,6 +1387,36 @@
return "{}";
};
+ // Return match as token list
+ public ObjectNode toTokenList () {
+ ObjectNode json = mapper.createObjectNode();
+
+ if (this.getDocID() != null)
+ json.put("textSigle", this.getDocID());
+ else if (this.getTextSigle() != null)
+ json.put("textSigle", this.getTextSigle());
+
+ ArrayNode tokens = json.putArray("tokens");
+
+ // Get pto object
+ PositionsToOffset pto = this.positionsToOffset;
+
+ // Add for position retrieval
+ for (int i = this.getStartPos(); i < this.getEndPos(); i++) {
+ pto.add(this.localDocID, i);
+ };
+
+ // Retrieve positions
+ for (int i = this.getStartPos(); i < this.getEndPos(); i++) {
+ ArrayNode token = tokens.addArray();
+ for (int offset : pto.span(this.localDocID, i)) {
+ token.add(offset);
+ };
+ };
+
+ return json;
+ };
+
// Remove duplicate identifiers
// Yeah ... I mean ... why not?
diff --git a/src/main/java/de/ids_mannheim/korap/KorapResult.java b/src/main/java/de/ids_mannheim/korap/KorapResult.java
index 6ec7991..5352028 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapResult.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapResult.java
@@ -6,6 +6,7 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.fasterxml.jackson.databind.node.ArrayNode;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.index.SearchContext;
@@ -77,7 +78,10 @@
}
- public KorapMatch addMatch(PositionsToOffset pto, int localDocID, int startPos, int endPos) {
+ public KorapMatch addMatch (PositionsToOffset pto,
+ int localDocID,
+ int startPos,
+ int endPos) {
KorapMatch km = new KorapMatch(pto, localDocID, startPos, endPos);
// Temporary - should use the same interface like results
@@ -102,7 +106,7 @@
this.add(km);
return km;
- }
+ };
@Deprecated
public int totalResults() {
@@ -167,7 +171,7 @@
public String getBenchmarkHitCounter() {
return this.benchmarkHitCounter;
- }
+ };
@JsonIgnore
public void setItemsPerResource (short value) {
@@ -184,30 +188,28 @@
return this.itemsPerResource;
};
- public String getQuery() {
+ public String getQuery () {
return this.query;
- }
+ };
@JsonIgnore
- public KorapMatch getMatch(int index) {
+ public KorapMatch getMatch (int index) {
return this.matches.get(index);
- }
+ };
- // @JsonIgnore
+ @JsonIgnore
public List<KorapMatch> getMatches() {
return this.matches;
- }
-
+ };
@Deprecated
- public KorapMatch match(int index) {
+ public KorapMatch match (int index) {
return this.matches.get(index);
- }
+ };
-
- public int getStartIndex() {
+ public int getStartIndex () {
return startIndex;
- }
+ };
@JsonIgnore
@@ -224,7 +226,7 @@
// Identical to KorapMatch!
- public String toJSON() {
+ public String toJSON () {
ObjectNode json = (ObjectNode) mapper.valueToTree(this);
if (this.context != null)
@@ -236,13 +238,42 @@
if (this.getVersion() != null)
json.put("version", this.getVersion());
+ // Add matches
+ json.putPOJO("matches", this.getMatches());
+
try {
return mapper.writeValueAsString(json);
- } catch (Exception e) {
- log.warn(e.getLocalizedMessage());
}
-
+ catch (Exception e) {
+ log.warn(e.getLocalizedMessage());
+ };
return "{}";
};
+
+
+ // For Collocation Analysis API
+ public String toTokenListJSON () {
+ ObjectNode json = (ObjectNode) mapper.valueToTree(this);
+
+ if (this.getVersion() != null)
+ json.put("version", this.getVersion());
+
+ ArrayNode array = json.putArray("matches");
+
+ // Add matches as token lists
+ for (KorapMatch km : this.getMatches()) {
+ array.add(km.toTokenList());
+ };
+
+ try {
+ return mapper.writeValueAsString(json);
+ }
+ catch (Exception e) {
+ log.warn(e.getLocalizedMessage());
+ };
+
+ return "{}";
+ };
+
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
index 7481d3e..059bb97 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapSearch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -24,7 +24,7 @@
/**
* @author Nils Diewald
*
- * KoraSearch implements an object for all search relevant parameters.
+ * KorapSearch implements an object for all search relevant parameters.
*/
public class KorapSearch {
private int startIndex = 0, limit = 0;
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKorapResult.java b/src/test/java/de/ids_mannheim/korap/search/TestKorapResult.java
index a64a3e4..983d3eb 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKorapResult.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKorapResult.java
@@ -163,6 +163,63 @@
};
+ @Test
+ public void checkJSONTokenResult () throws Exception {
+ KorapIndex ki = new KorapIndex();
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-1");
+ fd.addString("UID", "1");
+ fd.addTV("base",
+ "abab",
+ "[(0-1)s:a|i:a|_0#0-1|-:t$<i>4]" +
+ "[(1-2)s:b|i:b|_1#1-2]" +
+ "[(2-3)s:a|i:c|_2#2-3]" +
+ "[(3-4)s:b|i:a|_3#3-4]");
+ ki.addDoc(fd);
+ fd = new FieldDocument();
+ fd.addString("ID", "doc-2");
+ fd.addString("UID", "2");
+ fd.addTV("base",
+ "aba",
+ "[(0-1)s:a|i:a|_0#0-1|-:t$<i>3]" +
+ "[(1-2)s:b|i:b|_1#1-2]" +
+ "[(2-3)s:a|i:c|_2#2-3]");
+ ki.addDoc(fd);
+
+ // Commit!
+ ki.commit();
+
+ KorapQuery kq = new KorapQuery("base");
+ SpanQuery q = (SpanQuery) kq.seq(kq.seg("s:a")).append(kq.seg("s:b")).toQuery();
+ KorapResult kr = ki.search(q);
+
+ assertEquals(3, kr.getTotalResults());
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode res = mapper.readTree(kr.toTokenListJSON());
+ assertEquals(3, res.at("/totalResults").asInt());
+ assertEquals("spanNext(base:s:a, base:s:b)", res.at("/query").asText());
+ assertEquals(0, res.at("/startIndex").asInt());
+ assertEquals(25, res.at("/itemsPerPage").asInt());
+
+ assertEquals("doc-1", res.at("/matches/0/textSigle").asText());
+ assertEquals(0, res.at("/matches/0/tokens/0/0").asInt());
+ assertEquals(1, res.at("/matches/0/tokens/0/1").asInt());
+ assertEquals(1, res.at("/matches/0/tokens/1/0").asInt());
+ assertEquals(2, res.at("/matches/0/tokens/1/1").asInt());
+
+ assertEquals("doc-1", res.at("/matches/1/textSigle").asText());
+ assertEquals(2, res.at("/matches/1/tokens/0/0").asInt());
+ assertEquals(3, res.at("/matches/1/tokens/0/1").asInt());
+ assertEquals(3, res.at("/matches/1/tokens/1/0").asInt());
+ assertEquals(4, res.at("/matches/1/tokens/1/1").asInt());
+
+ assertEquals("doc-2", res.at("/matches/2/textSigle").asText());
+ assertEquals(0, res.at("/matches/2/tokens/0/0").asInt());
+ assertEquals(1, res.at("/matches/2/tokens/0/1").asInt());
+ assertEquals(1, res.at("/matches/2/tokens/1/0").asInt());
+ assertEquals(2, res.at("/matches/2/tokens/1/1").asInt());
+ };
+
public static String getString (String path) {
StringBuilder contentBuilder = new StringBuilder();
try {
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
index 59ac327..c28a74f 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKorapSearch.java
@@ -19,6 +19,9 @@
import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+
import static org.junit.Assert.*;
import org.junit.Test;
import org.junit.Ignore;
@@ -670,8 +673,6 @@
assertEquals(0, kr.getStartIndex());
};
-
-
@Test
public void searchJSONmultipleClassesBug () throws IOException {
// Construct index
@@ -707,7 +708,44 @@
assertEquals(0, kr.getStartIndex());
};
+ @Test
+ public void searchJSONmultipleClassesBugTokenList () throws IOException {
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ ki.addDocFile(
+ 1,getClass().getResource("/goe/AGA-03828.json.gz").getFile(), true
+ );
+ ki.addDocFile(
+ 2,getClass().getResource("/bzk/D59-00089.json.gz").getFile(), true
+ );
+ ki.commit();
+
+ String json = getString(
+ getClass().getResource("/queries/bugs/multiple_classes.jsonld").getFile()
+ );
+
+ KorapSearch ks = new KorapSearch(json);
+ KorapResult kr = ks.run(ki);
+
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode res = mapper.readTree(kr.toTokenListJSON());
+
+ assertEquals(1, res.at("/totalResults").asInt());
+ assertEquals("{4: spanNext({1: spanNext({2: tokens:s:ins}, " +
+ "{3: tokens:s:Leben})}, tokens:s:gerufen)}", res.at("/query").asText());
+ assertEquals(0, res.at("/startIndex").asInt());
+ assertEquals(25, res.at("/itemsPerPage").asInt());
+
+ assertEquals("BZK_D59.00089", res.at("/matches/0/textSigle").asText());
+ assertEquals(328, res.at("/matches/0/tokens/0/0").asInt());
+ assertEquals(331, res.at("/matches/0/tokens/0/1").asInt());
+ assertEquals(332, res.at("/matches/0/tokens/1/0").asInt());
+ assertEquals(337, res.at("/matches/0/tokens/1/1").asInt());
+ assertEquals(338, res.at("/matches/0/tokens/2/0").asInt());
+ assertEquals(345, res.at("/matches/0/tokens/2/1").asInt());
+ };
@Test