Add token serialization as a snippet option (fixes #72)
Change-Id: Ibc175000d162ceba5de79b11ef0d70d031e20e9d
diff --git a/.gitignore b/.gitignore
index 5cce705..613b6da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
/wiki.org
/test-output
/misc/web-api.md
+*.code-workspace
*~
.*
*\#
diff --git a/Changes b/Changes
index 607d58d..5724b9c 100644
--- a/Changes
+++ b/Changes
@@ -1,7 +1,10 @@
-0.59.7 2021-11-08
- - Implemented a new cache with on disk storage and auto-update (margaretha).
+0.59.7 2021-12-01
+ - [feature] Implemented a new cache with on disk
+ storage and auto-update (margaretha).
+ - [feature] Support for tokenized snippet output
+ (fixed #72; diewald)
-0.59.6 2021-11-10
+0.59.6
- [bugfix] Fixed skipping of focus spans (fixed #78; margaretha,
diewald)
- [bugfix] Clear matchlist if skip fails in NextSpans
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 4e9ea06..fcec1e1 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -963,6 +963,10 @@
if (match.getStartPos() == -1)
return match;
+ // For the moment, direct match retrievals will always include
+ // snippets. But this may change in the future.
+ match.hasSnippet = true;
+
// Create a filter based on the corpusID and the docID
BooleanQuery bool = new BooleanQuery();
if (match.getTextSigle() != null) {
@@ -1319,13 +1323,16 @@
* Search in the index.
*/
public Result search (SpanQuery query) {
- return this.search(new Krill(query));
+ final Krill krill = new Krill(query);
+ krill.getMeta().setSnippets(true);
+ return this.search(krill);
};
public Result search (SpanQuery query, short count) {
final Krill krill = new Krill(query);
krill.getMeta().setCount(count);
+ krill.getMeta().setSnippets(true);
return this.search(krill);
};
@@ -1340,6 +1347,7 @@
meta.setStartIndex(startIndex).setCount(count);
meta.setContext(new SearchContext(leftTokenContext, leftContext,
rightTokenContext, rightContext));
+ meta.setSnippets(true);
return this.search(ks);
};
@@ -1379,7 +1387,7 @@
// The following fields should be lifted for matches
List<String> fields = (ArrayList<String>) meta.getFields().clone();
HashSet<String> fieldsSet = new HashSet<String>(fields);
- boolean snippets = meta.hasSnippets();
+ boolean snippets = meta.hasSnippets() || meta.hasTokens();
// Lift all fields
if (fields.contains("@all")) {
@@ -1447,7 +1455,6 @@
if (isTimeout)
break;
-
/*
* Todo: There may be a way to know early if the bitset is emty
@@ -1525,7 +1532,7 @@
// Create new Match
final Match match = new Match(pto, localDocID,
spans.start(), spans.end());
-
+
// Add snippet if existing
if (snippets) {
match.setContext(kr.getContext());
@@ -1536,7 +1543,15 @@
if (spans.isPayloadAvailable())
match.addPayload((List<byte[]>) spans.getPayload());
- }
+
+ if (meta.hasSnippets()) {
+ match.hasSnippet = true;
+ };
+
+ if (meta.hasTokens()) {
+ match.hasTokens = true;
+ };
+ };
// Add match to Result
kr.add(match);
diff --git a/src/main/java/de/ids_mannheim/korap/KrillMeta.java b/src/main/java/de/ids_mannheim/korap/KrillMeta.java
index eddafd8..dbd814f 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillMeta.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillMeta.java
@@ -29,6 +29,8 @@
// Per default snippets are requested
private boolean snippets = true;
+ private boolean tokens = false;
+
private ArrayList<String> fields;
HashSet<Integer> highlights;
@@ -144,10 +146,15 @@
if (json.has("itemsPerResource"))
this.setItemsPerResource(json.get("itemsPerResource").asInt());
- // Defined context
+ // Defined snippets
if (json.has("snippets")) {
this.snippets = json.get("snippets").asBoolean();
};
+
+ // Defined tokens
+ if (json.has("tokens")) {
+ this.tokens = json.get("tokens").asBoolean();
+ };
// Defined context
if (json.has("context"))
@@ -284,6 +291,23 @@
this.snippets = snippets;
return this;
};
+
+
+ /**
+ * Get if tokens should be retrieved.
+ */
+ public boolean hasTokens () {
+ return this.tokens;
+ };
+
+
+ /**
+ * Set if tokens should be retrieved.
+ */
+ public KrillMeta setTokens (boolean tokens) {
+ this.tokens = tokens;
+ return this;
+ };
// Get set of fields
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 6f46fca..8c1498e 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -139,8 +139,13 @@
identifier,
mirrorIdentifier;
+ private ObjectNode snippetTokens;
+
private HighlightCombinator snippetArray;
+ public boolean hasSnippet = false;
+ public boolean hasTokens = false;
+
@JsonIgnore
public boolean startMore = true, endMore = true;
@@ -1122,6 +1127,7 @@
this.processed = false;
this.snippetHTML = null;
this.snippetBrackets = null;
+ this.snippetTokens = null;
this.identifier = null;
// Delete all spans
@@ -1351,8 +1357,177 @@
};
};
+ /*
+ * Return the snippet as a list of tokens
+ */
+ @JsonIgnore
+ public ObjectNode getSnippetTokens () {
+ ObjectNode json = mapper.createObjectNode();
- @JsonProperty("snippet")
+ if (this.processed && this.snippetTokens != null)
+ return this.snippetTokens;
+
+ if (DEBUG)
+ log.trace("--- Process tokens");
+
+ if (this.positionsToOffset == null || this.localDocID == -1)
+ return null;
+
+ PositionsToOffset pto = this.positionsToOffset;
+ int ldid = this.localDocID;
+
+ int startContext = -1;
+ int endContext = -1;
+ int startContextChar = -1;
+ int endContextChar = -1;
+
+ int pdl = this.getPrimaryDataLength();
+
+ // Get context based on a span definition
+ if (this.getContext().isSpanDefined()) {
+
+ if (DEBUG)
+ log.debug("Context defined by span");
+
+ int[] spanContext = this.expandContextToSpan(
+ this.positionsToOffset.getLeafReader(), (Bits) null,
+ "tokens", this.context.getSpanContext());
+ startContext = spanContext[0];
+ endContext = spanContext[1];
+ startContextChar = spanContext[2];
+ endContextChar = spanContext[3];
+ }
+
+ // The offset is not yet defined - and defined by tokens
+ if (endContext == -1) {
+
+ if (DEBUG)
+ log.debug("No context defined by span");
+
+ if (this.context.left.isToken() && this.context.left.getLength() > 0) {
+ startContext = this.startPos - this.context.left.getLength();
+ if (startContext < 0)
+ startContext = 0;
+ };
+
+ if (this.context.right.isToken() && this.context.right.getLength() > 0) {
+ endContext = this.endPos + this.context.right.getLength() - 1;
+ };
+ };
+
+ if (startContext == -1) {
+ startContext = this.startPos;
+ if (DEBUG)
+ log.debug("Set startContext {}", endContext);
+ };
+
+ if (endContext == -1) {
+ endContext = this.endPos - 1;
+ if (DEBUG)
+ log.debug("Set endContext {}", endContext);
+ };
+
+ // Retrieve the character offsets for all tokens
+ for (int i = startContext; i < endContext; i++) {
+ pto.add(ldid, i);
+ };
+
+ if (startContextChar == -1)
+ startContextChar = pto.start(ldid, startContext);
+
+ if (endContextChar == -1)
+ endContextChar = pto.end(ldid, endContext);
+
+ if (DEBUG)
+ log.debug("Match is {}/{} - {}/{}",startContext,startContextChar,endContext,endContextChar);
+
+ if (endContextChar == -1 || endContextChar == 0 || endContextChar > pdl) {
+ this.tempSnippet = this.getPrimaryData(startContextChar);
+ this.endMore = false;
+ } else {
+ this.tempSnippet = this.getPrimaryData(startContextChar,endContextChar);
+ }
+
+ if (startContext == 0) {
+ this.startMore = false;
+ }
+
+ Integer[] offsets;
+ ArrayNode tokens;
+ int i;
+
+ // Create left context token list
+ if (startContext < this.startPos) {
+ tokens = json.putArray("left");
+ for (i = startContext; i < this.startPos; i++) {
+ offsets = pto.span(ldid,i);
+ tokens.add(
+ this.tempSnippet.substring(
+ offsets[0]- startContextChar, offsets[1] - startContextChar)
+ );
+ };
+ };
+
+ tokens = json.putArray("match");
+ for (i = this.startPos; i < this.endPos; i++) {
+ offsets = pto.span(ldid,i);
+ if (offsets == null) {
+ continue;
+ }
+ tokens.add(
+ this.tempSnippet.substring(
+ offsets[0]- startContextChar, offsets[1] - startContextChar)
+ );
+ };
+
+ // Create right context token list
+ if (endContext > this.endPos) {
+ tokens = null;
+ for (i = this.endPos; i < endContext; i++) {
+ offsets = pto.span(ldid,i);
+ if (offsets == null) {
+ break;
+ };
+
+ if (tokens == null)
+ tokens = json.putArray("right");
+
+ tokens.add(
+ this.tempSnippet.substring(
+ offsets[0]- startContextChar, offsets[1] - startContextChar)
+ );
+ };
+ };
+
+ // Add class arrays to JSON
+ if (this.highlight != null) {
+
+ ArrayNode classes = null;
+ for (Highlight highlight : this.highlight) {
+
+ if (highlight.number < 0 || highlight.number > 255)
+ continue;
+
+ // Highlight is a pagebreak
+ if (highlight.end == PB_MARKER)
+ continue;
+
+ if (classes == null)
+ classes = json.putArray("classes");
+
+ ArrayNode cls = mapper.createArrayNode();
+ cls.add(highlight.number);
+ cls.add(highlight.start - this.startPos);
+ cls.add(highlight.end - this.startPos);
+ classes.add(cls);
+ };
+ };
+
+ return (this.snippetTokens = json);
+ };
+
+
+ @JsonIgnore
public String getSnippetHTML () {
if (!this._processHighlight())
@@ -1642,7 +1817,7 @@
* Sometimes the match start and end positions are inside the
* matching region, e.g. when the match was expanded.
* This will override the original matching positions
- * And matrk the real matching.
+ * And mark the real matching.
*/
public void overrideMatchPosition (int start, int end) {
if (DEBUG)
@@ -1893,7 +2068,7 @@
};
if (DEBUG)
- log.trace("Snippet: '" + this.tempSnippet + "'");
+ log.trace("Snippet: '{}'", this.tempSnippet);
if (DEBUG)
log.trace(
@@ -1926,6 +2101,12 @@
json.put("pages", pages);
};
+
+ if (this.hasSnippet)
+ json.put("snippet", this.getSnippetHTML());
+
+ if (this.hasTokens)
+ json.put("tokens", this.getSnippetTokens());
return json;
};
diff --git a/src/main/java/de/ids_mannheim/korap/response/Result.java b/src/main/java/de/ids_mannheim/korap/response/Result.java
index 32b3cfe..873f137 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Result.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Result.java
@@ -274,8 +274,16 @@
this._addMeta(json);
// Add matches
- if (this.matches != null)
- json.putPOJO("matches", this.getMatches());
+ if (this.matches != null) {
+
+ // Initiate matches
+ ArrayNode matches = json.putArray("matches");
+
+ // Add matches
+ for (Match km : this.getMatches()) {
+ matches.add(km.toJsonNode());
+ };
+ };
return json;
};
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index 666fd73..27f03c9 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -533,4 +533,45 @@
km.getSnippetHTML());
};
+
+ @Test
+ public void checkTokenArray () throws IOException, QueryException {
+
+ KrillIndex ki = new KrillIndex();
+ String json = new String("{" + " \"fields\" : [" + " { "
+ + " \"primaryData\" : \"abc\"" + " }," + " {"
+ + " \"name\" : \"tokens\"," + " \"data\" : ["
+ + " [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"],"
+ + " [ \"s:b\", \"i:b\", \"_1#1-2\" ],"
+ + " [ \"s:c\", \"i:c\", \"_2#2-3\" ]" + " ]"
+ + " }" + " ]" + "}");
+
+ ki.addDoc(json);
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("tokens");
+ Result kr = ki
+ .search((SpanQuery) kq.seq(kq.nr(1, kq.seg("s:b")), kq.seg("s:c")).toQuery());
+ Match km = kr.getMatch(0);
+ assertEquals(km.getStartPos(), 1);
+ assertEquals(km.getEndPos(), 3);
+ assertEquals(km.getStartPos(1), 1);
+ assertEquals(km.getEndPos(1), 2);
+
+ assertEquals(
+ "{\"left\":[\"a\"],\"match\":[\"b\",\"c\"],\"classes\":[[1,0,0]]}",
+ km.getSnippetTokens().toString());
+
+ kq = new QueryBuilder("tokens");
+ kr = ki
+ .search((SpanQuery) kq.seq(kq.seg("s:a"), kq.seg("s:b"), kq.seg("s:c")).toQuery());
+ km = kr.getMatch(0);
+ assertEquals(km.getStartPos(), 0);
+ assertEquals(km.getEndPos(), 3);
+
+ assertEquals(
+ "{\"match\":[\"a\",\"b\",\"c\"]}",
+ km.getSnippetTokens().toString());
+
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 1d8f3cd..ade2efd 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -153,6 +153,8 @@
assertEquals("SnippetBrackets (0)", "... bcabca[[{2:b{1:a}}]]c",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (0)", "{\"left\":[\"b\",\"c\",\"a\",\"b\",\"c\",\"a\"],\"match\":[\"b\",\"a\"],\"right\":[\"c\"],\"classes\":[[2,0,1],[1,1,1]]}",
+ km.getSnippetTokens().toString());
assertEquals("ID (0)", "match-c1!d1-p7-9(2)7-8(1)8-8", km.getID());
};
@@ -170,6 +172,8 @@
assertEquals("SnippetBrackets (0)", "... [[{2:b{a}}]] ...",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (0)", "{\"match\":[\"b\",\"a\"],\"classes\":[[0,1,1],[2,0,1]]}",
+ km.getSnippetTokens().toString());
assertEquals("ID (0)", "match-c1!d1-p7-9(0)8-8(2)7-8", km.getID());
@@ -179,6 +183,8 @@
assertEquals("SnippetBrackets (1)",
"... [[{f/m:acht:b}{f/m:neun:a}]] ...",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (1)", "{\"match\":[\"b\",\"a\"]}",
+ km.getSnippetTokens().toString());
// Mirror identifier when passed
km = ki.getMatchInfo("match-c1!d1-p7-9(0)8-8(2)7-8", "tokens", "f",
@@ -186,6 +192,8 @@
assertEquals("SnippetBrackets (1b)",
"... [[{f/m:acht:{f/y:eight:b}}{f/m:neun:{f/y:nine:a}}]] ...",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (1b)", "{\"match\":[\"b\",\"a\"]}",
+ km.getSnippetTokens().toString());
JsonNode res = mapper.readTree(km.toJsonString());
assertEquals("match-c1!d1-p7-9(0)8-8(2)7-8",
@@ -197,6 +205,8 @@
assertEquals("SnippetBrackets (2)",
"... [[{2:{f/m:acht:b}{{f/m:neun:a}}}]] ...",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (2)", "{\"match\":[\"b\",\"a\"],\"classes\":[[0,1,1],[2,0,1]]}",
+ km.getSnippetTokens().toString());
km = ki.getMatchInfo("match-c1!d1-p7-9(4)8-8(2)7-8", "tokens", "f", "m",
false, true);
@@ -204,6 +214,8 @@
assertEquals("SnippetBrackets (3)",
"... [[{2:{f/m:acht:b}{4:{f/m:neun:a}}}]] ...",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (3)", "{\"match\":[\"b\",\"a\"],\"classes\":[[4,1,1],[2,0,1]]}",
+ km.getSnippetTokens().toString());
km = ki.getMatchInfo("match-c1!d1-p7-9(4)8-8(2)7-8", "tokens", "f",
null, false, true);
@@ -211,6 +223,8 @@
assertEquals("SnippetBrackets (4)",
"... [[{2:{f/m:acht:{f/y:eight:b}}{4:{f/m:neun:{f/y:nine:a}}}}]] ...",
km.getSnippetBrackets());
+ assertEquals("SnippetTokens (4)", "{\"match\":[\"b\",\"a\"],\"classes\":[[4,1,1],[2,0,1]]}",
+ km.getSnippetTokens().toString());
assertEquals("SnippetHTML (4)",
"<span class=\"context-left\">" + "<span class=\"more\">"
@@ -1012,7 +1026,11 @@
assertEquals("SnippetBrackets (with Spans)",
"[[{x/tag:a:{x/tag:b:{x/tag:c:{x/tag:v:x}}y}}z]]",
km.getSnippetBrackets());
-
+
+ assertEquals("SnippetTokens (without Spans)",
+ "{\"match\":[\"x\",\"y\",\"z\"]}",
+ km.getSnippetTokens().toString());
+
assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
assertEquals(fd.getFieldValue("title"), "Autobiographische Einzelheiten");
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index 32b665c..84d8636 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -102,6 +102,7 @@
meta.setStartIndex(5);
meta.getContext().left.setLength(1);
meta.getContext().right.setLength(1);
+ assertTrue(meta.hasSnippets());
Result kr = ks.apply(ki);
assertEquals(kr.getTotalResults(), 6);
@@ -110,20 +111,49 @@
"... dem [[Buchstaben]] A ...");
JsonNode res = ks.toJsonNode();
+
assertEquals(3, res.at("/meta/count").asInt());
assertEquals(5, res.at("/meta/startIndex").asInt());
assertEquals("token", res.at("/meta/context/left/0").asText());
assertEquals(1, res.at("/meta/context/left/1").asInt());
assertEquals("token", res.at("/meta/context/right/0").asText());
assertEquals(1, res.at("/meta/context/right/1").asInt());
+ assertTrue(res.at("/matches/0/snippet").isMissingNode());
+ assertTrue(res.at("/matches/0/tokens").isMissingNode());
+
+ res = kr.toJsonNode();
+
+ assertFalse(res.at("/matches/0/snippet").isMissingNode());
+ assertTrue(res.at("/matches/0/tokens").isMissingNode());
+
// Handle count=0 correctly
meta = ks.getMeta();
meta.setCount(0);
+
kr = ks.apply(ki);
assertEquals(kr.getTotalResults(), 6);
assertEquals(kr.getItemsPerPage(), 0);
assertEquals(kr.getMatches().size(), 0);
+
+ // Handle tokens=true and
+ // snippet=false correctly
+ meta = ks.getMeta();
+ meta.setCount(1);
+ meta.setTokens(true);
+ meta.setSnippets(false);
+
+ kr = ks.apply(ki);
+ assertEquals(kr.getTotalResults(), 6);
+ assertEquals(kr.getMatches().size(), 1);
+
+ res = kr.toJsonNode();
+
+ assertFalse(res.at("/matches/0/hasSnippet").asBoolean());
+ assertTrue(res.at("/matches/0/hasTokens").asBoolean());
+ assertTrue(res.at("/matches/0/snippet").isMissingNode());
+ assertEquals("dem", res.at("/matches/0/tokens/left/0").asText());
+ assertEquals("Buchstaben", res.at("/matches/0/tokens/match/0").asText());
};
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
index 5e6bce8..2042175 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
@@ -239,7 +239,7 @@
assertTrue(resultJson.indexOf("\"textSigle\":\"GOE_AGX.00002\"") > 0);
assertTrue(resultJson.indexOf("\"docSigle\":\"GOE_AGX\"") > 0);
assertTrue(resultJson.indexOf("\"corpusSigle\":\"GOE\"") > 0);
- assertTrue(resultJson.indexOf("\"UID\":") > 0);
+ // assertTrue(resultJson.indexOf("\"UID\":") > 0);
assertTrue(resultJson.indexOf("\"availability\":") > 0);
assertEquals(