Support vector representation in KoralQuery terms
Change-Id: Ia6b3ca682b52e797c589be921a3d5c5b93083502
diff --git a/Changes b/Changes
index dd69c3c..fbede7d 100644
--- a/Changes
+++ b/Changes
@@ -1,12 +1,14 @@
-0.59.0 2019-11-07
+0.59.0 2019-11-27
- [bugfix] Fix offset retrieval in concurrent searches
(diewald)
- [cleanup] Removed deprecated numberOf() method from index
(diewald)
- [bugfix] Fix offset retrieval in concurrent getMatchInfo requests
(diewald)
- - Updated readme and the version of java and some plugins & libraries.
- (margaretha)
+ - [cleanup] Updated readme and the version of java and some plugins
+ & libraries (margaretha)
+ - [feature] Support for vector representation of terms in KoralQuery
+ (diewald)
0.58.7 2019-09-16
- [bugfix] Fix the behaviour of negative operands in virtual
diff --git a/src/main/java/de/ids_mannheim/korap/KrillQuery.java b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
index a5f6e31..791a15b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
@@ -2,6 +2,7 @@
import java.io.IOException;
import java.util.ArrayList;
+import java.util.LinkedList;
import java.util.List;
import java.util.Iterator;
@@ -316,7 +317,7 @@
// Get wrapped token
return this._segFromJson(json.get("wrap"));
- case "koral:span":
+ case "koral:span":
// EM: what to do with empty koral:span?
// it is allowed only in relation queries
if (isOperationRelation && !json.has("key") && !json.has("wrap") && !json.has("attr")) {
@@ -1123,15 +1124,31 @@
Boolean isTerm = termType.equals("koral:term") ? true
: false;
Boolean isCaseInsensitive = false;
-
- if (!json.has("key") || json.get("key").asText().length() < 1) {
- // why must it have an attr?
+
+ if (!json.has("key") ||
+ (json.get("key").size() == 1 && json.get("key").asText().length() < 1)) {
+
+ // It may have no key but an attribute
if (!json.has("attr")) {
// return new SpanRepetitionQueryWrapper();
throw new QueryException(740,
"Key definition is missing in term or span");
}
};
+
+
+ // Term is represented as a list of keys
+ LinkedList<String> keys = new LinkedList<String>();
+
+ if (json.has("key")) {
+ if (json.get("key").size() > 1) {
+ for (JsonNode value : json.get("key")) {
+ keys.push(value.asText());
+ }
+ } else {
+ keys.push(json.get("key").asText());
+ }
+ }
// Empty koral:span hack
if (isSpan) {
@@ -1159,44 +1176,50 @@
};
};
- StringBuilder value = new StringBuilder();
-
- if (direction != null)
- value.append(direction.value());
-
// expect orth? expect lemma?
// s:den | i:den | cnx/l:die | mate/m:mood:ind | cnx/syn:@PREMOD |
// mate/m:number:sg | opennlp/p:ART
- if (json.has("foundry") && json.get("foundry").asText().length() > 0) {
+ StringBuilder value = new StringBuilder();
+ LinkedList<String> values = new LinkedList<String>();
+
+ if (direction != null)
+ value.append(direction.value());
+
+ if (json.has("foundry") &&
+ json.get("foundry").asText().length() > 0) {
value.append(json.get("foundry").asText()).append('/');
};
+
// No default foundry defined
- if (json.has("layer") && json.get("layer").asText().length() > 0) {
+ if (json.has("layer") &&
+ json.get("layer").asText().length() > 0) {
String layer = json.get("layer").asText();
switch (layer) {
- case "lemma":
- layer = "l";
- break;
+ case "lemma":
+ layer = "l";
+ break;
- case "pos":
- layer = "p";
- break;
+ case "pos":
+ layer = "p";
+ break;
- case "orth":
- // TODO: THIS IS AN UGLY HACK! AND SHOULD BE NAMED "SURFACE" or . OR *
- layer = ".";
- break;
+ case "orth":
+ // TODO:
+ // THIS IS AN UGLY HACK!
+ // AND SHOULD BE NAMED "SURFACE" or . OR *
+ layer = ".";
+ break;
- case "struct":
- layer = "s";
- break;
+ case "struct":
+ layer = "s";
+ break;
- case "const":
- layer = "c";
- break;
+ case "const":
+ layer = "c";
+ break;
};
if (isCaseInsensitive && isTerm) {
@@ -1204,7 +1227,7 @@
layer = "i";
else {
this.addWarning(767,
- "Case insensitivity is currently not supported for this layer");
+ "Case insensitivity is currently not supported for this layer");
};
};
@@ -1220,17 +1243,33 @@
value.append(layer).append(':');
};
- if (json.has("key") && json.get("key").asText().length() > 0) {
- String key = json.get("key").asText();
- value.append(isCaseInsensitive ? key.toLowerCase() : key);
- };
+ // Remember the common prefix for all values
+ int offset = value.length();
- if (json.has("value") && json.get("value").asText().length() > 0)
- value.append(':').append(json.get("value").asText());
+ // Iterate over all keys
+ for (String key : keys) {
+
+ // Reset to common prefix
+ value.setLength(offset);
+
+ // Add key to value
+ value.append(isCaseInsensitive ? key.toLowerCase() : key);
+
+ // TODO:
+ // This should iterate over all values as well
+ if (json.has("value") && json.get("value").asText().length() > 0)
+ value.append(':').append(json.get("value").asText());
+
+ // Add to value list
+ values.push(value.toString());
+ };
// Regular expression or wildcard
if (isTerm) {
+ // Create alter query
+ SpanAlterQueryWrapper saqw = new SpanAlterQueryWrapper(this.field);
+
String match = "match:eq";
if (json.has("match")) {
match = json.get("match").asText();
@@ -1243,37 +1282,43 @@
switch (json.get("type").asText()) {
case "type:regex": {
- // The regex can be rewritten to an any token
- if (value.toString().matches("^[si]:\\.[\\+\\*]\\??$")) {
- return new SpanRepetitionQueryWrapper();
+ for (String v : values) {
+
+ // The regex can be rewritten to an any token
+ if (v.matches("^[si]:\\.[\\+\\*]\\??$")) {
+ return new SpanRepetitionQueryWrapper();
+ };
+
+ saqw.or(qb.re(v, isCaseInsensitive));
};
- SpanRegexQueryWrapper srqw = qb.re(value.toString(), isCaseInsensitive);
-
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
- // ssqw.makeNegative();
- return this.builder().seg().without(srqw);
+ saqw.setNegative(true);
+ return saqw;
}
else if (match.equals("match:eq")) {
- return srqw;
+ return saqw;
}
throw new QueryException(741, "Match relation unknown");
}
case "type:wildcard": {
- SpanWildcardQueryWrapper swcqw =
- qb.wc(value.toString(), isCaseInsensitive);
+ // Wildcard queries are deprecated in KoralQuery since 9/2017
+
+ for (String v : values) {
+ saqw.or(qb.wc(v, isCaseInsensitive));
+ };
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
- // ssqw.makeNegative();
- return this.builder().seg().without(swcqw);
+ saqw.setNegative(true);
+ return saqw;
}
else if (match.equals("match:eq")) {
- return swcqw;
+ return saqw;
};
throw new QueryException(741, "Match relation unknown");
}
@@ -1286,21 +1331,39 @@
};
};
- SpanSegmentQueryWrapper ssqw = this.builder().seg(value.toString());
+ // TODO:
+ // This could alternatively use
+ // https://github.com/tokuhirom/regexp-trie
+ for (String v : values) {
+ saqw.or(v);
+ };
+
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
- ssqw.makeNegative();
- return this.builder().seg().without(ssqw);
+
+ // Segment "without" doesn't work in
+ // attribute contexts
+ saqw.setNegative(true);
+ return saqw;
}
else if (match.equals("match:eq")) {
- return ssqw;
+ return saqw;
}
else {
throw new QueryException(741, "Match relation unknown");
}
};
+ if (values.size() > 1) {
+ throw new QueryException(
+ 0,
+ "List representation for spans not yet supported"
+ );
+
+ };
+
+ // Term has attribute
if (json.has("attr")) {
JsonNode attrNode = json.get("attr");
if (!attrNode.has("@type")) {
@@ -1331,11 +1394,12 @@
private SpanQueryWrapper _createElementAttrFromJson (
SpanQueryWrapper elementWithIdWrapper, JsonNode json,
JsonNode attrNode) throws QueryException {
-
+
if (attrNode.get("@type").asText().equals("koral:term")) {
SpanQueryWrapper attrWrapper = _attrFromJson(json.get("attr"));
+
if (attrWrapper != null) {
- if (elementWithIdWrapper != null) {
+ if (elementWithIdWrapper != null) {
return new SpanWithAttributeQueryWrapper(
elementWithIdWrapper, attrWrapper);
}
@@ -1430,10 +1494,8 @@
String rootValue = attrNode.get("root").asText();
if (rootValue.equals("true") || rootValue.equals("false")) {
- // TODO: Here do not refer to 'tokens'!!!
- // EM: what should it be? property?
return new SpanAttributeQueryWrapper(new SpanSimpleQueryWrapper(
- "tokens", "@root", Boolean.valueOf(rootValue)));
+ this.field, "@root", Boolean.valueOf(rootValue)));
}
}
return null;
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
index 4ed669f..00c2efd 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanAlterQueryWrapper.java
@@ -39,8 +39,14 @@
};
};
+ public SpanAlterQueryWrapper setNegative (Boolean neg) {
+ this.isNegative = neg;
+ return this;
+ };
public SpanAlterQueryWrapper or (String term) {
+ // TODO:
+ // Potential optimizable by directly add()ing
SpanQueryWrapper sqw = new SpanSimpleQueryWrapper(this.field, term);
return this.or(sqw);
};
@@ -50,6 +56,7 @@
if (term.isNull())
return this;
+ // Check! This seems to render the whole group negative!
if (term.isNegative())
this.isNegative = true;
@@ -103,7 +110,7 @@
public SpanQuery toFragmentQuery () throws QueryException {
if (this.isNull || this.alternatives.size() == 0)
return (SpanQuery) null;
-
+
if (this.alternatives.size() == 1) {
return (SpanQuery) this.alternatives.get(0)
.retrieveNode(this.retrieveNode).toFragmentQuery();
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java
index 18752ce..4d1c712 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java
@@ -205,6 +205,7 @@
else if (this.inclusive.size() == 0 && this.exclusive.size() >= 1) {
return (SpanQuery) this._listToQuery(this.exclusive);
}
+
else if (this.inclusive.size() >= 1 && this.exclusive.size() == 0) {
return (SpanQuery) this._listToQuery(this.inclusive);
};
@@ -215,6 +216,11 @@
private SpanQuery _listToQuery (ArrayList<SpanQueryWrapper> list)
throws QueryException {
+
+ if (list.size() == 1) {
+ return (SpanQuery) list.get(0).toFragmentQuery();
+ };
+
SpanQuery query = list.get(0).toFragmentQuery();
for (int i = 1; i < list.size(); i++) {
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
index 9627308..b4bcf5c 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
@@ -597,7 +597,91 @@
};
};
+ @Test
+ public void queryJSONtermVector () throws QueryException {
+ // base=foo|base=bar|base=xyz|base=abc
+ try {
+ String json = getJsonString(getClass()
+ .getResource(
+ "/queries/segment/vector.jsonld")
+ .getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+ assertEquals("spanOr([tokens:s:foo, tokens:s:bar, tokens:s:xyz, tokens:s:abc])",
+ kq.fromKoral(json).toQuery().toString());
+ }
+ catch (QueryException e) {
+ fail(e.getMessage());
+ };
+ };
+
+ @Test
+ public void queryJSONtermVectorCaseInsensitive () throws QueryException {
+ // base=fOo|base=bAr|base=xYz|base=aBc
+ try {
+ String json = getJsonString(getClass()
+ .getResource(
+ "/queries/segment/vector-caseinsensitive.jsonld")
+ .getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+
+ assertEquals("spanOr([tokens:i:foo, tokens:i:bar, tokens:i:xyz, tokens:i:abc])",
+ kq.fromKoral(json).toQuery().toString());
+ }
+ catch (QueryException e) {
+ fail(e.getMessage());
+ };
+ };
+
+
+ @Test
+ public void queryJSONwildcardVector () throws QueryException {
+ // base=f?o|base=bar|base=x*z|base=abc
+ try {
+ String json = getJsonString(getClass()
+ .getResource(
+ "/queries/segment/vector-wildcards.jsonld")
+ .getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+
+ assertEquals("spanOr([" +
+ "SpanMultiTermQueryWrapper(tokens:s:f?o), " +
+ "SpanMultiTermQueryWrapper(tokens:s:bar), " +
+ "SpanMultiTermQueryWrapper(tokens:s:x*z), " +
+ "SpanMultiTermQueryWrapper(tokens:s:abc)" +
+ "])",
+ kq.fromKoral(json).toQuery().toString());
+ }
+ catch (QueryException e) {
+ fail(e.getMessage());
+ };
+ };
+
+
+ @Test
+ public void queryJSONregexVector () throws QueryException {
+ // base=f.?o|base=b[au]r|base=x(yz)*|base=ab+c
+ try {
+ String json = getJsonString(getClass()
+ .getResource(
+ "/queries/segment/vector-regex.jsonld")
+ .getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+
+ assertEquals("spanOr([" +
+ "SpanMultiTermQueryWrapper(tokens:/s:f.?o/), " +
+ "SpanMultiTermQueryWrapper(tokens:/s:b[au]r/), " +
+ "SpanMultiTermQueryWrapper(tokens:/s:x(yz)*/), " +
+ "SpanMultiTermQueryWrapper(tokens:/s:ab+c/)" +
+ "])",
+ kq.fromKoral(json).toQuery().toString());
+ }
+ catch (QueryException e) {
+ fail(e.getMessage());
+ };
+ };
+
+
@Test
public void queryJSONregexRewrite1 () throws QueryException {
// "der" [.+?]
@@ -612,6 +696,25 @@
@Test
+ public void queryJSONregexVectorRewrite () throws QueryException {
+ // der [base=f.?o|base=b[au]r|base=.*|base=ab+c]
+ try {
+ String json = getJsonString(
+ getClass()
+ .getResource("/queries/sequence/regex-rewrite-vector.jsonld")
+ .getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+
+ assertEquals("focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 1}, right)}))",
+ kq.fromKoral(json).toQuery().toString());
+ }
+ catch (QueryException e) {
+ fail(e.getMessage());
+ };
+ };
+
+
+ @Test
public void queryJSONmerge () throws QueryException {
// treat merging gracefully
String json = getJsonString(getClass()
diff --git a/src/test/resources/queries/segment/vector-caseinsensitive.jsonld b/src/test/resources/queries/segment/vector-caseinsensitive.jsonld
new file mode 100644
index 0000000..505ff2e
--- /dev/null
+++ b/src/test/resources/queries/segment/vector-caseinsensitive.jsonld
@@ -0,0 +1,12 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "layer": "orth",
+ "key": ["fOo","bAr","xYz","aBc"],
+ "flags":["flags:caseInsensitive"]
+ }
+ }
+}
diff --git a/src/test/resources/queries/segment/vector-regex.jsonld b/src/test/resources/queries/segment/vector-regex.jsonld
new file mode 100644
index 0000000..d682722
--- /dev/null
+++ b/src/test/resources/queries/segment/vector-regex.jsonld
@@ -0,0 +1,12 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "layer": "orth",
+ "key": ["f.?o","b[au]r","x(yz)*","ab+c"],
+ "type":"type:regex"
+ }
+ }
+}
diff --git a/src/test/resources/queries/segment/vector-wildcards.jsonld b/src/test/resources/queries/segment/vector-wildcards.jsonld
new file mode 100644
index 0000000..1c718c1
--- /dev/null
+++ b/src/test/resources/queries/segment/vector-wildcards.jsonld
@@ -0,0 +1,12 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "layer": "orth",
+ "key": ["f?o","bar","x*z","abc"],
+ "type":"type:wildcard"
+ }
+ }
+}
diff --git a/src/test/resources/queries/segment/vector.jsonld b/src/test/resources/queries/segment/vector.jsonld
new file mode 100644
index 0000000..a813ef0
--- /dev/null
+++ b/src/test/resources/queries/segment/vector.jsonld
@@ -0,0 +1,11 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "layer": "orth",
+ "key": ["foo","bar","xyz","abc"]
+ }
+ }
+}
diff --git a/src/test/resources/queries/sequence/regex-rewrite-vector.jsonld b/src/test/resources/queries/sequence/regex-rewrite-vector.jsonld
new file mode 100644
index 0000000..174ab19
--- /dev/null
+++ b/src/test/resources/queries/sequence/regex-rewrite-vector.jsonld
@@ -0,0 +1,29 @@
+{
+ "@context":"http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection":null,
+ "query":{
+ "@type": "koral:group",
+ "operation": "operation:sequence",
+ "operands": [
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "key": "der",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ },
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "type" : "type:regex",
+ "key": ["f.?o","b[au]r",".*","ab+c"],
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ }
+ ]
+ }
+}