Optimize regex queries that are equivalent to any tokens
Change-Id: I9934d851920eb20e4c097f268c6aa439a824797d
diff --git a/Changes b/Changes
index 2cbdcf7..6c9d48f 100644
--- a/Changes
+++ b/Changes
@@ -1,8 +1,10 @@
-0.55.7 2016-10-10
+0.55.7 2016-10-11
- [bugfix] Throw error on optional operands in distance
queries (diewald)
- [performance] Remember solved problematic queries in the
query planner (diewald)
+ - [performance] Treat term queries like ".+?", ".+", ".*?", and ".*"
+ as any-terms (diewald)
0.55.6 2016-08-10
- [bugfix] distance with key "t" uses default foundry (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillQuery.java b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
index 3d70505..19fd398 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
@@ -1193,18 +1193,23 @@
// Branch on type
switch (json.get("type").asText()) {
- case "type:regex":
- return qb.seg(qb.re(value.toString(), isCaseInsensitive));
+ case "type:regex": {
- case "type:wildcard":
- return qb.seq(qb.wc(value.toString(), isCaseInsensitive));
+ // The regex can be rewritten to an any token
+ if (value.toString().matches("^[si]:\\.[\\+\\*]\\??$")) {
+ return new SpanRepetitionQueryWrapper();
+ };
+ return qb.seg(qb.re(value.toString(), isCaseInsensitive));
+ }
+ case "type:wildcard":
+ return qb.seq(qb.wc(value.toString(), isCaseInsensitive));
- case "type:string":
- break;
+ case "type:string":
+ break;
- default:
- this.addWarning(746,
- "Term type is not supported - treated as a string");
+ default:
+ this.addWarning(746,
+ "Term type is not supported - treated as a string");
};
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index f1f412d..19f1ec5 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -23,8 +23,14 @@
import java.util.*;
/*
- Todo: Store primary data at base/cons field.
+ TODO: Store primary data at base/cons field.
All other Termvectors should have no stored field!
+
+ TODO: Currently Character offsets are stored with positional
+ information in the token stream. This is bad!
+ The character offset may need a special encoding in Lucene
+ To store the character offsets directly (not in the payloads),
+ to make this less messy and speed things up.
*/
/**
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
index f11a5da..09d690b 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
@@ -589,6 +589,30 @@
};
};
+ @Test
+ public void queryJSONregexRewrite1 () throws QueryException {
+ // "der" [.+?]
+ String json = getString(getClass().getResource(
+ "/queries/sequence/regex-rewrite-1.jsonld").getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+
+ assertEquals(
+ kq.fromKoral(json).toQuery().toString(),
+ "focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 1}, right)}))");
+ };
+
+ @Ignore
+ public void queryJSONregexRewrite2 () throws QueryException {
+ // "der" [.+?]
+ String json = getString(getClass().getResource(
+ "/queries/sequence/regex-rewrite-2.jsonld").getFile());
+ KrillQuery kq = new KrillQuery("tokens");
+
+ assertEquals(
+ kq.fromKoral(json).toQuery().toString(),
+ "focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:s:der, []{1, 4}, right)}))");
+ };
+
public static String getString (String path) {
StringBuilder contentBuilder = new StringBuilder();
diff --git a/src/test/resources/queries/sequence/regex-rewrite-1.jsonld b/src/test/resources/queries/sequence/regex-rewrite-1.jsonld
new file mode 100644
index 0000000..e3cf3bf
--- /dev/null
+++ b/src/test/resources/queries/sequence/regex-rewrite-1.jsonld
@@ -0,0 +1,29 @@
+{
+ "@context":"http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection":null,
+ "query":{
+ "@type": "koral:group",
+ "operation": "operation:sequence",
+ "operands": [
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "key": "der",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ },
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "type" : "type:regex",
+ "key": ".+?",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ }
+ ]
+ }
+}
diff --git a/src/test/resources/queries/sequence/regex-rewrite-2.jsonld b/src/test/resources/queries/sequence/regex-rewrite-2.jsonld
new file mode 100644
index 0000000..a073d3f
--- /dev/null
+++ b/src/test/resources/queries/sequence/regex-rewrite-2.jsonld
@@ -0,0 +1,59 @@
+{
+ "@context":"http://ids-mannheim.de/ns/KorAP/json-ld/v0.3/context.jsonld",
+ "collection":null,
+ "query":{
+ "@type": "koral:group",
+ "operation": "operation:sequence",
+ "operands": [
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "key": "der",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ },
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "type" : "type:regex",
+ "key": ".*",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ },
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "type" : "type:regex",
+ "key": ".*?",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ },
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "type" : "type:regex",
+ "key": ".+",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ },
+ {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:term",
+ "type" : "type:regex",
+ "key": ".+?",
+ "layer": "orth",
+ "match": "match:eq"
+ }
+ }
+ ]
+ }
+}