First attempt to fix negative regex behaviour in sequences
Change-Id: I95a1f54653b15777b768023a617e59a9cfda23bb
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 07bb3ae..3eb34cb 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1356,6 +1356,10 @@
query = (SpanQuery) rewrittenQuery;
};
+ if (DEBUG)
+ log.trace("Rewritten query is {}", query.toString());
+
+
// Todo: run this in a separated thread
for (LeafReaderContext atomic : this.reader().leaves()) {
diff --git a/src/main/java/de/ids_mannheim/korap/KrillQuery.java b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
index 8e30b7a..a5f6e31 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
@@ -24,6 +24,7 @@
import de.ids_mannheim.korap.query.wrap.SpanQueryWrapper;
import de.ids_mannheim.korap.query.wrap.SpanReferenceQueryWrapper;
import de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper;
+import de.ids_mannheim.korap.query.wrap.SpanWildcardQueryWrapper;
import de.ids_mannheim.korap.query.wrap.SpanRelationWrapper;
import de.ids_mannheim.korap.query.wrap.SpanRepetitionQueryWrapper;
import de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper;
@@ -1228,53 +1229,77 @@
value.append(':').append(json.get("value").asText());
// Regular expression or wildcard
- if (isTerm && json.has("type")) {
+ if (isTerm) {
- QueryBuilder qb = this.builder();
+ String match = "match:eq";
+ if (json.has("match")) {
+ match = json.get("match").asText();
+ };
- // Branch on type
- switch (json.get("type").asText()) {
+ if (json.has("type")) {
+ QueryBuilder qb = this.builder();
+
+ // Branch on type
+ switch (json.get("type").asText()) {
case "type:regex": {
// The regex can be rewritten to an any token
if (value.toString().matches("^[si]:\\.[\\+\\*]\\??$")) {
return new SpanRepetitionQueryWrapper();
};
- return qb.seg(qb.re(value.toString(), isCaseInsensitive));
- }
- case "type:wildcard":
- return qb.seq(qb.wc(value.toString(), isCaseInsensitive));
+ SpanRegexQueryWrapper srqw = qb.re(value.toString(), isCaseInsensitive);
+
+ if (match.equals("match:ne")) {
+ if (DEBUG)
+ log.trace("Term is negated");
+ // ssqw.makeNegative();
+ return this.builder().seg().without(srqw);
+ }
+ else if (match.equals("match:eq")) {
+ return srqw;
+ }
+ throw new QueryException(741, "Match relation unknown");
+ }
+ case "type:wildcard": {
+
+ SpanWildcardQueryWrapper swcqw =
+ qb.wc(value.toString(), isCaseInsensitive);
+
+ if (match.equals("match:ne")) {
+ if (DEBUG)
+ log.trace("Term is negated");
+ // ssqw.makeNegative();
+ return this.builder().seg().without(swcqw);
+ }
+ else if (match.equals("match:eq")) {
+ return swcqw;
+ };
+ throw new QueryException(741, "Match relation unknown");
+ }
case "type:string":
break;
-
+
default:
this.addWarning(746,
- "Term type is not supported - treated as a string");
- };
- };
+ "Term type is not supported - treated as a string");
+ };
+ };
- if (isTerm) {
-
- String match = "match:eq";
- if (json.has("match")) {
- match = json.get("match").asText();
- }
-
- SpanSegmentQueryWrapper ssqw = this.builder().seg(value.toString());
- if (match.equals("match:ne")) {
- if (DEBUG)
- log.trace("Term is negated");
- ssqw.makeNegative();
- return this.builder().seg().without(ssqw);
- }
- else if (match.equals("match:eq")) {
- return ssqw;
- }
- else {
- throw new QueryException(741, "Match relation unknown");
- }
- }
+ SpanSegmentQueryWrapper ssqw = this.builder().seg(value.toString());
+ if (match.equals("match:ne")) {
+ if (DEBUG)
+ log.trace("Term is negated");
+ ssqw.makeNegative();
+ return this.builder().seg().without(ssqw);
+ }
+ else if (match.equals("match:eq")) {
+ return ssqw;
+ }
+ else {
+ throw new QueryException(741, "Match relation unknown");
+ }
+ };
if (json.has("attr")) {
JsonNode attrNode = json.get("attr");
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index 1678d9e..be6663e 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -33,7 +33,7 @@
# Index:
# log4j.logger.de.ids_mannheim.korap.index.Indexer = INFO, stdout
-# log4j.logger.de.ids_mannheim.korap.KrillIndex = TRACE, stdout
+ log4j.logger.de.ids_mannheim.korap.KrillIndex = TRACE, stdout
# log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
# log4j.logger.de.ids_mannheim.korap.index.MultiTermTokenStream = TRACE, stdout
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java
index 472583d..9da10bb 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestRegexWildcardIndex.java
@@ -57,7 +57,7 @@
assertEquals("affe [[afffe]] baum ...",
kr.getMatch(1).getSnippetBrackets());
- ks = _newKrill(new QueryBuilder("base").re("s:baum.*"));
+ ks = _newKrill(kq.re("s:baum.*"));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
@@ -66,7 +66,7 @@
assertEquals("... baum [[baumgarten]] steingarten ...",
kr.getMatch(1).getSnippetBrackets());
- ks = _newKrill(new QueryBuilder("base").re("s:.....?garten"));
+ ks = _newKrill(kq.re("s:.....?garten"));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... baum [[baumgarten]] steingarten ...",
@@ -74,7 +74,7 @@
assertEquals("... baumgarten [[steingarten]] franz ...",
kr.getMatch(1).getSnippetBrackets());
- ks = _newKrill(new QueryBuilder("base").re("s:ha.s"));
+ ks = _newKrill(kq.re("s:ha.s"));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... franz [[hans]] haus ...",
@@ -82,14 +82,27 @@
assertEquals("... hans [[haus]] efeu ...",
kr.getMatch(1).getSnippetBrackets());
- ks = _newKrill(new QueryBuilder("base").re("s:.*ff.*"));
+ ks = _newKrill(kq.re("s:.*ff.*"));
kr = ki.search(ks);
assertEquals((long) 3, kr.getTotalResults());
assertEquals("[[affe]] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("affe [[afffe]] baum ...",
kr.getMatch(1).getSnippetBrackets());
assertEquals("... efeu [[effe]]", kr.getMatch(2).getSnippetBrackets());
- };
+
+ SpanQueryWrapper sq = kq.seq(
+ kq.re("s:.*garten")
+ ).append(
+ kq.seg().without(
+ kq.re("s:.*an.*")
+ )
+ );
+ System.err.println(sq.toQuery().toString());
+ ks = _newKrill(sq);
+ kr = ki.search(ks);
+
+ assertEquals((long) 1, kr.getTotalResults());
+ };
@Test
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
index d9dc5bc..592b2d9 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestSpanSequenceQueryJSON.java
@@ -209,6 +209,17 @@
};
+ @Test
+ public void queryJSONseqNegativeRegexEnd () throws QueryException {
+ SpanQueryWrapper sqwi = jsonQueryFile("negative-regex.jsonld");
+ // [tt/p=NN][tt/p!="NN"]
+ assertEquals(
+ "focus(254: spanContain(<tokens:base/s:t />, {254: spanExpansion(tokens:tt/p:NN, !SpanMultiTermQueryWrapper(tokens:/opennlp/p:NN/){1, 1}, right)}))",
+ sqwi.toQuery().toString()
+ );
+ };
+
+
@Test
public void queryJSONseqNegativeStartRepetition () throws QueryException {
SpanQueryWrapper sqwi = jsonQueryFile(
diff --git a/src/test/resources/queries/sequence/negative-regex.jsonld b/src/test/resources/queries/sequence/negative-regex.jsonld
new file mode 100644
index 0000000..3053e96
--- /dev/null
+++ b/src/test/resources/queries/sequence/negative-regex.jsonld
@@ -0,0 +1,30 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.1/context.jsonld",
+ "query" : {
+ "@type" : "koral:group",
+ "operands" : [
+ {
+ "@type" : "koral:token",
+ "wrap" : {
+ "@type" : "koral:term",
+ "foundry" : "tt",
+ "key" : "NN",
+ "layer" : "p",
+ "match" : "match:eq"
+ }
+ },
+ {
+ "@type" : "koral:token",
+ "wrap" : {
+ "@type" : "koral:term",
+ "foundry" : "opennlp",
+ "key" : "NN",
+ "layer" : "p",
+ "match" : "match:ne",
+ "type" : "type:regex"
+ }
+ }
+ ],
+ "operation" : "operation:sequence"
+ }
+}