Fix negation in segment queries following De Morgan's law (fixes #93)
Change-Id: I062bb44f572b23012578486082df21989105669f
diff --git a/Changes b/Changes
index d356bd8..734db02 100644
--- a/Changes
+++ b/Changes
@@ -24,6 +24,8 @@
(diewald; tests AI-assisted Claude Opus 4.6)
- [bugfix] Keep classes in repetition queries
(diewald; fixes #59; diewald; AI-assisted Claude Opus 4.6)
+ - [bugfix] Fix negation in segment queries following De Morgan's law
+ (diewald; fixes #93; diewald; AI-assisted Claude Opus 4.6)
0.64.6 2026-03-09
- [performance] Add leaf cache. (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillQuery.java b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
index ae21a82..c55d9df 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillQuery.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillQuery.java
@@ -1172,10 +1172,34 @@
case "relation:or":
+ // Collect all OR operands, e.g. [pos=NN | pos!=VV], and check if all are negated
+ ArrayList<SpanQueryWrapper> orParts = new ArrayList<>();
+ boolean allNeg = true;
+ for (JsonNode operand : operands) {
+ SpanQueryWrapper part = this._segFromJson(operand);
+ orParts.add(part);
+ if (!part.isNegative())
+ allNeg = false;
+ };
+
+ // De Morgan: e.g. [!pos=NN | !pos=VV] -> NOT([pos=NN & pos=VV])
+ if (allNeg && orParts.size() > 0) {
+ SpanSegmentQueryWrapper ssegOr = this.builder().seg();
+ for (SpanQueryWrapper part : orParts) {
+ ssegOr.without((SpanAlterQueryWrapper) part);
+ };
+ SpanAlterQueryWrapper negWrapper =
+ new SpanAlterQueryWrapper(this.field);
+ negWrapper.or(ssegOr);
+ negWrapper.setNegative(true);
+ return negWrapper;
+ }
+
+ // Normal case, e.g. [pos=NN | pos=VV] - build a standard OR
SpanAlterQueryWrapper ssaq = new SpanAlterQueryWrapper(
this.field);
- for (JsonNode operand : operands) {
- ssaq.or(this._segFromJson(operand));
+ for (SpanQueryWrapper part : orParts) {
+ ssaq.or(part);
};
return ssaq;
};
diff --git a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java
index cbc715b..b69b236 100644
--- a/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java
+++ b/src/main/java/de/ids_mannheim/korap/query/wrap/SpanSegmentQueryWrapper.java
@@ -194,17 +194,19 @@
|| (this.inclusive.size() + this.exclusive.size() == 0)) {
return (SpanQuery) null;
}
+ // Both inclusive and exclusive, e.g. [orth=Baum & pos!=NN]
else if (this.inclusive.size() >= 1 && this.exclusive.size() >= 1) {
return (SpanQuery) new SpanNotQuery(
this._listToQuery(this.inclusive),
this._listToOrQuery(this.exclusive));
}
- // These are now identical but may be negative
+ // Exclusives only, e.g. [pos!=NN & pos!=VV] -- OR-combine for exclusion
else if (this.inclusive.size() == 0 && this.exclusive.size() >= 1) {
- return (SpanQuery) this._listToQuery(this.exclusive);
+ return (SpanQuery) this._listToOrQuery(this.exclusive);
}
+ // Inclusives only, e.g. [orth=Baum & pos=NN] -- AND-combine into segment
else if (this.inclusive.size() >= 1 && this.exclusive.size() == 0) {
return (SpanQuery) this._listToQuery(this.inclusive);
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestSegmentNegationIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestSegmentNegationIndex.java
index 1bd1dcd..338c62c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestSegmentNegationIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestSegmentNegationIndex.java
@@ -116,6 +116,191 @@
};
+ @Test
+ public void testAllNegationsInTermGroup () throws Exception {
+ // [orth!="des" & orth!="ihres"] [orth="Hauses"]
+ ki = new KrillIndex();
+
+ FieldDocument fd1 = new FieldDocument();
+ fd1.addString("ID", "doc-neg-0");
+ fd1.addTV("tokens", "des Hauses",
+ "[(0-3)s:des|i:des|_1$<i>0<i>1]"
+ + "[(4-10)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd1);
+
+ FieldDocument fd2 = new FieldDocument();
+ fd2.addString("ID", "doc-neg-1");
+ fd2.addTV("tokens", "ihres Hauses",
+ "[(0-5)s:ihres|i:ihres|_1$<i>0<i>1]"
+ + "[(6-12)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd2);
+
+ FieldDocument fd3 = new FieldDocument();
+ fd3.addString("ID", "doc-neg-2");
+ fd3.addTV("tokens", "eines Hauses",
+ "[(0-5)s:eines|i:eines|_1$<i>0<i>1]"
+ + "[(6-12)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd3);
+
+ FieldDocument fd4 = new FieldDocument();
+ fd4.addString("ID", "doc-neg-3");
+ fd4.addTV("tokens", "meines Hauses",
+ "[(0-6)s:meines|i:meines|_1$<i>0<i>1]"
+ + "[(7-13)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd4);
+
+ ki.commit();
+
+ // Search using KoralQuery JSON:
+ // [orth!="des" & orth!="ihres"] [orth="Hauses"]
+ String json = "{\"query\": {\"@type\": \"koral:group\", \"operands\": ["
+ + "{\"@type\": \"koral:token\", \"wrap\": {"
+ + "\"@type\": \"koral:termGroup\", \"operands\": ["
+ + "{\"@type\": \"koral:term\", \"key\": \"des\", \"layer\": \"orth\", \"match\": \"match:ne\", \"type\": \"type:regex\"},"
+ + "{\"@type\": \"koral:term\", \"key\": \"ihres\", \"layer\": \"orth\", \"match\": \"match:ne\", \"type\": \"type:regex\"}"
+ + "], \"relation\": \"relation:and\"}},"
+ + "{\"@type\": \"koral:token\", \"wrap\": {"
+ + "\"@type\": \"koral:term\", \"key\": \"Hauses\", \"layer\": \"orth\", \"match\": \"match:eq\", \"type\": \"type:regex\"}}"
+ + "], \"operation\": \"operation:sequence\"}}";
+
+ Krill krill = new Krill(json);
+ kr = ki.search(krill);
+
+ assertEquals("totalResults", 2, kr.getTotalResults());
+ assertEquals("StartPos (0)", 0, kr.getMatch(0).startPos);
+ assertEquals("EndPos (0)", 2, kr.getMatch(0).endPos);
+ assertEquals("StartPos (1)", 0, kr.getMatch(1).startPos);
+ assertEquals("EndPos (1)", 2, kr.getMatch(1).endPos);
+ }
+
+
+ @Test
+ public void testAllNegationsOrInTermGroup () throws Exception {
+ // [orth!="des" | orth!="ihres"] [orth="Hauses"]
+ // By De Morgan: NOT(des) OR NOT(ihres) = NOT(des AND ihres)
+ // Since a token can only have one orth value,
+ // (des AND ihres) is always false, so NOT(false) = true.
+ // Every token matches, so all "[...] Hauses" docs match.
+ ki = new KrillIndex();
+
+ FieldDocument fd1 = new FieldDocument();
+ fd1.addString("ID", "doc-neg-0");
+ fd1.addTV("tokens", "des Hauses",
+ "[(0-3)s:des|i:des|_1$<i>0<i>1]"
+ + "[(4-10)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd1);
+
+ FieldDocument fd2 = new FieldDocument();
+ fd2.addString("ID", "doc-neg-1");
+ fd2.addTV("tokens", "ihres Hauses",
+ "[(0-5)s:ihres|i:ihres|_1$<i>0<i>1]"
+ + "[(6-12)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd2);
+
+ FieldDocument fd3 = new FieldDocument();
+ fd3.addString("ID", "doc-neg-2");
+ fd3.addTV("tokens", "eines Hauses",
+ "[(0-5)s:eines|i:eines|_1$<i>0<i>1]"
+ + "[(6-12)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd3);
+
+ FieldDocument fd4 = new FieldDocument();
+ fd4.addString("ID", "doc-neg-3");
+ fd4.addTV("tokens", "meines Hauses",
+ "[(0-6)s:meines|i:meines|_1$<i>0<i>1]"
+ + "[(7-13)s:Hauses|i:hauses|_2$<i>1<i>2]");
+ ki.addDoc(fd4);
+
+ ki.commit();
+
+ // [orth!="des" | orth!="ihres"] [orth="Hauses"]
+ String json = "{\"query\": {\"@type\": \"koral:group\", \"operands\": ["
+ + "{\"@type\": \"koral:token\", \"wrap\": {"
+ + "\"@type\": \"koral:termGroup\", \"operands\": ["
+ + "{\"@type\": \"koral:term\", \"key\": \"des\", \"layer\": \"orth\", \"match\": \"match:ne\", \"type\": \"type:regex\"},"
+ + "{\"@type\": \"koral:term\", \"key\": \"ihres\", \"layer\": \"orth\", \"match\": \"match:ne\", \"type\": \"type:regex\"}"
+ + "], \"relation\": \"relation:or\"}},"
+ + "{\"@type\": \"koral:token\", \"wrap\": {"
+ + "\"@type\": \"koral:term\", \"key\": \"Hauses\", \"layer\": \"orth\", \"match\": \"match:eq\", \"type\": \"type:regex\"}}"
+ + "], \"operation\": \"operation:sequence\"}}";
+
+ Krill krill = new Krill(json);
+ kr = ki.search(krill);
+
+ assertEquals("totalResults", 4, kr.getTotalResults());
+ }
+
+
+ @Test
+ public void testAllNegationsOrMultiValuedLayer () throws Exception {
+ // [marmot/p!=ADJ | marmot/p!=NN] [orth="Baum"]
+ // By De Morgan: NOT(ADJ) OR NOT(NN) = NOT(ADJ AND NN)
+ // A position CAN have multiple POS tags (e.g. ADJ and NN).
+ // Only tokens with BOTH ADJ and NN are excluded.
+ // However - this may be up to interpretation, as ADJ is !=NN and vice versa!
+ ki = new KrillIndex();
+
+ // Token "alte" has BOTH marmot/p:ADJ and marmot/p:NN
+ FieldDocument fd1 = new FieldDocument();
+ fd1.addString("ID", "doc-multi-0");
+ fd1.addTV("tokens", "alte Baum",
+ "[(0-4)s:alte|i:alte|marmot/p:ADJ|marmot/p:NN|_1$<i>0<i>1]"
+ + "[(5-9)s:Baum|i:baum|_2$<i>1<i>2]");
+ ki.addDoc(fd1);
+
+ // Token "grosse" has only marmot/p:ADJ (not NN)
+ FieldDocument fd2 = new FieldDocument();
+ fd2.addString("ID", "doc-multi-1");
+ fd2.addTV("tokens", "grosse Baum",
+ "[(0-6)s:grosse|i:grosse|marmot/p:ADJ|_1$<i>0<i>1]"
+ + "[(7-11)s:Baum|i:baum|_2$<i>1<i>2]");
+ ki.addDoc(fd2);
+
+ // Token "kleiner" has only marmot/p:NN (not ADJ)
+ FieldDocument fd3 = new FieldDocument();
+ fd3.addString("ID", "doc-multi-2");
+ fd3.addTV("tokens", "kleiner Baum",
+ "[(0-7)s:kleiner|i:kleiner|marmot/p:NN|_1$<i>0<i>1]"
+ + "[(8-12)s:Baum|i:baum|_2$<i>1<i>2]");
+ ki.addDoc(fd3);
+
+ // Token "der" has marmot/p:DET (neither ADJ nor NN)
+ FieldDocument fd4 = new FieldDocument();
+ fd4.addString("ID", "doc-multi-3");
+ fd4.addTV("tokens", "der Baum",
+ "[(0-3)s:der|i:der|marmot/p:DET|_1$<i>0<i>1]"
+ + "[(4-8)s:Baum|i:baum|_2$<i>1<i>2]");
+ ki.addDoc(fd4);
+
+ ki.commit();
+
+ // [marmot/p!=ADJ | marmot/p!=NN] [orth="Baum"]
+ // De Morgan: NOT(ADJ AND NN) - only exclude tokens with BOTH
+ String json = "{\"query\": {\"@type\": \"koral:group\", \"operands\": ["
+ + "{\"@type\": \"koral:token\", \"wrap\": {"
+ + "\"@type\": \"koral:termGroup\", \"operands\": ["
+ + "{\"@type\": \"koral:term\", \"foundry\": \"marmot\", \"key\": \"ADJ\", \"layer\": \"pos\", \"match\": \"match:ne\", \"type\": \"type:regex\"},"
+ + "{\"@type\": \"koral:term\", \"foundry\": \"marmot\", \"key\": \"NN\", \"layer\": \"pos\", \"match\": \"match:ne\", \"type\": \"type:regex\"}"
+ + "], \"relation\": \"relation:or\"}},"
+ + "{\"@type\": \"koral:token\", \"wrap\": {"
+ + "\"@type\": \"koral:term\", \"key\": \"Baum\", \"layer\": \"orth\", \"match\": \"match:eq\", \"type\": \"type:regex\"}}"
+ + "], \"operation\": \"operation:sequence\"}}";
+
+ Krill krill = new Krill(json);
+ kr = ki.search(krill);
+
+ // doc-multi-0: "alte" has BOTH ADJ and NN -> ADJ AND NN = true
+ // -> NOT(true) = false -> excluded
+ // doc-multi-1: "grosse" has only ADJ -> ADJ AND NN = false
+ // -> NOT(false) = true -> matches
+ // doc-multi-2: "kleiner" has only NN -> ADJ AND NN = false
+ // -> NOT(false) = true -> matches
+ // doc-multi-3: "der" has DET -> ADJ AND NN = false
+ // -> NOT(false) = true -> matches
+ assertEquals("totalResults", 3, kr.getTotalResults());
+ }
+
+
private FieldDocument createFieldDoc0 () {
fd = new FieldDocument();
fd.addString("ID", "doc-0");
diff --git a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
index 5947790..fbd945e 100644
--- a/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/query/TestKrillQueryJSON.java
@@ -169,13 +169,9 @@
getClass().getResource("/queries/bsp11.jsonld").getFile());
// [base!=Katze | orth!=Katzen]
- /*
- Imagine a([^b]|[^c])d
- Matches abd and acd
- Interpretation would be not(spanAnd(...))
- */
+ // De Morgan: NOT(Katze) OR NOT(Katzen) = NOT(Katze AND Katzen)
assertEquals(
- "spanOr([tokens:mate/l:Katze, tokens:s:Katzen])",
+ "spanSegment(tokens:mate/l:Katze, tokens:s:Katzen)",
sqwi.toQuery().toString());
assertTrue(sqwi.isNegative());
};
@@ -747,6 +743,52 @@
kq.fromKoral(json).toQuery().toString());
};
+ public void queryJSONallNegationInGroup () throws QueryException {
+ // [orth!="des" & orth!="ihres"]
+ String json = getJsonString(getClass()
+ .getResource("/queries/segment/all-negation-in-group.jsonld")
+ .getFile());
+
+ KrillQuery kq = new KrillQuery("tokens");
+ SpanQueryWrapper sqwi = kq.fromKoral(json);
+ assertEquals(
+ "spanOr([SpanMultiTermQueryWrapper(tokens:/s:des/), SpanMultiTermQueryWrapper(tokens:/s:ihres/)])",
+ sqwi.toQuery().toString());
+ assertTrue(sqwi.isNegative());
+ };
+
+ @Test
+ public void queryJSONallNegationInGroupThree () throws QueryException {
+ // [orth!="des" & orth!="ihres" & orth!="eines"]
+ // By De Morgan's law: NOT(A) AND NOT(B) AND NOT(C) = NOT(A OR B OR C)
+ String json = getJsonString(getClass()
+ .getResource("/queries/segment/all-negation-in-group-three.jsonld")
+ .getFile());
+
+ KrillQuery kq = new KrillQuery("tokens");
+ SpanQueryWrapper sqwi = kq.fromKoral(json);
+ assertEquals(
+ "spanOr([SpanMultiTermQueryWrapper(tokens:/s:des/), SpanMultiTermQueryWrapper(tokens:/s:ihres/), SpanMultiTermQueryWrapper(tokens:/s:eines/)])",
+ sqwi.toQuery().toString());
+ assertTrue(sqwi.isNegative());
+ };
+
+ @Test
+ public void queryJSONallNegationInGroupOrThree () throws QueryException {
+ // [orth!="des" | orth!="ihres" | orth!="eines"]
+ // By De Morgan's law: NOT(A) OR NOT(B) OR NOT(C) = NOT(A AND B AND C)
+ String json = getJsonString(getClass()
+ .getResource("/queries/segment/all-negation-in-group-or-three.jsonld")
+ .getFile());
+
+ KrillQuery kq = new KrillQuery("tokens");
+ SpanQueryWrapper sqwi = kq.fromKoral(json);
+ assertEquals(
+ "spanSegment(spanSegment(SpanMultiTermQueryWrapper(tokens:/s:des/), SpanMultiTermQueryWrapper(tokens:/s:ihres/)), SpanMultiTermQueryWrapper(tokens:/s:eines/))",
+ sqwi.toQuery().toString());
+ assertTrue(sqwi.isNegative());
+ };
+
@Test
public void queryJSONregexFail () {
// "Leserin.{,3}"
diff --git a/src/test/resources/queries/segment/all-negation-in-group-or-three.jsonld b/src/test/resources/queries/segment/all-negation-in-group-or-three.jsonld
new file mode 100644
index 0000000..a350e5e
--- /dev/null
+++ b/src/test/resources/queries/segment/all-negation-in-group-or-three.jsonld
@@ -0,0 +1,36 @@
+{
+ "@context": "http://ids-mannheim.de/ns/KorAP/json-ld/v0.1/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:termGroup",
+ "operands": [
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "des",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ },
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "ihres",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ },
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "eines",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ }
+ ],
+ "relation": "relation:or"
+ }
+ }
+}
diff --git a/src/test/resources/queries/segment/all-negation-in-group-three.jsonld b/src/test/resources/queries/segment/all-negation-in-group-three.jsonld
new file mode 100644
index 0000000..70b436d
--- /dev/null
+++ b/src/test/resources/queries/segment/all-negation-in-group-three.jsonld
@@ -0,0 +1,36 @@
+{
+ "@context": "http://ids-mannheim.de/ns/KorAP/json-ld/v0.1/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:termGroup",
+ "operands": [
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "des",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ },
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "ihres",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ },
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "eines",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ }
+ ],
+ "relation": "relation:and"
+ }
+ }
+}
diff --git a/src/test/resources/queries/segment/all-negation-in-group.jsonld b/src/test/resources/queries/segment/all-negation-in-group.jsonld
new file mode 100644
index 0000000..6e2ceb6
--- /dev/null
+++ b/src/test/resources/queries/segment/all-negation-in-group.jsonld
@@ -0,0 +1,28 @@
+{
+ "@context": "http://ids-mannheim.de/ns/KorAP/json-ld/v0.1/context.jsonld",
+ "query": {
+ "@type": "koral:token",
+ "wrap": {
+ "@type": "koral:termGroup",
+ "operands": [
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "des",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ },
+ {
+ "@type": "koral:term",
+ "foundry": "opennlp",
+ "key": "ihres",
+ "layer": "orth",
+ "match": "match:ne",
+ "type": "type:regex"
+ }
+ ],
+ "relation": "relation:and"
+ }
+ }
+}