Fix behaviour of negative fields in virtual corpora
Change-Id: I126bd787d6938af5841ed032c3f18b982263b1b6
diff --git a/Changes b/Changes
index 150340f..c0274ca 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.58.7 2019-09-16
+ - [bugfix] Fix the behaviour of negative operands in virtual
+ corpora (diewald)
+
0.58.6 2019-08-28
- [bugfix] Updated cache loading (fixed #55) (diewald, margaretha)
- [bugfix] Introduce left match cutting so that
diff --git a/pom.xml b/pom.xml
index 76a0896..9cc960b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
<groupId>de.ids_mannheim.korap</groupId>
<artifactId>Krill</artifactId>
- <version>0.58.6</version>
+ <version>0.58.7</version>
<packaging>jar</packaging>
<name>Krill</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 11469e6..628173b 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -419,6 +419,10 @@
"Unknown document group operation");
for (JsonNode operand : json.get("operands")) {
+
+ // TODO:
+ // Potentially bed here, when operand is a group inside a group
+ // with the same operator (and not negative)
group.with(this._fromKoral(operand));
};
return group;
@@ -581,6 +585,7 @@
FixedBitSet bitset = new FixedBitSet(r.maxDoc());
DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
+
if (docids == null) {
if (this.cbi != null) {
bitset.clear(0, bitset.length());
@@ -589,8 +594,9 @@
bitset.set(0, bitset.length());
};
}
- else
+ else {
bitset.or(docids.iterator());
+ }
return bitset;
};
@@ -622,16 +628,19 @@
// Init vector
DocIdSet docids = filter.getDocIdSet(atomic, null);
+
DocIdSetIterator filterIter =
(docids == null) ? null : docids.iterator();
if (filterIter == null) {
+
if (!this.cbi.isNegative()) return null;
bitset.set(0, maxDoc);
}
else {
- // Or bit set
+
+ // Or bit set
bitset.or(filterIter);
// Revert for negation
diff --git a/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java b/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java
index 5791703..a9160d4 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java
@@ -123,25 +123,27 @@
@Override
- public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs)
+ public DocIdSet getDocIdSet (LeafReaderContext atomic, Bits acceptDocs)
throws IOException {
- final LeafReader reader = context.reader();
+ final LeafReader reader = atomic.reader();
int maxDoc = reader.maxDoc();
FixedBitSet bitset = new FixedBitSet(maxDoc);
FixedBitSet combinator = new FixedBitSet(maxDoc);
boolean init = true;
- if (DEBUG)
- log.debug("Start trying to filter on bitset of length {}", maxDoc);
+ if (DEBUG) {
+ log.debug("Filter on group {} in a corpus of {} docs", this.toString(), maxDoc);
+ }
- for (final GroupFilterOperand operand : this.operands) {
- final DocIdSet docids = operand.filter.getDocIdSet(context, null);
- final DocIdSetIterator filterIter = (docids == null) ? null
- : docids.iterator();
+ for (GroupFilterOperand operand : this.operands) {
+ DocIdSet docids = operand.filter.getDocIdSet(atomic, null);
+ DocIdSetIterator filterIter = (docids == null) ? null
+ : docids.iterator();
if (DEBUG)
log.debug("> Filter to bitset of {} ({} negative)",
- operand.filter.toString(), operand.isNegative);
+ operand.filter.toString(), operand.isNegative);
+
// Filter resulted in no docs
if (filterIter == null) {
@@ -149,60 +151,77 @@
if (DEBUG)
log.debug("- Filter is null");
- // Filter matches
+ // Filter matches everywhere
if (operand.isNegative) {
- if (DEBUG) {
- // OR - This means, everything is allowed
- if (this.isOptional) {
+ // OR - This means, everything is allowed
+ if (this.isOptional) {
+ if (DEBUG) {
log.debug("- Filter to allow all documents (OR NEG NULL)");
+ }
- }
- // AND - The negation is irrelevant
- else {
- log.debug("- Filter to allow all documents (AND NEG NULL)");
- };
- };
+ // bitset.set(1, maxDoc);
+ bitset.set(0, maxDoc);
+
+ // Match all accepted documents!
+ return BitsFilteredDocIdSet
+ .wrap(new BitDocIdSet(bitset), acceptDocs);
+ }
- bitset.set(0, maxDoc);
- return BitsFilteredDocIdSet
- .wrap(new BitDocIdSet(bitset), acceptDocs);
+ // AND - The negation is irrelevant
+ if (init) {
+ if (DEBUG)
+ log.debug("- Initialize by setting to all");
+
+ bitset.set(0, maxDoc);
+ }
+
+ else if (DEBUG) {
+ log.debug("- Filter by ignoring this operand (AND NEG NULL)");
+ };
}
- // The result is unimportant
- else if (this.isOptional) {
- if (DEBUG)
- log.debug("- Filter is ignorable");
- continue;
- };
-
+ // AND with NULL
// There is no possible match
+ else if (!this.isOptional) {
+
+ if (DEBUG)
+ log.debug("- Filter to allow no documents (2)");
+ return null;
+ }
+
+ // OR WITH 0 - ignore!
+ else if (DEBUG) {
+ log.debug("- Filter is ignorable");
+ }
+
if (DEBUG)
- log.debug("- Filter to allow no documents (2)");
- return null;
+ log.debug("- Check next operand");
+
}
// Initialize bitset
else if (init) {
+ // Set initially empty bitset to filterIter
bitset.or(filterIter);
if (DEBUG)
- log.debug("- Filter is inial with card {}",
- bitset.cardinality());
+ log.debug("- Filter is inital with card {}",
+ bitset.cardinality());
// Flip the matching documents
if (operand.isNegative) {
bitset.flip(0, maxDoc);
if (DEBUG)
log.debug(
- "- Filter is negative - so flipped to card {} (1)",
- bitset.cardinality());
+ "- Filter is negative - so flipped to card {} (1)",
+ bitset.cardinality());
};
-
- init = false;
}
+
+ // Operate on bitsets
else {
if (DEBUG)
@@ -251,7 +270,16 @@
if (DEBUG)
log.debug("- Subresult has card {} ", bitset.cardinality());
};
+
+ if (DEBUG)
+ log.debug("- Init is over");
+
+ init = false;
};
+
+ if (DEBUG)
+ log.debug("- Operand list ends");
+
return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bitset), acceptDocs);
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 7cdfb85..2457c1e 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -219,7 +219,7 @@
@Test
- public void testIndexWithNegation () throws IOException {
+ public void testIndexWithNegation1 () throws IOException {
ki = new KrillIndex();
ki.addDoc(createDoc1());
ki.addDoc(createDoc2());
@@ -246,10 +246,198 @@
kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
.with(cb.term("author", "Sebastian")));
assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.term("author", "Frank").not()
+ )
+ .with(
+ cb.term("author", "Sebastian").not()
+ )
+ );
+ assertEquals("AndGroup(-author:Frank -author:Sebastian)", kcn.toString());
+ assertEquals(1, kcn.docCount());
+
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.term("author", "Peter")
+ )
+ .with(
+ cb.andGroup().with(
+ cb.term("author", "Frank").not()
+ )
+ .with(
+ cb.term("author", "Sebastian").not()
+ )
+ )
+ );
+ assertEquals("AndGroup(author:Peter AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.re("textClass", "reis.*")
+ )
+ .with(
+ cb.andGroup().with(
+ cb.term("author", "Frank").not()
+ )
+ .with(
+ cb.term("author", "Sebastian").not()
+ )
+ )
+ );
+ assertEquals("AndGroup(QueryWrapperFilter(textClass:/reis.*/) AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
+ assertEquals(1, kcn.docCount());
};
@Test
+ public void testIndexWithNegation2 () throws IOException {
+ ki = new KrillIndex();
+ ki.addDoc(createDoc1());
+ ki.commit();
+ ki.addDoc(createDoc2());
+ ki.commit();
+ ki.addDoc(createDoc3());
+ ki.commit();
+ CollectionBuilder cb = new CollectionBuilder();
+ KrillCollection kcn = new KrillCollection(ki);
+
+ // Simple negation tests
+ kcn.fromBuilder(cb.term("author", "Frank").not());
+ assertEquals(2, kcn.docCount());
+
+ kcn.fromBuilder(cb.term("textClass", "reisen").not());
+ assertEquals(0, kcn.docCount());
+
+ kcn.fromBuilder(cb.term("textClass", "kultur").not());
+ assertEquals(1, kcn.docCount());
+
+ // orGroup with simple Negation
+ kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
+ .with(cb.term("author", "Peter")));
+ assertEquals(2, kcn.docCount());
+
+ kcn.fromBuilder(cb.orGroup().with(cb.term("textClass", "kultur").not())
+ .with(cb.term("author", "Sebastian")));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.term("author", "Frank").not()
+ )
+ .with(
+ cb.term("author", "Sebastian").not()
+ )
+ );
+ assertEquals("AndGroup(-author:Frank -author:Sebastian)", kcn.toString());
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.term("author", "Peter")
+ )
+ .with(
+ cb.andGroup().with(
+ cb.term("author", "Frank").not()
+ )
+ .with(
+ cb.term("author", "Sebastian").not()
+ )
+ )
+ );
+ assertEquals("AndGroup(author:Peter AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.re("textClass", "reis..")
+ )
+ .with(
+ cb.andGroup().with(
+ cb.term("author", "Frank").not()
+ )
+ .with(
+ cb.term("author", "Sebastian").not()
+ )
+ )
+ );
+ assertEquals("AndGroup(QueryWrapperFilter(textClass:/reis../) AndGroup(-author:Frank -author:Sebastian))", kcn.toString());
+ assertEquals(1, kcn.docCount());
+ };
+
+ @Test
+ public void testIndexWithNegation3 () throws IOException {
+
+ // This is identical to above but the operands are switched
+ ki = new KrillIndex();
+ ki.addDoc(createDoc1());
+ ki.commit();
+ ki.addDoc(createDoc2());
+ ki.commit();
+ ki.addDoc(createDoc3());
+ ki.commit();
+ CollectionBuilder cb = new CollectionBuilder();
+ KrillCollection kcn = new KrillCollection(ki);
+
+ // orGroup with simple Negation
+ kcn.fromBuilder(
+ cb.orGroup().with(cb.term("author", "Peter"))
+ .with(cb.term("textClass", "kultur").not()));
+ assertEquals(2, kcn.docCount());
+
+ kcn.fromBuilder(cb.orGroup().with(cb.term("author", "Sebastian"))
+ .with(cb.term("textClass", "kultur").not()));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.term("author", "Sebastian").not()
+ )
+ .with(
+ cb.term("author", "Frank").not()
+ )
+ );
+ assertEquals("AndGroup(-author:Sebastian -author:Frank)", kcn.toString());
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.andGroup().with(
+ cb.term("author", "Sebastian").not()
+ )
+ .with(
+ cb.term("author", "Frank").not()
+ )
+ )
+ .with(
+ cb.term("author", "Peter")
+ )
+ );
+ assertEquals("AndGroup(AndGroup(-author:Sebastian -author:Frank) author:Peter)", kcn.toString());
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(
+ cb.andGroup().with(
+ cb.andGroup().with(
+ cb.term("author", "Sebastian").not()
+ )
+ .with(
+ cb.term("author", "Frank").not()
+ )
+ )
+ .with(
+ cb.re("textClass", "reis..")
+ )
+ );
+ assertEquals("AndGroup(AndGroup(-author:Sebastian -author:Frank) QueryWrapperFilter(textClass:/reis../))", kcn.toString());
+ assertEquals(1, kcn.docCount());
+ };
+
+
+ @Test
public void testIndexWithMultipleCommitsAndDeletes () throws IOException {
ki = new KrillIndex();
ki.addDoc(createDoc1());
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index f081f33..39c98e8 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -93,6 +93,16 @@
};
@Test
+ public void collectionWithMultipleNe () {
+ String metaQuery = _getJSONString("collection_multine.jsonld");
+ KrillCollection kc = new KrillCollection(metaQuery);
+ assertEquals(kc.toString(),
+ "AndGroup(QueryWrapperFilter(availability:/CC-BY.*/) AndGroup(-corpusSigle:WUD17 -corpusSigle:WDD17))");
+ };
+
+
+
+ @Test
public void collectionWithLargeVector () {
String query = _getJSONString("collection_large_vector.jsonld");
Krill ks = new Krill(query);
diff --git a/src/test/resources/queries/collections/collection_multine.jsonld b/src/test/resources/queries/collections/collection_multine.jsonld
new file mode 100644
index 0000000..c6b3426
--- /dev/null
+++ b/src/test/resources/queries/collections/collection_multine.jsonld
@@ -0,0 +1,36 @@
+{
+ "@context": "http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",
+ "collection": {
+ "@type": "koral:docGroup",
+ "operands": [
+ {
+ "@type": "koral:doc",
+ "key": "availability",
+ "match": "match:eq",
+ "type": "type:regex",
+ "value": "CC-BY.*"
+ },
+ {
+ "@type": "koral:docGroup",
+ "operands": [
+ {
+ "@type": "koral:doc",
+ "key": "corpusSigle",
+ "match": "match:ne",
+ "value": "WUD17",
+ "type": "type:string"
+ },
+ {
+ "@type": "koral:doc",
+ "key": "corpusSigle",
+ "match": "match:ne",
+ "value": "WDD17",
+ "type": "type:string"
+ }
+ ],
+ "operation": "operation:and"
+ }
+ ],
+ "operation": "operation:and"
+ }
+}