Fixed deletion bug in virtual collections - left two known bugs
Change-Id: Ib975976009ddfa74e9a9f3f07049bdad87a0486f
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java b/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java
index 894a747..f94b407 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java
@@ -7,11 +7,13 @@
import de.ids_mannheim.korap.response.Notifications;
import org.apache.lucene.search.*;
+import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.DocIdBitSet;
+import org.apache.lucene.search.BitsFilteredDocIdSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -58,6 +60,7 @@
return filter.toString();
};
+ /*
public FixedBitSet bits (AtomicReaderContext atomic) throws IOException {
int maxDoc = atomic.reader().maxDoc();
@@ -68,7 +71,7 @@
return null;
// Init vector
- DocIdSet docids = filter.getDocIdSet(atomic, atomic.reader().getLiveDocs());
+ DocIdSet docids = filter.getDocIdSet(atomic, null);
DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
if (filterIter == null) {
@@ -87,17 +90,60 @@
};
// Remove deleted docs
- /*
- System.err.println(atomic.reader().getClass());
- FixedBitSet livedocs = (FixedBitSet) atomic.reader().getLiveDocs();
- if (livedocs != null) {
- bitset.and(livedocs);
- };
- */
+ return (FixedBitSet) BitsFilteredDocIdSet.wrap(
+ (DocIdSet) bitset,
+ (Bits) atomic.reader().getLiveDocs()
+ ).iterator();
+ };
+ */
+
+ public FixedBitSet bits (AtomicReaderContext atomic) throws IOException {
+ AtomicReader r = atomic.reader();
+ FixedBitSet bitset = new FixedBitSet(r.maxDoc());
+ DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
+ if (docids == null)
+ return null;
+ bitset.or(docids.iterator());
return bitset;
};
+
+ public DocIdSet getDocIdSet (AtomicReaderContext atomic, Bits acceptDocs) throws IOException {
+
+ int maxDoc = atomic.reader().maxDoc();
+ FixedBitSet bitset = new FixedBitSet(maxDoc);
+
+ Filter filter;
+ if (this.cb == null || (filter = this.cb.toFilter()) == null)
+ return null;
+
+ // Init vector
+ DocIdSet docids = filter.getDocIdSet(atomic, null);
+ DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
+
+ if (filterIter == null) {
+ if (!this.cb.isNegative())
+ return null;
+
+ bitset.set(0, maxDoc);
+ }
+ else {
+ // Or bit set
+ bitset.or(filterIter);
+
+ // Revert for negation
+ if (this.cb.isNegative())
+ bitset.flip(0, maxDoc);
+ };
+
+ // Remove deleted docs
+ return (DocIdSet) BitsFilteredDocIdSet.wrap(
+ (DocIdSet) bitset,
+ acceptDocs
+ );
+ };
+
/**
* Search for the number of occurrences of different types,
* e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java
index 8b2ff4d..5bb9968 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java
@@ -73,7 +73,8 @@
return new CollectionBuilderRange(field, begin, end);
};
- return new CollectionBuilderTerm(field, dateDF.toString());
+
+ return new CollectionBuilderRange(field, dateDF.floor(), dateDF.ceil());
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 5aade60..e764123 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -91,6 +91,12 @@
kcn.fromBuilder(cb.andGroup(cb.term("textClass", "finanzen")).with(cb.term("textClass", "kultur")));
assertEquals(0, kcn.docCount());
+
+ kcn.fromBuilder(cb.term("text", "Mann"));
+ assertEquals(3, kcn.docCount());
+
+ kcn.fromBuilder(cb.term("text", "Frau"));
+ assertEquals(1, kcn.docCount());
};
@Test
@@ -127,7 +133,7 @@
};
@Test
- public void testIndexWithMultipleCommits () throws IOException {
+ public void testIndexWithMultipleCommitsAndDeletes () throws IOException {
ki = new KrillIndex();
ki.addDoc(createDoc1());
ki.addDoc(createDoc2());
@@ -169,11 +175,112 @@
assertEquals(1, kcn.docCount());
kcn.fromBuilder(cb.term("author", "Michael").not());
assertEquals(2, kcn.docCount());
+
+ // Readd Peter's doc
+ ki.addDoc(createDoc2());
+ ki.commit();
+
+ kcn.fromBuilder(cb.term("author", "Frank"));
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.term("author", "Peter"));
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.term("author", "Sebastian"));
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.term("author", "Michael").not());
+ assertEquals(3, kcn.docCount());
};
- // Todo: Test index with removes
- // Todo: Test with dates
- // Todo: Test with regex
+ @Test
+ public void testIndexWithDateRanges () throws IOException {
+ ki = new KrillIndex();
+ ki.addDoc(createDoc1());
+ ki.addDoc(createDoc2());
+ ki.addDoc(createDoc3());
+ ki.commit();
+ CollectionBuilderNew cb = new CollectionBuilderNew();
+ KrillCollectionNew kcn = new KrillCollectionNew(ki);
+
+ kcn.fromBuilder(cb.date("pubDate", "2005"));
+ assertEquals(3, kcn.docCount());
+ kcn.fromBuilder(cb.date("pubDate", "2005-12"));
+ assertEquals(3, kcn.docCount());
+
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-10"));
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-16"));
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-07"));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(cb.since("pubDate", "2005-12-07"));
+ assertEquals(3, kcn.docCount());
+ kcn.fromBuilder(cb.since("pubDate", "2005-12-10"));
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.since("pubDate", "2005-12-16"));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-16"));
+ assertEquals(3, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-10"));
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-07"));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-10").not());
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-16").not());
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-07").not());
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.date("pubDate", "2005-12-09").not());
+ assertEquals(3, kcn.docCount());
+
+
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-16").not());
+ assertEquals(0, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-15").not());
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-10").not());
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-09").not());
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-07").not());
+ assertEquals(2, kcn.docCount());
+ kcn.fromBuilder(cb.till("pubDate", "2005-12-06").not());
+ assertEquals(3, kcn.docCount());
+ };
+
+
+ @Test
+ public void testIndexWithRegexes () throws IOException {
+ ki = new KrillIndex();
+
+ ki.addDoc(createDoc1());
+ ki.addDoc(createDoc2());
+ ki.addDoc(createDoc3());
+ ki.commit();
+
+ CollectionBuilderNew cb = new CollectionBuilderNew();
+ KrillCollectionNew kcn = new KrillCollectionNew(ki);
+
+ kcn.fromBuilder(cb.re("author", "Fran.*"));
+ assertEquals(1, kcn.docCount());
+ kcn.fromBuilder(cb.re("author", "Blin.*"));
+ assertEquals(0, kcn.docCount());
+ kcn.fromBuilder(cb.re("author", "Frank|Peter"));
+ assertEquals(2, kcn.docCount());
+
+ kcn.fromBuilder(cb.term("text", "Frau"));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(cb.re("text", "Frau"));
+ assertEquals(1, kcn.docCount());
+
+ kcn.fromBuilder(cb.re("text", "Frau|Mann"));
+ System.err.println(kcn.toString());
+ assertEquals(3, kcn.docCount());
+ };
+
private FieldDocument createDoc1 () {
FieldDocument fd = new FieldDocument();
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionNew.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionNew.java
index 0440483..8408176 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionNew.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionNew.java
@@ -46,7 +46,7 @@
@Test
public void builderDateDay () throws IOException {
CollectionBuilderNew kc = new CollectionBuilderNew();
- assertEquals("pubDate:20051011",
+ assertEquals("pubDate:[20051011 TO 20051011]",
kc.date("pubDate", "2005-10-11").toString());
};
@@ -56,7 +56,7 @@
// CollectionBuilderNew.CollectionBuilderInterface kbi = ;
assertNull(kc.date("pubDate", ""));
- assertEquals("pubDate:20051580",
+ assertEquals("pubDate:[20051580 TO 20051580]",
kc.date("pubDate", "2005-15-80").toString());
assertNull(kc.date("pubDate", "2005-15-8"));