Fixed collection statistics bug
Change-Id: If8633847ea454c8f0510d853d76691edd67e66b9
diff --git a/Changes b/Changes
index 7435d58..fe5d4f3 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.53 2015-07-30
+0.53 2015-07-31
- [feature] Implemented new KrillCollection (diewald)
This should fix a lot of issues with deleted
documents and negation in virtual collections.
@@ -6,6 +6,7 @@
- [cleanup] REMOVED deprecated termRelation API (diewald)
- [feature] Added removal methods for documents (diewald)
- [cleanup] REMOVED search method in KrillCollection (diewald)
+ - [bugfix] Fixed collection statistics (diewald)
0.52 2015-07-08
- [bugfix] Fixed payload filtering in FocusSpans (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 943819c..f9fcc14 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -337,12 +337,14 @@
for (String uid : uids) {
cbg.with(this.cb.term("UID", uid));
};
+ this.filter(cbg);
+ /*
if (this.getBuilder() != null)
filter.with(this.getBuilder());
filter.with(cbg);
this.fromBuilder(filter);
-
+ */
/*
BooleanFilter filter = new BooleanFilter();
filter.or("UID", uids);
@@ -426,10 +428,17 @@
FixedBitSet bitset = new FixedBitSet(r.maxDoc());
DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
- if (docids == null)
- return null;
+ if (docids == null) {
+ if (this.cbi != null) {
+ bitset.clear(0, bitset.length());
+ }
+ else {
+ bitset.set(0, bitset.length());
+ };
+ }
+ else
+ bitset.or(docids.iterator());
- bitset.or(docids.iterator());
return bitset;
};
@@ -451,26 +460,37 @@
FixedBitSet bitset = new FixedBitSet(maxDoc);
Filter filter;
- if (this.cbi == null || (filter = this.cbi.toFilter()) == null)
- return null;
-
- // Init vector
- DocIdSet docids = filter.getDocIdSet(atomic, null);
- DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
-
- if (filterIter == null) {
- if (!this.cbi.isNegative())
+ if (this.cbi == null || (filter = this.cbi.toFilter()) == null) {
+ if (acceptDocs == null)
return null;
bitset.set(0, maxDoc);
}
else {
- // Or bit set
- bitset.or(filterIter);
- // Revert for negation
- if (this.cbi.isNegative())
- bitset.flip(0, maxDoc);
+ // Init vector
+ DocIdSet docids = filter.getDocIdSet(atomic, null);
+ DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
+
+ if (filterIter == null) {
+ if (!this.cbi.isNegative())
+ return null;
+
+ bitset.set(0, maxDoc);
+ }
+ else {
+ // Or bit set
+ bitset.or(filterIter);
+
+ // Revert for negation
+ if (this.cbi.isNegative())
+ bitset.flip(0, maxDoc);
+ };
+ };
+
+ if (DEBUG) {
+ log.debug("Bit set is {}", _bits(bitset));
+ log.debug("Livedocs is {}", _bits(acceptDocs));
};
// Remove deleted docs
@@ -506,6 +526,10 @@
if (this.index == null)
return (long) -1;
+ // No reader (inex is empty)
+ if (this.index.reader() == null)
+ return (long) 0;
+
// This is redundant to index stuff
if (type.equals("documents") || type.equals("base/texts")) {
if (this.cbi == null) {
@@ -529,16 +553,19 @@
// Iterate over all atomic readers and collect occurrences
for (AtomicReaderContext atomic : this.index.reader().leaves()) {
Bits bits = this.bits(atomic);
- if (bits != null)
- occurrences += this._numberOfAtomic(bits, atomic, term);
+
if (DEBUG)
- log.debug("Added up to {} for {}/{} with {}", occurrences, field, type, bits);
+ log.debug("Final bits {}", _bits(bits));
+
+ occurrences += this._numberOfAtomic(bits, atomic, term);
+ if (DEBUG)
+ log.debug("Added up to {} for {}/{}", occurrences, field, type);
};
}
// Something went wrong
- catch (Exception e) {
- log.warn(e.getLocalizedMessage());
+ catch (IOException e) {
+ log.warn(e.getMessage());
};
return occurrences;
@@ -562,9 +589,15 @@
// Set the position in the iterator to the term that is seeked
if (termsEnum.seekExact(term.bytes())) {
+ // TODO: Reuse a DocsAndPositionsEnum!!
+
// Start an iterator to fetch all payloads of the term
- DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec,
- null, DocsAndPositionsEnum.FLAG_PAYLOADS);
+ DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
+ docvec,
+ null,
+ DocsAndPositionsEnum.FLAG_PAYLOADS
+ );
+
// The iterator is empty
// This may even be an error, but we return 0
@@ -578,15 +611,27 @@
// Init nextDoc()
while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
+ if (docs.freq() < 1)
+ continue;
+
// Initialize (go to first term)
docs.nextPosition();
// Copy payload with the offset of the BytesRef
payload = docs.getPayload();
- System.arraycopy(payload.bytes, payload.offset, pl, 0, 4);
+ if (payload != null) {
+ System.arraycopy(payload.bytes, payload.offset, pl, 0, 4);
- // Add payload as integer
- occurrences += bb.wrap(pl).getInt();
+ // Add payload as integer
+ occurrences += bb.wrap(pl).getInt();
+
+ if (DEBUG)
+ log.debug("Value for {} incremented by {} to {} in {}",
+ term,
+ bb.wrap(pl).getInt(),
+ occurrences,
+ docs.docID());
+ };
};
// Return the sum of all occurrences
@@ -629,6 +674,14 @@
};
+ private static String _bits (Bits bitset) {
+ String str = "";
+ for (int i = 0; i < bitset.length(); i++) {
+ str += bitset.get(i) ? "1" : "0";
+ };
+ return str;
+ };
+
/*
@Deprecated
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 1cb804c..fb26168 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -2,9 +2,6 @@
import java.util.*;
import java.io.IOException;
-// TEMPORARY:
-import org.apache.lucene.queries.BooleanFilter;
-import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsFilter;
@@ -18,6 +15,12 @@
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.collection.BooleanGroupFilter;
+/*
+ * TODO: Optimize!
+ * - Remove multiple times the same object in Boolean groups.
+ * - Flatten boolean groups
+ */
+
public class CollectionBuilder {
// Logger
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 398c7b0..5fe41bf 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -480,18 +480,178 @@
// kc.extend(kf.and("textClass", "uninteresting"));
kc.extend(kc.build().term("textClass", "uninteresting"));
- /*
-
-
assertEquals("Documents", 1, kc.numberOf("documents"));
- kc.extend(kf.and("textClass", "wissenschaft"));
+ kc.extend(kc.build().term("textClass", "wissenschaft"));
assertEquals("Documents", 3, kc.numberOf("documents"));
assertEquals("Tokens", 1669, kc.numberOf("tokens"));
assertEquals("Sentences", 188, kc.numberOf("sentences"));
assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
+
+ // System.err.println(kc.toString());
+ // Test collectionbuilder simplifier!
+ /*
+ OrGroup(
+ AndGroup(
+ corpusID:WPD
+ textClass:reisen
+ textClass:freizeit-unterhaltung
+ textClass:kultur
+ corpusID:WPD
+ )
+ textClass:uninteresting
+ textClass:wissenschaft
+ )
*/
+
+ assertTrue(ki.delDocs("textClass", "wissenschaft"));
+ ki.commit();
+
+ assertEquals("Documents", 1, kc.numberOf("documents"));
+ assertEquals("Tokens", 405, kc.numberOf("tokens"));
+ assertEquals("Sentences", 75, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
+ };
+
+ @Test
+ public void filterExample2Legacy () throws Exception {
+
+ // Construct index
+ KrillIndex ki = new KrillIndex();
+ // Indexing test files
+ for (String i : new String[] { "00001", "00002", "00003", "00004",
+ "00005", "00006", "02439" }) {
+ ki.addDoc(
+ getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+ ki.commit();
+
+ ki.addDoc(getClass()
+ .getResourceAsStream("/wiki/00012-fakemeta.json.gz"), true);
+
+ ki.commit();
+
+ /*
+ CollectionBuilderLegacy kf = new CollectionBuilderLegacy();
+
+ // Create Virtual collections:
+ KrillCollectionLegacy kc = new KrillCollectionLegacy(ki);
+ kc.filter(kf.and("textClass", "reisen").and("textClass",
+ "freizeit-unterhaltung"));
+ */
+
+ KrillCollection kc = new KrillCollection(ki);
+ CollectionBuilder cb = kc.build();
+ kc.filter(cb.andGroup().with(cb.term("textClass", "reisen")).with(cb.term("textClass","freizeit-unterhaltung")));
+
+ assertEquals("Documents", 5, kc.numberOf("documents"));
+ assertEquals("Tokens", 1678, kc.numberOf("tokens"));
+ assertEquals("Sentences", 194, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
+
+
+ // Create a query
+ QueryBuilder kq = new QueryBuilder("tokens");
+ SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
+
+
+ Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, (short) 5);
+ assertEquals(kr.getTotalResults(), 369);
+
+ // kc.filter(kf.and("corpusID", "QQQ"));
+ kc.filter(cb.term("corpusID", "QQQ"));
+
+ assertEquals("Documents", 0, kc.numberOf("documents"));
+ assertEquals("Tokens", 0, kc.numberOf("tokens"));
+ assertEquals("Sentences", 0, kc.numberOf("sentences"));
+ assertEquals("Paragraphs", 0, kc.numberOf("paragraphs"));
+
+ kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, (short) 5);
+ assertEquals(kr.getTotalResults(), 0);
+ };
+
+
+ @Test
+ public void uidCollectionLegacy () throws IOException {
+
+ // Construct index
+ KrillIndex ki = new KrillIndex();
+ // Indexing test files
+ int uid = 1;
+ for (String i : new String[] { "00001", "00002", "00003", "00004",
+ "00005", "00006", "02439" }) {
+ FieldDocument fd = ki.addDoc(uid++,
+ getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+ ki.commit();
+
+ assertEquals("Documents", 7, ki.numberOf("documents"));
+ assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
+ assertEquals("Sentences", 281, ki.numberOf("sentences"));
+ assertEquals("Tokens", 2661, ki.numberOf("tokens"));
+
+ SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der"));
+ Result kr = ki.search(sq, (short) 10);
+ assertEquals(86, kr.getTotalResults());
+
+ // Create Virtual collections:
+ KrillCollection kc = new KrillCollection();
+ kc.filterUIDs(new String[] { "2", "3", "4" });
+ kc.setIndex(ki);
+ assertEquals("Documents", 3, kc.numberOf("documents"));
+
+ assertEquals("Paragraphs", 46, kc.numberOf("paragraphs"));
+ assertEquals("Sentences", 103, kc.numberOf("sentences"));
+ assertEquals("Tokens", 1229, kc.numberOf("tokens"));
+
+ kr = ki.search(kc, sq, 0, (short) 20, true, (short) 5, true, (short) 5);
+
+ assertEquals((long) 39, kr.getTotalResults());
+ };
+
+ @Test
+ public void uidCollectionWithDeletions () throws IOException {
+
+ // Construct index
+ KrillIndex ki = new KrillIndex();
+ // Indexing test files
+ int uid = 1;
+ for (String i : new String[] { "00001", "00002", "00003", "00004",
+ "00005", "00006", "02439" }) {
+ FieldDocument fd = ki.addDoc(uid++,
+ getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+ ki.commit();
+
+
+ assertEquals("Documents", 7, ki.numberOf("documents"));
+ assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
+ assertEquals("Sentences", 281, ki.numberOf("sentences"));
+ assertEquals("Tokens", 2661, ki.numberOf("tokens"));
+
+ assertTrue(ki.delDoc(3));
+ ki.commit();
+
+ assertEquals("Documents", 6, ki.numberOf("documents"));
+
+ assertEquals("Paragraphs", 146, ki.numberOf("paragraphs"));
+ assertEquals("Sentences", 212, ki.numberOf("sentences"));
+ assertEquals("Tokens", 2019, ki.numberOf("tokens"));
+
+ assertTrue(ki.delDoc(2));
+ assertTrue(ki.delDoc(3));
+ assertTrue(ki.delDoc(4));
+ assertTrue(ki.delDoc(5));
+ assertTrue(ki.delDoc(6));
+ assertTrue(ki.delDoc(7));
+ ki.commit();
+
+ assertEquals("Documents", 1, ki.numberOf("documents"));
+ assertEquals("Paragraphs", 75, ki.numberOf("paragraphs"));
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index 46e59cd..da311c0 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -66,7 +66,7 @@
assertFalse(ks.hasErrors());
assertFalse(ks.hasWarnings());
assertFalse(ks.hasMessages());
- assertEquals("author:/Goethe/", ks
+ assertEquals("QueryWrapperFilter(author:/Goethe/)", ks
.getCollection().toString());
};
@@ -78,7 +78,7 @@
assertFalse(ks.hasErrors());
assertFalse(ks.hasWarnings());
assertFalse(ks.hasMessages());
- assertEquals("-author:/Goethe/", ks
+ assertEquals("-QueryWrapperFilter(author:/Goethe/)", ks
.getCollection().toString());
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java
index 2c668c3..bfa1098 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java
@@ -53,7 +53,8 @@
KrillCollectionLegacy kc = new KrillCollectionLegacy(metaQuery);
assertEquals(1, kc.getCount());
assertEquals(
- "filter with QueryWrapperFilter(+pubDate:[20000101 TO 20131231])",
+ // "filter with QueryWrapperFilter(+pubDate:[20000101 TO 20131231])"
+ "filter with QueryWrapperFilter(+(+pubDate:[20000101 TO 99999999] +pubDate:[0 TO 20131231]))",
kc.getFilter(0).toString());
};
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java
index 8b067b5..0a3f51f 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java
@@ -168,6 +168,7 @@
@Test
+ @Ignore
public void filterExample2 () throws Exception {
// Construct index
@@ -215,6 +216,7 @@
@Test
+ @Ignore
public void uidCollection () throws IOException {
// Construct index