Fixed collection statistics bug

Change-Id: If8633847ea454c8f0510d853d76691edd67e66b9
diff --git a/Changes b/Changes
index 7435d58..fe5d4f3 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.53 2015-07-30
+0.53 2015-07-31
         - [feature] Implemented new KrillCollection (diewald)
 	  This should fix a lot of issues with deleted
 	  documents and negation in virtual collections.
@@ -6,6 +6,7 @@
 	- [cleanup] REMOVED deprecated termRelation API (diewald)
 	- [feature] Added removal methods for documents (diewald)
 	- [cleanup] REMOVED search method in KrillCollection (diewald)
+        - [bugfix] Fixed collection statistics (diewald)
 
 0.52 2015-07-08
         - [bugfix] Fixed payload filtering in FocusSpans (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index 943819c..f9fcc14 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -337,12 +337,14 @@
         for (String uid : uids) {
             cbg.with(this.cb.term("UID", uid));
         };
+        this.filter(cbg);
+        /*
         if (this.getBuilder() != null)
             filter.with(this.getBuilder());
         filter.with(cbg);
 
         this.fromBuilder(filter);
-
+        */
         /*
         BooleanFilter filter = new BooleanFilter();
         filter.or("UID", uids);
@@ -426,10 +428,17 @@
         FixedBitSet bitset = new FixedBitSet(r.maxDoc());
         DocIdSet docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
 
-        if (docids == null)
-            return null;
+        if (docids == null) {
+            if (this.cbi != null) {
+                bitset.clear(0, bitset.length());
+            }
+            else {
+                bitset.set(0, bitset.length());
+            };
+        }
+        else
+            bitset.or(docids.iterator());
 
-        bitset.or(docids.iterator());
         return bitset;
     };
 
@@ -451,26 +460,37 @@
         FixedBitSet bitset = new FixedBitSet(maxDoc);
 
         Filter filter;
-        if (this.cbi == null || (filter = this.cbi.toFilter()) == null)
-            return null;
-
-        // Init vector
-        DocIdSet docids = filter.getDocIdSet(atomic, null);
-        DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
-
-        if (filterIter == null) {
-            if (!this.cbi.isNegative())
+        if (this.cbi == null || (filter = this.cbi.toFilter()) == null) {
+            if (acceptDocs == null)
                 return null;
 
             bitset.set(0, maxDoc);
         }
         else {
-            // Or bit set
-            bitset.or(filterIter);
 
-            // Revert for negation
-            if (this.cbi.isNegative())
-                bitset.flip(0, maxDoc);
+            // Init vector
+            DocIdSet docids = filter.getDocIdSet(atomic, null);
+            DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
+
+            if (filterIter == null) {
+                if (!this.cbi.isNegative())
+                    return null;
+
+                bitset.set(0, maxDoc);
+            }
+            else {
+                // Or bit set
+                bitset.or(filterIter);
+
+                // Revert for negation
+                if (this.cbi.isNegative())
+                    bitset.flip(0, maxDoc);
+            };
+        };
+
+        if (DEBUG) {
+            log.debug("Bit set is  {}", _bits(bitset));
+            log.debug("Livedocs is {}", _bits(acceptDocs));
         };
 
         // Remove deleted docs
@@ -506,6 +526,10 @@
         if (this.index == null)
             return (long) -1;
 
+        // No reader (inex is empty)
+        if (this.index.reader() == null)
+            return (long) 0;
+
         // This is redundant to index stuff
         if (type.equals("documents") || type.equals("base/texts")) {
             if (this.cbi == null) {
@@ -529,16 +553,19 @@
             // Iterate over all atomic readers and collect occurrences
             for (AtomicReaderContext atomic : this.index.reader().leaves()) {
                 Bits bits = this.bits(atomic);
-                if (bits != null)
-                    occurrences += this._numberOfAtomic(bits, atomic, term);
+
                 if (DEBUG)
-                    log.debug("Added up to {} for {}/{} with {}", occurrences, field, type, bits);
+                    log.debug("Final bits  {}", _bits(bits));
+
+                occurrences += this._numberOfAtomic(bits, atomic, term);
+                if (DEBUG)
+                    log.debug("Added up to {} for {}/{}", occurrences, field, type);
             };
         }
         
         // Something went wrong
-        catch (Exception e) {
-            log.warn(e.getLocalizedMessage());
+        catch (IOException e) {
+            log.warn(e.getMessage());
         };
 
         return occurrences;
@@ -562,9 +589,15 @@
             // Set the position in the iterator to the term that is seeked
             if (termsEnum.seekExact(term.bytes())) {
 
+                // TODO: Reuse a DocsAndPositionsEnum!!
+
                 // Start an iterator to fetch all payloads of the term
-                DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec,
-                        null, DocsAndPositionsEnum.FLAG_PAYLOADS);
+                DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
+                        docvec,
+                        null,
+                        DocsAndPositionsEnum.FLAG_PAYLOADS
+                );
+
 
                 // The iterator is empty
                 // This may even be an error, but we return 0
@@ -578,15 +611,27 @@
                 // Init nextDoc()
                 while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
 
+                    if (docs.freq() < 1)
+                        continue;
+
                     // Initialize (go to first term)
                     docs.nextPosition();
 
                     // Copy payload with the offset of the BytesRef
                     payload = docs.getPayload();
-                    System.arraycopy(payload.bytes, payload.offset, pl, 0, 4);
+                    if (payload != null) {
+                        System.arraycopy(payload.bytes, payload.offset, pl, 0, 4);
 
-                    // Add payload as integer
-                    occurrences += bb.wrap(pl).getInt();
+                        // Add payload as integer
+                        occurrences += bb.wrap(pl).getInt();
+
+                        if (DEBUG)
+                            log.debug("Value for {} incremented by {} to {} in {}",
+                                      term,
+                                      bb.wrap(pl).getInt(),
+                                      occurrences,
+                                      docs.docID());                    
+                    };
                 };
 
                 // Return the sum of all occurrences
@@ -629,6 +674,14 @@
     };
 
 
+    private static String _bits (Bits bitset) {
+        String str = "";
+        for (int i = 0; i < bitset.length(); i++) {
+            str += bitset.get(i) ? "1" : "0";
+        };
+        return str;
+    };
+
 
     /*
     @Deprecated
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
index 1cb804c..fb26168 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilder.java
@@ -2,9 +2,6 @@
 
 import java.util.*;
 import java.io.IOException;
-// TEMPORARY:
-import org.apache.lucene.queries.BooleanFilter;
-import org.apache.lucene.search.BooleanClause;
 
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.TermsFilter;
@@ -18,6 +15,12 @@
 import de.ids_mannheim.korap.KrillCollection;
 import de.ids_mannheim.korap.collection.BooleanGroupFilter;
 
+/*
+ * TODO: Optimize!
+ * - Remove multiple times the same object in Boolean groups.
+ * - Flatten boolean groups
+ */
+
 public class CollectionBuilder {
 
     // Logger
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index 398c7b0..5fe41bf 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -480,18 +480,178 @@
         // kc.extend(kf.and("textClass", "uninteresting"));
         kc.extend(kc.build().term("textClass", "uninteresting"));
 
-        /*
-
-
         assertEquals("Documents", 1, kc.numberOf("documents"));
 
-        kc.extend(kf.and("textClass", "wissenschaft"));
+        kc.extend(kc.build().term("textClass", "wissenschaft"));
 
         assertEquals("Documents", 3, kc.numberOf("documents"));
         assertEquals("Tokens", 1669, kc.numberOf("tokens"));
         assertEquals("Sentences", 188, kc.numberOf("sentences"));
         assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
+
+        // System.err.println(kc.toString());
+        // Test collectionbuilder simplifier!
+        /*
+        OrGroup(
+                AndGroup(
+                         corpusID:WPD
+                         textClass:reisen
+                         textClass:freizeit-unterhaltung
+                         textClass:kultur
+                         corpusID:WPD
+                         )
+                textClass:uninteresting
+                textClass:wissenschaft
+        )
         */
+
+        assertTrue(ki.delDocs("textClass", "wissenschaft"));
+        ki.commit();
+
+        assertEquals("Documents", 1, kc.numberOf("documents"));
+        assertEquals("Tokens", 405, kc.numberOf("tokens"));
+        assertEquals("Sentences", 75, kc.numberOf("sentences"));
+        assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
+    };
+
+    @Test
+    public void filterExample2Legacy () throws Exception {
+
+        // Construct index
+        KrillIndex ki = new KrillIndex();
+        // Indexing test files
+        for (String i : new String[] { "00001", "00002", "00003", "00004",
+                "00005", "00006", "02439" }) {
+            ki.addDoc(
+                    getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                    true);
+        };
+        ki.commit();
+
+        ki.addDoc(getClass()
+                .getResourceAsStream("/wiki/00012-fakemeta.json.gz"), true);
+
+        ki.commit();
+
+        /*
+        CollectionBuilderLegacy kf = new CollectionBuilderLegacy();
+
+        // Create Virtual collections:
+        KrillCollectionLegacy kc = new KrillCollectionLegacy(ki);
+        kc.filter(kf.and("textClass", "reisen").and("textClass",
+                "freizeit-unterhaltung"));
+        */
+
+        KrillCollection kc = new KrillCollection(ki);
+        CollectionBuilder cb = kc.build();
+        kc.filter(cb.andGroup().with(cb.term("textClass", "reisen")).with(cb.term("textClass","freizeit-unterhaltung")));
+
+        assertEquals("Documents", 5, kc.numberOf("documents"));
+        assertEquals("Tokens", 1678, kc.numberOf("tokens"));
+        assertEquals("Sentences", 194, kc.numberOf("sentences"));
+        assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
+
+
+        // Create a query
+        QueryBuilder kq = new QueryBuilder("tokens");
+        SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
+
+
+        Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, (short) 5);
+        assertEquals(kr.getTotalResults(), 369);
+
+        // kc.filter(kf.and("corpusID", "QQQ"));
+        kc.filter(cb.term("corpusID", "QQQ"));
+
+        assertEquals("Documents", 0, kc.numberOf("documents"));
+        assertEquals("Tokens", 0, kc.numberOf("tokens"));
+        assertEquals("Sentences", 0, kc.numberOf("sentences"));
+        assertEquals("Paragraphs", 0, kc.numberOf("paragraphs"));
+
+        kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true, (short) 5);
+        assertEquals(kr.getTotalResults(), 0);
+    };
+
+
+    @Test
+    public void uidCollectionLegacy () throws IOException {
+
+        // Construct index
+        KrillIndex ki = new KrillIndex();
+        // Indexing test files
+        int uid = 1;
+        for (String i : new String[] { "00001", "00002", "00003", "00004",
+                "00005", "00006", "02439" }) {
+            FieldDocument fd = ki.addDoc(uid++,
+                    getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                    true);
+        };
+        ki.commit();
+
+        assertEquals("Documents", 7, ki.numberOf("documents"));
+        assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
+        assertEquals("Sentences", 281, ki.numberOf("sentences"));
+        assertEquals("Tokens", 2661, ki.numberOf("tokens"));
+
+        SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der"));
+        Result kr = ki.search(sq, (short) 10);
+        assertEquals(86, kr.getTotalResults());
+
+        // Create Virtual collections:
+        KrillCollection kc = new KrillCollection();
+        kc.filterUIDs(new String[] { "2", "3", "4" });
+        kc.setIndex(ki);
+        assertEquals("Documents", 3, kc.numberOf("documents"));
+
+        assertEquals("Paragraphs", 46, kc.numberOf("paragraphs"));
+        assertEquals("Sentences", 103, kc.numberOf("sentences"));
+        assertEquals("Tokens", 1229, kc.numberOf("tokens"));
+
+        kr = ki.search(kc, sq, 0, (short) 20, true, (short) 5, true, (short) 5);
+
+        assertEquals((long) 39, kr.getTotalResults());
+    };
+
+    @Test
+    public void uidCollectionWithDeletions () throws IOException {
+
+        // Construct index
+        KrillIndex ki = new KrillIndex();
+        // Indexing test files
+        int uid = 1;
+        for (String i : new String[] { "00001", "00002", "00003", "00004",
+                "00005", "00006", "02439" }) {
+            FieldDocument fd = ki.addDoc(uid++,
+                    getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+                    true);
+        };
+        ki.commit();
+
+
+        assertEquals("Documents", 7, ki.numberOf("documents"));
+        assertEquals("Paragraphs", 174, ki.numberOf("paragraphs"));
+        assertEquals("Sentences", 281, ki.numberOf("sentences"));
+        assertEquals("Tokens", 2661, ki.numberOf("tokens"));
+
+        assertTrue(ki.delDoc(3));
+        ki.commit();
+
+        assertEquals("Documents", 6, ki.numberOf("documents"));
+
+        assertEquals("Paragraphs", 146, ki.numberOf("paragraphs"));
+        assertEquals("Sentences", 212, ki.numberOf("sentences"));
+        assertEquals("Tokens", 2019, ki.numberOf("tokens"));
+
+        assertTrue(ki.delDoc(2));
+        assertTrue(ki.delDoc(3));
+        assertTrue(ki.delDoc(4));
+        assertTrue(ki.delDoc(5));
+        assertTrue(ki.delDoc(6));
+        assertTrue(ki.delDoc(7));
+        ki.commit();
+
+        assertEquals("Documents", 1, ki.numberOf("documents"));
+        assertEquals("Paragraphs", 75, ki.numberOf("paragraphs"));
     };
 
 
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
index 46e59cd..da311c0 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSON.java
@@ -66,7 +66,7 @@
         assertFalse(ks.hasErrors());
         assertFalse(ks.hasWarnings());
         assertFalse(ks.hasMessages());
-        assertEquals("author:/Goethe/", ks
+        assertEquals("QueryWrapperFilter(author:/Goethe/)", ks
                 .getCollection().toString());
     };
 
@@ -78,7 +78,7 @@
         assertFalse(ks.hasErrors());
         assertFalse(ks.hasWarnings());
         assertFalse(ks.hasMessages());
-        assertEquals("-author:/Goethe/", ks
+        assertEquals("-QueryWrapperFilter(author:/Goethe/)", ks
                 .getCollection().toString());
     };
 
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java
index 2c668c3..bfa1098 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionJSONLegacy.java
@@ -53,7 +53,8 @@
         KrillCollectionLegacy kc = new KrillCollectionLegacy(metaQuery);
         assertEquals(1, kc.getCount());
         assertEquals(
-                "filter with QueryWrapperFilter(+pubDate:[20000101 TO 20131231])",
+                     // "filter with QueryWrapperFilter(+pubDate:[20000101 TO 20131231])"
+                     "filter with QueryWrapperFilter(+(+pubDate:[20000101 TO 99999999] +pubDate:[0 TO 20131231]))",
                 kc.getFilter(0).toString());
     };
 
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java
index 8b067b5..0a3f51f 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionLegacy.java
@@ -168,6 +168,7 @@
 
 
     @Test
+    @Ignore
     public void filterExample2 () throws Exception {
 
         // Construct index
@@ -215,6 +216,7 @@
 
 
     @Test
+    @Ignore
     public void uidCollection () throws IOException {
 
         // Construct index