Introduced new KrillCollection interface - but with a failing test for deleted documents
Change-Id: Ie5cd0cea3b651eb93c5b46e669cc9cd37503c8b3
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollection.java b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
index a89fdfc..1aef947 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollection.java
@@ -67,7 +67,7 @@
.getLogger(KrillCollection.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
/**
diff --git a/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java b/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java
new file mode 100644
index 0000000..894a747
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/KrillCollectionNew.java
@@ -0,0 +1,152 @@
+package de.ids_mannheim.korap;
+
+import java.util.*;
+import java.io.IOException;
+
+import de.ids_mannheim.korap.collection.CollectionBuilderNew;
+import de.ids_mannheim.korap.response.Notifications;
+
+import org.apache.lucene.search.*;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.DocIdBitSet;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class KrillCollectionNew extends Notifications {
+ private KrillIndex index;
+ private CollectionBuilderNew.CollectionBuilderInterface cb;
+
+ // Logger
+ private final static Logger log = LoggerFactory
+ .getLogger(KrillCollection.class);
+
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = true;
+
+
+ /**
+ * Construct a new KrillCollection by passing a KrillIndex.
+ *
+ * @param index
+ * The {@link KrillIndex} object.
+ */
+ public KrillCollectionNew (KrillIndex index) {
+ this.index = index;
+ };
+
+ public KrillCollectionNew fromBuilder (CollectionBuilderNew.CollectionBuilderInterface cb) {
+ this.cb = cb;
+ return this;
+ };
+
+ public Filter toFilter () {
+ if (this.cb == null)
+ return null;
+
+ return this.cb.toFilter();
+ };
+
+ public String toString () {
+ Filter filter = this.toFilter();
+ if (filter == null)
+ return "";
+
+ return filter.toString();
+ };
+
+ public FixedBitSet bits (AtomicReaderContext atomic) throws IOException {
+
+ int maxDoc = atomic.reader().maxDoc();
+ FixedBitSet bitset = new FixedBitSet(maxDoc);
+
+ Filter filter;
+ if (this.cb == null || (filter = this.cb.toFilter()) == null)
+ return null;
+
+ // Init vector
+ DocIdSet docids = filter.getDocIdSet(atomic, atomic.reader().getLiveDocs());
+ DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
+
+ if (filterIter == null) {
+ if (!this.cb.isNegative())
+ return null;
+
+ bitset.set(0, maxDoc);
+ }
+ else {
+ // Or bit set
+ bitset.or(filterIter);
+
+ // Revert for negation
+ if (this.cb.isNegative())
+ bitset.flip(0, maxDoc);
+ };
+
+ // Remove deleted docs
+ /*
+ System.err.println(atomic.reader().getClass());
+ FixedBitSet livedocs = (FixedBitSet) atomic.reader().getLiveDocs();
+ if (livedocs != null) {
+ bitset.and(livedocs);
+ };
+ */
+
+ return bitset;
+ };
+
+ /**
+ * Search for the number of occurrences of different types,
+ * e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual
+ * collection.
+ *
+ * @param field
+ * The field containing the textual data and the
+ * annotations as a string.
+ * @param type
+ * The type of meta information,
+ * e.g. <i>documents</i> or <i>sentences</i> as a
+ * string.
+ * @return The number of the occurrences.
+ * @throws IOException
+ * @see KrillIndex#numberOf
+ */
+ public long numberOf (String field, String type) throws IOException {
+
+ // No index defined
+ if (this.index == null)
+ return (long) -1;
+
+ // This is redundant to index stuff
+ if (type.equals("documents"))
+ return this.docCount();
+
+ return (long) 0;
+ // return this.index.numberOf(this, field, type);
+ };
+
+
+
+ public long docCount () {
+
+ // No index defined
+ if (this.index == null)
+ return (long) 0;
+
+ long docCount = 0;
+ try {
+ FixedBitSet bitset;
+ for (AtomicReaderContext atomic : this.index.reader().leaves()) {
+ if ((bitset = this.bits(atomic)) != null)
+ docCount += bitset.cardinality();
+ };
+ }
+ catch (IOException e) {
+ log.warn(e.getLocalizedMessage());
+ };
+ return docCount;
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 73a604a..bfdae31 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -419,6 +419,51 @@
/**
+ * Delete documents of the index by passing field information.
+ *
+ * @param field
+ * The meta field name.
+ * @param term
+ * The meta field term.
+ */
+ public boolean delDocs (String field, String term) {
+ if (field == null || term == null)
+ return false;
+ try {
+ this.writer().deleteDocuments(
+ new Term(field, term)
+ );
+ if (++commitCounter > autoCommit) {
+ this.commit();
+ commitCounter = 0;
+ };
+
+ return true;
+ }
+
+ // Failed to add document
+ catch (IOException e) {
+ log.error("Unable to delete documents");
+ };
+
+ return false;
+ };
+
+
+ /**
+ * Delete a document of the index by passing a UID.
+ *
+ * @param uid
+ * The unique identifier of the document.
+ */
+ public boolean delDoc (Integer uid) {
+ if (uid < 0)
+ return false;
+ return this.delDocs("UID", uid.toString());
+ };
+
+
+ /**
* Add a document to the index as a JSON string.
*
* @param json
@@ -574,11 +619,11 @@
};
long docCount = 0;
- int i = 1;
+ // int i = 1;
try {
for (AtomicReaderContext atomic : this.reader().leaves()) {
docCount += collection.bits(atomic).cardinality();
- i++;
+ // i++;
};
}
catch (IOException e) {
diff --git a/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java b/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java
new file mode 100644
index 0000000..7efcc60
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/collection/BooleanGroupFilter.java
@@ -0,0 +1,231 @@
+package de.ids_mannheim.korap.collection;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.BitsFilteredDocIdSet;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.util.Bits;
+
+import de.ids_mannheim.korap.KrillCollection;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A container Filter that allows Boolean composition of Filters
+ * in groups (either or-groups or and-groups).
+ *
+ * @author Nils Diewald
+ *
+ * This filter is roughly based on org.apache.lucene.queries.BooleanFilter.
+ */
+public class BooleanGroupFilter extends Filter {
+ // Group is either an or- or an and-Group
+ private boolean isOptional;
+
+ // Logger
+ private final static Logger log = LoggerFactory.getLogger(KrillCollection.class);
+
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = true;
+
+ // Init operands list
+ private final List<GroupFilterOperand> operands = new ArrayList<>(3);
+
+ // Operand in the filter group
+ private class GroupFilterOperand {
+ public Filter filter;
+ public boolean isNegative;
+
+ // Operand has filter and negativity information
+ public GroupFilterOperand (Filter filter, boolean negative) {
+ this.filter = filter;
+ this.isNegative = negative;
+ };
+ };
+
+ /**
+ * Create a new BooleanGroupFilter.
+ * Accepts a boolean parameter to make it an or-Group
+ * (<pre>true</pre>) or an and-Group (<pre>true</pre>).
+ */
+ public BooleanGroupFilter (boolean optional) {
+ this.isOptional = optional;
+ };
+
+
+ /**
+ * Add an operand to the list of filter operands.
+ * The operand is a positive filter that won't be flipped.
+ */
+ public final void with (Filter filter) {
+ this.operands.add(new GroupFilterOperand(filter, false));
+ };
+
+
+ /**
+ * Add an operand to the list of filter operands.
+ * The operand is a negative filter that will be flipped.
+ */
+ public final void without (Filter filter) {
+ this.operands.add(new GroupFilterOperand(filter, true));
+ };
+
+
+ @Override
+ public boolean equals (Object obj) {
+ if (this == obj)
+ return true;
+
+ if ((obj == null) || (obj.getClass() != this.getClass()))
+ return false;
+
+ final BooleanGroupFilter other = (BooleanGroupFilter) obj;
+ return operands.equals(other.operands);
+ };
+
+
+ @Override
+ public int hashCode() {
+ return 657153719 ^ operands.hashCode();
+ };
+
+
+ @Override
+ public String toString () {
+ StringBuilder buffer = new StringBuilder(
+ this.isOptional ? "OrGroup(" : "AndGroup("
+ );
+ boolean first = true;
+ for (final GroupFilterOperand operand : this.operands) {
+ if (first)
+ first = false;
+ else
+ buffer.append(" ");
+
+ if (operand.isNegative)
+ buffer.append('-');
+
+ buffer.append(operand.filter.toString());
+ };
+ return buffer.append(')').toString();
+ };
+
+
+ @Override
+ public DocIdSet getDocIdSet (AtomicReaderContext context, Bits acceptDocs) throws IOException {
+ final AtomicReader reader = context.reader();
+ int maxDoc = reader.maxDoc();
+ FixedBitSet bitset = new FixedBitSet(maxDoc);
+ FixedBitSet combinator = new FixedBitSet(maxDoc);
+ boolean init = true;
+
+ if (DEBUG)
+ log.debug("Start trying to filter on bitset of length {}", maxDoc);
+
+ for (final GroupFilterOperand operand : this.operands) {
+ final DocIdSet docids = operand.filter.getDocIdSet(context, null);
+ final DocIdSetIterator filterIter = (docids == null) ? null : docids.iterator();
+
+ if (DEBUG)
+ log.debug("> Filter to bitset of {} ({} negative)",
+ operand.filter.toString(),
+ operand.isNegative);
+
+ // Filter resulted in no docs
+ if (filterIter == null) {
+
+ if (DEBUG) log.debug("- Filter is null");
+
+ // Filter matches
+ if (operand.isNegative) {
+
+ // This means, everything is allowed
+ if (this.isOptional) {
+
+ // Everything is allowed
+ if (DEBUG) log.debug("- Filter to allow all documents");
+
+ bitset.set(0, maxDoc);
+ return BitsFilteredDocIdSet.wrap(bitset, acceptDocs);
+ };
+
+ // There is no possible match
+ if (DEBUG) log.debug("- Filter to allow no documents (1)");
+ return null;
+ }
+
+ // The result is unimportant
+ else if (this.isOptional) {
+ if (DEBUG) log.debug("- Filter is ignorable");
+ continue;
+ };
+
+ // There is no possible match
+ if (DEBUG) log.debug("- Filter to allow no documents (2)");
+ return null;
+ }
+
+ // Initialize bitset
+ else if (init) {
+
+ bitset.or(filterIter);
+
+ if (DEBUG) log.debug("- Filter is inial with card {}", bitset.cardinality());
+
+ // Flip the matching documents
+ if (operand.isNegative) {
+ bitset.flip(0, maxDoc);
+ if (DEBUG) log.debug("- Filter is negative - so flipped to card {} (1)", bitset.cardinality());
+ };
+
+ init = false;
+ }
+ else {
+
+ if (DEBUG) log.debug("- Filter is fine and operating");
+
+ // Operator is negative and needs to be flipped
+ if (operand.isNegative) {
+ if (this.isOptional) {
+ if (DEBUG) log.debug("- Filter is negative optional");
+
+ // Negative or ... may be slow
+ combinator.or(filterIter);
+ combinator.flip(0, maxDoc);
+
+ if (DEBUG) log.debug("- Filter is negative - so flipped to card {} (2)", combinator.cardinality());
+
+ bitset.or(combinator);
+ combinator.clear(0, maxDoc);
+ }
+
+ // Negative and
+ else {
+ if (DEBUG) log.debug("- Filter is negative not optional");
+ bitset.andNot(filterIter);
+ if (DEBUG) log.debug("- Filter is negative - so andNotted");
+ }
+ }
+ else if (this.isOptional) {
+ if (DEBUG) log.debug("- Filter is simply optional");
+ bitset.or(filterIter);
+ }
+ else {
+ if (DEBUG) log.debug("- Filter is simply not optional");
+ bitset.and(filterIter);
+ // TODO: Check with nextSetBit() if the filter is not applicable
+ };
+
+ if (DEBUG) log.debug("- Subresult has card {} ", bitset.cardinality());
+ };
+ };
+ return BitsFilteredDocIdSet.wrap(bitset, acceptDocs);
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java
index 06bafd4..8b2ff4d 100644
--- a/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java
+++ b/src/main/java/de/ids_mannheim/korap/collection/CollectionBuilderNew.java
@@ -2,8 +2,10 @@
import java.util.*;
import java.io.IOException;
+// TEMPORARY:
import org.apache.lucene.queries.BooleanFilter;
import org.apache.lucene.search.BooleanClause;
+
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.*;
@@ -14,6 +16,7 @@
import org.slf4j.LoggerFactory;
import de.ids_mannheim.korap.KrillCollection;
+import de.ids_mannheim.korap.collection.BooleanGroupFilter;
public class CollectionBuilderNew {
@@ -144,7 +147,6 @@
return this.isOptional;
};
-
private ArrayList<CollectionBuilderInterface> operands;
public CollectionBuilderGroup (boolean optional) {
@@ -159,28 +161,26 @@
return this;
};
- public Filter toFilter () {
+ public Filter toFilter () {
if (this.operands == null || this.operands.isEmpty())
return null;
if (this.operands.size() == 1)
return this.operands.get(0).toFilter();
- BooleanFilter bool = new BooleanFilter();
+ // BooleanFilter bool = new BooleanFilter();
+ BooleanGroupFilter bool = new BooleanGroupFilter(this.isOptional);
Iterator<CollectionBuilderInterface> i = this.operands.iterator();
while (i.hasNext()) {
CollectionBuilderInterface cb = i.next();
if (cb.isNegative()) {
- bool.add(cb.toFilter(), BooleanClause.Occur.MUST_NOT);
- }
- else if (this.isOptional()) {
- bool.add(cb.toFilter(), BooleanClause.Occur.SHOULD);
+ bool.without(cb.toFilter());
}
else {
- bool.add(cb.toFilter(), BooleanClause.Occur.MUST);
- }
+ bool.with(cb.toFilter());
+ };
};
return bool;
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index 5fed33b..eacf929 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -1,4 +1,4 @@
-log4j.rootLogger = ERROR, stdout
+# log4j.rootLogger = ERROR, stdout
# Queries:
# log4j.logger.de.ids_mannheim.korap.query.SpanNextQuery = TRACE, stdout
@@ -21,7 +21,7 @@
# Collections:
# log4j.logger.de.ids_mannheim.korap.collection.Filter = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.KrillCollection = TRACE, stdout
+log4j.logger.de.ids_mannheim.korap.KrillCollection = TRACE, stdout
# Responses:
# log4j.logger.de.ids_mannheim.korap.server.Node = TRACE, stdout