| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.collection; |
| 2 | |
| 3 | import java.io.IOException; |
| 4 | import java.util.*; |
| 5 | |
| 6 | import org.apache.lucene.search.Filter; |
| 7 | import org.apache.lucene.util.FixedBitSet; |
| 8 | import org.apache.lucene.search.DocIdSet; |
| 9 | import org.apache.lucene.search.DocIdSetIterator; |
| 10 | import org.apache.lucene.search.BitsFilteredDocIdSet; |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 11 | import org.apache.lucene.index.LeafReaderContext; |
| 12 | import org.apache.lucene.index.LeafReader; |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 13 | import org.apache.lucene.util.Bits; |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 14 | import org.apache.lucene.util.BitDocIdSet; |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 15 | |
| 16 | import de.ids_mannheim.korap.KrillCollection; |
| 17 | |
| 18 | import org.slf4j.Logger; |
| 19 | import org.slf4j.LoggerFactory; |
| 20 | |
| 21 | /** |
| 22 | * A container Filter that allows Boolean composition of Filters |
| 23 | * in groups (either or-groups or and-groups). |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 24 | * |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 25 | * @author Nils Diewald |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 26 | * |
| 27 | * This filter is roughly based on |
| 28 | * org.apache.lucene.queries.BooleanFilter. |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 29 | */ |
| 30 | public class BooleanGroupFilter extends Filter { |
| 31 | // Group is either an or- or an and-Group |
| 32 | private boolean isOptional; |
| 33 | |
| 34 | // Logger |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 35 | private final static Logger log = LoggerFactory |
| 36 | .getLogger(KrillCollection.class); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 37 | |
| 38 | // This advices the java compiler to ignore all loggings |
| Akron | 1d63f27 | 2015-07-28 12:19:49 +0200 | [diff] [blame] | 39 | public static final boolean DEBUG = false; |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 40 | |
| 41 | // Init operands list |
| 42 | private final List<GroupFilterOperand> operands = new ArrayList<>(3); |
| 43 | |
| 44 | // Operand in the filter group |
| 45 | private class GroupFilterOperand { |
| 46 | public Filter filter; |
| 47 | public boolean isNegative; |
| 48 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 49 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 50 | // Operand has filter and negativity information |
| 51 | public GroupFilterOperand (Filter filter, boolean negative) { |
| 52 | this.filter = filter; |
| 53 | this.isNegative = negative; |
| 54 | }; |
| 55 | }; |
| 56 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 57 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 58 | /** |
| 59 | * Create a new BooleanGroupFilter. |
| 60 | * Accepts a boolean parameter to make it an or-Group |
| 61 | * (<pre>true</pre>) or an and-Group (<pre>true</pre>). |
| 62 | */ |
| 63 | public BooleanGroupFilter (boolean optional) { |
| 64 | this.isOptional = optional; |
| 65 | }; |
| 66 | |
| 67 | |
| 68 | /** |
| 69 | * Add an operand to the list of filter operands. |
| 70 | * The operand is a positive filter that won't be flipped. |
| 71 | */ |
| 72 | public final void with (Filter filter) { |
| 73 | this.operands.add(new GroupFilterOperand(filter, false)); |
| 74 | }; |
| 75 | |
| 76 | |
| 77 | /** |
| 78 | * Add an operand to the list of filter operands. |
| 79 | * The operand is a negative filter that will be flipped. |
| 80 | */ |
| 81 | public final void without (Filter filter) { |
| 82 | this.operands.add(new GroupFilterOperand(filter, true)); |
| 83 | }; |
| 84 | |
| 85 | |
| 86 | @Override |
| 87 | public boolean equals (Object obj) { |
| 88 | if (this == obj) |
| 89 | return true; |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 90 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 91 | if ((obj == null) || (obj.getClass() != this.getClass())) |
| 92 | return false; |
| 93 | |
| 94 | final BooleanGroupFilter other = (BooleanGroupFilter) obj; |
| 95 | return operands.equals(other.operands); |
| 96 | }; |
| 97 | |
| 98 | |
| 99 | @Override |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 100 | public int hashCode () { |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 101 | return 657153719 ^ operands.hashCode(); |
| 102 | }; |
| 103 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 104 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 105 | @Override |
| 106 | public String toString () { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 107 | StringBuilder buffer = new StringBuilder( |
| 108 | this.isOptional ? "OrGroup(" : "AndGroup("); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 109 | boolean first = true; |
| 110 | for (final GroupFilterOperand operand : this.operands) { |
| 111 | if (first) |
| 112 | first = false; |
| 113 | else |
| 114 | buffer.append(" "); |
| 115 | |
| 116 | if (operand.isNegative) |
| 117 | buffer.append('-'); |
| 118 | |
| 119 | buffer.append(operand.filter.toString()); |
| 120 | }; |
| 121 | return buffer.append(')').toString(); |
| 122 | }; |
| 123 | |
| Akron | 4299355 | 2016-02-04 13:24:24 +0100 | [diff] [blame] | 124 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 125 | /* |
| 126 | @Override |
| 127 | public String toString (String str) { |
| 128 | return this.toString(); |
| 129 | }; |
| 130 | */ |
| 131 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 132 | |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 133 | @Override |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 134 | public DocIdSet getDocIdSet (LeafReaderContext context, Bits acceptDocs) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 135 | throws IOException { |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 136 | final LeafReader reader = context.reader(); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 137 | int maxDoc = reader.maxDoc(); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 138 | FixedBitSet bitset = new FixedBitSet(maxDoc); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 139 | FixedBitSet combinator = new FixedBitSet(maxDoc); |
| 140 | boolean init = true; |
| 141 | |
| 142 | if (DEBUG) |
| 143 | log.debug("Start trying to filter on bitset of length {}", maxDoc); |
| 144 | |
| 145 | for (final GroupFilterOperand operand : this.operands) { |
| 146 | final DocIdSet docids = operand.filter.getDocIdSet(context, null); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 147 | final DocIdSetIterator filterIter = (docids == null) ? null |
| 148 | : docids.iterator(); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 149 | |
| 150 | if (DEBUG) |
| 151 | log.debug("> Filter to bitset of {} ({} negative)", |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 152 | operand.filter.toString(), operand.isNegative); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 153 | |
| 154 | // Filter resulted in no docs |
| 155 | if (filterIter == null) { |
| 156 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 157 | if (DEBUG) |
| 158 | log.debug("- Filter is null"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 159 | |
| 160 | // Filter matches |
| 161 | if (operand.isNegative) { |
| 162 | |
| 163 | // This means, everything is allowed |
| 164 | if (this.isOptional) { |
| 165 | |
| 166 | // Everything is allowed |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 167 | if (DEBUG) |
| 168 | log.debug("- Filter to allow all documents"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 169 | |
| 170 | bitset.set(0, maxDoc); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 171 | return BitsFilteredDocIdSet |
| 172 | .wrap(new BitDocIdSet(bitset), acceptDocs); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 173 | }; |
| 174 | |
| 175 | // There is no possible match |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 176 | if (DEBUG) |
| 177 | log.debug("- Filter to allow no documents (1)"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 178 | return null; |
| 179 | } |
| 180 | |
| 181 | // The result is unimportant |
| 182 | else if (this.isOptional) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 183 | if (DEBUG) |
| 184 | log.debug("- Filter is ignorable"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 185 | continue; |
| 186 | }; |
| 187 | |
| 188 | // There is no possible match |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 189 | if (DEBUG) |
| 190 | log.debug("- Filter to allow no documents (2)"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 191 | return null; |
| 192 | } |
| 193 | |
| 194 | // Initialize bitset |
| 195 | else if (init) { |
| 196 | |
| 197 | bitset.or(filterIter); |
| 198 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 199 | if (DEBUG) |
| 200 | log.debug("- Filter is inial with card {}", |
| 201 | bitset.cardinality()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 202 | |
| 203 | // Flip the matching documents |
| 204 | if (operand.isNegative) { |
| 205 | bitset.flip(0, maxDoc); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 206 | if (DEBUG) |
| 207 | log.debug( |
| 208 | "- Filter is negative - so flipped to card {} (1)", |
| 209 | bitset.cardinality()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 210 | }; |
| 211 | |
| 212 | init = false; |
| 213 | } |
| 214 | else { |
| 215 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 216 | if (DEBUG) |
| 217 | log.debug("- Filter is fine and operating"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 218 | |
| 219 | // Operator is negative and needs to be flipped |
| 220 | if (operand.isNegative) { |
| 221 | if (this.isOptional) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 222 | if (DEBUG) |
| 223 | log.debug("- Filter is negative optional"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 224 | |
| 225 | // Negative or ... may be slow |
| 226 | combinator.or(filterIter); |
| 227 | combinator.flip(0, maxDoc); |
| 228 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 229 | if (DEBUG) |
| 230 | log.debug( |
| 231 | "- Filter is negative - so flipped to card {} (2)", |
| 232 | combinator.cardinality()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 233 | |
| 234 | bitset.or(combinator); |
| 235 | combinator.clear(0, maxDoc); |
| 236 | } |
| 237 | |
| 238 | // Negative and |
| 239 | else { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 240 | if (DEBUG) |
| 241 | log.debug("- Filter is negative not optional"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 242 | bitset.andNot(filterIter); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 243 | if (DEBUG) |
| 244 | log.debug("- Filter is negative - so andNotted"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 245 | } |
| 246 | } |
| 247 | else if (this.isOptional) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 248 | if (DEBUG) |
| 249 | log.debug("- Filter is simply optional"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 250 | bitset.or(filterIter); |
| 251 | } |
| 252 | else { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 253 | if (DEBUG) |
| 254 | log.debug("- Filter is simply not optional"); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 255 | bitset.and(filterIter); |
| 256 | // TODO: Check with nextSetBit() if the filter is not applicable |
| 257 | }; |
| 258 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 259 | if (DEBUG) |
| 260 | log.debug("- Subresult has card {} ", bitset.cardinality()); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 261 | }; |
| 262 | }; |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 263 | return BitsFilteredDocIdSet.wrap(new BitDocIdSet(bitset), acceptDocs); |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 264 | }; |
| 265 | }; |