| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap; |
| 2 | |
| 3 | import java.util.*; |
| 4 | import java.io.IOException; |
| 5 | import org.apache.lucene.search.QueryWrapperFilter; |
| 6 | import org.apache.lucene.search.NumericRangeFilter; |
| 7 | import org.apache.lucene.search.Filter; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 8 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 9 | import de.ids_mannheim.korap.KorapIndex; |
| 10 | import de.ids_mannheim.korap.KorapResult; |
| 11 | import de.ids_mannheim.korap.KorapFilter; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 12 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 13 | import de.ids_mannheim.korap.util.KorapDate; |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 14 | import de.ids_mannheim.korap.util.QueryException; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 15 | import de.ids_mannheim.korap.filter.BooleanFilter; |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 16 | import de.ids_mannheim.korap.filter.FilterOperation; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 17 | import org.apache.lucene.search.spans.SpanQuery; |
| 18 | import org.apache.lucene.search.Query; |
| 19 | import org.apache.lucene.search.FilteredQuery; |
| 20 | import org.apache.lucene.index.AtomicReaderContext; |
| 21 | import org.apache.lucene.util.FixedBitSet; |
| 22 | import org.apache.lucene.util.Bits; |
| 23 | import org.apache.lucene.search.DocIdSetIterator; |
| 24 | import org.apache.lucene.search.DocIdSet; |
| 25 | |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 26 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 27 | import com.fasterxml.jackson.databind.JsonNode; |
| 28 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 29 | import org.slf4j.Logger; |
| 30 | import org.slf4j.LoggerFactory; |
| 31 | |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 32 | // TODO: Make a cache for the bits!!! DELETE IT IN CASE OF AN EXTENSION OR A FILTER! |
| 33 | |
| Nils Diewald | 2cd1c3d | 2014-01-08 22:53:08 +0000 | [diff] [blame] | 34 | // TODO: Maybe a constantScoreQuery can make things faster? |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 35 | |
| 36 | // accepts as first parameter the index |
| 37 | // THIS MAY CHANGE for stuff like combining virtual collections |
| 38 | // See http://mail-archives.apache.org/mod_mbox/lucene-java-user/200805.mbox/%3C17080852.post@talk.nabble.com%3E |
| 39 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 40 | public class KorapCollection { |
| 41 | private KorapIndex index; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 42 | private KorapDate created; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 43 | private String id; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 44 | private String error; |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 45 | private ArrayList<FilterOperation> filter; |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 46 | private int filterCount = 0; |
| 47 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 48 | // Logger |
| 49 | private final static Logger log = LoggerFactory.getLogger(KorapCollection.class); |
| 50 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 51 | // user? |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 52 | public KorapCollection (KorapIndex ki) { |
| 53 | this.index = ki; |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 54 | this.filter = new ArrayList<FilterOperation>(5); |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 55 | }; |
| 56 | |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 57 | public KorapCollection (String jsonString) { |
| 58 | this.filter = new ArrayList<FilterOperation>(5); |
| 59 | ObjectMapper mapper = new ObjectMapper(); |
| 60 | try { |
| 61 | JsonNode json = mapper.readValue(jsonString, JsonNode.class); |
| Nils Diewald | 23417e8 | 2014-02-12 18:33:24 +0000 | [diff] [blame] | 62 | if (json.has("collections")) { |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 63 | log.trace("Add meta collection"); |
| Nils Diewald | 23417e8 | 2014-02-12 18:33:24 +0000 | [diff] [blame] | 64 | for (JsonNode collection : json.get("collections")) { |
| 65 | this.fromJSON(collection); |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 66 | }; |
| 67 | }; |
| 68 | } |
| 69 | catch (Exception e) { |
| 70 | this.error = e.getMessage(); |
| 71 | }; |
| 72 | }; |
| 73 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 74 | public KorapCollection () { |
| 75 | this.filter = new ArrayList<FilterOperation>(5); |
| 76 | }; |
| 77 | |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 78 | public void fromJSON(JsonNode json) throws QueryException { |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 79 | String type = json.get("@type").asText(); |
| 80 | |
| 81 | if (type.equals("korap:meta-filter")) { |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 82 | log.trace("Add Filter"); |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 83 | this.filter(new BooleanFilter(json.get("@value"))); |
| 84 | } |
| 85 | else if (type.equals("korap:meta-extend")) { |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 86 | log.trace("Add Extend"); |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 87 | this.extend(new BooleanFilter(json.get("@value"))); |
| 88 | }; |
| 89 | }; |
| 90 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 91 | public int getCount() { |
| 92 | return this.filterCount; |
| 93 | }; |
| 94 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 95 | public void setIndex (KorapIndex ki) { |
| 96 | this.index = ki; |
| 97 | }; |
| 98 | |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 99 | // The checks asre not necessary |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 100 | public KorapCollection filter (BooleanFilter filter) { |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 101 | log.trace("Added filter: {}", filter.toString()); |
| 102 | if (filter == null) { |
| 103 | log.warn("No filter is given"); |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 104 | return this; |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 105 | }; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 106 | Filter f = (Filter) new QueryWrapperFilter(filter.toQuery()); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 107 | if (f == null) { |
| 108 | log.warn("Filter can't be wrapped"); |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 109 | return this; |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 110 | }; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 111 | FilterOperation fo = new FilterOperation(f,false); |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 112 | if (fo == null) { |
| 113 | log.warn("Filter operation invalid"); |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 114 | return this; |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 115 | }; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 116 | this.filter.add(fo); |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 117 | this.filterCount++; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 118 | return this; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 119 | }; |
| 120 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 121 | public KorapCollection extend (BooleanFilter filter) { |
| Nils Diewald | 9f31083 | 2013-12-06 22:38:55 +0000 | [diff] [blame] | 122 | log.trace("Added extension: {}", filter.toString()); |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 123 | this.filter.add( |
| 124 | new FilterOperation( |
| Nils Diewald | 9cc86fe | 2013-12-07 17:45:59 +0000 | [diff] [blame] | 125 | (Filter) new QueryWrapperFilter(filter.toQuery()), |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 126 | true |
| 127 | ) |
| 128 | ); |
| 129 | this.filterCount++; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 130 | return this; |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 131 | }; |
| 132 | |
| 133 | public ArrayList<FilterOperation> getFilters () { |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 134 | return this.filter; |
| 135 | }; |
| 136 | |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 137 | public FilterOperation getFilter (int i) { |
| 138 | return this.filter.get(i); |
| 139 | }; |
| 140 | |
| 141 | |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 142 | public String toString () { |
| 143 | StringBuffer sb = new StringBuffer(); |
| 144 | for (FilterOperation fo : this.filter) { |
| 145 | sb.append(fo.toString()).append("; "); |
| 146 | }; |
| 147 | return sb.toString(); |
| 148 | }; |
| 149 | |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 150 | // DEPRECATED BUT USED IN TEST CASES |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 151 | public KorapResult search (SpanQuery query) { |
| Nils Diewald | 3ef9a47 | 2013-12-02 16:06:09 +0000 | [diff] [blame] | 152 | return this.index.search(this, query, 0, (short) 20, true, (short) 5, true, (short) 5); |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 153 | }; |
| 154 | |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 155 | public FixedBitSet bits (AtomicReaderContext atomic) throws IOException { |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 156 | |
| 157 | /* |
| Nils Diewald | 41e58f8 | 2013-11-20 20:30:15 +0000 | [diff] [blame] | 158 | Use Bits.MatchAllBits(int len) |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 159 | */ |
| 160 | |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 161 | boolean noDoc = true; |
| 162 | FixedBitSet bitset; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 163 | |
| 164 | if (this.filterCount > 0) { |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 165 | bitset = new FixedBitSet(atomic.reader().numDocs()); |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 166 | |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 167 | ArrayList<FilterOperation> filters = (ArrayList<FilterOperation>) this.filter.clone(); |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 168 | |
| Nils Diewald | 22efd2d | 2013-11-29 22:54:24 +0000 | [diff] [blame] | 169 | FilterOperation kcInit = filters.remove(0); |
| 170 | log.trace("FILTER: {}", kcInit); |
| 171 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 172 | // Init vector |
| Nils Diewald | 22efd2d | 2013-11-29 22:54:24 +0000 | [diff] [blame] | 173 | DocIdSet docids = kcInit.filter.getDocIdSet(atomic, null); |
| Nils Diewald | 9cc86fe | 2013-12-07 17:45:59 +0000 | [diff] [blame] | 174 | |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 175 | DocIdSetIterator filterIter = docids.iterator(); |
| 176 | |
| 177 | if (filterIter != null) { |
| Nils Diewald | 22efd2d | 2013-11-29 22:54:24 +0000 | [diff] [blame] | 178 | log.trace("InitFilter has effect"); |
| Nils Diewald | 9cc86fe | 2013-12-07 17:45:59 +0000 | [diff] [blame] | 179 | // System.err.println("Init has an effect"); |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 180 | bitset.or(filterIter); |
| 181 | noDoc = false; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 182 | }; |
| 183 | |
| Nils Diewald | 41e58f8 | 2013-11-20 20:30:15 +0000 | [diff] [blame] | 184 | if (!noDoc) { |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 185 | for (FilterOperation kc : filters) { |
| Nils Diewald | 41e58f8 | 2013-11-20 20:30:15 +0000 | [diff] [blame] | 186 | log.trace("FILTER: {}", kc); |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 187 | |
| 188 | // BUG!!! |
| 189 | docids = kc.filter.getDocIdSet(atomic, kc.isExtension() ? null : bitset); |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 190 | filterIter = docids.iterator(); |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 191 | |
| Nils Diewald | 41e58f8 | 2013-11-20 20:30:15 +0000 | [diff] [blame] | 192 | if (filterIter == null) { |
| 193 | // There must be a better way ... |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 194 | if (kc.isFilter()) { |
| 195 | bitset.clear(0, bitset.length()); |
| 196 | noDoc = true; |
| Nils Diewald | 9cc86fe | 2013-12-07 17:45:59 +0000 | [diff] [blame] | 197 | } |
| 198 | else { |
| 199 | // System.err.println("No term found"); |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 200 | }; |
| 201 | continue; |
| Nils Diewald | 41e58f8 | 2013-11-20 20:30:15 +0000 | [diff] [blame] | 202 | }; |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 203 | if (kc.isExtension()) { |
| Nils Diewald | 9cc86fe | 2013-12-07 17:45:59 +0000 | [diff] [blame] | 204 | // System.err.println("Term found!"); |
| 205 | // log.trace("Extend filter"); |
| 206 | // System.err.println("Old Card:" + bitset.cardinality()); |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 207 | bitset.or(filterIter); |
| Nils Diewald | 9cc86fe | 2013-12-07 17:45:59 +0000 | [diff] [blame] | 208 | // System.err.println("New Card:" + bitset.cardinality()); |
| Nils Diewald | 5def8bc | 2013-11-28 19:26:54 +0000 | [diff] [blame] | 209 | } |
| 210 | else { |
| 211 | bitset.and(filterIter); |
| 212 | }; |
| Nils Diewald | 41e58f8 | 2013-11-20 20:30:15 +0000 | [diff] [blame] | 213 | }; |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 214 | |
| 215 | if (!noDoc) { |
| 216 | FixedBitSet livedocs = (FixedBitSet) atomic.reader().getLiveDocs(); |
| 217 | if (livedocs != null) { |
| 218 | bitset.and(livedocs); |
| 219 | }; |
| 220 | }; |
| 221 | } |
| 222 | else { |
| 223 | return bitset; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 224 | }; |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 225 | } |
| 226 | else { |
| 227 | bitset = (FixedBitSet) atomic.reader().getLiveDocs(); |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 228 | }; |
| 229 | |
| 230 | return bitset; |
| 231 | }; |
| 232 | |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 233 | public long numberOf (String foundry, String type) throws IOException { |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 234 | if (this.index == null) |
| 235 | return (long) 0; |
| 236 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 237 | return this.index.numberOf(this, foundry, type); |
| 238 | }; |
| 239 | |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 240 | public long numberOf (String type) throws IOException { |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 241 | if (this.index == null) |
| 242 | return (long) 0; |
| 243 | |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 244 | return this.index.numberOf(this, "tokens", type); |
| 245 | }; |
| 246 | |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 247 | public String getError () { |
| 248 | return this.error; |
| 249 | }; |
| 250 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 251 | // implement "till" with rangefilter |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 252 | }; |