blob: fdb0ff37a8181e272f45431bbf6ccd4f43b20a2d [file] [log] [blame]
Nils Diewaldbaf68c52013-11-20 13:22:19 +00001package de.ids_mannheim.korap;
2
3import java.util.*;
4import java.io.IOException;
5import org.apache.lucene.search.QueryWrapperFilter;
6import org.apache.lucene.search.NumericRangeFilter;
7import org.apache.lucene.search.Filter;
Nils Diewaldc925b492013-12-03 23:56:10 +00008
Nils Diewaldbaf68c52013-11-20 13:22:19 +00009import de.ids_mannheim.korap.KorapIndex;
10import de.ids_mannheim.korap.KorapResult;
11import de.ids_mannheim.korap.KorapFilter;
Nils Diewaldc925b492013-12-03 23:56:10 +000012
Nils Diewaldbaf68c52013-11-20 13:22:19 +000013import de.ids_mannheim.korap.util.KorapDate;
Nils Diewaldb1c3b652013-12-28 22:47:00 +000014import de.ids_mannheim.korap.util.QueryException;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000015import de.ids_mannheim.korap.filter.BooleanFilter;
Nils Diewald5def8bc2013-11-28 19:26:54 +000016import de.ids_mannheim.korap.filter.FilterOperation;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000017import org.apache.lucene.search.spans.SpanQuery;
18import org.apache.lucene.search.Query;
19import org.apache.lucene.search.FilteredQuery;
20import org.apache.lucene.index.AtomicReaderContext;
21import org.apache.lucene.util.FixedBitSet;
22import org.apache.lucene.util.Bits;
23import org.apache.lucene.search.DocIdSetIterator;
24import org.apache.lucene.search.DocIdSet;
25
Nils Diewald01b4ce32013-12-05 22:39:25 +000026import com.fasterxml.jackson.databind.ObjectMapper;
27import com.fasterxml.jackson.databind.JsonNode;
28
Nils Diewald2276e1c2014-04-10 15:01:59 +000029import java.io.StringWriter;
30
Nils Diewaldbaf68c52013-11-20 13:22:19 +000031import org.slf4j.Logger;
32import org.slf4j.LoggerFactory;
33
Nils Diewald5def8bc2013-11-28 19:26:54 +000034// TODO: Make a cache for the bits!!! DELETE IT IN CASE OF AN EXTENSION OR A FILTER!
Nils Diewald6802acd2014-03-18 18:29:30 +000035// Todo: Maybe use radomaccessfilterstrategy
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000036// TODO: Maybe a constantScoreQuery can make things faster?
Nils Diewaldbaf68c52013-11-20 13:22:19 +000037
38// accepts as first parameter the index
39// THIS MAY CHANGE for stuff like combining virtual collections
40// See http://mail-archives.apache.org/mod_mbox/lucene-java-user/200805.mbox/%3C17080852.post@talk.nabble.com%3E
41
Nils Diewaldbaf68c52013-11-20 13:22:19 +000042public class KorapCollection {
43 private KorapIndex index;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000044 private KorapDate created;
Nils Diewaldc925b492013-12-03 23:56:10 +000045 private String id;
Nils Diewald01b4ce32013-12-05 22:39:25 +000046 private String error;
Nils Diewald5def8bc2013-11-28 19:26:54 +000047 private ArrayList<FilterOperation> filter;
Nils Diewalddfb21ea2013-11-21 14:26:47 +000048 private int filterCount = 0;
49
Nils Diewaldbaf68c52013-11-20 13:22:19 +000050 // Logger
51 private final static Logger log = LoggerFactory.getLogger(KorapCollection.class);
52
Nils Diewaldbaf68c52013-11-20 13:22:19 +000053 // user?
Nils Diewaldbaf68c52013-11-20 13:22:19 +000054 public KorapCollection (KorapIndex ki) {
55 this.index = ki;
Nils Diewald5def8bc2013-11-28 19:26:54 +000056 this.filter = new ArrayList<FilterOperation>(5);
Nils Diewaldbaf68c52013-11-20 13:22:19 +000057 };
58
Nils Diewald01b4ce32013-12-05 22:39:25 +000059 public KorapCollection (String jsonString) {
60 this.filter = new ArrayList<FilterOperation>(5);
61 ObjectMapper mapper = new ObjectMapper();
Nils Diewald2276e1c2014-04-10 15:01:59 +000062
Nils Diewald01b4ce32013-12-05 22:39:25 +000063 try {
64 JsonNode json = mapper.readValue(jsonString, JsonNode.class);
Nils Diewald23417e82014-02-12 18:33:24 +000065 if (json.has("collections")) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +000066 log.trace("Add meta collection");
Nils Diewald23417e82014-02-12 18:33:24 +000067 for (JsonNode collection : json.get("collections")) {
68 this.fromJSON(collection);
Nils Diewald01b4ce32013-12-05 22:39:25 +000069 };
70 };
71 }
72 catch (Exception e) {
73 this.error = e.getMessage();
74 };
75 };
76
Nils Diewaldc925b492013-12-03 23:56:10 +000077 public KorapCollection () {
78 this.filter = new ArrayList<FilterOperation>(5);
79 };
80
Nils Diewaldb1c3b652013-12-28 22:47:00 +000081 public void fromJSON(JsonNode json) throws QueryException {
Nils Diewald01b4ce32013-12-05 22:39:25 +000082 String type = json.get("@type").asText();
83
84 if (type.equals("korap:meta-filter")) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +000085 log.trace("Add Filter");
Nils Diewaldfb4d7b02014-04-09 17:56:17 +000086 this.filter(new KorapFilter(json.get("@value")));
Nils Diewald01b4ce32013-12-05 22:39:25 +000087 }
88 else if (type.equals("korap:meta-extend")) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +000089 log.trace("Add Extend");
Nils Diewaldfb4d7b02014-04-09 17:56:17 +000090 this.extend(new KorapFilter(json.get("@value")));
Nils Diewald01b4ce32013-12-05 22:39:25 +000091 };
92 };
93
Nils Diewaldbaf68c52013-11-20 13:22:19 +000094 public int getCount() {
95 return this.filterCount;
96 };
97
Nils Diewaldc925b492013-12-03 23:56:10 +000098 public void setIndex (KorapIndex ki) {
99 this.index = ki;
100 };
101
Nils Diewald01b4ce32013-12-05 22:39:25 +0000102 // The checks asre not necessary
Nils Diewaldc925b492013-12-03 23:56:10 +0000103 public KorapCollection filter (BooleanFilter filter) {
Nils Diewald9f310832013-12-06 22:38:55 +0000104 log.trace("Added filter: {}", filter.toString());
105 if (filter == null) {
106 log.warn("No filter is given");
Nils Diewald01b4ce32013-12-05 22:39:25 +0000107 return this;
Nils Diewald9f310832013-12-06 22:38:55 +0000108 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000109 Filter f = (Filter) new QueryWrapperFilter(filter.toQuery());
Nils Diewald9f310832013-12-06 22:38:55 +0000110 if (f == null) {
111 log.warn("Filter can't be wrapped");
Nils Diewald01b4ce32013-12-05 22:39:25 +0000112 return this;
Nils Diewald9f310832013-12-06 22:38:55 +0000113 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000114 FilterOperation fo = new FilterOperation(f,false);
Nils Diewald9f310832013-12-06 22:38:55 +0000115 if (fo == null) {
116 log.warn("Filter operation invalid");
Nils Diewald01b4ce32013-12-05 22:39:25 +0000117 return this;
Nils Diewald9f310832013-12-06 22:38:55 +0000118 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000119 this.filter.add(fo);
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000120 this.filterCount++;
Nils Diewaldc925b492013-12-03 23:56:10 +0000121 return this;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000122 };
123
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000124 public KorapCollection filter (KorapFilter filter) {
125 return this.filter(filter.toBooleanFilter());
126 };
127
128
Nils Diewaldc925b492013-12-03 23:56:10 +0000129 public KorapCollection extend (BooleanFilter filter) {
Nils Diewald9f310832013-12-06 22:38:55 +0000130 log.trace("Added extension: {}", filter.toString());
Nils Diewald5def8bc2013-11-28 19:26:54 +0000131 this.filter.add(
132 new FilterOperation(
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000133 (Filter) new QueryWrapperFilter(filter.toQuery()),
Nils Diewald5def8bc2013-11-28 19:26:54 +0000134 true
135 )
136 );
137 this.filterCount++;
Nils Diewaldc925b492013-12-03 23:56:10 +0000138 return this;
Nils Diewald5def8bc2013-11-28 19:26:54 +0000139 };
140
Nils Diewaldfb4d7b02014-04-09 17:56:17 +0000141 public KorapCollection extend (KorapFilter filter) {
142 return this.extend(filter.toBooleanFilter());
143 };
144
145
Nils Diewald5def8bc2013-11-28 19:26:54 +0000146 public ArrayList<FilterOperation> getFilters () {
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000147 return this.filter;
148 };
149
Nils Diewald01b4ce32013-12-05 22:39:25 +0000150 public FilterOperation getFilter (int i) {
151 return this.filter.get(i);
152 };
153
154
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000155 public String toString () {
Nils Diewald2276e1c2014-04-10 15:01:59 +0000156 StringBuilder sb = new StringBuilder();
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000157 for (FilterOperation fo : this.filter) {
158 sb.append(fo.toString()).append("; ");
159 };
160 return sb.toString();
161 };
162
Nils Diewaldc925b492013-12-03 23:56:10 +0000163 // DEPRECATED BUT USED IN TEST CASES
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000164 public KorapResult search (SpanQuery query) {
Nils Diewald3ef9a472013-12-02 16:06:09 +0000165 return this.index.search(this, query, 0, (short) 20, true, (short) 5, true, (short) 5);
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000166 };
167
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000168 public FixedBitSet bits (AtomicReaderContext atomic) throws IOException {
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000169
170 /*
Nils Diewald41e58f82013-11-20 20:30:15 +0000171 Use Bits.MatchAllBits(int len)
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000172 */
173
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000174 boolean noDoc = true;
175 FixedBitSet bitset;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000176
177 if (this.filterCount > 0) {
Nils Diewald2276e1c2014-04-10 15:01:59 +0000178 bitset = new FixedBitSet(atomic.reader().maxDoc());
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000179
Nils Diewald5def8bc2013-11-28 19:26:54 +0000180 ArrayList<FilterOperation> filters = (ArrayList<FilterOperation>) this.filter.clone();
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000181
Nils Diewald22efd2d2013-11-29 22:54:24 +0000182 FilterOperation kcInit = filters.remove(0);
183 log.trace("FILTER: {}", kcInit);
184
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000185 // Init vector
Nils Diewald22efd2d2013-11-29 22:54:24 +0000186 DocIdSet docids = kcInit.filter.getDocIdSet(atomic, null);
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000187
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000188 DocIdSetIterator filterIter = docids.iterator();
189
190 if (filterIter != null) {
Nils Diewald22efd2d2013-11-29 22:54:24 +0000191 log.trace("InitFilter has effect");
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000192 // System.err.println("Init has an effect");
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000193 bitset.or(filterIter);
194 noDoc = false;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000195 };
196
Nils Diewald41e58f82013-11-20 20:30:15 +0000197 if (!noDoc) {
Nils Diewald5def8bc2013-11-28 19:26:54 +0000198 for (FilterOperation kc : filters) {
Nils Diewald41e58f82013-11-20 20:30:15 +0000199 log.trace("FILTER: {}", kc);
Nils Diewald5def8bc2013-11-28 19:26:54 +0000200
201 // BUG!!!
202 docids = kc.filter.getDocIdSet(atomic, kc.isExtension() ? null : bitset);
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000203 filterIter = docids.iterator();
Nils Diewald5def8bc2013-11-28 19:26:54 +0000204
Nils Diewald41e58f82013-11-20 20:30:15 +0000205 if (filterIter == null) {
206 // There must be a better way ...
Nils Diewald5def8bc2013-11-28 19:26:54 +0000207 if (kc.isFilter()) {
Nils Diewald2276e1c2014-04-10 15:01:59 +0000208 // TODO: Check if this is really correct!
209 // Maybe here is the bug
Nils Diewald5def8bc2013-11-28 19:26:54 +0000210 bitset.clear(0, bitset.length());
211 noDoc = true;
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000212 }
213 else {
214 // System.err.println("No term found");
Nils Diewald5def8bc2013-11-28 19:26:54 +0000215 };
216 continue;
Nils Diewald41e58f82013-11-20 20:30:15 +0000217 };
Nils Diewald5def8bc2013-11-28 19:26:54 +0000218 if (kc.isExtension()) {
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000219 // System.err.println("Term found!");
220 // log.trace("Extend filter");
221 // System.err.println("Old Card:" + bitset.cardinality());
Nils Diewald5def8bc2013-11-28 19:26:54 +0000222 bitset.or(filterIter);
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000223 // System.err.println("New Card:" + bitset.cardinality());
Nils Diewald5def8bc2013-11-28 19:26:54 +0000224 }
225 else {
226 bitset.and(filterIter);
227 };
Nils Diewald41e58f82013-11-20 20:30:15 +0000228 };
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000229
230 if (!noDoc) {
231 FixedBitSet livedocs = (FixedBitSet) atomic.reader().getLiveDocs();
232 if (livedocs != null) {
233 bitset.and(livedocs);
234 };
235 };
236 }
237 else {
238 return bitset;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000239 };
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000240 }
241 else {
242 bitset = (FixedBitSet) atomic.reader().getLiveDocs();
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000243 };
244
245 return bitset;
246 };
247
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000248 public long numberOf (String foundry, String type) throws IOException {
Nils Diewaldc925b492013-12-03 23:56:10 +0000249 if (this.index == null)
250 return (long) 0;
251
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000252 return this.index.numberOf(this, foundry, type);
253 };
254
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000255 public long numberOf (String type) throws IOException {
Nils Diewaldc925b492013-12-03 23:56:10 +0000256 if (this.index == null)
257 return (long) 0;
258
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000259 return this.index.numberOf(this, "tokens", type);
260 };
261
Nils Diewald2276e1c2014-04-10 15:01:59 +0000262 // This is only for testing purposes!
263 public HashMap getTermRelation(String field) throws Exception {
264 if (this.index == null) {
265 HashMap<String,Long> map = new HashMap<>(1);
266 map.put("-docs", (long) 0);
267 return map;
268 };
269
270 return this.index.getTermRelation(this, field);
271 };
272
273 public String getTermRelationJSON(String field) throws IOException {
274 ObjectMapper mapper = new ObjectMapper();
275 StringWriter sw = new StringWriter();
276 sw.append("{\"field\":");
277 mapper.writeValue(sw,field);
278 sw.append(",");
279
280 try {
281 HashMap<String, Long> map = this.getTermRelation(field);
282
283 sw.append("\"documents\":");
284 mapper.writeValue(sw,map.remove("-docs"));
285 sw.append(",");
286
287 String[] keys = map.keySet().toArray(new String[map.size()]);
288
289 HashMap<String,Integer> setHash = new HashMap<>(20);
290 ArrayList<HashMap<String,Long>> set = new ArrayList<>(20);
291 ArrayList<Long[]> overlap = new ArrayList<>(100);
292
293 int count = 0;
294 for (String key : keys) {
295 if (!key.startsWith("#__")) {
296 HashMap<String,Long> simpleMap = new HashMap<>();
297 simpleMap.put(key, map.remove(key));
298 set.add(simpleMap);
299 setHash.put(key, count++);
300 };
301 };
302
303 keys = map.keySet().toArray(new String[map.size()]);
304 for (String key : keys) {
305 String[] comb = key.substring(3).split(":###:");
306 Long[] l = new Long[3];
307 l[0] = (long) setHash.get(comb[0]);
308 l[1] = (long) setHash.get(comb[1]);
309 l[2] = map.remove(key);
310 overlap.add(l);
311 };
312
313
314 sw.append("\"sets\":");
315 mapper.writeValue(sw, (Object) set);
316 sw.append(",\"overlaps\":");
317 mapper.writeValue(sw, (Object) overlap);
318 sw.append(",\"error\":null");
319
320 }
321 catch (Exception e) {
322 sw.append("\"error\":");
323 mapper.writeValue(sw,e.getMessage());
324 };
325
326 sw.append("}");
327 return sw.getBuffer().toString();
328 };
329
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000330 public String getError () {
331 return this.error;
332 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000333};