blob: ef01c9c9efaf1d9f86e8963b4a28bc6f205e1405 [file] [log] [blame]
Nils Diewaldbaf68c52013-11-20 13:22:19 +00001package de.ids_mannheim.korap;
2
3import java.util.*;
4import java.io.IOException;
5import org.apache.lucene.search.QueryWrapperFilter;
6import org.apache.lucene.search.NumericRangeFilter;
7import org.apache.lucene.search.Filter;
Nils Diewaldc925b492013-12-03 23:56:10 +00008
Nils Diewaldbaf68c52013-11-20 13:22:19 +00009import de.ids_mannheim.korap.KorapIndex;
10import de.ids_mannheim.korap.KorapResult;
11import de.ids_mannheim.korap.KorapFilter;
Nils Diewaldc925b492013-12-03 23:56:10 +000012
Nils Diewaldbaf68c52013-11-20 13:22:19 +000013import de.ids_mannheim.korap.util.KorapDate;
Nils Diewaldb1c3b652013-12-28 22:47:00 +000014import de.ids_mannheim.korap.util.QueryException;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000015import de.ids_mannheim.korap.filter.BooleanFilter;
Nils Diewald5def8bc2013-11-28 19:26:54 +000016import de.ids_mannheim.korap.filter.FilterOperation;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000017import org.apache.lucene.search.spans.SpanQuery;
18import org.apache.lucene.search.Query;
19import org.apache.lucene.search.FilteredQuery;
20import org.apache.lucene.index.AtomicReaderContext;
21import org.apache.lucene.util.FixedBitSet;
22import org.apache.lucene.util.Bits;
23import org.apache.lucene.search.DocIdSetIterator;
24import org.apache.lucene.search.DocIdSet;
25
Nils Diewald01b4ce32013-12-05 22:39:25 +000026import com.fasterxml.jackson.databind.ObjectMapper;
27import com.fasterxml.jackson.databind.JsonNode;
28
Nils Diewaldbaf68c52013-11-20 13:22:19 +000029import org.slf4j.Logger;
30import org.slf4j.LoggerFactory;
31
Nils Diewald5def8bc2013-11-28 19:26:54 +000032// TODO: Make a cache for the bits!!! DELETE IT IN CASE OF AN EXTENSION OR A FILTER!
33
Nils Diewald2cd1c3d2014-01-08 22:53:08 +000034// TODO: Maybe a constantScoreQuery can make things faster?
Nils Diewaldbaf68c52013-11-20 13:22:19 +000035
36// accepts as first parameter the index
37// THIS MAY CHANGE for stuff like combining virtual collections
38// See http://mail-archives.apache.org/mod_mbox/lucene-java-user/200805.mbox/%3C17080852.post@talk.nabble.com%3E
39
Nils Diewaldbaf68c52013-11-20 13:22:19 +000040public class KorapCollection {
41 private KorapIndex index;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000042 private KorapDate created;
Nils Diewaldc925b492013-12-03 23:56:10 +000043 private String id;
Nils Diewald01b4ce32013-12-05 22:39:25 +000044 private String error;
Nils Diewald5def8bc2013-11-28 19:26:54 +000045 private ArrayList<FilterOperation> filter;
Nils Diewalddfb21ea2013-11-21 14:26:47 +000046 private int filterCount = 0;
47
Nils Diewaldbaf68c52013-11-20 13:22:19 +000048 // Logger
49 private final static Logger log = LoggerFactory.getLogger(KorapCollection.class);
50
Nils Diewaldbaf68c52013-11-20 13:22:19 +000051 // user?
Nils Diewaldbaf68c52013-11-20 13:22:19 +000052 public KorapCollection (KorapIndex ki) {
53 this.index = ki;
Nils Diewald5def8bc2013-11-28 19:26:54 +000054 this.filter = new ArrayList<FilterOperation>(5);
Nils Diewaldbaf68c52013-11-20 13:22:19 +000055 };
56
Nils Diewald01b4ce32013-12-05 22:39:25 +000057 public KorapCollection (String jsonString) {
58 this.filter = new ArrayList<FilterOperation>(5);
59 ObjectMapper mapper = new ObjectMapper();
60 try {
61 JsonNode json = mapper.readValue(jsonString, JsonNode.class);
Nils Diewald23417e82014-02-12 18:33:24 +000062 if (json.has("collections")) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +000063 log.trace("Add meta collection");
Nils Diewald23417e82014-02-12 18:33:24 +000064 for (JsonNode collection : json.get("collections")) {
65 this.fromJSON(collection);
Nils Diewald01b4ce32013-12-05 22:39:25 +000066 };
67 };
68 }
69 catch (Exception e) {
70 this.error = e.getMessage();
71 };
72 };
73
Nils Diewaldc925b492013-12-03 23:56:10 +000074 public KorapCollection () {
75 this.filter = new ArrayList<FilterOperation>(5);
76 };
77
Nils Diewaldb1c3b652013-12-28 22:47:00 +000078 public void fromJSON(JsonNode json) throws QueryException {
Nils Diewald01b4ce32013-12-05 22:39:25 +000079 String type = json.get("@type").asText();
80
81 if (type.equals("korap:meta-filter")) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +000082 log.trace("Add Filter");
Nils Diewald01b4ce32013-12-05 22:39:25 +000083 this.filter(new BooleanFilter(json.get("@value")));
84 }
85 else if (type.equals("korap:meta-extend")) {
Nils Diewaldb1c3b652013-12-28 22:47:00 +000086 log.trace("Add Extend");
Nils Diewald01b4ce32013-12-05 22:39:25 +000087 this.extend(new BooleanFilter(json.get("@value")));
88 };
89 };
90
Nils Diewaldbaf68c52013-11-20 13:22:19 +000091 public int getCount() {
92 return this.filterCount;
93 };
94
Nils Diewaldc925b492013-12-03 23:56:10 +000095 public void setIndex (KorapIndex ki) {
96 this.index = ki;
97 };
98
Nils Diewald01b4ce32013-12-05 22:39:25 +000099 // The checks asre not necessary
Nils Diewaldc925b492013-12-03 23:56:10 +0000100 public KorapCollection filter (BooleanFilter filter) {
Nils Diewald9f310832013-12-06 22:38:55 +0000101 log.trace("Added filter: {}", filter.toString());
102 if (filter == null) {
103 log.warn("No filter is given");
Nils Diewald01b4ce32013-12-05 22:39:25 +0000104 return this;
Nils Diewald9f310832013-12-06 22:38:55 +0000105 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000106 Filter f = (Filter) new QueryWrapperFilter(filter.toQuery());
Nils Diewald9f310832013-12-06 22:38:55 +0000107 if (f == null) {
108 log.warn("Filter can't be wrapped");
Nils Diewald01b4ce32013-12-05 22:39:25 +0000109 return this;
Nils Diewald9f310832013-12-06 22:38:55 +0000110 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000111 FilterOperation fo = new FilterOperation(f,false);
Nils Diewald9f310832013-12-06 22:38:55 +0000112 if (fo == null) {
113 log.warn("Filter operation invalid");
Nils Diewald01b4ce32013-12-05 22:39:25 +0000114 return this;
Nils Diewald9f310832013-12-06 22:38:55 +0000115 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000116 this.filter.add(fo);
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000117 this.filterCount++;
Nils Diewaldc925b492013-12-03 23:56:10 +0000118 return this;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000119 };
120
Nils Diewaldc925b492013-12-03 23:56:10 +0000121 public KorapCollection extend (BooleanFilter filter) {
Nils Diewald9f310832013-12-06 22:38:55 +0000122 log.trace("Added extension: {}", filter.toString());
Nils Diewald5def8bc2013-11-28 19:26:54 +0000123 this.filter.add(
124 new FilterOperation(
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000125 (Filter) new QueryWrapperFilter(filter.toQuery()),
Nils Diewald5def8bc2013-11-28 19:26:54 +0000126 true
127 )
128 );
129 this.filterCount++;
Nils Diewaldc925b492013-12-03 23:56:10 +0000130 return this;
Nils Diewald5def8bc2013-11-28 19:26:54 +0000131 };
132
133 public ArrayList<FilterOperation> getFilters () {
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000134 return this.filter;
135 };
136
Nils Diewald01b4ce32013-12-05 22:39:25 +0000137 public FilterOperation getFilter (int i) {
138 return this.filter.get(i);
139 };
140
141
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000142 public String toString () {
143 StringBuffer sb = new StringBuffer();
144 for (FilterOperation fo : this.filter) {
145 sb.append(fo.toString()).append("; ");
146 };
147 return sb.toString();
148 };
149
Nils Diewaldc925b492013-12-03 23:56:10 +0000150 // DEPRECATED BUT USED IN TEST CASES
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000151 public KorapResult search (SpanQuery query) {
Nils Diewald3ef9a472013-12-02 16:06:09 +0000152 return this.index.search(this, query, 0, (short) 20, true, (short) 5, true, (short) 5);
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000153 };
154
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000155 public FixedBitSet bits (AtomicReaderContext atomic) throws IOException {
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000156
157 /*
Nils Diewald41e58f82013-11-20 20:30:15 +0000158 Use Bits.MatchAllBits(int len)
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000159 */
160
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000161 boolean noDoc = true;
162 FixedBitSet bitset;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000163
164 if (this.filterCount > 0) {
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000165 bitset = new FixedBitSet(atomic.reader().numDocs());
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000166
Nils Diewald5def8bc2013-11-28 19:26:54 +0000167 ArrayList<FilterOperation> filters = (ArrayList<FilterOperation>) this.filter.clone();
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000168
Nils Diewald22efd2d2013-11-29 22:54:24 +0000169 FilterOperation kcInit = filters.remove(0);
170 log.trace("FILTER: {}", kcInit);
171
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000172 // Init vector
Nils Diewald22efd2d2013-11-29 22:54:24 +0000173 DocIdSet docids = kcInit.filter.getDocIdSet(atomic, null);
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000174
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000175 DocIdSetIterator filterIter = docids.iterator();
176
177 if (filterIter != null) {
Nils Diewald22efd2d2013-11-29 22:54:24 +0000178 log.trace("InitFilter has effect");
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000179 // System.err.println("Init has an effect");
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000180 bitset.or(filterIter);
181 noDoc = false;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000182 };
183
Nils Diewald41e58f82013-11-20 20:30:15 +0000184 if (!noDoc) {
Nils Diewald5def8bc2013-11-28 19:26:54 +0000185 for (FilterOperation kc : filters) {
Nils Diewald41e58f82013-11-20 20:30:15 +0000186 log.trace("FILTER: {}", kc);
Nils Diewald5def8bc2013-11-28 19:26:54 +0000187
188 // BUG!!!
189 docids = kc.filter.getDocIdSet(atomic, kc.isExtension() ? null : bitset);
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000190 filterIter = docids.iterator();
Nils Diewald5def8bc2013-11-28 19:26:54 +0000191
Nils Diewald41e58f82013-11-20 20:30:15 +0000192 if (filterIter == null) {
193 // There must be a better way ...
Nils Diewald5def8bc2013-11-28 19:26:54 +0000194 if (kc.isFilter()) {
195 bitset.clear(0, bitset.length());
196 noDoc = true;
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000197 }
198 else {
199 // System.err.println("No term found");
Nils Diewald5def8bc2013-11-28 19:26:54 +0000200 };
201 continue;
Nils Diewald41e58f82013-11-20 20:30:15 +0000202 };
Nils Diewald5def8bc2013-11-28 19:26:54 +0000203 if (kc.isExtension()) {
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000204 // System.err.println("Term found!");
205 // log.trace("Extend filter");
206 // System.err.println("Old Card:" + bitset.cardinality());
Nils Diewald5def8bc2013-11-28 19:26:54 +0000207 bitset.or(filterIter);
Nils Diewald9cc86fe2013-12-07 17:45:59 +0000208 // System.err.println("New Card:" + bitset.cardinality());
Nils Diewald5def8bc2013-11-28 19:26:54 +0000209 }
210 else {
211 bitset.and(filterIter);
212 };
Nils Diewald41e58f82013-11-20 20:30:15 +0000213 };
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000214
215 if (!noDoc) {
216 FixedBitSet livedocs = (FixedBitSet) atomic.reader().getLiveDocs();
217 if (livedocs != null) {
218 bitset.and(livedocs);
219 };
220 };
221 }
222 else {
223 return bitset;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000224 };
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000225 }
226 else {
227 bitset = (FixedBitSet) atomic.reader().getLiveDocs();
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000228 };
229
230 return bitset;
231 };
232
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000233 public long numberOf (String foundry, String type) throws IOException {
Nils Diewaldc925b492013-12-03 23:56:10 +0000234 if (this.index == null)
235 return (long) 0;
236
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000237 return this.index.numberOf(this, foundry, type);
238 };
239
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000240 public long numberOf (String type) throws IOException {
Nils Diewaldc925b492013-12-03 23:56:10 +0000241 if (this.index == null)
242 return (long) 0;
243
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000244 return this.index.numberOf(this, "tokens", type);
245 };
246
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000247 public String getError () {
248 return this.error;
249 };
250
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000251 // implement "till" with rangefilter
Nils Diewald01b4ce32013-12-05 22:39:25 +0000252};