blob: fbd2f4fa372ec9dc06eecef69d2a86e5f3feb148 [file] [log] [blame]
Nils Diewaldbaf68c52013-11-20 13:22:19 +00001package de.ids_mannheim.korap;
2
margarethafe252802018-07-30 14:59:50 +02003import java.io.File;
4import java.io.FileInputStream;
Nils Diewaldbaf68c52013-11-20 13:22:19 +00005import java.io.IOException;
margaretha7e31ca92021-12-13 10:48:44 +01006import java.io.InputStream;
margaretha8efa3752018-07-24 17:46:43 +02007import java.nio.ByteBuffer;
margaretha4dfe3c52018-08-13 17:07:50 +02008import java.util.Properties;
margaretha5a8abea2021-11-08 16:57:51 +01009import java.util.Set;
margarethac20a9212018-08-21 14:32:09 +020010import java.util.zip.GZIPInputStream;
Nils Diewaldc925b492013-12-03 23:56:10 +000011
margarethafe252802018-07-30 14:59:50 +020012import org.apache.commons.io.IOUtils;
margarethac20a9212018-08-21 14:32:09 +020013import org.apache.commons.io.output.ByteArrayOutputStream;
margaretha8efa3752018-07-24 17:46:43 +020014import org.apache.lucene.index.DocsAndPositionsEnum;
15import org.apache.lucene.index.LeafReader;
16import org.apache.lucene.index.LeafReaderContext;
17import org.apache.lucene.index.Term;
18import org.apache.lucene.index.Terms;
19import org.apache.lucene.index.TermsEnum;
20import org.apache.lucene.search.BitsFilteredDocIdSet;
21import org.apache.lucene.search.DocIdSet;
22import org.apache.lucene.search.DocIdSetIterator;
23import org.apache.lucene.search.Filter;
24import org.apache.lucene.util.BitDocIdSet;
25import org.apache.lucene.util.Bits;
26import org.apache.lucene.util.BytesRef;
27import org.apache.lucene.util.FixedBitSet;
28import org.slf4j.Logger;
29import org.slf4j.LoggerFactory;
30
31import com.fasterxml.jackson.databind.JsonNode;
32import com.fasterxml.jackson.databind.ObjectMapper;
33
Nils Diewaldea969502015-02-16 21:10:54 +000034import de.ids_mannheim.korap.collection.CollectionBuilder;
Nils Diewaldc471b182014-11-19 22:51:15 +000035import de.ids_mannheim.korap.response.Notifications;
margaretha4dfe3c52018-08-13 17:07:50 +020036import de.ids_mannheim.korap.util.KrillProperties;
Akron176c9b12015-07-29 19:53:40 +020037import de.ids_mannheim.korap.util.QueryException;
margaretha78f397a2017-06-29 13:44:46 +020038import de.ids_mannheim.korap.util.StatusCodes;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000039
Nils Diewaldc471b182014-11-19 22:51:15 +000040/**
Nils Diewaldea969502015-02-16 21:10:54 +000041 * Create a Virtual Collection of documents by means of a KoralQuery
Nils Diewald2d5f8102015-02-26 21:07:54 +000042 * collection object.
Nils Diewaldbb33da22015-03-04 16:24:25 +000043 *
Nils Diewaldea969502015-02-16 21:10:54 +000044 * <blockquote><pre>
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 * KrillCollection kc = new KrillCollection(json);
Nils Diewaldea969502015-02-16 21:10:54 +000046 * </pre></blockquote>
Nils Diewaldbb33da22015-03-04 16:24:25 +000047 *
Nils Diewaldafab8f32015-01-26 19:11:32 +000048 * @author diewald
Nils Diewaldc471b182014-11-19 22:51:15 +000049 */
Nils Diewaldea969502015-02-16 21:10:54 +000050/*
51 * TODO: Make a cache for the bits
52 * Delete it in case of an extension or a filter
53 * TODO: Maybe use randomaccessfilterstrategy
54 * TODO: Maybe a constantScoreQuery can make things faster?
55 * See http://mail-archives.apache.org/mod_mbox/lucene-java-user/
56 * 200805.mbox/%3C17080852.post@talk.nabble.com%3E
57 */
margaretha5a8abea2021-11-08 16:57:51 +010058public final class KrillCollection extends Notifications implements IndexInfo {
Nils Diewalda14ecd62015-02-26 21:00:20 +000059 private KrillIndex index;
Akronbb5d1732015-06-22 01:22:40 +020060 private JsonNode json;
margaretha5a8abea2021-11-08 16:57:51 +010061 private final CollectionBuilder cb = new CollectionBuilder(this);
Akron60dfa7e2015-08-03 22:15:17 +020062 private CollectionBuilder.Interface cbi;
Akron176c9b12015-07-29 19:53:40 +020063 private byte[] pl = new byte[4];
Akronb59f40e2018-08-23 17:15:43 +020064
Akron65d57e92018-08-24 19:25:56 +020065 private ObjectMapper mapper = new ObjectMapper();
66
Akronb59f40e2018-08-23 17:15:43 +020067 private Filter prefiltered = null;
margaretha85ee2ac2018-07-25 17:58:09 +020068 // private static ByteBuffer bb = ByteBuffer.allocate(4);
Nils Diewaldbb33da22015-03-04 16:24:25 +000069
Nils Diewaldbaf68c52013-11-20 13:22:19 +000070 // Logger
margaretha5a8abea2021-11-08 16:57:51 +010071 private final static Logger log =
margaretha8efa3752018-07-24 17:46:43 +020072 LoggerFactory.getLogger(KrillCollection.class);
Nils Diewald7cbcfe92014-09-22 22:01:51 +000073 // This advices the java compiler to ignore all loggings
Akron3ba74f22015-07-24 18:46:17 +020074 public static final boolean DEBUG = false;
margaretha5a8abea2021-11-08 16:57:51 +010075 private double start, end; // for debugging
margaretha85ee2ac2018-07-25 17:58:09 +020076
Nils Diewald65894bd2015-02-16 21:36:53 +000077 /**
Akron176c9b12015-07-29 19:53:40 +020078 * Construct a new KrillCollection.
79 *
80 */
margaretha5a8abea2021-11-08 16:57:51 +010081 public KrillCollection () {};
Akron176c9b12015-07-29 19:53:40 +020082
83
84 /**
Nils Diewald2d5f8102015-02-26 21:07:54 +000085 * Construct a new KrillCollection by passing a KrillIndex.
Nils Diewaldbb33da22015-03-04 16:24:25 +000086 *
87 * @param index
88 * The {@link KrillIndex} object.
Nils Diewald65894bd2015-02-16 21:36:53 +000089 */
Nils Diewald2d5f8102015-02-26 21:07:54 +000090 public KrillCollection (KrillIndex index) {
Nils Diewald65894bd2015-02-16 21:36:53 +000091 this.index = index;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000092 };
93
margaretha85ee2ac2018-07-25 17:58:09 +020094 /**
Nils Diewald2d5f8102015-02-26 21:07:54 +000095 * Construct a new KrillCollection by passing a KoralQuery.
Nils Diewaldbb33da22015-03-04 16:24:25 +000096 *
Akron176c9b12015-07-29 19:53:40 +020097 * @param json
98 * The KoralQuery document as a JSON string.
Nils Diewald33fcb5d2014-11-07 23:27:03 +000099 */
Nils Diewald2d5f8102015-02-26 21:07:54 +0000100 public KrillCollection (String jsonString) {
Nils Diewald44d5fa12015-01-15 21:31:52 +0000101 try {
102 JsonNode json = mapper.readTree(jsonString);
margaretha8efa3752018-07-24 17:46:43 +0200103
104 if (json.has("errors") && json.get("errors").size() > 0) {
105 this.addError(StatusCodes.INVALID_QUERY, "Json has errors.");
margarethaf2c31502017-06-26 17:57:16 +0200106 }
margaretha8efa3752018-07-24 17:46:43 +0200107 else if (json.has("collection")) {
Akron850b46e2016-06-08 10:08:55 +0200108 this.fromKoral(json.get("collection"));
margarethaf2c31502017-06-26 17:57:16 +0200109 }
margaretha8efa3752018-07-24 17:46:43 +0200110 else if (json.has("collections")) {
Akron40550172015-08-04 03:06:12 +0200111 this.addError(899,
112 "Collections are not supported anymore in favour of a single collection");
margarethaf2c31502017-06-26 17:57:16 +0200113 }
margaretha8efa3752018-07-24 17:46:43 +0200114 else {
115 this.addError(StatusCodes.MISSING_COLLECTION,
Akronb59f40e2018-08-23 17:15:43 +0200116 "Collection is not found");
margaretha8efa3752018-07-24 17:46:43 +0200117 this.fromBuilder(this.build().nothing());
margarethaf2c31502017-06-26 17:57:16 +0200118 }
Nils Diewald44d5fa12015-01-15 21:31:52 +0000119 }
Akron176c9b12015-07-29 19:53:40 +0200120
121 // Query Exception
Nils Diewald44d5fa12015-01-15 21:31:52 +0000122 catch (QueryException qe) {
Nils Diewaldea969502015-02-16 21:10:54 +0000123 this.addError(qe.getErrorCode(), qe.getMessage());
margaretha8efa3752018-07-24 17:46:43 +0200124 this.fromBuilder(this.build().nothing());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000125 }
Akron176c9b12015-07-29 19:53:40 +0200126
127 // JSON exception
Nils Diewald44d5fa12015-01-15 21:31:52 +0000128 catch (IOException e) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000129 this.addError(621, "Unable to parse JSON", "KrillCollection",
130 e.getLocalizedMessage());
margaretha8efa3752018-07-24 17:46:43 +0200131 this.fromBuilder(this.build().nothing());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000132 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000133 };
134
margaretha5a8abea2021-11-08 16:57:51 +0100135
Nils Diewaldea969502015-02-16 21:10:54 +0000136 /**
Akron176c9b12015-07-29 19:53:40 +0200137 * Set the {@link KrillIndex} the virtual collection refers to.
138 *
139 * @param index
140 * The {@link KrillIndex} the virtual collection refers
141 * to.
Nils Diewaldea969502015-02-16 21:10:54 +0000142 */
Akron176c9b12015-07-29 19:53:40 +0200143 public void setIndex (KrillIndex index) {
144 this.index = index;
Nils Diewaldc925b492013-12-03 23:56:10 +0000145 };
146
Nils Diewald33fcb5d2014-11-07 23:27:03 +0000147
Nils Diewaldea969502015-02-16 21:10:54 +0000148 /**
149 * Import the "collection" part of a KoralQuery.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000150 *
151 * @param jsonString
152 * The "collection" part of a KoralQuery.
Nils Diewaldea969502015-02-16 21:10:54 +0000153 * @throws QueryException
154 */
Akron850b46e2016-06-08 10:08:55 +0200155 public KrillCollection fromKoral (String jsonString) throws QueryException {
margaretha5a8abea2021-11-08 16:57:51 +0100156 this.prefiltered = null;
Nils Diewald44d5fa12015-01-15 21:31:52 +0000157 try {
Akron850b46e2016-06-08 10:08:55 +0200158 this.fromKoral((JsonNode) mapper.readTree(jsonString));
Nils Diewald44d5fa12015-01-15 21:31:52 +0000159 }
160 catch (Exception e) {
Nils Diewald2d5f8102015-02-26 21:07:54 +0000161 this.addError(621, "Unable to parse JSON", "KrillCollection");
margaretha8efa3752018-07-24 17:46:43 +0200162 this.fromBuilder(this.build().nothing());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000163 };
Nils Diewald3aa9e692015-02-20 22:20:11 +0000164
165 return this;
Nils Diewalde3645702014-11-07 21:15:20 +0000166 };
Nils Diewald78993522014-10-27 17:51:22 +0000167
Nils Diewald33fcb5d2014-11-07 23:27:03 +0000168
Akron65d57e92018-08-24 19:25:56 +0200169 public KrillCollection fromStore (String ref) throws QueryException {
Akronb59f40e2018-08-23 17:15:43 +0200170 Properties prop = KrillProperties.loadDefaultProperties();
171 this.prefiltered = null;
172
173 if (prop == null) {
174 this.addError(StatusCodes.MISSING_KRILL_PROPERTIES,
175 "krill.properties is not found.");
176 return null;
177 }
178
margaretha7e31ca92021-12-13 10:48:44 +0100179 String p = prop.getProperty("krill.test", "false");
180 boolean isTest = Boolean.parseBoolean(p);
181
Akronb59f40e2018-08-23 17:15:43 +0200182 String namedVCPath = prop.getProperty("krill.namedVC");
183
184 if (!namedVCPath.endsWith("/")) {
185 namedVCPath += "/";
186 };
187
188 String fileName = namedVCPath + ref + ".jsonld";
189 File file;
190 String json = null;
margaretha7e31ca92021-12-13 10:48:44 +0100191 InputStream is = null;
Akronb59f40e2018-08-23 17:15:43 +0200192 if ((file= new File(fileName)).exists()) {
193 try (FileInputStream fis = new FileInputStream(file)) {
194 json = IOUtils.toString(fis,"utf-8");
195 }
196 catch (IOException e) {
197 this.addError(StatusCodes.READING_COLLECTION_FAILED,
198 e.getMessage());
199 return this;
200 }
201 }
202 // slower than plain text, but save space
203 else if ((file = new File(fileName + ".gz")).exists()){
204 try (GZIPInputStream gzipInputStream =
205 new GZIPInputStream(new FileInputStream(file));
206 ByteArrayOutputStream bos =
207 new ByteArrayOutputStream(512);) {
208 bos.write(gzipInputStream);
209 json = bos.toString("utf-8");
210 }
211 catch (IOException e) {
212 this.addError(StatusCodes.READING_COLLECTION_FAILED,
213 e.getMessage());
214 return this;
215 }
216 }
margaretha7e31ca92021-12-13 10:48:44 +0100217 // for testing
218 else if (isTest
219 && (is = retrieveInputStreamFromClasspath(fileName)) != null) {
220 try {
221 json = IOUtils.toString(is, "utf-8");
222 }
223 catch (IOException e) {
224 this.addError(StatusCodes.READING_COLLECTION_FAILED,
225 e.getMessage());
226 return this;
227 }
228 }
Akronb59f40e2018-08-23 17:15:43 +0200229 else{
230 this.addError(StatusCodes.MISSING_COLLECTION,
231 "Collection is not found " + fileName);
232 return this;
233 };
234
235 return this.fromKoral(json);
236 };
237
238
margaretha7e31ca92021-12-13 10:48:44 +0100239 private InputStream retrieveInputStreamFromClasspath (String fileName) {
240 if (!fileName.startsWith("/")) {
241 fileName = "/"+fileName;
242 }
243 return KrillCollection.class.getResourceAsStream(fileName);
244 }
245
246
Nils Diewaldea969502015-02-16 21:10:54 +0000247 /**
248 * Import the "collection" part of a KoralQuery.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000249 *
250 * @param json
251 * The "collection" part of a KoralQuery
252 * as a {@link JsonNode} object.
Nils Diewaldea969502015-02-16 21:10:54 +0000253 * @throws QueryException
254 */
Akron850b46e2016-06-08 10:08:55 +0200255 public KrillCollection fromKoral (JsonNode json) throws QueryException {
Akronbb5d1732015-06-22 01:22:40 +0200256 this.json = json;
Akronb59f40e2018-08-23 17:15:43 +0200257 this.prefiltered = null;
Akron850b46e2016-06-08 10:08:55 +0200258 return this.fromBuilder(this._fromKoral(json));
Nils Diewald65894bd2015-02-16 21:36:53 +0000259 };
260
261
Akrond5ca00a2016-06-08 14:29:00 +0200262 // Create collection from KoralQuery
Akron850b46e2016-06-08 10:08:55 +0200263 private CollectionBuilder.Interface _fromKoral (JsonNode json)
Akronb59f40e2018-08-23 17:15:43 +0200264 throws QueryException {
265
266 if (json.has("collection")) {
267 return this._fromKoral(json.at("/collection"));
268 };
Nils Diewald65894bd2015-02-16 21:36:53 +0000269
Akronc63697c2015-06-17 22:32:02 +0200270 if (!json.has("@type")) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000271 throw new QueryException(701,
272 "JSON-LD group has no @type attribute");
Akronc63697c2015-06-17 22:32:02 +0200273 };
Nils Diewald65894bd2015-02-16 21:36:53 +0000274
275 String type = json.get("@type").asText();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000276
Nils Diewaldcec40f92015-02-19 22:20:02 +0000277 if (type.equals("koral:doc")) {
Nils Diewald65894bd2015-02-16 21:36:53 +0000278
margaretha85ee2ac2018-07-25 17:58:09 +0200279 // default key
Nils Diewaldbb33da22015-03-04 16:24:25 +0000280 String key = "tokens";
Nils Diewald65894bd2015-02-16 21:36:53 +0000281 String valtype = "type:string";
Nils Diewaldbb33da22015-03-04 16:24:25 +0000282 String match = "match:eq";
Nils Diewald65894bd2015-02-16 21:36:53 +0000283
margaretha8efa3752018-07-24 17:46:43 +0200284 if (json.has("key")) key = json.get("key").asText();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000285
margaretha8efa3752018-07-24 17:46:43 +0200286 if (json.has("type")) valtype = json.get("type").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000287
288 // Filter based on date
289 if (valtype.equals("type:date")) {
290
291 if (!json.has("value"))
Akron176c9b12015-07-29 19:53:40 +0200292 throw new QueryException(820, "Dates require value fields");
Nils Diewald65894bd2015-02-16 21:36:53 +0000293
294 String dateStr = json.get("value").asText();
Akron176c9b12015-07-29 19:53:40 +0200295
margaretha8efa3752018-07-24 17:46:43 +0200296 if (json.has("match")) match = json.get("match").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000297
298 // TODO: This isn't stable yet
299 switch (match) {
Akron40550172015-08-04 03:06:12 +0200300 case "match:eq":
301 return this.cb.date(key, dateStr);
302 case "match:ne":
303 return this.cb.date(key, dateStr).not();
304 case "match:geq":
305 return this.cb.since(key, dateStr);
306 case "match:leq":
307 return this.cb.till(key, dateStr);
Nils Diewald65894bd2015-02-16 21:36:53 +0000308 };
Akron48937e92015-06-26 01:49:02 +0200309
Eliza Margaretha6f989202016-10-14 21:48:29 +0200310 throw new QueryException(841,
311 "Match relation unknown for type");
Nils Diewald65894bd2015-02-16 21:36:53 +0000312 }
313
Akron48937e92015-06-26 01:49:02 +0200314 // Filter based on string
Nils Diewald65894bd2015-02-16 21:36:53 +0000315 else if (valtype.equals("type:string")) {
margarethaecddb0b2018-07-31 15:23:38 +0200316 if (json.get("value").size() > 1){
Akrone64cc162019-01-08 18:40:37 +0100317 if (DEBUG) {
318 log.debug("koral:doc size " + json.get("value").size());
319 };
margarethadf0e9d12018-07-30 16:22:59 +0200320 if (json.has("match")) {
321 match = json.get("match").asText();
322 }
323
margaretha8a8c4272018-08-21 17:39:27 +0200324 CollectionBuilder.Group group = this.cb.orGroup();
325 for (JsonNode value : json.get("value")) {
326 group.with(cb.term(key, value.asText()));
margarethadf0e9d12018-07-30 16:22:59 +0200327 }
margaretha8a8c4272018-08-21 17:39:27 +0200328
329 if (match.equals("match:ne")) {
330 return group.not();
margarethadf0e9d12018-07-30 16:22:59 +0200331 }
332 return group;
333 }
334
margaretha8efa3752018-07-24 17:46:43 +0200335 if (json.has("match")) match = json.get("match").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000336
Akron176c9b12015-07-29 19:53:40 +0200337 switch (match) {
338
Akron40550172015-08-04 03:06:12 +0200339 case "match:eq":
340 return this.cb.term(key, json.get("value").asText());
341 case "match:ne":
342 return this.cb.term(key, json.get("value").asText())
343 .not();
Akron176c9b12015-07-29 19:53:40 +0200344
margaretha8efa3752018-07-24 17:46:43 +0200345 // Contains and containsnot (or excludes) is only
346 // effective on text fields and ineffective on
347 // string fields
Akron40550172015-08-04 03:06:12 +0200348 case "match:contains":
margaretha8efa3752018-07-24 17:46:43 +0200349 return this.cb.text(key, json.get("value").asText());
Akron176c9b12015-07-29 19:53:40 +0200350
Akron40550172015-08-04 03:06:12 +0200351 case "match:containsnot":
margaretha8efa3752018-07-24 17:46:43 +0200352 return this.cb.text(key, json.get("value").asText())
353 .not();
Akron176c9b12015-07-29 19:53:40 +0200354
Eliza Margaretha6f989202016-10-14 21:48:29 +0200355 // <LEGACY>
Akron40550172015-08-04 03:06:12 +0200356 case "match:excludes":
margaretha8efa3752018-07-24 17:46:43 +0200357 return this.cb.text(key, json.get("value").asText())
358 .not();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200359 // </LEGACY>
Akron48937e92015-06-26 01:49:02 +0200360 };
Nils Diewald65894bd2015-02-16 21:36:53 +0000361
Eliza Margaretha6f989202016-10-14 21:48:29 +0200362 throw new QueryException(841,
363 "Match relation unknown for type");
Akron48937e92015-06-26 01:49:02 +0200364 }
365
366 // Filter based on regex
367 else if (valtype.equals("type:regex")) {
Akron176c9b12015-07-29 19:53:40 +0200368
margaretha8efa3752018-07-24 17:46:43 +0200369 if (json.has("match")) match = json.get("match").asText();
Akron48937e92015-06-26 01:49:02 +0200370
371 if (match.equals("match:eq")) {
Akronfd05f502015-07-30 18:34:26 +0200372 return this.cb.re(key, json.get("value").asText());
Akron48937e92015-06-26 01:49:02 +0200373 }
374 else if (match.equals("match:ne")) {
Akronfd05f502015-07-30 18:34:26 +0200375 return this.cb.re(key, json.get("value").asText()).not();
Akron176c9b12015-07-29 19:53:40 +0200376 }
Akron27469702018-04-05 12:46:18 +0200377
margaretha8efa3752018-07-24 17:46:43 +0200378 // Contains and containsnot (or excludes) is
379 // identical to eq and ne in case of regexes for the
380 // moment,
381 // though it may be beneficial to circumfix these
382 // with .*
Akron176c9b12015-07-29 19:53:40 +0200383 else if (match.equals("match:contains")) {
Akronfd05f502015-07-30 18:34:26 +0200384 return this.cb.re(key, json.get("value").asText());
Akron176c9b12015-07-29 19:53:40 +0200385 }
Akron27469702018-04-05 12:46:18 +0200386 else if (match.equals("match:containsnot")) {
387 return this.cb.re(key, json.get("value").asText());
388 }
margaretha8efa3752018-07-24 17:46:43 +0200389 // <LEGACY>
Akron176c9b12015-07-29 19:53:40 +0200390 else if (match.equals("match:excludes")) {
Akronfd05f502015-07-30 18:34:26 +0200391 return this.cb.re(key, json.get("value").asText()).not();
Akron48937e92015-06-26 01:49:02 +0200392 };
margaretha8efa3752018-07-24 17:46:43 +0200393 // </LEGACY>
Akron48937e92015-06-26 01:49:02 +0200394
Eliza Margaretha6f989202016-10-14 21:48:29 +0200395 throw new QueryException(841,
396 "Match relation unknown for type");
margaretha85ee2ac2018-07-25 17:58:09 +0200397 }
398
Akron176c9b12015-07-29 19:53:40 +0200399 throw new QueryException(843, "Document type is not supported");
Nils Diewald65894bd2015-02-16 21:36:53 +0000400 }
401
402 // nested group
Akronb59f40e2018-08-23 17:15:43 +0200403 else if (type.equals("koral:docGroup")) {
Akron176c9b12015-07-29 19:53:40 +0200404
Nils Diewald65894bd2015-02-16 21:36:53 +0000405 if (!json.has("operands") || !json.get("operands").isArray())
Akron40550172015-08-04 03:06:12 +0200406 throw new QueryException(842,
407 "Document group needs operand list");
Akron176c9b12015-07-29 19:53:40 +0200408
Akron60dfa7e2015-08-03 22:15:17 +0200409 CollectionBuilder.Group group;
Nils Diewald65894bd2015-02-16 21:36:53 +0000410
411 String operation = "operation:and";
412 if (json.has("operation"))
Akron40550172015-08-04 03:06:12 +0200413 operation = json.get("operation").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000414
Akron176c9b12015-07-29 19:53:40 +0200415 if (operation.equals("operation:or"))
Akronfd05f502015-07-30 18:34:26 +0200416 group = this.cb.orGroup();
Akron176c9b12015-07-29 19:53:40 +0200417 else if (operation.equals("operation:and"))
Akronfd05f502015-07-30 18:34:26 +0200418 group = this.cb.andGroup();
Akron176c9b12015-07-29 19:53:40 +0200419 else
Akron40550172015-08-04 03:06:12 +0200420 throw new QueryException(810,
421 "Unknown document group operation");
422
Nils Diewald65894bd2015-02-16 21:36:53 +0000423 for (JsonNode operand : json.get("operands")) {
Akron6b0be132019-09-16 19:01:59 +0200424
425 // TODO:
426 // Potentially bed here, when operand is a group inside a group
427 // with the same operator (and not negative)
Akron850b46e2016-06-08 10:08:55 +0200428 group.with(this._fromKoral(operand));
Nils Diewald65894bd2015-02-16 21:36:53 +0000429 };
Akron176c9b12015-07-29 19:53:40 +0200430 return group;
Nils Diewald65894bd2015-02-16 21:36:53 +0000431 }
Akronb59f40e2018-08-23 17:15:43 +0200432
margaretha8efa3752018-07-24 17:46:43 +0200433 // vc reference
434 else if (type.equals("koral:docGroupRef")) {
Akronb59f40e2018-08-23 17:15:43 +0200435
margaretha8efa3752018-07-24 17:46:43 +0200436 if (!json.has("ref")) {
437 throw new QueryException(StatusCodes.MISSING_VC_REFERENCE,
438 "ref is not found");
439 }
margaretha85ee2ac2018-07-25 17:58:09 +0200440
margaretha8efa3752018-07-24 17:46:43 +0200441 String ref = json.get("ref").asText();
margaretha85ee2ac2018-07-25 17:58:09 +0200442 if (ref.isEmpty()) {
margaretha8efa3752018-07-24 17:46:43 +0200443 throw new QueryException(StatusCodes.MISSING_VC_REFERENCE,
444 "ref is empty");
Akronb59f40e2018-08-23 17:15:43 +0200445 };
margaretha85ee2ac2018-07-25 17:58:09 +0200446
Akronb59f40e2018-08-23 17:15:43 +0200447 return this.cb.referTo(ref);
margaretha8efa3752018-07-24 17:46:43 +0200448 }
449
Nils Diewald65894bd2015-02-16 21:36:53 +0000450
451 // Unknown type
Akron176c9b12015-07-29 19:53:40 +0200452 throw new QueryException(813, "Collection type is not supported");
Akron40550172015-08-04 03:06:12 +0200453 };
454
Akron176c9b12015-07-29 19:53:40 +0200455 /**
456 * Set the collection from a {@link CollectionBuilder} object.
457 *
Akron40550172015-08-04 03:06:12 +0200458 * @param cb
459 * The CollectionBuilder object.
Akron176c9b12015-07-29 19:53:40 +0200460 */
Akron60dfa7e2015-08-03 22:15:17 +0200461 public KrillCollection fromBuilder (CollectionBuilder.Interface cbi) {
Akronb59f40e2018-08-23 17:15:43 +0200462 this.prefiltered = null;
Akronfd05f502015-07-30 18:34:26 +0200463 this.cbi = cbi;
Nils Diewald3aa9e692015-02-20 22:20:11 +0000464 return this;
Nils Diewalde3645702014-11-07 21:15:20 +0000465 };
466
Akron40550172015-08-04 03:06:12 +0200467
Akron60dfa7e2015-08-03 22:15:17 +0200468 public CollectionBuilder.Interface getBuilder () {
Akronfd05f502015-07-30 18:34:26 +0200469 return this.cbi;
470 };
471
472
473 public CollectionBuilder build () {
Akron176c9b12015-07-29 19:53:40 +0200474 return this.cb;
Nils Diewald01b4ce32013-12-05 22:39:25 +0000475 };
476
Akron40550172015-08-04 03:06:12 +0200477
Akron60dfa7e2015-08-03 22:15:17 +0200478 public KrillCollection filter (CollectionBuilder.Interface filter) {
Akronfd05f502015-07-30 18:34:26 +0200479 return this.fromBuilder(this.cb.andGroup().with(this.cbi).with(filter));
480 };
481
Akron40550172015-08-04 03:06:12 +0200482
Akron60dfa7e2015-08-03 22:15:17 +0200483 public KrillCollection extend (CollectionBuilder.Interface extension) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200484 return this
485 .fromBuilder(this.cb.orGroup().with(this.cbi).with(extension));
Akronfd05f502015-07-30 18:34:26 +0200486 };
487
488
489
Nils Diewaldea969502015-02-16 21:10:54 +0000490 /**
491 * Add a filter based on a list of unique document identifiers.
492 * UIDs may be indexed in the field "UID".
Nils Diewaldbb33da22015-03-04 16:24:25 +0000493 *
Nils Diewaldea969502015-02-16 21:10:54 +0000494 * This filter is not part of the legacy API!
Nils Diewaldbb33da22015-03-04 16:24:25 +0000495 *
496 * @param uids
497 * The list of unique document identifier.
Nils Diewald2d5f8102015-02-26 21:07:54 +0000498 * @return The {@link KrillCollection} object for chaining.
Nils Diewaldea969502015-02-16 21:10:54 +0000499 */
Nils Diewald2d5f8102015-02-26 21:07:54 +0000500 public KrillCollection filterUIDs (String ... uids) {
Akronb59f40e2018-08-23 17:15:43 +0200501 this.prefiltered = null;
Akron60dfa7e2015-08-03 22:15:17 +0200502 CollectionBuilder.Group cbg = this.cb.orGroup();
Akronfd05f502015-07-30 18:34:26 +0200503 for (String uid : uids) {
504 cbg.with(this.cb.term("UID", uid));
505 };
Akron60dfa7e2015-08-03 22:15:17 +0200506 return this.filter(cbg);
Nils Diewaldd723d812014-09-23 18:50:52 +0000507 };
508
509
Nils Diewaldea969502015-02-16 21:10:54 +0000510 /**
Akron176c9b12015-07-29 19:53:40 +0200511 * Serialize collection to a {@link Filter} object.
Nils Diewaldea969502015-02-16 21:10:54 +0000512 */
Akronb59f40e2018-08-23 17:15:43 +0200513 public Filter toFilter () throws QueryException {
514 if (this.cbi == null)
515 return null;
516
517 if (this.prefiltered != null)
518 return this.prefiltered;
519
520 this.prefiltered = this.cbi.toFilter();
521 return this.prefiltered;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000522 };
523
Nils Diewaldea969502015-02-16 21:10:54 +0000524
525 /**
Akron176c9b12015-07-29 19:53:40 +0200526 * Boolean value if the collection should work inverted or
527 * not.
Nils Diewaldea969502015-02-16 21:10:54 +0000528 */
Akron176c9b12015-07-29 19:53:40 +0200529 public boolean isNegative () {
margaretha8efa3752018-07-24 17:46:43 +0200530 if (this.cbi == null) return false;
Nils Diewald01b4ce32013-12-05 22:39:25 +0000531
Akronfd05f502015-07-30 18:34:26 +0200532 return this.cbi.isNegative();
Nils Diewaldea969502015-02-16 21:10:54 +0000533 };
534
535
536 /**
Akronb59f40e2018-08-23 17:15:43 +0200537 * Generate a string representation of the virtual collection.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000538 *
539 * <strong>Warning</strong>: This currently does not generate a
540 * valid
Nils Diewaldea969502015-02-16 21:10:54 +0000541 * KoralQuery string, so this may change in a future version.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000542 *
Nils Diewaldea969502015-02-16 21:10:54 +0000543 * @return A string representation of the virtual collection.
544 */
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000545 public String toString () {
Akronb59f40e2018-08-23 17:15:43 +0200546 try {
547 Filter filter = this.toFilter();
548 if (filter == null) return "";
549 return (this.isNegative() ? "-" : "") + filter.toString();
550 }
551 catch (QueryException qe) {
552 log.warn(qe.getLocalizedMessage());
553 };
554 return "";
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000555 };
556
Nils Diewaldea969502015-02-16 21:10:54 +0000557
Nils Diewald7cbcfe92014-09-22 22:01:51 +0000558 /**
Akronbb5d1732015-06-22 01:22:40 +0200559 * Return the associated KoralQuery collection object
560 * as a {@link JsonNode}. This won't work,
561 * if the object was build using a CollectionBuilder,
562 * therefore it is limited to mirror a deserialized KoralQuery
563 * object.
564 *
565 * @return The {@link JsonNode} representing the collection object
566 * of a deserialized KoralQuery object.
567 */
568 public JsonNode toJsonNode () {
569 return this.json;
570 };
571
572
Nils Diewaldea969502015-02-16 21:10:54 +0000573 /**
574 * Create a bit vector representing the live documents of the
575 * virtual collection to be used in searches.
Akron176c9b12015-07-29 19:53:40 +0200576 * This will respect deleted documents.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000577 *
578 * @param The
Akron700c1eb2015-09-25 16:57:30 +0200579 * {@link LeafReaderContext} to search in.
Nils Diewaldea969502015-02-16 21:10:54 +0000580 * @return A bit vector representing the live documents of the
581 * virtual collection.
582 * @throws IOException
583 */
Akronb59f40e2018-08-23 17:15:43 +0200584 public FixedBitSet bits (LeafReaderContext atomic) throws IOException, QueryException {
margaretha85ee2ac2018-07-25 17:58:09 +0200585
margaretha5a8abea2021-11-08 16:57:51 +0100586 // EM: really need a fixedBitset?
587 // maybe better use org.apache.lucene.util.BitDocIdSet.Builder
588 // for automatic sparse bitset support
589 // appears possible by implementing a SparseDocBits class extending
590 // SparseFixedBitSet and implementing Serializable (only as marker interface)
Akron700c1eb2015-09-25 16:57:30 +0200591 LeafReader r = atomic.reader();
Akron176c9b12015-07-29 19:53:40 +0200592 FixedBitSet bitset = new FixedBitSet(r.maxDoc());
margaretha5a8abea2021-11-08 16:57:51 +0100593
594 if (DEBUG) {
595 start = System.currentTimeMillis();
596 }
597 DocIdSet docids = null;
598 try {
599 docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
600 }
601 catch (RuntimeException e) {
602 Throwable t = e.getCause();
603 if (t instanceof IOException) {
604 throw new IOException(t);
605 }
606 else if (t instanceof QueryException) {
607 throw new QueryException(((QueryException) t).getErrorCode(), t.getLocalizedMessage());
608 }
margaretha05a4bc12022-02-11 10:55:43 +0100609 else {
610 throw e;
611 }
margaretha5a8abea2021-11-08 16:57:51 +0100612 }
613
614 if (DEBUG) {
615 end = System.currentTimeMillis();
616 log.info("getDocIdSet in bits: " + (end - start));
617 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000618
Akron6b0be132019-09-16 19:01:59 +0200619
Akronaa74ec62015-07-31 17:22:55 +0200620 if (docids == null) {
621 if (this.cbi != null) {
622 bitset.clear(0, bitset.length());
623 }
624 else {
625 bitset.set(0, bitset.length());
626 };
627 }
Akron6b0be132019-09-16 19:01:59 +0200628 else {
Akronaa74ec62015-07-31 17:22:55 +0200629 bitset.or(docids.iterator());
Akron6b0be132019-09-16 19:01:59 +0200630 }
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000631
Nils Diewald44d5fa12015-01-15 21:31:52 +0000632 return bitset;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000633 };
634
Nils Diewaldea969502015-02-16 21:10:54 +0000635
636 /**
Akron176c9b12015-07-29 19:53:40 +0200637 * Return the {@link DocIdSet} representing the documents of the
638 * virtual collection to be used in searches.
639 * This will respect deleted documents.
640 *
641 * @param atomic
Akron700c1eb2015-09-25 16:57:30 +0200642 * The {@link LeafReaderContext} to search in.
Akron176c9b12015-07-29 19:53:40 +0200643 * @param accepted
644 * {@link Bits} vector of accepted documents.
645 * @throws IOException
646 */
Akron700c1eb2015-09-25 16:57:30 +0200647 public DocIdSet getDocIdSet (LeafReaderContext atomic, Bits acceptDocs)
Akronb59f40e2018-08-23 17:15:43 +0200648 throws IOException, QueryException {
Akron176c9b12015-07-29 19:53:40 +0200649
650 int maxDoc = atomic.reader().maxDoc();
651 FixedBitSet bitset = new FixedBitSet(maxDoc);
652
margaretha5a8abea2021-11-08 16:57:51 +0100653 final Filter filter = this.toFilter();
654
655 if (filter == null) {
656 if (acceptDocs == null)
657 return null;
658 bitset.set(0, maxDoc);
659 }
Akronb59f40e2018-08-23 17:15:43 +0200660 else {
Akron176c9b12015-07-29 19:53:40 +0200661
Akronb59f40e2018-08-23 17:15:43 +0200662 // Init vector
663 DocIdSet docids = filter.getDocIdSet(atomic, null);
Akron6b0be132019-09-16 19:01:59 +0200664
Akronb59f40e2018-08-23 17:15:43 +0200665 DocIdSetIterator filterIter =
666 (docids == null) ? null : docids.iterator();
667
668 if (filterIter == null) {
Akron6b0be132019-09-16 19:01:59 +0200669
Akronb59f40e2018-08-23 17:15:43 +0200670 if (!this.cbi.isNegative()) return null;
Akron176c9b12015-07-29 19:53:40 +0200671
Akronb59f40e2018-08-23 17:15:43 +0200672 bitset.set(0, maxDoc);
673 }
674 else {
Akron6b0be132019-09-16 19:01:59 +0200675
676 // Or bit set
Akronb59f40e2018-08-23 17:15:43 +0200677 bitset.or(filterIter);
678
679 // Revert for negation
680 if (this.cbi.isNegative()) bitset.flip(0, maxDoc);
681 };
682 };
Akronaa74ec62015-07-31 17:22:55 +0200683
Akronb59f40e2018-08-23 17:15:43 +0200684 if (DEBUG) {
685 log.debug("Bit set is {}", _bits(bitset));
686 log.debug("Livedocs is {}", _bits(acceptDocs));
Akron176c9b12015-07-29 19:53:40 +0200687 };
688
689 // Remove deleted docs
Eliza Margaretha6f989202016-10-14 21:48:29 +0200690 return (DocIdSet) BitsFilteredDocIdSet
Akronb59f40e2018-08-23 17:15:43 +0200691 .wrap((DocIdSet) new BitDocIdSet(bitset), acceptDocs);
Akron176c9b12015-07-29 19:53:40 +0200692 };
693
Akron40550172015-08-04 03:06:12 +0200694
Akron176c9b12015-07-29 19:53:40 +0200695 public long numberOf (String type) throws IOException {
696 return this.numberOf("tokens", type);
697 };
698
Akron40550172015-08-04 03:06:12 +0200699
Akron176c9b12015-07-29 19:53:40 +0200700 /**
Nils Diewaldea969502015-02-16 21:10:54 +0000701 * Search for the number of occurrences of different types,
702 * e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual
703 * collection.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000704 *
705 * @param field
706 * The field containing the textual data and the
707 * annotations as a string.
708 * @param type
709 * The type of meta information,
710 * e.g. <i>documents</i> or <i>sentences</i> as a
711 * string.
Nils Diewaldea969502015-02-16 21:10:54 +0000712 * @return The number of the occurrences.
713 * @throws IOException
Nils Diewalda14ecd62015-02-26 21:00:20 +0000714 * @see KrillIndex#numberOf
Nils Diewaldea969502015-02-16 21:10:54 +0000715 */
716 public long numberOf (String field, String type) throws IOException {
Akron176c9b12015-07-29 19:53:40 +0200717
718 // No index defined
margaretha8efa3752018-07-24 17:46:43 +0200719 if (this.index == null) return (long) -1;
Nils Diewaldc925b492013-12-03 23:56:10 +0000720
Akronaa74ec62015-07-31 17:22:55 +0200721 // No reader (inex is empty)
margaretha8efa3752018-07-24 17:46:43 +0200722 if (this.index.reader() == null) return (long) 0;
Akronaa74ec62015-07-31 17:22:55 +0200723
Akron176c9b12015-07-29 19:53:40 +0200724 // This is redundant to index stuff
725 if (type.equals("documents") || type.equals("base/texts")) {
Akronfd05f502015-07-30 18:34:26 +0200726 if (this.cbi == null) {
margaretha8efa3752018-07-24 17:46:43 +0200727 if (this.index.reader() == null) return (long) 0;
Akron176c9b12015-07-29 19:53:40 +0200728 return (long) this.index.reader().numDocs();
Akronfd05f502015-07-30 18:34:26 +0200729 }
Akron176c9b12015-07-29 19:53:40 +0200730 else
731 return this.docCount();
732 };
Akron40550172015-08-04 03:06:12 +0200733
Akron176c9b12015-07-29 19:53:40 +0200734 // Create search term
735 // This may be prefixed by foundries
736 Term term = new Term(field, "-:" + type);
737
margaretha8efa3752018-07-24 17:46:43 +0200738 if (DEBUG) log.debug("Iterate for {}/{}", field, type);
Akronfd05f502015-07-30 18:34:26 +0200739
Akron176c9b12015-07-29 19:53:40 +0200740 long occurrences = 0;
741 try {
742 // Iterate over all atomic readers and collect occurrences
Akron700c1eb2015-09-25 16:57:30 +0200743 for (LeafReaderContext atomic : this.index.reader().leaves()) {
Akronfd05f502015-07-30 18:34:26 +0200744 Bits bits = this.bits(atomic);
Akronaa74ec62015-07-31 17:22:55 +0200745
margaretha8efa3752018-07-24 17:46:43 +0200746 if (DEBUG) log.debug("Final bits {}", _bits(bits));
Akronaa74ec62015-07-31 17:22:55 +0200747
748 occurrences += this._numberOfAtomic(bits, atomic, term);
margaretha8efa3752018-07-24 17:46:43 +0200749 if (DEBUG) log.debug("Added up to {} for {}/{}", occurrences,
750 field, type);
Akron176c9b12015-07-29 19:53:40 +0200751 };
752 }
Akron40550172015-08-04 03:06:12 +0200753
Akron176c9b12015-07-29 19:53:40 +0200754 // Something went wrong
Akronaa74ec62015-07-31 17:22:55 +0200755 catch (IOException e) {
Akronb59f40e2018-08-23 17:15:43 +0200756 log.warn(e.getLocalizedMessage());
757 }
758
759 // E.g. reference corpus not found
760 catch (QueryException e) {
761 log.warn(e.getLocalizedMessage());
Akron176c9b12015-07-29 19:53:40 +0200762 };
763
764 return occurrences;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000765 };
766
Nils Diewaldea969502015-02-16 21:10:54 +0000767
Akron176c9b12015-07-29 19:53:40 +0200768 // Search for meta information in term vectors
769 // This will create the sum of all numerical payloads
770 // of the term in the document vector
Akron700c1eb2015-09-25 16:57:30 +0200771 private long _numberOfAtomic (Bits docvec, LeafReaderContext atomic,
Akron176c9b12015-07-29 19:53:40 +0200772 Term term) throws IOException {
773
774 // This reimplements docsAndPositionsEnum with payloads
775 final Terms terms = atomic.reader().fields().terms(term.field());
776
777 // No terms were found
778 if (terms != null) {
779 // Todo: Maybe reuse a termsEnum!
780 final TermsEnum termsEnum = terms.iterator(null);
781
margaretha8efa3752018-07-24 17:46:43 +0200782 // Set the position in the iterator to the term that is
783 // seeked
Akron176c9b12015-07-29 19:53:40 +0200784 if (termsEnum.seekExact(term.bytes())) {
785
Akronaa74ec62015-07-31 17:22:55 +0200786 // TODO: Reuse a DocsAndPositionsEnum!!
787
Akron176c9b12015-07-29 19:53:40 +0200788 // Start an iterator to fetch all payloads of the term
Akron40550172015-08-04 03:06:12 +0200789 DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec,
790 null, DocsAndPositionsEnum.FLAG_PAYLOADS);
Akronaa74ec62015-07-31 17:22:55 +0200791
Akron176c9b12015-07-29 19:53:40 +0200792
793 // The iterator is empty
794 // This may even be an error, but we return 0
margaretha8efa3752018-07-24 17:46:43 +0200795 if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS) return 0;
Akron176c9b12015-07-29 19:53:40 +0200796
797 // Init some variables for data copying
798 long occurrences = 0;
799 BytesRef payload;
800
801 // Init nextDoc()
802 while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
803
margaretha8efa3752018-07-24 17:46:43 +0200804 if (docs.freq() < 1) continue;
Akronaa74ec62015-07-31 17:22:55 +0200805
Akron176c9b12015-07-29 19:53:40 +0200806 // Initialize (go to first term)
807 docs.nextPosition();
808
809 // Copy payload with the offset of the BytesRef
810 payload = docs.getPayload();
Akronaa74ec62015-07-31 17:22:55 +0200811 if (payload != null) {
Akron40550172015-08-04 03:06:12 +0200812 System.arraycopy(payload.bytes, payload.offset, pl, 0,
813 4);
Akron176c9b12015-07-29 19:53:40 +0200814
Akronaa74ec62015-07-31 17:22:55 +0200815 // Add payload as integer
margaretha85ee2ac2018-07-25 17:58:09 +0200816 occurrences += ByteBuffer.wrap(pl).getInt();
Akronaa74ec62015-07-31 17:22:55 +0200817
margaretha8efa3752018-07-24 17:46:43 +0200818 if (DEBUG) log.debug(
819 "Value for {} incremented by {} to {} in {}",
margaretha85ee2ac2018-07-25 17:58:09 +0200820 term, ByteBuffer.wrap(pl).getInt(), occurrences,
margaretha8efa3752018-07-24 17:46:43 +0200821 docs.docID());
Akronaa74ec62015-07-31 17:22:55 +0200822 };
Akron176c9b12015-07-29 19:53:40 +0200823 };
824
825 // Return the sum of all occurrences
826 return occurrences;
827 };
828 };
829
830 // Nothing found
831 return 0;
Akron40550172015-08-04 03:06:12 +0200832 };
Akron176c9b12015-07-29 19:53:40 +0200833
834
Nils Diewaldea969502015-02-16 21:10:54 +0000835 /**
Akron176c9b12015-07-29 19:53:40 +0200836 * Return the number of documents in the virtual
837 * collection.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000838 *
Nils Diewaldea969502015-02-16 21:10:54 +0000839 * @return The number of the occurrences.
Akron176c9b12015-07-29 19:53:40 +0200840 * @see #numberOf
Nils Diewaldea969502015-02-16 21:10:54 +0000841 */
Akron176c9b12015-07-29 19:53:40 +0200842 public long docCount () {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000843
Akron176c9b12015-07-29 19:53:40 +0200844 // No index defined
margaretha8efa3752018-07-24 17:46:43 +0200845 if (this.index == null) return (long) 0;
Akron176c9b12015-07-29 19:53:40 +0200846
847 // TODO: Caching!
848
849 long docCount = 0;
850 try {
851 FixedBitSet bitset;
Akron700c1eb2015-09-25 16:57:30 +0200852 for (LeafReaderContext atomic : this.index.reader().leaves()) {
Akron176c9b12015-07-29 19:53:40 +0200853 if ((bitset = this.bits(atomic)) != null)
854 docCount += bitset.cardinality();
855 };
856 }
857 catch (IOException e) {
858 log.warn(e.getLocalizedMessage());
Akronb59f40e2018-08-23 17:15:43 +0200859 }
860 catch (QueryException e) {
861 log.warn(e.getLocalizedMessage());
Akron176c9b12015-07-29 19:53:40 +0200862 };
863 return docCount;
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000864 };
865
Nils Diewaldea969502015-02-16 21:10:54 +0000866
Akronaa74ec62015-07-31 17:22:55 +0200867 private static String _bits (Bits bitset) {
868 String str = "";
869 for (int i = 0; i < bitset.length(); i++) {
870 str += bitset.get(i) ? "1" : "0";
871 };
872 return str;
873 };
margaretha2ac95e32021-11-29 15:31:14 +0100874
margaretha5a8abea2021-11-08 16:57:51 +0100875 @Override
876 public Set<String> getAllLeafFingerprints () {
877 return index.getAllLeafFingerprints();
margaretha85ee2ac2018-07-25 17:58:09 +0200878 }
margarethafe252802018-07-30 14:59:50 +0200879
Akron176c9b12015-07-29 19:53:40 +0200880 /*
Akron176c9b12015-07-29 19:53:40 +0200881 * Analyze how terms relate
882 */
883 /*
Nils Diewald7cbcfe92014-09-22 22:01:51 +0000884 @Deprecated
Akron176c9b12015-07-29 19:53:40 +0200885 public HashMap getTermRelation (KrillCollection kc, String field)
886 throws Exception {
887 HashMap<String, Long> map = new HashMap<>(100);
888 long docNumber = 0, checkNumber = 0;
Eliza Margaretha6f989202016-10-14 21:48:29 +0200889
Nils Diewald44d5fa12015-01-15 21:31:52 +0000890 try {
Akron176c9b12015-07-29 19:53:40 +0200891 if (kc.getCount() <= 0) {
892 checkNumber = (long) this.reader().numDocs();
893 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200894
Akron700c1eb2015-09-25 16:57:30 +0200895 for (LeafReaderContext atomic : this.reader().leaves()) {
Akron176c9b12015-07-29 19:53:40 +0200896 HashMap<String, FixedBitSet> termVector = new HashMap<>(20);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200897
Akron176c9b12015-07-29 19:53:40 +0200898 FixedBitSet docvec = kc.bits(atomic);
899 if (docvec != null) {
900 docNumber += docvec.cardinality();
901 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200902
Akron176c9b12015-07-29 19:53:40 +0200903 Terms terms = atomic.reader().fields().terms(field);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200904
Akron176c9b12015-07-29 19:53:40 +0200905 if (terms == null) {
906 continue;
907 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200908
Akron176c9b12015-07-29 19:53:40 +0200909 int docLength = atomic.reader().maxDoc();
910 FixedBitSet bitset = new FixedBitSet(docLength);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200911
Akron176c9b12015-07-29 19:53:40 +0200912 // Iterate over all tokens in this field
913 TermsEnum termsEnum = terms.iterator(null);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200914
Akron176c9b12015-07-29 19:53:40 +0200915 while (termsEnum.next() != null) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200916
Akron176c9b12015-07-29 19:53:40 +0200917 String termString = termsEnum.term().utf8ToString();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200918
Akron176c9b12015-07-29 19:53:40 +0200919 bitset.clear(0, docLength);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200920
Akron176c9b12015-07-29 19:53:40 +0200921 // Get frequency
922 bitset.or((DocIdSetIterator) termsEnum.docs((Bits) docvec,
923 null));
Eliza Margaretha6f989202016-10-14 21:48:29 +0200924
Akron176c9b12015-07-29 19:53:40 +0200925 long value = 0;
926 if (map.containsKey(termString))
927 value = map.get(termString);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200928
Akron176c9b12015-07-29 19:53:40 +0200929 map.put(termString, value + bitset.cardinality());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200930
Akron176c9b12015-07-29 19:53:40 +0200931 termVector.put(termString, bitset.clone());
932 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200933
Akron176c9b12015-07-29 19:53:40 +0200934 int keySize = termVector.size();
935 String[] keys = termVector.keySet()
936 .toArray(new String[keySize]);
937 java.util.Arrays.sort(keys);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200938
Akron176c9b12015-07-29 19:53:40 +0200939 if (keySize > maxTermRelations) {
940 throw new Exception("termRelations are limited to "
941 + maxTermRelations + " sets"
942 + " (requested were at least " + keySize + " sets)");
943 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200944
Akron176c9b12015-07-29 19:53:40 +0200945 for (int i = 0; i < keySize; i++) {
946 for (int j = i + 1; j < keySize; j++) {
947 FixedBitSet comby = termVector.get(keys[i]).clone();
948 comby.and(termVector.get(keys[j]));
Eliza Margaretha6f989202016-10-14 21:48:29 +0200949
Akron176c9b12015-07-29 19:53:40 +0200950 StringBuilder sb = new StringBuilder();
951 sb.append("#__").append(keys[i]).append(":###:")
952 .append(keys[j]);
953 String combString = sb.toString();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200954
Akron176c9b12015-07-29 19:53:40 +0200955 long cap = (long) comby.cardinality();
956 if (map.containsKey(combString)) {
957 cap += map.get(combString);
958 };
959 map.put(combString, cap);
960 };
Nils Diewald44d5fa12015-01-15 21:31:52 +0000961 };
962 };
Akron176c9b12015-07-29 19:53:40 +0200963 map.put("-docs", checkNumber != 0 ? checkNumber : docNumber);
Nils Diewald44d5fa12015-01-15 21:31:52 +0000964 }
Akron176c9b12015-07-29 19:53:40 +0200965 catch (IOException e) {
966 log.warn(e.getMessage());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000967 };
Akron176c9b12015-07-29 19:53:40 +0200968 return map;
Nils Diewald2276e1c2014-04-10 15:01:59 +0000969 };
Akron176c9b12015-07-29 19:53:40 +0200970 */
Nils Diewald65894bd2015-02-16 21:36:53 +0000971
972
Nils Diewald01b4ce32013-12-05 22:39:25 +0000973};