blob: ffa99cdea584bdf32765e30f09c29f95de8e3aba [file] [log] [blame]
Nils Diewaldbaf68c52013-11-20 13:22:19 +00001package de.ids_mannheim.korap;
2
margarethafe252802018-07-30 14:59:50 +02003import java.io.File;
4import java.io.FileInputStream;
Nils Diewaldbaf68c52013-11-20 13:22:19 +00005import java.io.IOException;
margaretha7e31ca92021-12-13 10:48:44 +01006import java.io.InputStream;
margaretha8efa3752018-07-24 17:46:43 +02007import java.nio.ByteBuffer;
margaretha4dfe3c52018-08-13 17:07:50 +02008import java.util.Properties;
margaretha5a8abea2021-11-08 16:57:51 +01009import java.util.Set;
margarethac20a9212018-08-21 14:32:09 +020010import java.util.zip.GZIPInputStream;
Nils Diewaldc925b492013-12-03 23:56:10 +000011
margarethafe252802018-07-30 14:59:50 +020012import org.apache.commons.io.IOUtils;
margarethac20a9212018-08-21 14:32:09 +020013import org.apache.commons.io.output.ByteArrayOutputStream;
margaretha8efa3752018-07-24 17:46:43 +020014import org.apache.lucene.index.DocsAndPositionsEnum;
15import org.apache.lucene.index.LeafReader;
16import org.apache.lucene.index.LeafReaderContext;
17import org.apache.lucene.index.Term;
18import org.apache.lucene.index.Terms;
19import org.apache.lucene.index.TermsEnum;
20import org.apache.lucene.search.BitsFilteredDocIdSet;
21import org.apache.lucene.search.DocIdSet;
22import org.apache.lucene.search.DocIdSetIterator;
23import org.apache.lucene.search.Filter;
24import org.apache.lucene.util.BitDocIdSet;
25import org.apache.lucene.util.Bits;
26import org.apache.lucene.util.BytesRef;
27import org.apache.lucene.util.FixedBitSet;
28import org.slf4j.Logger;
29import org.slf4j.LoggerFactory;
30
31import com.fasterxml.jackson.databind.JsonNode;
32import com.fasterxml.jackson.databind.ObjectMapper;
33
Nils Diewaldea969502015-02-16 21:10:54 +000034import de.ids_mannheim.korap.collection.CollectionBuilder;
Nils Diewaldc471b182014-11-19 22:51:15 +000035import de.ids_mannheim.korap.response.Notifications;
margaretha4dfe3c52018-08-13 17:07:50 +020036import de.ids_mannheim.korap.util.KrillProperties;
Akron176c9b12015-07-29 19:53:40 +020037import de.ids_mannheim.korap.util.QueryException;
margaretha78f397a2017-06-29 13:44:46 +020038import de.ids_mannheim.korap.util.StatusCodes;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000039
Nils Diewaldc471b182014-11-19 22:51:15 +000040/**
Nils Diewaldea969502015-02-16 21:10:54 +000041 * Create a Virtual Collection of documents by means of a KoralQuery
Nils Diewald2d5f8102015-02-26 21:07:54 +000042 * collection object.
Nils Diewaldbb33da22015-03-04 16:24:25 +000043 *
Nils Diewaldea969502015-02-16 21:10:54 +000044 * <blockquote><pre>
Nils Diewaldbb33da22015-03-04 16:24:25 +000045 * KrillCollection kc = new KrillCollection(json);
Nils Diewaldea969502015-02-16 21:10:54 +000046 * </pre></blockquote>
Nils Diewaldbb33da22015-03-04 16:24:25 +000047 *
Nils Diewaldafab8f32015-01-26 19:11:32 +000048 * @author diewald
Nils Diewaldc471b182014-11-19 22:51:15 +000049 */
Nils Diewaldea969502015-02-16 21:10:54 +000050/*
51 * TODO: Make a cache for the bits
52 * Delete it in case of an extension or a filter
53 * TODO: Maybe use randomaccessfilterstrategy
54 * TODO: Maybe a constantScoreQuery can make things faster?
55 * See http://mail-archives.apache.org/mod_mbox/lucene-java-user/
56 * 200805.mbox/%3C17080852.post@talk.nabble.com%3E
57 */
margaretha5a8abea2021-11-08 16:57:51 +010058public final class KrillCollection extends Notifications implements IndexInfo {
Nils Diewalda14ecd62015-02-26 21:00:20 +000059 private KrillIndex index;
Akronbb5d1732015-06-22 01:22:40 +020060 private JsonNode json;
margaretha5a8abea2021-11-08 16:57:51 +010061 private final CollectionBuilder cb = new CollectionBuilder(this);
Akron60dfa7e2015-08-03 22:15:17 +020062 private CollectionBuilder.Interface cbi;
Akron176c9b12015-07-29 19:53:40 +020063 private byte[] pl = new byte[4];
Akronb59f40e2018-08-23 17:15:43 +020064
Akron65d57e92018-08-24 19:25:56 +020065 private ObjectMapper mapper = new ObjectMapper();
66
Akronb59f40e2018-08-23 17:15:43 +020067 private Filter prefiltered = null;
margaretha85ee2ac2018-07-25 17:58:09 +020068 // private static ByteBuffer bb = ByteBuffer.allocate(4);
Nils Diewaldbb33da22015-03-04 16:24:25 +000069
Nils Diewaldbaf68c52013-11-20 13:22:19 +000070 // Logger
margaretha5a8abea2021-11-08 16:57:51 +010071 private final static Logger log =
margaretha8efa3752018-07-24 17:46:43 +020072 LoggerFactory.getLogger(KrillCollection.class);
Nils Diewald7cbcfe92014-09-22 22:01:51 +000073 // This advices the java compiler to ignore all loggings
Akron3ba74f22015-07-24 18:46:17 +020074 public static final boolean DEBUG = false;
margaretha5a8abea2021-11-08 16:57:51 +010075 private double start, end; // for debugging
margaretha85ee2ac2018-07-25 17:58:09 +020076
Nils Diewald65894bd2015-02-16 21:36:53 +000077 /**
Akron176c9b12015-07-29 19:53:40 +020078 * Construct a new KrillCollection.
79 *
80 */
margaretha5a8abea2021-11-08 16:57:51 +010081 public KrillCollection () {};
Akron176c9b12015-07-29 19:53:40 +020082
83
84 /**
Nils Diewald2d5f8102015-02-26 21:07:54 +000085 * Construct a new KrillCollection by passing a KrillIndex.
Nils Diewaldbb33da22015-03-04 16:24:25 +000086 *
87 * @param index
88 * The {@link KrillIndex} object.
Nils Diewald65894bd2015-02-16 21:36:53 +000089 */
Nils Diewald2d5f8102015-02-26 21:07:54 +000090 public KrillCollection (KrillIndex index) {
Nils Diewald65894bd2015-02-16 21:36:53 +000091 this.index = index;
Nils Diewaldbaf68c52013-11-20 13:22:19 +000092 };
93
margaretha85ee2ac2018-07-25 17:58:09 +020094 /**
Nils Diewald2d5f8102015-02-26 21:07:54 +000095 * Construct a new KrillCollection by passing a KoralQuery.
Nils Diewaldbb33da22015-03-04 16:24:25 +000096 *
Akron176c9b12015-07-29 19:53:40 +020097 * @param json
98 * The KoralQuery document as a JSON string.
Nils Diewald33fcb5d2014-11-07 23:27:03 +000099 */
Nils Diewald2d5f8102015-02-26 21:07:54 +0000100 public KrillCollection (String jsonString) {
Nils Diewald44d5fa12015-01-15 21:31:52 +0000101 try {
102 JsonNode json = mapper.readTree(jsonString);
margaretha8efa3752018-07-24 17:46:43 +0200103
104 if (json.has("errors") && json.get("errors").size() > 0) {
105 this.addError(StatusCodes.INVALID_QUERY, "Json has errors.");
margarethaf2c31502017-06-26 17:57:16 +0200106 }
margaretha8efa3752018-07-24 17:46:43 +0200107 else if (json.has("collection")) {
Akron850b46e2016-06-08 10:08:55 +0200108 this.fromKoral(json.get("collection"));
margarethaf2c31502017-06-26 17:57:16 +0200109 }
margaretha8efa3752018-07-24 17:46:43 +0200110 else if (json.has("collections")) {
Akron40550172015-08-04 03:06:12 +0200111 this.addError(899,
112 "Collections are not supported anymore in favour of a single collection");
margarethaf2c31502017-06-26 17:57:16 +0200113 }
margaretha8efa3752018-07-24 17:46:43 +0200114 else {
115 this.addError(StatusCodes.MISSING_COLLECTION,
Akronb59f40e2018-08-23 17:15:43 +0200116 "Collection is not found");
margaretha8efa3752018-07-24 17:46:43 +0200117 this.fromBuilder(this.build().nothing());
margarethaf2c31502017-06-26 17:57:16 +0200118 }
Nils Diewald44d5fa12015-01-15 21:31:52 +0000119 }
Akron176c9b12015-07-29 19:53:40 +0200120
121 // Query Exception
Nils Diewald44d5fa12015-01-15 21:31:52 +0000122 catch (QueryException qe) {
Nils Diewaldea969502015-02-16 21:10:54 +0000123 this.addError(qe.getErrorCode(), qe.getMessage());
margaretha8efa3752018-07-24 17:46:43 +0200124 this.fromBuilder(this.build().nothing());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000125 }
Akron176c9b12015-07-29 19:53:40 +0200126
127 // JSON exception
Nils Diewald44d5fa12015-01-15 21:31:52 +0000128 catch (IOException e) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000129 this.addError(621, "Unable to parse JSON", "KrillCollection",
130 e.getLocalizedMessage());
margaretha8efa3752018-07-24 17:46:43 +0200131 this.fromBuilder(this.build().nothing());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000132 };
Nils Diewald01b4ce32013-12-05 22:39:25 +0000133 };
134
margaretha5a8abea2021-11-08 16:57:51 +0100135
Nils Diewaldea969502015-02-16 21:10:54 +0000136 /**
Akron176c9b12015-07-29 19:53:40 +0200137 * Set the {@link KrillIndex} the virtual collection refers to.
138 *
139 * @param index
140 * The {@link KrillIndex} the virtual collection refers
141 * to.
Nils Diewaldea969502015-02-16 21:10:54 +0000142 */
Akron176c9b12015-07-29 19:53:40 +0200143 public void setIndex (KrillIndex index) {
144 this.index = index;
Nils Diewaldc925b492013-12-03 23:56:10 +0000145 };
146
Nils Diewald33fcb5d2014-11-07 23:27:03 +0000147
Nils Diewaldea969502015-02-16 21:10:54 +0000148 /**
149 * Import the "collection" part of a KoralQuery.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000150 *
151 * @param jsonString
152 * The "collection" part of a KoralQuery.
Nils Diewaldea969502015-02-16 21:10:54 +0000153 * @throws QueryException
154 */
Akron850b46e2016-06-08 10:08:55 +0200155 public KrillCollection fromKoral (String jsonString) throws QueryException {
margaretha5a8abea2021-11-08 16:57:51 +0100156 this.prefiltered = null;
Nils Diewald44d5fa12015-01-15 21:31:52 +0000157 try {
Akron850b46e2016-06-08 10:08:55 +0200158 this.fromKoral((JsonNode) mapper.readTree(jsonString));
Nils Diewald44d5fa12015-01-15 21:31:52 +0000159 }
160 catch (Exception e) {
Nils Diewald2d5f8102015-02-26 21:07:54 +0000161 this.addError(621, "Unable to parse JSON", "KrillCollection");
margaretha8efa3752018-07-24 17:46:43 +0200162 this.fromBuilder(this.build().nothing());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000163 };
Nils Diewald3aa9e692015-02-20 22:20:11 +0000164
165 return this;
Nils Diewalde3645702014-11-07 21:15:20 +0000166 };
Nils Diewald78993522014-10-27 17:51:22 +0000167
Nils Diewald33fcb5d2014-11-07 23:27:03 +0000168
Akron65d57e92018-08-24 19:25:56 +0200169 public KrillCollection fromStore (String ref) throws QueryException {
Akronb59f40e2018-08-23 17:15:43 +0200170 this.prefiltered = null;
margaretha88258da2024-06-07 12:19:51 +0200171 String namedVCPath = KrillProperties.namedVCPath;
Akronb59f40e2018-08-23 17:15:43 +0200172 if (!namedVCPath.endsWith("/")) {
173 namedVCPath += "/";
174 };
175
176 String fileName = namedVCPath + ref + ".jsonld";
177 File file;
178 String json = null;
margaretha7e31ca92021-12-13 10:48:44 +0100179 InputStream is = null;
Akronb59f40e2018-08-23 17:15:43 +0200180 if ((file= new File(fileName)).exists()) {
181 try (FileInputStream fis = new FileInputStream(file)) {
182 json = IOUtils.toString(fis,"utf-8");
183 }
184 catch (IOException e) {
185 this.addError(StatusCodes.READING_COLLECTION_FAILED,
186 e.getMessage());
187 return this;
188 }
189 }
190 // slower than plain text, but save space
191 else if ((file = new File(fileName + ".gz")).exists()){
192 try (GZIPInputStream gzipInputStream =
193 new GZIPInputStream(new FileInputStream(file));
194 ByteArrayOutputStream bos =
195 new ByteArrayOutputStream(512);) {
196 bos.write(gzipInputStream);
197 json = bos.toString("utf-8");
198 }
199 catch (IOException e) {
200 this.addError(StatusCodes.READING_COLLECTION_FAILED,
201 e.getMessage());
202 return this;
203 }
204 }
margaretha7e31ca92021-12-13 10:48:44 +0100205 // for testing
margaretha88258da2024-06-07 12:19:51 +0200206 else if (KrillProperties.isTest
margaretha7e31ca92021-12-13 10:48:44 +0100207 && (is = retrieveInputStreamFromClasspath(fileName)) != null) {
208 try {
209 json = IOUtils.toString(is, "utf-8");
210 }
211 catch (IOException e) {
212 this.addError(StatusCodes.READING_COLLECTION_FAILED,
213 e.getMessage());
214 return this;
215 }
216 }
Akronb59f40e2018-08-23 17:15:43 +0200217 else{
218 this.addError(StatusCodes.MISSING_COLLECTION,
219 "Collection is not found " + fileName);
220 return this;
221 };
222
223 return this.fromKoral(json);
224 };
225
226
margaretha7e31ca92021-12-13 10:48:44 +0100227 private InputStream retrieveInputStreamFromClasspath (String fileName) {
228 if (!fileName.startsWith("/")) {
229 fileName = "/"+fileName;
230 }
231 return KrillCollection.class.getResourceAsStream(fileName);
232 }
233
234
Nils Diewaldea969502015-02-16 21:10:54 +0000235 /**
236 * Import the "collection" part of a KoralQuery.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000237 *
238 * @param json
239 * The "collection" part of a KoralQuery
240 * as a {@link JsonNode} object.
Nils Diewaldea969502015-02-16 21:10:54 +0000241 * @throws QueryException
242 */
Akron850b46e2016-06-08 10:08:55 +0200243 public KrillCollection fromKoral (JsonNode json) throws QueryException {
Akronbb5d1732015-06-22 01:22:40 +0200244 this.json = json;
Akronb59f40e2018-08-23 17:15:43 +0200245 this.prefiltered = null;
Akron850b46e2016-06-08 10:08:55 +0200246 return this.fromBuilder(this._fromKoral(json));
Nils Diewald65894bd2015-02-16 21:36:53 +0000247 };
248
249
Akrond5ca00a2016-06-08 14:29:00 +0200250 // Create collection from KoralQuery
Akron850b46e2016-06-08 10:08:55 +0200251 private CollectionBuilder.Interface _fromKoral (JsonNode json)
Akronb59f40e2018-08-23 17:15:43 +0200252 throws QueryException {
253
254 if (json.has("collection")) {
255 return this._fromKoral(json.at("/collection"));
256 };
Nils Diewald65894bd2015-02-16 21:36:53 +0000257
Akronc63697c2015-06-17 22:32:02 +0200258 if (!json.has("@type")) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000259 throw new QueryException(701,
260 "JSON-LD group has no @type attribute");
Akronc63697c2015-06-17 22:32:02 +0200261 };
Nils Diewald65894bd2015-02-16 21:36:53 +0000262
263 String type = json.get("@type").asText();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000264
Nils Diewaldcec40f92015-02-19 22:20:02 +0000265 if (type.equals("koral:doc")) {
Nils Diewald65894bd2015-02-16 21:36:53 +0000266
margaretha85ee2ac2018-07-25 17:58:09 +0200267 // default key
Nils Diewaldbb33da22015-03-04 16:24:25 +0000268 String key = "tokens";
Nils Diewald65894bd2015-02-16 21:36:53 +0000269 String valtype = "type:string";
Nils Diewaldbb33da22015-03-04 16:24:25 +0000270 String match = "match:eq";
Nils Diewald65894bd2015-02-16 21:36:53 +0000271
margaretha8efa3752018-07-24 17:46:43 +0200272 if (json.has("key")) key = json.get("key").asText();
Nils Diewaldbb33da22015-03-04 16:24:25 +0000273
margaretha8efa3752018-07-24 17:46:43 +0200274 if (json.has("type")) valtype = json.get("type").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000275
276 // Filter based on date
277 if (valtype.equals("type:date")) {
278
279 if (!json.has("value"))
Akron176c9b12015-07-29 19:53:40 +0200280 throw new QueryException(820, "Dates require value fields");
Nils Diewald65894bd2015-02-16 21:36:53 +0000281
282 String dateStr = json.get("value").asText();
Akron176c9b12015-07-29 19:53:40 +0200283
margaretha8efa3752018-07-24 17:46:43 +0200284 if (json.has("match")) match = json.get("match").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000285
286 // TODO: This isn't stable yet
287 switch (match) {
Akron40550172015-08-04 03:06:12 +0200288 case "match:eq":
289 return this.cb.date(key, dateStr);
290 case "match:ne":
291 return this.cb.date(key, dateStr).not();
292 case "match:geq":
293 return this.cb.since(key, dateStr);
294 case "match:leq":
295 return this.cb.till(key, dateStr);
Nils Diewald65894bd2015-02-16 21:36:53 +0000296 };
Akron48937e92015-06-26 01:49:02 +0200297
Eliza Margaretha6f989202016-10-14 21:48:29 +0200298 throw new QueryException(841,
299 "Match relation unknown for type");
Nils Diewald65894bd2015-02-16 21:36:53 +0000300 }
301
Akron48937e92015-06-26 01:49:02 +0200302 // Filter based on string
Nils Diewald65894bd2015-02-16 21:36:53 +0000303 else if (valtype.equals("type:string")) {
margarethaecddb0b2018-07-31 15:23:38 +0200304 if (json.get("value").size() > 1){
Akrone64cc162019-01-08 18:40:37 +0100305 if (DEBUG) {
306 log.debug("koral:doc size " + json.get("value").size());
307 };
margarethadf0e9d12018-07-30 16:22:59 +0200308 if (json.has("match")) {
309 match = json.get("match").asText();
310 }
311
margaretha8a8c4272018-08-21 17:39:27 +0200312 CollectionBuilder.Group group = this.cb.orGroup();
313 for (JsonNode value : json.get("value")) {
314 group.with(cb.term(key, value.asText()));
margarethadf0e9d12018-07-30 16:22:59 +0200315 }
margaretha8a8c4272018-08-21 17:39:27 +0200316
317 if (match.equals("match:ne")) {
318 return group.not();
margarethadf0e9d12018-07-30 16:22:59 +0200319 }
320 return group;
321 }
322
margaretha8efa3752018-07-24 17:46:43 +0200323 if (json.has("match")) match = json.get("match").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000324
Akron176c9b12015-07-29 19:53:40 +0200325 switch (match) {
326
Akron40550172015-08-04 03:06:12 +0200327 case "match:eq":
328 return this.cb.term(key, json.get("value").asText());
329 case "match:ne":
330 return this.cb.term(key, json.get("value").asText())
331 .not();
Akron176c9b12015-07-29 19:53:40 +0200332
margaretha8efa3752018-07-24 17:46:43 +0200333 // Contains and containsnot (or excludes) is only
334 // effective on text fields and ineffective on
335 // string fields
Akron40550172015-08-04 03:06:12 +0200336 case "match:contains":
margaretha8efa3752018-07-24 17:46:43 +0200337 return this.cb.text(key, json.get("value").asText());
Akron176c9b12015-07-29 19:53:40 +0200338
Akron40550172015-08-04 03:06:12 +0200339 case "match:containsnot":
margaretha8efa3752018-07-24 17:46:43 +0200340 return this.cb.text(key, json.get("value").asText())
341 .not();
Akron176c9b12015-07-29 19:53:40 +0200342
Eliza Margaretha6f989202016-10-14 21:48:29 +0200343 // <LEGACY>
Akron40550172015-08-04 03:06:12 +0200344 case "match:excludes":
margaretha8efa3752018-07-24 17:46:43 +0200345 return this.cb.text(key, json.get("value").asText())
346 .not();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200347 // </LEGACY>
Akron48937e92015-06-26 01:49:02 +0200348 };
Nils Diewald65894bd2015-02-16 21:36:53 +0000349
Eliza Margaretha6f989202016-10-14 21:48:29 +0200350 throw new QueryException(841,
351 "Match relation unknown for type");
Akron48937e92015-06-26 01:49:02 +0200352 }
353
354 // Filter based on regex
355 else if (valtype.equals("type:regex")) {
Akron176c9b12015-07-29 19:53:40 +0200356
margaretha8efa3752018-07-24 17:46:43 +0200357 if (json.has("match")) match = json.get("match").asText();
Akron48937e92015-06-26 01:49:02 +0200358
359 if (match.equals("match:eq")) {
Akronfd05f502015-07-30 18:34:26 +0200360 return this.cb.re(key, json.get("value").asText());
Akron48937e92015-06-26 01:49:02 +0200361 }
362 else if (match.equals("match:ne")) {
Akronfd05f502015-07-30 18:34:26 +0200363 return this.cb.re(key, json.get("value").asText()).not();
Akron176c9b12015-07-29 19:53:40 +0200364 }
Akron27469702018-04-05 12:46:18 +0200365
margaretha8efa3752018-07-24 17:46:43 +0200366 // Contains and containsnot (or excludes) is
367 // identical to eq and ne in case of regexes for the
368 // moment,
369 // though it may be beneficial to circumfix these
370 // with .*
Akron176c9b12015-07-29 19:53:40 +0200371 else if (match.equals("match:contains")) {
Akronfd05f502015-07-30 18:34:26 +0200372 return this.cb.re(key, json.get("value").asText());
Akron176c9b12015-07-29 19:53:40 +0200373 }
Akron27469702018-04-05 12:46:18 +0200374 else if (match.equals("match:containsnot")) {
375 return this.cb.re(key, json.get("value").asText());
376 }
margaretha8efa3752018-07-24 17:46:43 +0200377 // <LEGACY>
Akron176c9b12015-07-29 19:53:40 +0200378 else if (match.equals("match:excludes")) {
Akronfd05f502015-07-30 18:34:26 +0200379 return this.cb.re(key, json.get("value").asText()).not();
Akron48937e92015-06-26 01:49:02 +0200380 };
margaretha8efa3752018-07-24 17:46:43 +0200381 // </LEGACY>
Akron48937e92015-06-26 01:49:02 +0200382
Eliza Margaretha6f989202016-10-14 21:48:29 +0200383 throw new QueryException(841,
384 "Match relation unknown for type");
margaretha85ee2ac2018-07-25 17:58:09 +0200385 }
386
Akron176c9b12015-07-29 19:53:40 +0200387 throw new QueryException(843, "Document type is not supported");
Nils Diewald65894bd2015-02-16 21:36:53 +0000388 }
389
390 // nested group
Akronb59f40e2018-08-23 17:15:43 +0200391 else if (type.equals("koral:docGroup")) {
Akron176c9b12015-07-29 19:53:40 +0200392
Nils Diewald65894bd2015-02-16 21:36:53 +0000393 if (!json.has("operands") || !json.get("operands").isArray())
Akron40550172015-08-04 03:06:12 +0200394 throw new QueryException(842,
395 "Document group needs operand list");
Akron176c9b12015-07-29 19:53:40 +0200396
Akron60dfa7e2015-08-03 22:15:17 +0200397 CollectionBuilder.Group group;
Nils Diewald65894bd2015-02-16 21:36:53 +0000398
399 String operation = "operation:and";
400 if (json.has("operation"))
Akron40550172015-08-04 03:06:12 +0200401 operation = json.get("operation").asText();
Nils Diewald65894bd2015-02-16 21:36:53 +0000402
Akron176c9b12015-07-29 19:53:40 +0200403 if (operation.equals("operation:or"))
Akronfd05f502015-07-30 18:34:26 +0200404 group = this.cb.orGroup();
Akron176c9b12015-07-29 19:53:40 +0200405 else if (operation.equals("operation:and"))
Akronfd05f502015-07-30 18:34:26 +0200406 group = this.cb.andGroup();
Akron176c9b12015-07-29 19:53:40 +0200407 else
Akron40550172015-08-04 03:06:12 +0200408 throw new QueryException(810,
409 "Unknown document group operation");
410
Nils Diewald65894bd2015-02-16 21:36:53 +0000411 for (JsonNode operand : json.get("operands")) {
Akron6b0be132019-09-16 19:01:59 +0200412
413 // TODO:
414 // Potentially bed here, when operand is a group inside a group
415 // with the same operator (and not negative)
Akron850b46e2016-06-08 10:08:55 +0200416 group.with(this._fromKoral(operand));
Nils Diewald65894bd2015-02-16 21:36:53 +0000417 };
Akron176c9b12015-07-29 19:53:40 +0200418 return group;
Nils Diewald65894bd2015-02-16 21:36:53 +0000419 }
Akronb59f40e2018-08-23 17:15:43 +0200420
margaretha8efa3752018-07-24 17:46:43 +0200421 // vc reference
422 else if (type.equals("koral:docGroupRef")) {
Akronb59f40e2018-08-23 17:15:43 +0200423
margaretha8efa3752018-07-24 17:46:43 +0200424 if (!json.has("ref")) {
425 throw new QueryException(StatusCodes.MISSING_VC_REFERENCE,
426 "ref is not found");
427 }
margaretha85ee2ac2018-07-25 17:58:09 +0200428
margaretha8efa3752018-07-24 17:46:43 +0200429 String ref = json.get("ref").asText();
margaretha85ee2ac2018-07-25 17:58:09 +0200430 if (ref.isEmpty()) {
margaretha8efa3752018-07-24 17:46:43 +0200431 throw new QueryException(StatusCodes.MISSING_VC_REFERENCE,
432 "ref is empty");
Akronb59f40e2018-08-23 17:15:43 +0200433 };
margaretha85ee2ac2018-07-25 17:58:09 +0200434
Akronb59f40e2018-08-23 17:15:43 +0200435 return this.cb.referTo(ref);
margaretha8efa3752018-07-24 17:46:43 +0200436 }
437
Nils Diewald65894bd2015-02-16 21:36:53 +0000438
439 // Unknown type
Akron176c9b12015-07-29 19:53:40 +0200440 throw new QueryException(813, "Collection type is not supported");
Akron40550172015-08-04 03:06:12 +0200441 };
442
Akron176c9b12015-07-29 19:53:40 +0200443 /**
444 * Set the collection from a {@link CollectionBuilder} object.
445 *
Akron40550172015-08-04 03:06:12 +0200446 * @param cb
447 * The CollectionBuilder object.
Akron176c9b12015-07-29 19:53:40 +0200448 */
Akron60dfa7e2015-08-03 22:15:17 +0200449 public KrillCollection fromBuilder (CollectionBuilder.Interface cbi) {
Akronb59f40e2018-08-23 17:15:43 +0200450 this.prefiltered = null;
Akronfd05f502015-07-30 18:34:26 +0200451 this.cbi = cbi;
Nils Diewald3aa9e692015-02-20 22:20:11 +0000452 return this;
Nils Diewalde3645702014-11-07 21:15:20 +0000453 };
454
Akron40550172015-08-04 03:06:12 +0200455
Akron60dfa7e2015-08-03 22:15:17 +0200456 public CollectionBuilder.Interface getBuilder () {
Akronfd05f502015-07-30 18:34:26 +0200457 return this.cbi;
458 };
459
460
461 public CollectionBuilder build () {
Akron176c9b12015-07-29 19:53:40 +0200462 return this.cb;
Nils Diewald01b4ce32013-12-05 22:39:25 +0000463 };
464
Akron40550172015-08-04 03:06:12 +0200465
Akron60dfa7e2015-08-03 22:15:17 +0200466 public KrillCollection filter (CollectionBuilder.Interface filter) {
Akronfd05f502015-07-30 18:34:26 +0200467 return this.fromBuilder(this.cb.andGroup().with(this.cbi).with(filter));
468 };
469
Akron40550172015-08-04 03:06:12 +0200470
Akron60dfa7e2015-08-03 22:15:17 +0200471 public KrillCollection extend (CollectionBuilder.Interface extension) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200472 return this
473 .fromBuilder(this.cb.orGroup().with(this.cbi).with(extension));
Akronfd05f502015-07-30 18:34:26 +0200474 };
475
476
477
Nils Diewaldea969502015-02-16 21:10:54 +0000478 /**
479 * Add a filter based on a list of unique document identifiers.
480 * UIDs may be indexed in the field "UID".
Nils Diewaldbb33da22015-03-04 16:24:25 +0000481 *
Nils Diewaldea969502015-02-16 21:10:54 +0000482 * This filter is not part of the legacy API!
Nils Diewaldbb33da22015-03-04 16:24:25 +0000483 *
484 * @param uids
485 * The list of unique document identifier.
Nils Diewald2d5f8102015-02-26 21:07:54 +0000486 * @return The {@link KrillCollection} object for chaining.
Nils Diewaldea969502015-02-16 21:10:54 +0000487 */
Nils Diewald2d5f8102015-02-26 21:07:54 +0000488 public KrillCollection filterUIDs (String ... uids) {
Akronb59f40e2018-08-23 17:15:43 +0200489 this.prefiltered = null;
Akron60dfa7e2015-08-03 22:15:17 +0200490 CollectionBuilder.Group cbg = this.cb.orGroup();
Akronfd05f502015-07-30 18:34:26 +0200491 for (String uid : uids) {
492 cbg.with(this.cb.term("UID", uid));
493 };
Akron60dfa7e2015-08-03 22:15:17 +0200494 return this.filter(cbg);
Nils Diewaldd723d812014-09-23 18:50:52 +0000495 };
496
497
Nils Diewaldea969502015-02-16 21:10:54 +0000498 /**
Akron176c9b12015-07-29 19:53:40 +0200499 * Serialize collection to a {@link Filter} object.
Nils Diewaldea969502015-02-16 21:10:54 +0000500 */
Akronb59f40e2018-08-23 17:15:43 +0200501 public Filter toFilter () throws QueryException {
502 if (this.cbi == null)
503 return null;
504
505 if (this.prefiltered != null)
506 return this.prefiltered;
507
508 this.prefiltered = this.cbi.toFilter();
509 return this.prefiltered;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000510 };
511
Nils Diewaldea969502015-02-16 21:10:54 +0000512
513 /**
Akron176c9b12015-07-29 19:53:40 +0200514 * Boolean value if the collection should work inverted or
515 * not.
Nils Diewaldea969502015-02-16 21:10:54 +0000516 */
Akron176c9b12015-07-29 19:53:40 +0200517 public boolean isNegative () {
margaretha8efa3752018-07-24 17:46:43 +0200518 if (this.cbi == null) return false;
Nils Diewald01b4ce32013-12-05 22:39:25 +0000519
Akronfd05f502015-07-30 18:34:26 +0200520 return this.cbi.isNegative();
Nils Diewaldea969502015-02-16 21:10:54 +0000521 };
522
523
524 /**
Akronb59f40e2018-08-23 17:15:43 +0200525 * Generate a string representation of the virtual collection.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000526 *
527 * <strong>Warning</strong>: This currently does not generate a
528 * valid
Nils Diewaldea969502015-02-16 21:10:54 +0000529 * KoralQuery string, so this may change in a future version.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000530 *
Nils Diewaldea969502015-02-16 21:10:54 +0000531 * @return A string representation of the virtual collection.
532 */
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000533 public String toString () {
Akronb59f40e2018-08-23 17:15:43 +0200534 try {
535 Filter filter = this.toFilter();
536 if (filter == null) return "";
537 return (this.isNegative() ? "-" : "") + filter.toString();
538 }
539 catch (QueryException qe) {
540 log.warn(qe.getLocalizedMessage());
541 };
542 return "";
Nils Diewaldb1c3b652013-12-28 22:47:00 +0000543 };
544
Nils Diewaldea969502015-02-16 21:10:54 +0000545
Nils Diewald7cbcfe92014-09-22 22:01:51 +0000546 /**
Akronbb5d1732015-06-22 01:22:40 +0200547 * Return the associated KoralQuery collection object
548 * as a {@link JsonNode}. This won't work,
549 * if the object was build using a CollectionBuilder,
550 * therefore it is limited to mirror a deserialized KoralQuery
551 * object.
552 *
553 * @return The {@link JsonNode} representing the collection object
554 * of a deserialized KoralQuery object.
555 */
556 public JsonNode toJsonNode () {
557 return this.json;
558 };
559
560
Nils Diewaldea969502015-02-16 21:10:54 +0000561 /**
562 * Create a bit vector representing the live documents of the
563 * virtual collection to be used in searches.
Akron176c9b12015-07-29 19:53:40 +0200564 * This will respect deleted documents.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000565 *
566 * @param The
Akron700c1eb2015-09-25 16:57:30 +0200567 * {@link LeafReaderContext} to search in.
Nils Diewaldea969502015-02-16 21:10:54 +0000568 * @return A bit vector representing the live documents of the
569 * virtual collection.
570 * @throws IOException
571 */
Akronb59f40e2018-08-23 17:15:43 +0200572 public FixedBitSet bits (LeafReaderContext atomic) throws IOException, QueryException {
margaretha85ee2ac2018-07-25 17:58:09 +0200573
margaretha5a8abea2021-11-08 16:57:51 +0100574 // EM: really need a fixedBitset?
575 // maybe better use org.apache.lucene.util.BitDocIdSet.Builder
576 // for automatic sparse bitset support
577 // appears possible by implementing a SparseDocBits class extending
578 // SparseFixedBitSet and implementing Serializable (only as marker interface)
Akron700c1eb2015-09-25 16:57:30 +0200579 LeafReader r = atomic.reader();
Akron176c9b12015-07-29 19:53:40 +0200580 FixedBitSet bitset = new FixedBitSet(r.maxDoc());
margaretha5a8abea2021-11-08 16:57:51 +0100581
582 if (DEBUG) {
583 start = System.currentTimeMillis();
584 }
585 DocIdSet docids = null;
586 try {
587 docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs());
588 }
589 catch (RuntimeException e) {
590 Throwable t = e.getCause();
591 if (t instanceof IOException) {
592 throw new IOException(t);
593 }
594 else if (t instanceof QueryException) {
595 throw new QueryException(((QueryException) t).getErrorCode(), t.getLocalizedMessage());
596 }
margaretha05a4bc12022-02-11 10:55:43 +0100597 else {
598 throw e;
599 }
margaretha5a8abea2021-11-08 16:57:51 +0100600 }
601
602 if (DEBUG) {
603 end = System.currentTimeMillis();
604 log.info("getDocIdSet in bits: " + (end - start));
605 }
Nils Diewaldbb33da22015-03-04 16:24:25 +0000606
Akron6b0be132019-09-16 19:01:59 +0200607
Akronaa74ec62015-07-31 17:22:55 +0200608 if (docids == null) {
609 if (this.cbi != null) {
610 bitset.clear(0, bitset.length());
611 }
612 else {
613 bitset.set(0, bitset.length());
614 };
615 }
Akron6b0be132019-09-16 19:01:59 +0200616 else {
Akronaa74ec62015-07-31 17:22:55 +0200617 bitset.or(docids.iterator());
Akron6b0be132019-09-16 19:01:59 +0200618 }
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000619
Nils Diewald44d5fa12015-01-15 21:31:52 +0000620 return bitset;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000621 };
622
Nils Diewaldea969502015-02-16 21:10:54 +0000623
624 /**
Akron176c9b12015-07-29 19:53:40 +0200625 * Return the {@link DocIdSet} representing the documents of the
626 * virtual collection to be used in searches.
627 * This will respect deleted documents.
628 *
629 * @param atomic
Akron700c1eb2015-09-25 16:57:30 +0200630 * The {@link LeafReaderContext} to search in.
Akron176c9b12015-07-29 19:53:40 +0200631 * @param accepted
632 * {@link Bits} vector of accepted documents.
633 * @throws IOException
634 */
Akron700c1eb2015-09-25 16:57:30 +0200635 public DocIdSet getDocIdSet (LeafReaderContext atomic, Bits acceptDocs)
Akronb59f40e2018-08-23 17:15:43 +0200636 throws IOException, QueryException {
Akron176c9b12015-07-29 19:53:40 +0200637
638 int maxDoc = atomic.reader().maxDoc();
639 FixedBitSet bitset = new FixedBitSet(maxDoc);
640
margaretha5a8abea2021-11-08 16:57:51 +0100641 final Filter filter = this.toFilter();
642
643 if (filter == null) {
644 if (acceptDocs == null)
645 return null;
646 bitset.set(0, maxDoc);
647 }
Akronb59f40e2018-08-23 17:15:43 +0200648 else {
Akron176c9b12015-07-29 19:53:40 +0200649
Akronb59f40e2018-08-23 17:15:43 +0200650 // Init vector
651 DocIdSet docids = filter.getDocIdSet(atomic, null);
Akron6b0be132019-09-16 19:01:59 +0200652
Akronb59f40e2018-08-23 17:15:43 +0200653 DocIdSetIterator filterIter =
654 (docids == null) ? null : docids.iterator();
655
656 if (filterIter == null) {
Akron6b0be132019-09-16 19:01:59 +0200657
Akronb59f40e2018-08-23 17:15:43 +0200658 if (!this.cbi.isNegative()) return null;
Akron176c9b12015-07-29 19:53:40 +0200659
Akronb59f40e2018-08-23 17:15:43 +0200660 bitset.set(0, maxDoc);
661 }
662 else {
Akron6b0be132019-09-16 19:01:59 +0200663
664 // Or bit set
Akronb59f40e2018-08-23 17:15:43 +0200665 bitset.or(filterIter);
666
667 // Revert for negation
668 if (this.cbi.isNegative()) bitset.flip(0, maxDoc);
669 };
670 };
Akronaa74ec62015-07-31 17:22:55 +0200671
Akronb59f40e2018-08-23 17:15:43 +0200672 if (DEBUG) {
673 log.debug("Bit set is {}", _bits(bitset));
674 log.debug("Livedocs is {}", _bits(acceptDocs));
Akron176c9b12015-07-29 19:53:40 +0200675 };
676
677 // Remove deleted docs
Eliza Margaretha6f989202016-10-14 21:48:29 +0200678 return (DocIdSet) BitsFilteredDocIdSet
Akronb59f40e2018-08-23 17:15:43 +0200679 .wrap((DocIdSet) new BitDocIdSet(bitset), acceptDocs);
Akron176c9b12015-07-29 19:53:40 +0200680 };
681
Akron40550172015-08-04 03:06:12 +0200682
Akron176c9b12015-07-29 19:53:40 +0200683 public long numberOf (String type) throws IOException {
684 return this.numberOf("tokens", type);
685 };
686
Akron40550172015-08-04 03:06:12 +0200687
Akron176c9b12015-07-29 19:53:40 +0200688 /**
Nils Diewaldea969502015-02-16 21:10:54 +0000689 * Search for the number of occurrences of different types,
690 * e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual
691 * collection.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000692 *
693 * @param field
694 * The field containing the textual data and the
695 * annotations as a string.
696 * @param type
697 * The type of meta information,
698 * e.g. <i>documents</i> or <i>sentences</i> as a
699 * string.
Nils Diewaldea969502015-02-16 21:10:54 +0000700 * @return The number of the occurrences.
701 * @throws IOException
Nils Diewalda14ecd62015-02-26 21:00:20 +0000702 * @see KrillIndex#numberOf
Nils Diewaldea969502015-02-16 21:10:54 +0000703 */
704 public long numberOf (String field, String type) throws IOException {
Akron176c9b12015-07-29 19:53:40 +0200705
706 // No index defined
margaretha8efa3752018-07-24 17:46:43 +0200707 if (this.index == null) return (long) -1;
Nils Diewaldc925b492013-12-03 23:56:10 +0000708
Akronaa74ec62015-07-31 17:22:55 +0200709 // No reader (inex is empty)
margaretha8efa3752018-07-24 17:46:43 +0200710 if (this.index.reader() == null) return (long) 0;
Akronaa74ec62015-07-31 17:22:55 +0200711
Akron176c9b12015-07-29 19:53:40 +0200712 // This is redundant to index stuff
713 if (type.equals("documents") || type.equals("base/texts")) {
Akronfd05f502015-07-30 18:34:26 +0200714 if (this.cbi == null) {
margaretha8efa3752018-07-24 17:46:43 +0200715 if (this.index.reader() == null) return (long) 0;
Akron176c9b12015-07-29 19:53:40 +0200716 return (long) this.index.reader().numDocs();
Akronfd05f502015-07-30 18:34:26 +0200717 }
Akron176c9b12015-07-29 19:53:40 +0200718 else
719 return this.docCount();
720 };
Akron40550172015-08-04 03:06:12 +0200721
Akron176c9b12015-07-29 19:53:40 +0200722 // Create search term
723 // This may be prefixed by foundries
724 Term term = new Term(field, "-:" + type);
725
margaretha8efa3752018-07-24 17:46:43 +0200726 if (DEBUG) log.debug("Iterate for {}/{}", field, type);
Akronfd05f502015-07-30 18:34:26 +0200727
Akron176c9b12015-07-29 19:53:40 +0200728 long occurrences = 0;
729 try {
730 // Iterate over all atomic readers and collect occurrences
Akron700c1eb2015-09-25 16:57:30 +0200731 for (LeafReaderContext atomic : this.index.reader().leaves()) {
Akronfd05f502015-07-30 18:34:26 +0200732 Bits bits = this.bits(atomic);
Akronaa74ec62015-07-31 17:22:55 +0200733
margaretha8efa3752018-07-24 17:46:43 +0200734 if (DEBUG) log.debug("Final bits {}", _bits(bits));
Akronaa74ec62015-07-31 17:22:55 +0200735
736 occurrences += this._numberOfAtomic(bits, atomic, term);
margaretha8efa3752018-07-24 17:46:43 +0200737 if (DEBUG) log.debug("Added up to {} for {}/{}", occurrences,
738 field, type);
Akron176c9b12015-07-29 19:53:40 +0200739 };
740 }
Akron40550172015-08-04 03:06:12 +0200741
Akron176c9b12015-07-29 19:53:40 +0200742 // Something went wrong
Akronaa74ec62015-07-31 17:22:55 +0200743 catch (IOException e) {
Akronb59f40e2018-08-23 17:15:43 +0200744 log.warn(e.getLocalizedMessage());
745 }
746
747 // E.g. reference corpus not found
748 catch (QueryException e) {
749 log.warn(e.getLocalizedMessage());
Akron176c9b12015-07-29 19:53:40 +0200750 };
751
752 return occurrences;
Nils Diewaldbaf68c52013-11-20 13:22:19 +0000753 };
754
Nils Diewaldea969502015-02-16 21:10:54 +0000755
Akron176c9b12015-07-29 19:53:40 +0200756 // Search for meta information in term vectors
757 // This will create the sum of all numerical payloads
758 // of the term in the document vector
Akron700c1eb2015-09-25 16:57:30 +0200759 private long _numberOfAtomic (Bits docvec, LeafReaderContext atomic,
Akron176c9b12015-07-29 19:53:40 +0200760 Term term) throws IOException {
761
762 // This reimplements docsAndPositionsEnum with payloads
763 final Terms terms = atomic.reader().fields().terms(term.field());
764
765 // No terms were found
766 if (terms != null) {
767 // Todo: Maybe reuse a termsEnum!
768 final TermsEnum termsEnum = terms.iterator(null);
769
margaretha8efa3752018-07-24 17:46:43 +0200770 // Set the position in the iterator to the term that is
771 // seeked
Akron176c9b12015-07-29 19:53:40 +0200772 if (termsEnum.seekExact(term.bytes())) {
773
Akronaa74ec62015-07-31 17:22:55 +0200774 // TODO: Reuse a DocsAndPositionsEnum!!
775
Akron176c9b12015-07-29 19:53:40 +0200776 // Start an iterator to fetch all payloads of the term
Akron40550172015-08-04 03:06:12 +0200777 DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec,
778 null, DocsAndPositionsEnum.FLAG_PAYLOADS);
Akronaa74ec62015-07-31 17:22:55 +0200779
Akron176c9b12015-07-29 19:53:40 +0200780
781 // The iterator is empty
782 // This may even be an error, but we return 0
margaretha8efa3752018-07-24 17:46:43 +0200783 if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS) return 0;
Akron176c9b12015-07-29 19:53:40 +0200784
785 // Init some variables for data copying
786 long occurrences = 0;
787 BytesRef payload;
788
789 // Init nextDoc()
790 while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) {
791
margaretha8efa3752018-07-24 17:46:43 +0200792 if (docs.freq() < 1) continue;
Akronaa74ec62015-07-31 17:22:55 +0200793
Akron176c9b12015-07-29 19:53:40 +0200794 // Initialize (go to first term)
795 docs.nextPosition();
796
797 // Copy payload with the offset of the BytesRef
798 payload = docs.getPayload();
Akronaa74ec62015-07-31 17:22:55 +0200799 if (payload != null) {
Akron40550172015-08-04 03:06:12 +0200800 System.arraycopy(payload.bytes, payload.offset, pl, 0,
801 4);
Akron176c9b12015-07-29 19:53:40 +0200802
Akronaa74ec62015-07-31 17:22:55 +0200803 // Add payload as integer
margaretha85ee2ac2018-07-25 17:58:09 +0200804 occurrences += ByteBuffer.wrap(pl).getInt();
Akronaa74ec62015-07-31 17:22:55 +0200805
margaretha8efa3752018-07-24 17:46:43 +0200806 if (DEBUG) log.debug(
807 "Value for {} incremented by {} to {} in {}",
margaretha85ee2ac2018-07-25 17:58:09 +0200808 term, ByteBuffer.wrap(pl).getInt(), occurrences,
margaretha8efa3752018-07-24 17:46:43 +0200809 docs.docID());
Akronaa74ec62015-07-31 17:22:55 +0200810 };
Akron176c9b12015-07-29 19:53:40 +0200811 };
812
813 // Return the sum of all occurrences
814 return occurrences;
815 };
816 };
817
818 // Nothing found
819 return 0;
Akron40550172015-08-04 03:06:12 +0200820 };
Akron176c9b12015-07-29 19:53:40 +0200821
822
Nils Diewaldea969502015-02-16 21:10:54 +0000823 /**
Akron176c9b12015-07-29 19:53:40 +0200824 * Return the number of documents in the virtual
825 * collection.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000826 *
Nils Diewaldea969502015-02-16 21:10:54 +0000827 * @return The number of the occurrences.
Akron176c9b12015-07-29 19:53:40 +0200828 * @see #numberOf
Nils Diewaldea969502015-02-16 21:10:54 +0000829 */
Akron176c9b12015-07-29 19:53:40 +0200830 public long docCount () {
Nils Diewaldafab8f32015-01-26 19:11:32 +0000831
Akron176c9b12015-07-29 19:53:40 +0200832 // No index defined
margaretha8efa3752018-07-24 17:46:43 +0200833 if (this.index == null) return (long) 0;
Akron176c9b12015-07-29 19:53:40 +0200834
835 // TODO: Caching!
836
837 long docCount = 0;
838 try {
839 FixedBitSet bitset;
Akron700c1eb2015-09-25 16:57:30 +0200840 for (LeafReaderContext atomic : this.index.reader().leaves()) {
Akron176c9b12015-07-29 19:53:40 +0200841 if ((bitset = this.bits(atomic)) != null)
842 docCount += bitset.cardinality();
843 };
844 }
845 catch (IOException e) {
846 log.warn(e.getLocalizedMessage());
Akronb59f40e2018-08-23 17:15:43 +0200847 }
848 catch (QueryException e) {
849 log.warn(e.getLocalizedMessage());
Akron176c9b12015-07-29 19:53:40 +0200850 };
851 return docCount;
Nils Diewalddfb21ea2013-11-21 14:26:47 +0000852 };
853
Nils Diewaldea969502015-02-16 21:10:54 +0000854
Akronaa74ec62015-07-31 17:22:55 +0200855 private static String _bits (Bits bitset) {
856 String str = "";
857 for (int i = 0; i < bitset.length(); i++) {
858 str += bitset.get(i) ? "1" : "0";
859 };
860 return str;
861 };
margaretha2ac95e32021-11-29 15:31:14 +0100862
margaretha5a8abea2021-11-08 16:57:51 +0100863 @Override
864 public Set<String> getAllLeafFingerprints () {
865 return index.getAllLeafFingerprints();
margaretha85ee2ac2018-07-25 17:58:09 +0200866 }
margarethafe252802018-07-30 14:59:50 +0200867
Akron176c9b12015-07-29 19:53:40 +0200868 /*
Akron176c9b12015-07-29 19:53:40 +0200869 * Analyze how terms relate
870 */
871 /*
Nils Diewald7cbcfe92014-09-22 22:01:51 +0000872 @Deprecated
Akron176c9b12015-07-29 19:53:40 +0200873 public HashMap getTermRelation (KrillCollection kc, String field)
874 throws Exception {
875 HashMap<String, Long> map = new HashMap<>(100);
876 long docNumber = 0, checkNumber = 0;
Eliza Margaretha6f989202016-10-14 21:48:29 +0200877
Nils Diewald44d5fa12015-01-15 21:31:52 +0000878 try {
Akron176c9b12015-07-29 19:53:40 +0200879 if (kc.getCount() <= 0) {
880 checkNumber = (long) this.reader().numDocs();
881 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200882
Akron700c1eb2015-09-25 16:57:30 +0200883 for (LeafReaderContext atomic : this.reader().leaves()) {
Akron176c9b12015-07-29 19:53:40 +0200884 HashMap<String, FixedBitSet> termVector = new HashMap<>(20);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200885
Akron176c9b12015-07-29 19:53:40 +0200886 FixedBitSet docvec = kc.bits(atomic);
887 if (docvec != null) {
888 docNumber += docvec.cardinality();
889 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200890
Akron176c9b12015-07-29 19:53:40 +0200891 Terms terms = atomic.reader().fields().terms(field);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200892
Akron176c9b12015-07-29 19:53:40 +0200893 if (terms == null) {
894 continue;
895 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200896
Akron176c9b12015-07-29 19:53:40 +0200897 int docLength = atomic.reader().maxDoc();
898 FixedBitSet bitset = new FixedBitSet(docLength);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200899
Akron176c9b12015-07-29 19:53:40 +0200900 // Iterate over all tokens in this field
901 TermsEnum termsEnum = terms.iterator(null);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200902
Akron176c9b12015-07-29 19:53:40 +0200903 while (termsEnum.next() != null) {
Eliza Margaretha6f989202016-10-14 21:48:29 +0200904
Akron176c9b12015-07-29 19:53:40 +0200905 String termString = termsEnum.term().utf8ToString();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200906
Akron176c9b12015-07-29 19:53:40 +0200907 bitset.clear(0, docLength);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200908
Akron176c9b12015-07-29 19:53:40 +0200909 // Get frequency
910 bitset.or((DocIdSetIterator) termsEnum.docs((Bits) docvec,
911 null));
Eliza Margaretha6f989202016-10-14 21:48:29 +0200912
Akron176c9b12015-07-29 19:53:40 +0200913 long value = 0;
914 if (map.containsKey(termString))
915 value = map.get(termString);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200916
Akron176c9b12015-07-29 19:53:40 +0200917 map.put(termString, value + bitset.cardinality());
Eliza Margaretha6f989202016-10-14 21:48:29 +0200918
Akron176c9b12015-07-29 19:53:40 +0200919 termVector.put(termString, bitset.clone());
920 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200921
Akron176c9b12015-07-29 19:53:40 +0200922 int keySize = termVector.size();
923 String[] keys = termVector.keySet()
924 .toArray(new String[keySize]);
925 java.util.Arrays.sort(keys);
Eliza Margaretha6f989202016-10-14 21:48:29 +0200926
Akron176c9b12015-07-29 19:53:40 +0200927 if (keySize > maxTermRelations) {
928 throw new Exception("termRelations are limited to "
929 + maxTermRelations + " sets"
930 + " (requested were at least " + keySize + " sets)");
931 };
Eliza Margaretha6f989202016-10-14 21:48:29 +0200932
Akron176c9b12015-07-29 19:53:40 +0200933 for (int i = 0; i < keySize; i++) {
934 for (int j = i + 1; j < keySize; j++) {
935 FixedBitSet comby = termVector.get(keys[i]).clone();
936 comby.and(termVector.get(keys[j]));
Eliza Margaretha6f989202016-10-14 21:48:29 +0200937
Akron176c9b12015-07-29 19:53:40 +0200938 StringBuilder sb = new StringBuilder();
939 sb.append("#__").append(keys[i]).append(":###:")
940 .append(keys[j]);
941 String combString = sb.toString();
Eliza Margaretha6f989202016-10-14 21:48:29 +0200942
Akron176c9b12015-07-29 19:53:40 +0200943 long cap = (long) comby.cardinality();
944 if (map.containsKey(combString)) {
945 cap += map.get(combString);
946 };
947 map.put(combString, cap);
948 };
Nils Diewald44d5fa12015-01-15 21:31:52 +0000949 };
950 };
Akron176c9b12015-07-29 19:53:40 +0200951 map.put("-docs", checkNumber != 0 ? checkNumber : docNumber);
Nils Diewald44d5fa12015-01-15 21:31:52 +0000952 }
Akron176c9b12015-07-29 19:53:40 +0200953 catch (IOException e) {
954 log.warn(e.getMessage());
Nils Diewald44d5fa12015-01-15 21:31:52 +0000955 };
Akron176c9b12015-07-29 19:53:40 +0200956 return map;
Nils Diewald2276e1c2014-04-10 15:01:59 +0000957 };
Akron176c9b12015-07-29 19:53:40 +0200958 */
Nils Diewald65894bd2015-02-16 21:36:53 +0000959
960
Nils Diewald01b4ce32013-12-05 22:39:25 +0000961};