| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap; |
| 2 | |
| margaretha | fe25280 | 2018-07-30 14:59:50 +0200 | [diff] [blame] | 3 | import java.io.File; |
| 4 | import java.io.FileInputStream; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 5 | import java.io.IOException; |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 6 | import java.io.InputStream; |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 7 | import java.nio.ByteBuffer; |
| margaretha | 4dfe3c5 | 2018-08-13 17:07:50 +0200 | [diff] [blame] | 8 | import java.util.Properties; |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 9 | import java.util.Set; |
| margaretha | c20a921 | 2018-08-21 14:32:09 +0200 | [diff] [blame] | 10 | import java.util.zip.GZIPInputStream; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 11 | |
| margaretha | fe25280 | 2018-07-30 14:59:50 +0200 | [diff] [blame] | 12 | import org.apache.commons.io.IOUtils; |
| margaretha | c20a921 | 2018-08-21 14:32:09 +0200 | [diff] [blame] | 13 | import org.apache.commons.io.output.ByteArrayOutputStream; |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 14 | import org.apache.lucene.index.DocsAndPositionsEnum; |
| 15 | import org.apache.lucene.index.LeafReader; |
| 16 | import org.apache.lucene.index.LeafReaderContext; |
| 17 | import org.apache.lucene.index.Term; |
| 18 | import org.apache.lucene.index.Terms; |
| 19 | import org.apache.lucene.index.TermsEnum; |
| 20 | import org.apache.lucene.search.BitsFilteredDocIdSet; |
| 21 | import org.apache.lucene.search.DocIdSet; |
| 22 | import org.apache.lucene.search.DocIdSetIterator; |
| 23 | import org.apache.lucene.search.Filter; |
| 24 | import org.apache.lucene.util.BitDocIdSet; |
| 25 | import org.apache.lucene.util.Bits; |
| 26 | import org.apache.lucene.util.BytesRef; |
| 27 | import org.apache.lucene.util.FixedBitSet; |
| 28 | import org.slf4j.Logger; |
| 29 | import org.slf4j.LoggerFactory; |
| 30 | |
| 31 | import com.fasterxml.jackson.databind.JsonNode; |
| 32 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 33 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 34 | import de.ids_mannheim.korap.collection.CollectionBuilder; |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 35 | import de.ids_mannheim.korap.response.Notifications; |
| margaretha | 4dfe3c5 | 2018-08-13 17:07:50 +0200 | [diff] [blame] | 36 | import de.ids_mannheim.korap.util.KrillProperties; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 37 | import de.ids_mannheim.korap.util.QueryException; |
| margaretha | 78f397a | 2017-06-29 13:44:46 +0200 | [diff] [blame] | 38 | import de.ids_mannheim.korap.util.StatusCodes; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 39 | |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 40 | /** |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 41 | * Create a Virtual Collection of documents by means of a KoralQuery |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 42 | * collection object. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 43 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 44 | * <blockquote><pre> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 45 | * KrillCollection kc = new KrillCollection(json); |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 46 | * </pre></blockquote> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 47 | * |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 48 | * @author diewald |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 49 | */ |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 50 | /* |
| 51 | * TODO: Make a cache for the bits |
| 52 | * Delete it in case of an extension or a filter |
| 53 | * TODO: Maybe use randomaccessfilterstrategy |
| 54 | * TODO: Maybe a constantScoreQuery can make things faster? |
| 55 | * See http://mail-archives.apache.org/mod_mbox/lucene-java-user/ |
| 56 | * 200805.mbox/%3C17080852.post@talk.nabble.com%3E |
| 57 | */ |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 58 | public final class KrillCollection extends Notifications implements IndexInfo { |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 59 | private KrillIndex index; |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 60 | private JsonNode json; |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 61 | private final CollectionBuilder cb = new CollectionBuilder(this); |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 62 | private CollectionBuilder.Interface cbi; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 63 | private byte[] pl = new byte[4]; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 64 | |
| Akron | 65d57e9 | 2018-08-24 19:25:56 +0200 | [diff] [blame] | 65 | private ObjectMapper mapper = new ObjectMapper(); |
| 66 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 67 | private Filter prefiltered = null; |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 68 | // private static ByteBuffer bb = ByteBuffer.allocate(4); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 69 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 70 | // Logger |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 71 | private final static Logger log = |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 72 | LoggerFactory.getLogger(KrillCollection.class); |
| Nils Diewald | 7cbcfe9 | 2014-09-22 22:01:51 +0000 | [diff] [blame] | 73 | // This advices the java compiler to ignore all loggings |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 74 | public static final boolean DEBUG = false; |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 75 | private double start, end; // for debugging |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 76 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 77 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 78 | * Construct a new KrillCollection. |
| 79 | * |
| 80 | */ |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 81 | public KrillCollection () {}; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 82 | |
| 83 | |
| 84 | /** |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 85 | * Construct a new KrillCollection by passing a KrillIndex. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 86 | * |
| 87 | * @param index |
| 88 | * The {@link KrillIndex} object. |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 89 | */ |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 90 | public KrillCollection (KrillIndex index) { |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 91 | this.index = index; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 92 | }; |
| 93 | |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 94 | /** |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 95 | * Construct a new KrillCollection by passing a KoralQuery. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 96 | * |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 97 | * @param json |
| 98 | * The KoralQuery document as a JSON string. |
| Nils Diewald | 33fcb5d | 2014-11-07 23:27:03 +0000 | [diff] [blame] | 99 | */ |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 100 | public KrillCollection (String jsonString) { |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 101 | try { |
| 102 | JsonNode json = mapper.readTree(jsonString); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 103 | |
| 104 | if (json.has("errors") && json.get("errors").size() > 0) { |
| 105 | this.addError(StatusCodes.INVALID_QUERY, "Json has errors."); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 106 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 107 | else if (json.has("collection")) { |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 108 | this.fromKoral(json.get("collection")); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 109 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 110 | else if (json.has("collections")) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 111 | this.addError(899, |
| 112 | "Collections are not supported anymore in favour of a single collection"); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 113 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 114 | else { |
| 115 | this.addError(StatusCodes.MISSING_COLLECTION, |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 116 | "Collection is not found"); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 117 | this.fromBuilder(this.build().nothing()); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 118 | } |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 119 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 120 | |
| 121 | // Query Exception |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 122 | catch (QueryException qe) { |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 123 | this.addError(qe.getErrorCode(), qe.getMessage()); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 124 | this.fromBuilder(this.build().nothing()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 125 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 126 | |
| 127 | // JSON exception |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 128 | catch (IOException e) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 129 | this.addError(621, "Unable to parse JSON", "KrillCollection", |
| 130 | e.getLocalizedMessage()); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 131 | this.fromBuilder(this.build().nothing()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 132 | }; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 133 | }; |
| 134 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 135 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 136 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 137 | * Set the {@link KrillIndex} the virtual collection refers to. |
| 138 | * |
| 139 | * @param index |
| 140 | * The {@link KrillIndex} the virtual collection refers |
| 141 | * to. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 142 | */ |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 143 | public void setIndex (KrillIndex index) { |
| 144 | this.index = index; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 145 | }; |
| 146 | |
| Nils Diewald | 33fcb5d | 2014-11-07 23:27:03 +0000 | [diff] [blame] | 147 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 148 | /** |
| 149 | * Import the "collection" part of a KoralQuery. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 150 | * |
| 151 | * @param jsonString |
| 152 | * The "collection" part of a KoralQuery. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 153 | * @throws QueryException |
| 154 | */ |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 155 | public KrillCollection fromKoral (String jsonString) throws QueryException { |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 156 | this.prefiltered = null; |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 157 | try { |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 158 | this.fromKoral((JsonNode) mapper.readTree(jsonString)); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 159 | } |
| 160 | catch (Exception e) { |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 161 | this.addError(621, "Unable to parse JSON", "KrillCollection"); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 162 | this.fromBuilder(this.build().nothing()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 163 | }; |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 164 | |
| 165 | return this; |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 166 | }; |
| Nils Diewald | 7899352 | 2014-10-27 17:51:22 +0000 | [diff] [blame] | 167 | |
| Nils Diewald | 33fcb5d | 2014-11-07 23:27:03 +0000 | [diff] [blame] | 168 | |
| Akron | 65d57e9 | 2018-08-24 19:25:56 +0200 | [diff] [blame] | 169 | public KrillCollection fromStore (String ref) throws QueryException { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 170 | this.prefiltered = null; |
| margaretha | 88258da | 2024-06-07 12:19:51 +0200 | [diff] [blame] | 171 | String namedVCPath = KrillProperties.namedVCPath; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 172 | if (!namedVCPath.endsWith("/")) { |
| 173 | namedVCPath += "/"; |
| 174 | }; |
| 175 | |
| 176 | String fileName = namedVCPath + ref + ".jsonld"; |
| 177 | File file; |
| 178 | String json = null; |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 179 | InputStream is = null; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 180 | if ((file= new File(fileName)).exists()) { |
| 181 | try (FileInputStream fis = new FileInputStream(file)) { |
| 182 | json = IOUtils.toString(fis,"utf-8"); |
| 183 | } |
| 184 | catch (IOException e) { |
| 185 | this.addError(StatusCodes.READING_COLLECTION_FAILED, |
| 186 | e.getMessage()); |
| 187 | return this; |
| 188 | } |
| 189 | } |
| 190 | // slower than plain text, but save space |
| 191 | else if ((file = new File(fileName + ".gz")).exists()){ |
| 192 | try (GZIPInputStream gzipInputStream = |
| 193 | new GZIPInputStream(new FileInputStream(file)); |
| 194 | ByteArrayOutputStream bos = |
| 195 | new ByteArrayOutputStream(512);) { |
| 196 | bos.write(gzipInputStream); |
| 197 | json = bos.toString("utf-8"); |
| 198 | } |
| 199 | catch (IOException e) { |
| 200 | this.addError(StatusCodes.READING_COLLECTION_FAILED, |
| 201 | e.getMessage()); |
| 202 | return this; |
| 203 | } |
| 204 | } |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 205 | // for testing |
| margaretha | 88258da | 2024-06-07 12:19:51 +0200 | [diff] [blame] | 206 | else if (KrillProperties.isTest |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 207 | && (is = retrieveInputStreamFromClasspath(fileName)) != null) { |
| 208 | try { |
| 209 | json = IOUtils.toString(is, "utf-8"); |
| 210 | } |
| 211 | catch (IOException e) { |
| 212 | this.addError(StatusCodes.READING_COLLECTION_FAILED, |
| 213 | e.getMessage()); |
| 214 | return this; |
| 215 | } |
| 216 | } |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 217 | else{ |
| 218 | this.addError(StatusCodes.MISSING_COLLECTION, |
| 219 | "Collection is not found " + fileName); |
| 220 | return this; |
| 221 | }; |
| 222 | |
| 223 | return this.fromKoral(json); |
| 224 | }; |
| 225 | |
| 226 | |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 227 | private InputStream retrieveInputStreamFromClasspath (String fileName) { |
| 228 | if (!fileName.startsWith("/")) { |
| 229 | fileName = "/"+fileName; |
| 230 | } |
| 231 | return KrillCollection.class.getResourceAsStream(fileName); |
| 232 | } |
| 233 | |
| 234 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 235 | /** |
| 236 | * Import the "collection" part of a KoralQuery. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 237 | * |
| 238 | * @param json |
| 239 | * The "collection" part of a KoralQuery |
| 240 | * as a {@link JsonNode} object. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 241 | * @throws QueryException |
| 242 | */ |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 243 | public KrillCollection fromKoral (JsonNode json) throws QueryException { |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 244 | this.json = json; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 245 | this.prefiltered = null; |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 246 | return this.fromBuilder(this._fromKoral(json)); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 247 | }; |
| 248 | |
| 249 | |
| Akron | d5ca00a | 2016-06-08 14:29:00 +0200 | [diff] [blame] | 250 | // Create collection from KoralQuery |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 251 | private CollectionBuilder.Interface _fromKoral (JsonNode json) |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 252 | throws QueryException { |
| 253 | |
| 254 | if (json.has("collection")) { |
| 255 | return this._fromKoral(json.at("/collection")); |
| 256 | }; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 257 | |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 258 | if (!json.has("@type")) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 259 | throw new QueryException(701, |
| 260 | "JSON-LD group has no @type attribute"); |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 261 | }; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 262 | |
| 263 | String type = json.get("@type").asText(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 264 | |
| Nils Diewald | cec40f9 | 2015-02-19 22:20:02 +0000 | [diff] [blame] | 265 | if (type.equals("koral:doc")) { |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 266 | |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 267 | // default key |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 268 | String key = "tokens"; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 269 | String valtype = "type:string"; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 270 | String match = "match:eq"; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 271 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 272 | if (json.has("key")) key = json.get("key").asText(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 273 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 274 | if (json.has("type")) valtype = json.get("type").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 275 | |
| 276 | // Filter based on date |
| 277 | if (valtype.equals("type:date")) { |
| 278 | |
| 279 | if (!json.has("value")) |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 280 | throw new QueryException(820, "Dates require value fields"); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 281 | |
| 282 | String dateStr = json.get("value").asText(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 283 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 284 | if (json.has("match")) match = json.get("match").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 285 | |
| 286 | // TODO: This isn't stable yet |
| 287 | switch (match) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 288 | case "match:eq": |
| 289 | return this.cb.date(key, dateStr); |
| 290 | case "match:ne": |
| 291 | return this.cb.date(key, dateStr).not(); |
| 292 | case "match:geq": |
| 293 | return this.cb.since(key, dateStr); |
| 294 | case "match:leq": |
| 295 | return this.cb.till(key, dateStr); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 296 | }; |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 297 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 298 | throw new QueryException(841, |
| 299 | "Match relation unknown for type"); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 300 | } |
| 301 | |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 302 | // Filter based on string |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 303 | else if (valtype.equals("type:string")) { |
| margaretha | ecddb0b | 2018-07-31 15:23:38 +0200 | [diff] [blame] | 304 | if (json.get("value").size() > 1){ |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 305 | if (DEBUG) { |
| 306 | log.debug("koral:doc size " + json.get("value").size()); |
| 307 | }; |
| margaretha | df0e9d1 | 2018-07-30 16:22:59 +0200 | [diff] [blame] | 308 | if (json.has("match")) { |
| 309 | match = json.get("match").asText(); |
| 310 | } |
| 311 | |
| margaretha | 8a8c427 | 2018-08-21 17:39:27 +0200 | [diff] [blame] | 312 | CollectionBuilder.Group group = this.cb.orGroup(); |
| 313 | for (JsonNode value : json.get("value")) { |
| 314 | group.with(cb.term(key, value.asText())); |
| margaretha | df0e9d1 | 2018-07-30 16:22:59 +0200 | [diff] [blame] | 315 | } |
| margaretha | 8a8c427 | 2018-08-21 17:39:27 +0200 | [diff] [blame] | 316 | |
| 317 | if (match.equals("match:ne")) { |
| 318 | return group.not(); |
| margaretha | df0e9d1 | 2018-07-30 16:22:59 +0200 | [diff] [blame] | 319 | } |
| 320 | return group; |
| 321 | } |
| 322 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 323 | if (json.has("match")) match = json.get("match").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 324 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 325 | switch (match) { |
| 326 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 327 | case "match:eq": |
| 328 | return this.cb.term(key, json.get("value").asText()); |
| 329 | case "match:ne": |
| 330 | return this.cb.term(key, json.get("value").asText()) |
| 331 | .not(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 332 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 333 | // Contains and containsnot (or excludes) is only |
| 334 | // effective on text fields and ineffective on |
| 335 | // string fields |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 336 | case "match:contains": |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 337 | return this.cb.text(key, json.get("value").asText()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 338 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 339 | case "match:containsnot": |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 340 | return this.cb.text(key, json.get("value").asText()) |
| 341 | .not(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 342 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 343 | // <LEGACY> |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 344 | case "match:excludes": |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 345 | return this.cb.text(key, json.get("value").asText()) |
| 346 | .not(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 347 | // </LEGACY> |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 348 | }; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 349 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 350 | throw new QueryException(841, |
| 351 | "Match relation unknown for type"); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 352 | } |
| 353 | |
| 354 | // Filter based on regex |
| 355 | else if (valtype.equals("type:regex")) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 356 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 357 | if (json.has("match")) match = json.get("match").asText(); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 358 | |
| 359 | if (match.equals("match:eq")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 360 | return this.cb.re(key, json.get("value").asText()); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 361 | } |
| 362 | else if (match.equals("match:ne")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 363 | return this.cb.re(key, json.get("value").asText()).not(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 364 | } |
| Akron | 2746970 | 2018-04-05 12:46:18 +0200 | [diff] [blame] | 365 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 366 | // Contains and containsnot (or excludes) is |
| 367 | // identical to eq and ne in case of regexes for the |
| 368 | // moment, |
| 369 | // though it may be beneficial to circumfix these |
| 370 | // with .* |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 371 | else if (match.equals("match:contains")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 372 | return this.cb.re(key, json.get("value").asText()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 373 | } |
| Akron | 2746970 | 2018-04-05 12:46:18 +0200 | [diff] [blame] | 374 | else if (match.equals("match:containsnot")) { |
| 375 | return this.cb.re(key, json.get("value").asText()); |
| 376 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 377 | // <LEGACY> |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 378 | else if (match.equals("match:excludes")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 379 | return this.cb.re(key, json.get("value").asText()).not(); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 380 | }; |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 381 | // </LEGACY> |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 382 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 383 | throw new QueryException(841, |
| 384 | "Match relation unknown for type"); |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 385 | } |
| 386 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 387 | throw new QueryException(843, "Document type is not supported"); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 388 | } |
| 389 | |
| 390 | // nested group |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 391 | else if (type.equals("koral:docGroup")) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 392 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 393 | if (!json.has("operands") || !json.get("operands").isArray()) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 394 | throw new QueryException(842, |
| 395 | "Document group needs operand list"); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 396 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 397 | CollectionBuilder.Group group; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 398 | |
| 399 | String operation = "operation:and"; |
| 400 | if (json.has("operation")) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 401 | operation = json.get("operation").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 402 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 403 | if (operation.equals("operation:or")) |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 404 | group = this.cb.orGroup(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 405 | else if (operation.equals("operation:and")) |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 406 | group = this.cb.andGroup(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 407 | else |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 408 | throw new QueryException(810, |
| 409 | "Unknown document group operation"); |
| 410 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 411 | for (JsonNode operand : json.get("operands")) { |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 412 | |
| 413 | // TODO: |
| 414 | // Potentially bed here, when operand is a group inside a group |
| 415 | // with the same operator (and not negative) |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 416 | group.with(this._fromKoral(operand)); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 417 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 418 | return group; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 419 | } |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 420 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 421 | // vc reference |
| 422 | else if (type.equals("koral:docGroupRef")) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 423 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 424 | if (!json.has("ref")) { |
| 425 | throw new QueryException(StatusCodes.MISSING_VC_REFERENCE, |
| 426 | "ref is not found"); |
| 427 | } |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 428 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 429 | String ref = json.get("ref").asText(); |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 430 | if (ref.isEmpty()) { |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 431 | throw new QueryException(StatusCodes.MISSING_VC_REFERENCE, |
| 432 | "ref is empty"); |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 433 | }; |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 434 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 435 | return this.cb.referTo(ref); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 436 | } |
| 437 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 438 | |
| 439 | // Unknown type |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 440 | throw new QueryException(813, "Collection type is not supported"); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 441 | }; |
| 442 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 443 | /** |
| 444 | * Set the collection from a {@link CollectionBuilder} object. |
| 445 | * |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 446 | * @param cb |
| 447 | * The CollectionBuilder object. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 448 | */ |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 449 | public KrillCollection fromBuilder (CollectionBuilder.Interface cbi) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 450 | this.prefiltered = null; |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 451 | this.cbi = cbi; |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 452 | return this; |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 453 | }; |
| 454 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 455 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 456 | public CollectionBuilder.Interface getBuilder () { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 457 | return this.cbi; |
| 458 | }; |
| 459 | |
| 460 | |
| 461 | public CollectionBuilder build () { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 462 | return this.cb; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 463 | }; |
| 464 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 465 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 466 | public KrillCollection filter (CollectionBuilder.Interface filter) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 467 | return this.fromBuilder(this.cb.andGroup().with(this.cbi).with(filter)); |
| 468 | }; |
| 469 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 470 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 471 | public KrillCollection extend (CollectionBuilder.Interface extension) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 472 | return this |
| 473 | .fromBuilder(this.cb.orGroup().with(this.cbi).with(extension)); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 474 | }; |
| 475 | |
| 476 | |
| 477 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 478 | /** |
| 479 | * Add a filter based on a list of unique document identifiers. |
| 480 | * UIDs may be indexed in the field "UID". |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 481 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 482 | * This filter is not part of the legacy API! |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 483 | * |
| 484 | * @param uids |
| 485 | * The list of unique document identifier. |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 486 | * @return The {@link KrillCollection} object for chaining. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 487 | */ |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 488 | public KrillCollection filterUIDs (String ... uids) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 489 | this.prefiltered = null; |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 490 | CollectionBuilder.Group cbg = this.cb.orGroup(); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 491 | for (String uid : uids) { |
| 492 | cbg.with(this.cb.term("UID", uid)); |
| 493 | }; |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 494 | return this.filter(cbg); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 495 | }; |
| 496 | |
| 497 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 498 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 499 | * Serialize collection to a {@link Filter} object. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 500 | */ |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 501 | public Filter toFilter () throws QueryException { |
| 502 | if (this.cbi == null) |
| 503 | return null; |
| 504 | |
| 505 | if (this.prefiltered != null) |
| 506 | return this.prefiltered; |
| 507 | |
| 508 | this.prefiltered = this.cbi.toFilter(); |
| 509 | return this.prefiltered; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 510 | }; |
| 511 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 512 | |
| 513 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 514 | * Boolean value if the collection should work inverted or |
| 515 | * not. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 516 | */ |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 517 | public boolean isNegative () { |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 518 | if (this.cbi == null) return false; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 519 | |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 520 | return this.cbi.isNegative(); |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 521 | }; |
| 522 | |
| 523 | |
| 524 | /** |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 525 | * Generate a string representation of the virtual collection. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 526 | * |
| 527 | * <strong>Warning</strong>: This currently does not generate a |
| 528 | * valid |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 529 | * KoralQuery string, so this may change in a future version. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 530 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 531 | * @return A string representation of the virtual collection. |
| 532 | */ |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 533 | public String toString () { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 534 | try { |
| 535 | Filter filter = this.toFilter(); |
| 536 | if (filter == null) return ""; |
| 537 | return (this.isNegative() ? "-" : "") + filter.toString(); |
| 538 | } |
| 539 | catch (QueryException qe) { |
| 540 | log.warn(qe.getLocalizedMessage()); |
| 541 | }; |
| 542 | return ""; |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 543 | }; |
| 544 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 545 | |
| Nils Diewald | 7cbcfe9 | 2014-09-22 22:01:51 +0000 | [diff] [blame] | 546 | /** |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 547 | * Return the associated KoralQuery collection object |
| 548 | * as a {@link JsonNode}. This won't work, |
| 549 | * if the object was build using a CollectionBuilder, |
| 550 | * therefore it is limited to mirror a deserialized KoralQuery |
| 551 | * object. |
| 552 | * |
| 553 | * @return The {@link JsonNode} representing the collection object |
| 554 | * of a deserialized KoralQuery object. |
| 555 | */ |
| 556 | public JsonNode toJsonNode () { |
| 557 | return this.json; |
| 558 | }; |
| 559 | |
| 560 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 561 | /** |
| 562 | * Create a bit vector representing the live documents of the |
| 563 | * virtual collection to be used in searches. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 564 | * This will respect deleted documents. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 565 | * |
| 566 | * @param The |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 567 | * {@link LeafReaderContext} to search in. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 568 | * @return A bit vector representing the live documents of the |
| 569 | * virtual collection. |
| 570 | * @throws IOException |
| 571 | */ |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 572 | public FixedBitSet bits (LeafReaderContext atomic) throws IOException, QueryException { |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 573 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 574 | // EM: really need a fixedBitset? |
| 575 | // maybe better use org.apache.lucene.util.BitDocIdSet.Builder |
| 576 | // for automatic sparse bitset support |
| 577 | // appears possible by implementing a SparseDocBits class extending |
| 578 | // SparseFixedBitSet and implementing Serializable (only as marker interface) |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 579 | LeafReader r = atomic.reader(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 580 | FixedBitSet bitset = new FixedBitSet(r.maxDoc()); |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 581 | |
| 582 | if (DEBUG) { |
| 583 | start = System.currentTimeMillis(); |
| 584 | } |
| 585 | DocIdSet docids = null; |
| 586 | try { |
| 587 | docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs()); |
| 588 | } |
| 589 | catch (RuntimeException e) { |
| 590 | Throwable t = e.getCause(); |
| 591 | if (t instanceof IOException) { |
| 592 | throw new IOException(t); |
| 593 | } |
| 594 | else if (t instanceof QueryException) { |
| 595 | throw new QueryException(((QueryException) t).getErrorCode(), t.getLocalizedMessage()); |
| 596 | } |
| margaretha | 05a4bc1 | 2022-02-11 10:55:43 +0100 | [diff] [blame] | 597 | else { |
| 598 | throw e; |
| 599 | } |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 600 | } |
| 601 | |
| 602 | if (DEBUG) { |
| 603 | end = System.currentTimeMillis(); |
| 604 | log.info("getDocIdSet in bits: " + (end - start)); |
| 605 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 606 | |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 607 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 608 | if (docids == null) { |
| 609 | if (this.cbi != null) { |
| 610 | bitset.clear(0, bitset.length()); |
| 611 | } |
| 612 | else { |
| 613 | bitset.set(0, bitset.length()); |
| 614 | }; |
| 615 | } |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 616 | else { |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 617 | bitset.or(docids.iterator()); |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 618 | } |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 619 | |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 620 | return bitset; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 621 | }; |
| 622 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 623 | |
| 624 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 625 | * Return the {@link DocIdSet} representing the documents of the |
| 626 | * virtual collection to be used in searches. |
| 627 | * This will respect deleted documents. |
| 628 | * |
| 629 | * @param atomic |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 630 | * The {@link LeafReaderContext} to search in. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 631 | * @param accepted |
| 632 | * {@link Bits} vector of accepted documents. |
| 633 | * @throws IOException |
| 634 | */ |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 635 | public DocIdSet getDocIdSet (LeafReaderContext atomic, Bits acceptDocs) |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 636 | throws IOException, QueryException { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 637 | |
| 638 | int maxDoc = atomic.reader().maxDoc(); |
| 639 | FixedBitSet bitset = new FixedBitSet(maxDoc); |
| 640 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 641 | final Filter filter = this.toFilter(); |
| 642 | |
| 643 | if (filter == null) { |
| 644 | if (acceptDocs == null) |
| 645 | return null; |
| 646 | bitset.set(0, maxDoc); |
| 647 | } |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 648 | else { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 649 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 650 | // Init vector |
| 651 | DocIdSet docids = filter.getDocIdSet(atomic, null); |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 652 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 653 | DocIdSetIterator filterIter = |
| 654 | (docids == null) ? null : docids.iterator(); |
| 655 | |
| 656 | if (filterIter == null) { |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 657 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 658 | if (!this.cbi.isNegative()) return null; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 659 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 660 | bitset.set(0, maxDoc); |
| 661 | } |
| 662 | else { |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 663 | |
| 664 | // Or bit set |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 665 | bitset.or(filterIter); |
| 666 | |
| 667 | // Revert for negation |
| 668 | if (this.cbi.isNegative()) bitset.flip(0, maxDoc); |
| 669 | }; |
| 670 | }; |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 671 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 672 | if (DEBUG) { |
| 673 | log.debug("Bit set is {}", _bits(bitset)); |
| 674 | log.debug("Livedocs is {}", _bits(acceptDocs)); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 675 | }; |
| 676 | |
| 677 | // Remove deleted docs |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 678 | return (DocIdSet) BitsFilteredDocIdSet |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 679 | .wrap((DocIdSet) new BitDocIdSet(bitset), acceptDocs); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 680 | }; |
| 681 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 682 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 683 | public long numberOf (String type) throws IOException { |
| 684 | return this.numberOf("tokens", type); |
| 685 | }; |
| 686 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 687 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 688 | /** |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 689 | * Search for the number of occurrences of different types, |
| 690 | * e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual |
| 691 | * collection. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 692 | * |
| 693 | * @param field |
| 694 | * The field containing the textual data and the |
| 695 | * annotations as a string. |
| 696 | * @param type |
| 697 | * The type of meta information, |
| 698 | * e.g. <i>documents</i> or <i>sentences</i> as a |
| 699 | * string. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 700 | * @return The number of the occurrences. |
| 701 | * @throws IOException |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 702 | * @see KrillIndex#numberOf |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 703 | */ |
| 704 | public long numberOf (String field, String type) throws IOException { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 705 | |
| 706 | // No index defined |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 707 | if (this.index == null) return (long) -1; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 708 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 709 | // No reader (inex is empty) |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 710 | if (this.index.reader() == null) return (long) 0; |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 711 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 712 | // This is redundant to index stuff |
| 713 | if (type.equals("documents") || type.equals("base/texts")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 714 | if (this.cbi == null) { |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 715 | if (this.index.reader() == null) return (long) 0; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 716 | return (long) this.index.reader().numDocs(); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 717 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 718 | else |
| 719 | return this.docCount(); |
| 720 | }; |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 721 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 722 | // Create search term |
| 723 | // This may be prefixed by foundries |
| 724 | Term term = new Term(field, "-:" + type); |
| 725 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 726 | if (DEBUG) log.debug("Iterate for {}/{}", field, type); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 727 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 728 | long occurrences = 0; |
| 729 | try { |
| 730 | // Iterate over all atomic readers and collect occurrences |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 731 | for (LeafReaderContext atomic : this.index.reader().leaves()) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 732 | Bits bits = this.bits(atomic); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 733 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 734 | if (DEBUG) log.debug("Final bits {}", _bits(bits)); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 735 | |
| 736 | occurrences += this._numberOfAtomic(bits, atomic, term); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 737 | if (DEBUG) log.debug("Added up to {} for {}/{}", occurrences, |
| 738 | field, type); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 739 | }; |
| 740 | } |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 741 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 742 | // Something went wrong |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 743 | catch (IOException e) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 744 | log.warn(e.getLocalizedMessage()); |
| 745 | } |
| 746 | |
| 747 | // E.g. reference corpus not found |
| 748 | catch (QueryException e) { |
| 749 | log.warn(e.getLocalizedMessage()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 750 | }; |
| 751 | |
| 752 | return occurrences; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 753 | }; |
| 754 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 755 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 756 | // Search for meta information in term vectors |
| 757 | // This will create the sum of all numerical payloads |
| 758 | // of the term in the document vector |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 759 | private long _numberOfAtomic (Bits docvec, LeafReaderContext atomic, |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 760 | Term term) throws IOException { |
| 761 | |
| 762 | // This reimplements docsAndPositionsEnum with payloads |
| 763 | final Terms terms = atomic.reader().fields().terms(term.field()); |
| 764 | |
| 765 | // No terms were found |
| 766 | if (terms != null) { |
| 767 | // Todo: Maybe reuse a termsEnum! |
| 768 | final TermsEnum termsEnum = terms.iterator(null); |
| 769 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 770 | // Set the position in the iterator to the term that is |
| 771 | // seeked |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 772 | if (termsEnum.seekExact(term.bytes())) { |
| 773 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 774 | // TODO: Reuse a DocsAndPositionsEnum!! |
| 775 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 776 | // Start an iterator to fetch all payloads of the term |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 777 | DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec, |
| 778 | null, DocsAndPositionsEnum.FLAG_PAYLOADS); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 779 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 780 | |
| 781 | // The iterator is empty |
| 782 | // This may even be an error, but we return 0 |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 783 | if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS) return 0; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 784 | |
| 785 | // Init some variables for data copying |
| 786 | long occurrences = 0; |
| 787 | BytesRef payload; |
| 788 | |
| 789 | // Init nextDoc() |
| 790 | while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { |
| 791 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 792 | if (docs.freq() < 1) continue; |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 793 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 794 | // Initialize (go to first term) |
| 795 | docs.nextPosition(); |
| 796 | |
| 797 | // Copy payload with the offset of the BytesRef |
| 798 | payload = docs.getPayload(); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 799 | if (payload != null) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 800 | System.arraycopy(payload.bytes, payload.offset, pl, 0, |
| 801 | 4); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 802 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 803 | // Add payload as integer |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 804 | occurrences += ByteBuffer.wrap(pl).getInt(); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 805 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 806 | if (DEBUG) log.debug( |
| 807 | "Value for {} incremented by {} to {} in {}", |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 808 | term, ByteBuffer.wrap(pl).getInt(), occurrences, |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 809 | docs.docID()); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 810 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 811 | }; |
| 812 | |
| 813 | // Return the sum of all occurrences |
| 814 | return occurrences; |
| 815 | }; |
| 816 | }; |
| 817 | |
| 818 | // Nothing found |
| 819 | return 0; |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 820 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 821 | |
| 822 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 823 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 824 | * Return the number of documents in the virtual |
| 825 | * collection. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 826 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 827 | * @return The number of the occurrences. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 828 | * @see #numberOf |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 829 | */ |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 830 | public long docCount () { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 831 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 832 | // No index defined |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 833 | if (this.index == null) return (long) 0; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 834 | |
| 835 | // TODO: Caching! |
| 836 | |
| 837 | long docCount = 0; |
| 838 | try { |
| 839 | FixedBitSet bitset; |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 840 | for (LeafReaderContext atomic : this.index.reader().leaves()) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 841 | if ((bitset = this.bits(atomic)) != null) |
| 842 | docCount += bitset.cardinality(); |
| 843 | }; |
| 844 | } |
| 845 | catch (IOException e) { |
| 846 | log.warn(e.getLocalizedMessage()); |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 847 | } |
| 848 | catch (QueryException e) { |
| 849 | log.warn(e.getLocalizedMessage()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 850 | }; |
| 851 | return docCount; |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 852 | }; |
| 853 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 854 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 855 | private static String _bits (Bits bitset) { |
| 856 | String str = ""; |
| 857 | for (int i = 0; i < bitset.length(); i++) { |
| 858 | str += bitset.get(i) ? "1" : "0"; |
| 859 | }; |
| 860 | return str; |
| 861 | }; |
| margaretha | 2ac95e3 | 2021-11-29 15:31:14 +0100 | [diff] [blame] | 862 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 863 | @Override |
| 864 | public Set<String> getAllLeafFingerprints () { |
| 865 | return index.getAllLeafFingerprints(); |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 866 | } |
| margaretha | fe25280 | 2018-07-30 14:59:50 +0200 | [diff] [blame] | 867 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 868 | /* |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 869 | * Analyze how terms relate |
| 870 | */ |
| 871 | /* |
| Nils Diewald | 7cbcfe9 | 2014-09-22 22:01:51 +0000 | [diff] [blame] | 872 | @Deprecated |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 873 | public HashMap getTermRelation (KrillCollection kc, String field) |
| 874 | throws Exception { |
| 875 | HashMap<String, Long> map = new HashMap<>(100); |
| 876 | long docNumber = 0, checkNumber = 0; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 877 | |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 878 | try { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 879 | if (kc.getCount() <= 0) { |
| 880 | checkNumber = (long) this.reader().numDocs(); |
| 881 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 882 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 883 | for (LeafReaderContext atomic : this.reader().leaves()) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 884 | HashMap<String, FixedBitSet> termVector = new HashMap<>(20); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 885 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 886 | FixedBitSet docvec = kc.bits(atomic); |
| 887 | if (docvec != null) { |
| 888 | docNumber += docvec.cardinality(); |
| 889 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 890 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 891 | Terms terms = atomic.reader().fields().terms(field); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 892 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 893 | if (terms == null) { |
| 894 | continue; |
| 895 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 896 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 897 | int docLength = atomic.reader().maxDoc(); |
| 898 | FixedBitSet bitset = new FixedBitSet(docLength); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 899 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 900 | // Iterate over all tokens in this field |
| 901 | TermsEnum termsEnum = terms.iterator(null); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 902 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 903 | while (termsEnum.next() != null) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 904 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 905 | String termString = termsEnum.term().utf8ToString(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 906 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 907 | bitset.clear(0, docLength); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 908 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 909 | // Get frequency |
| 910 | bitset.or((DocIdSetIterator) termsEnum.docs((Bits) docvec, |
| 911 | null)); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 912 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 913 | long value = 0; |
| 914 | if (map.containsKey(termString)) |
| 915 | value = map.get(termString); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 916 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 917 | map.put(termString, value + bitset.cardinality()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 918 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 919 | termVector.put(termString, bitset.clone()); |
| 920 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 921 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 922 | int keySize = termVector.size(); |
| 923 | String[] keys = termVector.keySet() |
| 924 | .toArray(new String[keySize]); |
| 925 | java.util.Arrays.sort(keys); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 926 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 927 | if (keySize > maxTermRelations) { |
| 928 | throw new Exception("termRelations are limited to " |
| 929 | + maxTermRelations + " sets" |
| 930 | + " (requested were at least " + keySize + " sets)"); |
| 931 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 932 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 933 | for (int i = 0; i < keySize; i++) { |
| 934 | for (int j = i + 1; j < keySize; j++) { |
| 935 | FixedBitSet comby = termVector.get(keys[i]).clone(); |
| 936 | comby.and(termVector.get(keys[j])); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 937 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 938 | StringBuilder sb = new StringBuilder(); |
| 939 | sb.append("#__").append(keys[i]).append(":###:") |
| 940 | .append(keys[j]); |
| 941 | String combString = sb.toString(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 942 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 943 | long cap = (long) comby.cardinality(); |
| 944 | if (map.containsKey(combString)) { |
| 945 | cap += map.get(combString); |
| 946 | }; |
| 947 | map.put(combString, cap); |
| 948 | }; |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 949 | }; |
| 950 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 951 | map.put("-docs", checkNumber != 0 ? checkNumber : docNumber); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 952 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 953 | catch (IOException e) { |
| 954 | log.warn(e.getMessage()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 955 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 956 | return map; |
| Nils Diewald | 2276e1c | 2014-04-10 15:01:59 +0000 | [diff] [blame] | 957 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 958 | */ |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 959 | |
| 960 | |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 961 | }; |