| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 1 | package de.ids_mannheim.korap; |
| 2 | |
| margaretha | fe25280 | 2018-07-30 14:59:50 +0200 | [diff] [blame] | 3 | import java.io.File; |
| 4 | import java.io.FileInputStream; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 5 | import java.io.IOException; |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 6 | import java.io.InputStream; |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 7 | import java.nio.ByteBuffer; |
| margaretha | 4dfe3c5 | 2018-08-13 17:07:50 +0200 | [diff] [blame] | 8 | import java.util.Properties; |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 9 | import java.util.Set; |
| margaretha | c20a921 | 2018-08-21 14:32:09 +0200 | [diff] [blame] | 10 | import java.util.zip.GZIPInputStream; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 11 | |
| margaretha | fe25280 | 2018-07-30 14:59:50 +0200 | [diff] [blame] | 12 | import org.apache.commons.io.IOUtils; |
| margaretha | c20a921 | 2018-08-21 14:32:09 +0200 | [diff] [blame] | 13 | import org.apache.commons.io.output.ByteArrayOutputStream; |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 14 | import org.apache.lucene.index.DocsAndPositionsEnum; |
| 15 | import org.apache.lucene.index.LeafReader; |
| 16 | import org.apache.lucene.index.LeafReaderContext; |
| 17 | import org.apache.lucene.index.Term; |
| 18 | import org.apache.lucene.index.Terms; |
| 19 | import org.apache.lucene.index.TermsEnum; |
| 20 | import org.apache.lucene.search.BitsFilteredDocIdSet; |
| 21 | import org.apache.lucene.search.DocIdSet; |
| 22 | import org.apache.lucene.search.DocIdSetIterator; |
| 23 | import org.apache.lucene.search.Filter; |
| 24 | import org.apache.lucene.util.BitDocIdSet; |
| 25 | import org.apache.lucene.util.Bits; |
| 26 | import org.apache.lucene.util.BytesRef; |
| 27 | import org.apache.lucene.util.FixedBitSet; |
| 28 | import org.slf4j.Logger; |
| 29 | import org.slf4j.LoggerFactory; |
| 30 | |
| 31 | import com.fasterxml.jackson.databind.JsonNode; |
| 32 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 33 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 34 | import de.ids_mannheim.korap.collection.CollectionBuilder; |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 35 | import de.ids_mannheim.korap.response.Notifications; |
| margaretha | 4dfe3c5 | 2018-08-13 17:07:50 +0200 | [diff] [blame] | 36 | import de.ids_mannheim.korap.util.KrillProperties; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 37 | import de.ids_mannheim.korap.util.QueryException; |
| margaretha | 78f397a | 2017-06-29 13:44:46 +0200 | [diff] [blame] | 38 | import de.ids_mannheim.korap.util.StatusCodes; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 39 | |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 40 | /** |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 41 | * Create a Virtual Collection of documents by means of a KoralQuery |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 42 | * collection object. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 43 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 44 | * <blockquote><pre> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 45 | * KrillCollection kc = new KrillCollection(json); |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 46 | * </pre></blockquote> |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 47 | * |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 48 | * @author diewald |
| Nils Diewald | c471b18 | 2014-11-19 22:51:15 +0000 | [diff] [blame] | 49 | */ |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 50 | /* |
| 51 | * TODO: Make a cache for the bits |
| 52 | * Delete it in case of an extension or a filter |
| 53 | * TODO: Maybe use randomaccessfilterstrategy |
| 54 | * TODO: Maybe a constantScoreQuery can make things faster? |
| 55 | * See http://mail-archives.apache.org/mod_mbox/lucene-java-user/ |
| 56 | * 200805.mbox/%3C17080852.post@talk.nabble.com%3E |
| 57 | */ |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 58 | public final class KrillCollection extends Notifications implements IndexInfo { |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 59 | private KrillIndex index; |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 60 | private JsonNode json; |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 61 | private final CollectionBuilder cb = new CollectionBuilder(this); |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 62 | private CollectionBuilder.Interface cbi; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 63 | private byte[] pl = new byte[4]; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 64 | |
| Akron | 65d57e9 | 2018-08-24 19:25:56 +0200 | [diff] [blame] | 65 | private ObjectMapper mapper = new ObjectMapper(); |
| 66 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 67 | private Filter prefiltered = null; |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 68 | // private static ByteBuffer bb = ByteBuffer.allocate(4); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 69 | |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 70 | // Logger |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 71 | private final static Logger log = |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 72 | LoggerFactory.getLogger(KrillCollection.class); |
| Nils Diewald | 7cbcfe9 | 2014-09-22 22:01:51 +0000 | [diff] [blame] | 73 | // This advices the java compiler to ignore all loggings |
| Akron | 3ba74f2 | 2015-07-24 18:46:17 +0200 | [diff] [blame] | 74 | public static final boolean DEBUG = false; |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 75 | private double start, end; // for debugging |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 76 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 77 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 78 | * Construct a new KrillCollection. |
| 79 | * |
| 80 | */ |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 81 | public KrillCollection () {}; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 82 | |
| 83 | |
| 84 | /** |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 85 | * Construct a new KrillCollection by passing a KrillIndex. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 86 | * |
| 87 | * @param index |
| 88 | * The {@link KrillIndex} object. |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 89 | */ |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 90 | public KrillCollection (KrillIndex index) { |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 91 | this.index = index; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 92 | }; |
| 93 | |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 94 | /** |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 95 | * Construct a new KrillCollection by passing a KoralQuery. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 96 | * |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 97 | * @param json |
| 98 | * The KoralQuery document as a JSON string. |
| Nils Diewald | 33fcb5d | 2014-11-07 23:27:03 +0000 | [diff] [blame] | 99 | */ |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 100 | public KrillCollection (String jsonString) { |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 101 | try { |
| 102 | JsonNode json = mapper.readTree(jsonString); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 103 | |
| 104 | if (json.has("errors") && json.get("errors").size() > 0) { |
| 105 | this.addError(StatusCodes.INVALID_QUERY, "Json has errors."); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 106 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 107 | else if (json.has("collection")) { |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 108 | this.fromKoral(json.get("collection")); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 109 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 110 | else if (json.has("collections")) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 111 | this.addError(899, |
| 112 | "Collections are not supported anymore in favour of a single collection"); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 113 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 114 | else { |
| 115 | this.addError(StatusCodes.MISSING_COLLECTION, |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 116 | "Collection is not found"); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 117 | this.fromBuilder(this.build().nothing()); |
| margaretha | f2c3150 | 2017-06-26 17:57:16 +0200 | [diff] [blame] | 118 | } |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 119 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 120 | |
| 121 | // Query Exception |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 122 | catch (QueryException qe) { |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 123 | this.addError(qe.getErrorCode(), qe.getMessage()); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 124 | this.fromBuilder(this.build().nothing()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 125 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 126 | |
| 127 | // JSON exception |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 128 | catch (IOException e) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 129 | this.addError(621, "Unable to parse JSON", "KrillCollection", |
| 130 | e.getLocalizedMessage()); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 131 | this.fromBuilder(this.build().nothing()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 132 | }; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 133 | }; |
| 134 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 135 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 136 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 137 | * Set the {@link KrillIndex} the virtual collection refers to. |
| 138 | * |
| 139 | * @param index |
| 140 | * The {@link KrillIndex} the virtual collection refers |
| 141 | * to. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 142 | */ |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 143 | public void setIndex (KrillIndex index) { |
| 144 | this.index = index; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 145 | }; |
| 146 | |
| Nils Diewald | 33fcb5d | 2014-11-07 23:27:03 +0000 | [diff] [blame] | 147 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 148 | /** |
| 149 | * Import the "collection" part of a KoralQuery. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 150 | * |
| 151 | * @param jsonString |
| 152 | * The "collection" part of a KoralQuery. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 153 | * @throws QueryException |
| 154 | */ |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 155 | public KrillCollection fromKoral (String jsonString) throws QueryException { |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 156 | this.prefiltered = null; |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 157 | try { |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 158 | this.fromKoral((JsonNode) mapper.readTree(jsonString)); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 159 | } |
| 160 | catch (Exception e) { |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 161 | this.addError(621, "Unable to parse JSON", "KrillCollection"); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 162 | this.fromBuilder(this.build().nothing()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 163 | }; |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 164 | |
| 165 | return this; |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 166 | }; |
| Nils Diewald | 7899352 | 2014-10-27 17:51:22 +0000 | [diff] [blame] | 167 | |
| Nils Diewald | 33fcb5d | 2014-11-07 23:27:03 +0000 | [diff] [blame] | 168 | |
| Akron | 65d57e9 | 2018-08-24 19:25:56 +0200 | [diff] [blame] | 169 | public KrillCollection fromStore (String ref) throws QueryException { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 170 | Properties prop = KrillProperties.loadDefaultProperties(); |
| 171 | this.prefiltered = null; |
| 172 | |
| 173 | if (prop == null) { |
| 174 | this.addError(StatusCodes.MISSING_KRILL_PROPERTIES, |
| 175 | "krill.properties is not found."); |
| 176 | return null; |
| 177 | } |
| 178 | |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 179 | String p = prop.getProperty("krill.test", "false"); |
| 180 | boolean isTest = Boolean.parseBoolean(p); |
| 181 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 182 | String namedVCPath = prop.getProperty("krill.namedVC"); |
| 183 | |
| 184 | if (!namedVCPath.endsWith("/")) { |
| 185 | namedVCPath += "/"; |
| 186 | }; |
| 187 | |
| 188 | String fileName = namedVCPath + ref + ".jsonld"; |
| 189 | File file; |
| 190 | String json = null; |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 191 | InputStream is = null; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 192 | if ((file= new File(fileName)).exists()) { |
| 193 | try (FileInputStream fis = new FileInputStream(file)) { |
| 194 | json = IOUtils.toString(fis,"utf-8"); |
| 195 | } |
| 196 | catch (IOException e) { |
| 197 | this.addError(StatusCodes.READING_COLLECTION_FAILED, |
| 198 | e.getMessage()); |
| 199 | return this; |
| 200 | } |
| 201 | } |
| 202 | // slower than plain text, but save space |
| 203 | else if ((file = new File(fileName + ".gz")).exists()){ |
| 204 | try (GZIPInputStream gzipInputStream = |
| 205 | new GZIPInputStream(new FileInputStream(file)); |
| 206 | ByteArrayOutputStream bos = |
| 207 | new ByteArrayOutputStream(512);) { |
| 208 | bos.write(gzipInputStream); |
| 209 | json = bos.toString("utf-8"); |
| 210 | } |
| 211 | catch (IOException e) { |
| 212 | this.addError(StatusCodes.READING_COLLECTION_FAILED, |
| 213 | e.getMessage()); |
| 214 | return this; |
| 215 | } |
| 216 | } |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 217 | // for testing |
| 218 | else if (isTest |
| 219 | && (is = retrieveInputStreamFromClasspath(fileName)) != null) { |
| 220 | try { |
| 221 | json = IOUtils.toString(is, "utf-8"); |
| 222 | } |
| 223 | catch (IOException e) { |
| 224 | this.addError(StatusCodes.READING_COLLECTION_FAILED, |
| 225 | e.getMessage()); |
| 226 | return this; |
| 227 | } |
| 228 | } |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 229 | else{ |
| 230 | this.addError(StatusCodes.MISSING_COLLECTION, |
| 231 | "Collection is not found " + fileName); |
| 232 | return this; |
| 233 | }; |
| 234 | |
| 235 | return this.fromKoral(json); |
| 236 | }; |
| 237 | |
| 238 | |
| margaretha | 7e31ca9 | 2021-12-13 10:48:44 +0100 | [diff] [blame] | 239 | private InputStream retrieveInputStreamFromClasspath (String fileName) { |
| 240 | if (!fileName.startsWith("/")) { |
| 241 | fileName = "/"+fileName; |
| 242 | } |
| 243 | return KrillCollection.class.getResourceAsStream(fileName); |
| 244 | } |
| 245 | |
| 246 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 247 | /** |
| 248 | * Import the "collection" part of a KoralQuery. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 249 | * |
| 250 | * @param json |
| 251 | * The "collection" part of a KoralQuery |
| 252 | * as a {@link JsonNode} object. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 253 | * @throws QueryException |
| 254 | */ |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 255 | public KrillCollection fromKoral (JsonNode json) throws QueryException { |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 256 | this.json = json; |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 257 | this.prefiltered = null; |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 258 | return this.fromBuilder(this._fromKoral(json)); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 259 | }; |
| 260 | |
| 261 | |
| Akron | d5ca00a | 2016-06-08 14:29:00 +0200 | [diff] [blame] | 262 | // Create collection from KoralQuery |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 263 | private CollectionBuilder.Interface _fromKoral (JsonNode json) |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 264 | throws QueryException { |
| 265 | |
| 266 | if (json.has("collection")) { |
| 267 | return this._fromKoral(json.at("/collection")); |
| 268 | }; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 269 | |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 270 | if (!json.has("@type")) { |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 271 | throw new QueryException(701, |
| 272 | "JSON-LD group has no @type attribute"); |
| Akron | c63697c | 2015-06-17 22:32:02 +0200 | [diff] [blame] | 273 | }; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 274 | |
| 275 | String type = json.get("@type").asText(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 276 | |
| Nils Diewald | cec40f9 | 2015-02-19 22:20:02 +0000 | [diff] [blame] | 277 | if (type.equals("koral:doc")) { |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 278 | |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 279 | // default key |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 280 | String key = "tokens"; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 281 | String valtype = "type:string"; |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 282 | String match = "match:eq"; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 283 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 284 | if (json.has("key")) key = json.get("key").asText(); |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 285 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 286 | if (json.has("type")) valtype = json.get("type").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 287 | |
| 288 | // Filter based on date |
| 289 | if (valtype.equals("type:date")) { |
| 290 | |
| 291 | if (!json.has("value")) |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 292 | throw new QueryException(820, "Dates require value fields"); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 293 | |
| 294 | String dateStr = json.get("value").asText(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 295 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 296 | if (json.has("match")) match = json.get("match").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 297 | |
| 298 | // TODO: This isn't stable yet |
| 299 | switch (match) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 300 | case "match:eq": |
| 301 | return this.cb.date(key, dateStr); |
| 302 | case "match:ne": |
| 303 | return this.cb.date(key, dateStr).not(); |
| 304 | case "match:geq": |
| 305 | return this.cb.since(key, dateStr); |
| 306 | case "match:leq": |
| 307 | return this.cb.till(key, dateStr); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 308 | }; |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 309 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 310 | throw new QueryException(841, |
| 311 | "Match relation unknown for type"); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 312 | } |
| 313 | |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 314 | // Filter based on string |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 315 | else if (valtype.equals("type:string")) { |
| margaretha | ecddb0b | 2018-07-31 15:23:38 +0200 | [diff] [blame] | 316 | if (json.get("value").size() > 1){ |
| Akron | e64cc16 | 2019-01-08 18:40:37 +0100 | [diff] [blame] | 317 | if (DEBUG) { |
| 318 | log.debug("koral:doc size " + json.get("value").size()); |
| 319 | }; |
| margaretha | df0e9d1 | 2018-07-30 16:22:59 +0200 | [diff] [blame] | 320 | if (json.has("match")) { |
| 321 | match = json.get("match").asText(); |
| 322 | } |
| 323 | |
| margaretha | 8a8c427 | 2018-08-21 17:39:27 +0200 | [diff] [blame] | 324 | CollectionBuilder.Group group = this.cb.orGroup(); |
| 325 | for (JsonNode value : json.get("value")) { |
| 326 | group.with(cb.term(key, value.asText())); |
| margaretha | df0e9d1 | 2018-07-30 16:22:59 +0200 | [diff] [blame] | 327 | } |
| margaretha | 8a8c427 | 2018-08-21 17:39:27 +0200 | [diff] [blame] | 328 | |
| 329 | if (match.equals("match:ne")) { |
| 330 | return group.not(); |
| margaretha | df0e9d1 | 2018-07-30 16:22:59 +0200 | [diff] [blame] | 331 | } |
| 332 | return group; |
| 333 | } |
| 334 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 335 | if (json.has("match")) match = json.get("match").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 336 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 337 | switch (match) { |
| 338 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 339 | case "match:eq": |
| 340 | return this.cb.term(key, json.get("value").asText()); |
| 341 | case "match:ne": |
| 342 | return this.cb.term(key, json.get("value").asText()) |
| 343 | .not(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 344 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 345 | // Contains and containsnot (or excludes) is only |
| 346 | // effective on text fields and ineffective on |
| 347 | // string fields |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 348 | case "match:contains": |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 349 | return this.cb.text(key, json.get("value").asText()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 350 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 351 | case "match:containsnot": |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 352 | return this.cb.text(key, json.get("value").asText()) |
| 353 | .not(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 354 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 355 | // <LEGACY> |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 356 | case "match:excludes": |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 357 | return this.cb.text(key, json.get("value").asText()) |
| 358 | .not(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 359 | // </LEGACY> |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 360 | }; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 361 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 362 | throw new QueryException(841, |
| 363 | "Match relation unknown for type"); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 364 | } |
| 365 | |
| 366 | // Filter based on regex |
| 367 | else if (valtype.equals("type:regex")) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 368 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 369 | if (json.has("match")) match = json.get("match").asText(); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 370 | |
| 371 | if (match.equals("match:eq")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 372 | return this.cb.re(key, json.get("value").asText()); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 373 | } |
| 374 | else if (match.equals("match:ne")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 375 | return this.cb.re(key, json.get("value").asText()).not(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 376 | } |
| Akron | 2746970 | 2018-04-05 12:46:18 +0200 | [diff] [blame] | 377 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 378 | // Contains and containsnot (or excludes) is |
| 379 | // identical to eq and ne in case of regexes for the |
| 380 | // moment, |
| 381 | // though it may be beneficial to circumfix these |
| 382 | // with .* |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 383 | else if (match.equals("match:contains")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 384 | return this.cb.re(key, json.get("value").asText()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 385 | } |
| Akron | 2746970 | 2018-04-05 12:46:18 +0200 | [diff] [blame] | 386 | else if (match.equals("match:containsnot")) { |
| 387 | return this.cb.re(key, json.get("value").asText()); |
| 388 | } |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 389 | // <LEGACY> |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 390 | else if (match.equals("match:excludes")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 391 | return this.cb.re(key, json.get("value").asText()).not(); |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 392 | }; |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 393 | // </LEGACY> |
| Akron | 48937e9 | 2015-06-26 01:49:02 +0200 | [diff] [blame] | 394 | |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 395 | throw new QueryException(841, |
| 396 | "Match relation unknown for type"); |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 397 | } |
| 398 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 399 | throw new QueryException(843, "Document type is not supported"); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 400 | } |
| 401 | |
| 402 | // nested group |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 403 | else if (type.equals("koral:docGroup")) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 404 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 405 | if (!json.has("operands") || !json.get("operands").isArray()) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 406 | throw new QueryException(842, |
| 407 | "Document group needs operand list"); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 408 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 409 | CollectionBuilder.Group group; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 410 | |
| 411 | String operation = "operation:and"; |
| 412 | if (json.has("operation")) |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 413 | operation = json.get("operation").asText(); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 414 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 415 | if (operation.equals("operation:or")) |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 416 | group = this.cb.orGroup(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 417 | else if (operation.equals("operation:and")) |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 418 | group = this.cb.andGroup(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 419 | else |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 420 | throw new QueryException(810, |
| 421 | "Unknown document group operation"); |
| 422 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 423 | for (JsonNode operand : json.get("operands")) { |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 424 | |
| 425 | // TODO: |
| 426 | // Potentially bed here, when operand is a group inside a group |
| 427 | // with the same operator (and not negative) |
| Akron | 850b46e | 2016-06-08 10:08:55 +0200 | [diff] [blame] | 428 | group.with(this._fromKoral(operand)); |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 429 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 430 | return group; |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 431 | } |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 432 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 433 | // vc reference |
| 434 | else if (type.equals("koral:docGroupRef")) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 435 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 436 | if (!json.has("ref")) { |
| 437 | throw new QueryException(StatusCodes.MISSING_VC_REFERENCE, |
| 438 | "ref is not found"); |
| 439 | } |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 440 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 441 | String ref = json.get("ref").asText(); |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 442 | if (ref.isEmpty()) { |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 443 | throw new QueryException(StatusCodes.MISSING_VC_REFERENCE, |
| 444 | "ref is empty"); |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 445 | }; |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 446 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 447 | return this.cb.referTo(ref); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 448 | } |
| 449 | |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 450 | |
| 451 | // Unknown type |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 452 | throw new QueryException(813, "Collection type is not supported"); |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 453 | }; |
| 454 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 455 | /** |
| 456 | * Set the collection from a {@link CollectionBuilder} object. |
| 457 | * |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 458 | * @param cb |
| 459 | * The CollectionBuilder object. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 460 | */ |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 461 | public KrillCollection fromBuilder (CollectionBuilder.Interface cbi) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 462 | this.prefiltered = null; |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 463 | this.cbi = cbi; |
| Nils Diewald | 3aa9e69 | 2015-02-20 22:20:11 +0000 | [diff] [blame] | 464 | return this; |
| Nils Diewald | e364570 | 2014-11-07 21:15:20 +0000 | [diff] [blame] | 465 | }; |
| 466 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 467 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 468 | public CollectionBuilder.Interface getBuilder () { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 469 | return this.cbi; |
| 470 | }; |
| 471 | |
| 472 | |
| 473 | public CollectionBuilder build () { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 474 | return this.cb; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 475 | }; |
| 476 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 477 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 478 | public KrillCollection filter (CollectionBuilder.Interface filter) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 479 | return this.fromBuilder(this.cb.andGroup().with(this.cbi).with(filter)); |
| 480 | }; |
| 481 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 482 | |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 483 | public KrillCollection extend (CollectionBuilder.Interface extension) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 484 | return this |
| 485 | .fromBuilder(this.cb.orGroup().with(this.cbi).with(extension)); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 486 | }; |
| 487 | |
| 488 | |
| 489 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 490 | /** |
| 491 | * Add a filter based on a list of unique document identifiers. |
| 492 | * UIDs may be indexed in the field "UID". |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 493 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 494 | * This filter is not part of the legacy API! |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 495 | * |
| 496 | * @param uids |
| 497 | * The list of unique document identifier. |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 498 | * @return The {@link KrillCollection} object for chaining. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 499 | */ |
| Nils Diewald | 2d5f810 | 2015-02-26 21:07:54 +0000 | [diff] [blame] | 500 | public KrillCollection filterUIDs (String ... uids) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 501 | this.prefiltered = null; |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 502 | CollectionBuilder.Group cbg = this.cb.orGroup(); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 503 | for (String uid : uids) { |
| 504 | cbg.with(this.cb.term("UID", uid)); |
| 505 | }; |
| Akron | 60dfa7e | 2015-08-03 22:15:17 +0200 | [diff] [blame] | 506 | return this.filter(cbg); |
| Nils Diewald | d723d81 | 2014-09-23 18:50:52 +0000 | [diff] [blame] | 507 | }; |
| 508 | |
| 509 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 510 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 511 | * Serialize collection to a {@link Filter} object. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 512 | */ |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 513 | public Filter toFilter () throws QueryException { |
| 514 | if (this.cbi == null) |
| 515 | return null; |
| 516 | |
| 517 | if (this.prefiltered != null) |
| 518 | return this.prefiltered; |
| 519 | |
| 520 | this.prefiltered = this.cbi.toFilter(); |
| 521 | return this.prefiltered; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 522 | }; |
| 523 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 524 | |
| 525 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 526 | * Boolean value if the collection should work inverted or |
| 527 | * not. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 528 | */ |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 529 | public boolean isNegative () { |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 530 | if (this.cbi == null) return false; |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 531 | |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 532 | return this.cbi.isNegative(); |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 533 | }; |
| 534 | |
| 535 | |
| 536 | /** |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 537 | * Generate a string representation of the virtual collection. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 538 | * |
| 539 | * <strong>Warning</strong>: This currently does not generate a |
| 540 | * valid |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 541 | * KoralQuery string, so this may change in a future version. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 542 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 543 | * @return A string representation of the virtual collection. |
| 544 | */ |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 545 | public String toString () { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 546 | try { |
| 547 | Filter filter = this.toFilter(); |
| 548 | if (filter == null) return ""; |
| 549 | return (this.isNegative() ? "-" : "") + filter.toString(); |
| 550 | } |
| 551 | catch (QueryException qe) { |
| 552 | log.warn(qe.getLocalizedMessage()); |
| 553 | }; |
| 554 | return ""; |
| Nils Diewald | b1c3b65 | 2013-12-28 22:47:00 +0000 | [diff] [blame] | 555 | }; |
| 556 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 557 | |
| Nils Diewald | 7cbcfe9 | 2014-09-22 22:01:51 +0000 | [diff] [blame] | 558 | /** |
| Akron | bb5d173 | 2015-06-22 01:22:40 +0200 | [diff] [blame] | 559 | * Return the associated KoralQuery collection object |
| 560 | * as a {@link JsonNode}. This won't work, |
| 561 | * if the object was build using a CollectionBuilder, |
| 562 | * therefore it is limited to mirror a deserialized KoralQuery |
| 563 | * object. |
| 564 | * |
| 565 | * @return The {@link JsonNode} representing the collection object |
| 566 | * of a deserialized KoralQuery object. |
| 567 | */ |
| 568 | public JsonNode toJsonNode () { |
| 569 | return this.json; |
| 570 | }; |
| 571 | |
| 572 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 573 | /** |
| 574 | * Create a bit vector representing the live documents of the |
| 575 | * virtual collection to be used in searches. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 576 | * This will respect deleted documents. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 577 | * |
| 578 | * @param The |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 579 | * {@link LeafReaderContext} to search in. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 580 | * @return A bit vector representing the live documents of the |
| 581 | * virtual collection. |
| 582 | * @throws IOException |
| 583 | */ |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 584 | public FixedBitSet bits (LeafReaderContext atomic) throws IOException, QueryException { |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 585 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 586 | // EM: really need a fixedBitset? |
| 587 | // maybe better use org.apache.lucene.util.BitDocIdSet.Builder |
| 588 | // for automatic sparse bitset support |
| 589 | // appears possible by implementing a SparseDocBits class extending |
| 590 | // SparseFixedBitSet and implementing Serializable (only as marker interface) |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 591 | LeafReader r = atomic.reader(); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 592 | FixedBitSet bitset = new FixedBitSet(r.maxDoc()); |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 593 | |
| 594 | if (DEBUG) { |
| 595 | start = System.currentTimeMillis(); |
| 596 | } |
| 597 | DocIdSet docids = null; |
| 598 | try { |
| 599 | docids = this.getDocIdSet(atomic, (Bits) r.getLiveDocs()); |
| 600 | } |
| 601 | catch (RuntimeException e) { |
| 602 | Throwable t = e.getCause(); |
| 603 | if (t instanceof IOException) { |
| 604 | throw new IOException(t); |
| 605 | } |
| 606 | else if (t instanceof QueryException) { |
| 607 | throw new QueryException(((QueryException) t).getErrorCode(), t.getLocalizedMessage()); |
| 608 | } |
| margaretha | 05a4bc1 | 2022-02-11 10:55:43 +0100 | [diff] [blame] | 609 | else { |
| 610 | throw e; |
| 611 | } |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 612 | } |
| 613 | |
| 614 | if (DEBUG) { |
| 615 | end = System.currentTimeMillis(); |
| 616 | log.info("getDocIdSet in bits: " + (end - start)); |
| 617 | } |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 618 | |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 619 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 620 | if (docids == null) { |
| 621 | if (this.cbi != null) { |
| 622 | bitset.clear(0, bitset.length()); |
| 623 | } |
| 624 | else { |
| 625 | bitset.set(0, bitset.length()); |
| 626 | }; |
| 627 | } |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 628 | else { |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 629 | bitset.or(docids.iterator()); |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 630 | } |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 631 | |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 632 | return bitset; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 633 | }; |
| 634 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 635 | |
| 636 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 637 | * Return the {@link DocIdSet} representing the documents of the |
| 638 | * virtual collection to be used in searches. |
| 639 | * This will respect deleted documents. |
| 640 | * |
| 641 | * @param atomic |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 642 | * The {@link LeafReaderContext} to search in. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 643 | * @param accepted |
| 644 | * {@link Bits} vector of accepted documents. |
| 645 | * @throws IOException |
| 646 | */ |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 647 | public DocIdSet getDocIdSet (LeafReaderContext atomic, Bits acceptDocs) |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 648 | throws IOException, QueryException { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 649 | |
| 650 | int maxDoc = atomic.reader().maxDoc(); |
| 651 | FixedBitSet bitset = new FixedBitSet(maxDoc); |
| 652 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 653 | final Filter filter = this.toFilter(); |
| 654 | |
| 655 | if (filter == null) { |
| 656 | if (acceptDocs == null) |
| 657 | return null; |
| 658 | bitset.set(0, maxDoc); |
| 659 | } |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 660 | else { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 661 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 662 | // Init vector |
| 663 | DocIdSet docids = filter.getDocIdSet(atomic, null); |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 664 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 665 | DocIdSetIterator filterIter = |
| 666 | (docids == null) ? null : docids.iterator(); |
| 667 | |
| 668 | if (filterIter == null) { |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 669 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 670 | if (!this.cbi.isNegative()) return null; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 671 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 672 | bitset.set(0, maxDoc); |
| 673 | } |
| 674 | else { |
| Akron | 6b0be13 | 2019-09-16 19:01:59 +0200 | [diff] [blame] | 675 | |
| 676 | // Or bit set |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 677 | bitset.or(filterIter); |
| 678 | |
| 679 | // Revert for negation |
| 680 | if (this.cbi.isNegative()) bitset.flip(0, maxDoc); |
| 681 | }; |
| 682 | }; |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 683 | |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 684 | if (DEBUG) { |
| 685 | log.debug("Bit set is {}", _bits(bitset)); |
| 686 | log.debug("Livedocs is {}", _bits(acceptDocs)); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 687 | }; |
| 688 | |
| 689 | // Remove deleted docs |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 690 | return (DocIdSet) BitsFilteredDocIdSet |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 691 | .wrap((DocIdSet) new BitDocIdSet(bitset), acceptDocs); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 692 | }; |
| 693 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 694 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 695 | public long numberOf (String type) throws IOException { |
| 696 | return this.numberOf("tokens", type); |
| 697 | }; |
| 698 | |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 699 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 700 | /** |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 701 | * Search for the number of occurrences of different types, |
| 702 | * e.g. <i>documents</i>, <i>sentences</i> etc. in the virtual |
| 703 | * collection. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 704 | * |
| 705 | * @param field |
| 706 | * The field containing the textual data and the |
| 707 | * annotations as a string. |
| 708 | * @param type |
| 709 | * The type of meta information, |
| 710 | * e.g. <i>documents</i> or <i>sentences</i> as a |
| 711 | * string. |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 712 | * @return The number of the occurrences. |
| 713 | * @throws IOException |
| Nils Diewald | a14ecd6 | 2015-02-26 21:00:20 +0000 | [diff] [blame] | 714 | * @see KrillIndex#numberOf |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 715 | */ |
| 716 | public long numberOf (String field, String type) throws IOException { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 717 | |
| 718 | // No index defined |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 719 | if (this.index == null) return (long) -1; |
| Nils Diewald | c925b49 | 2013-12-03 23:56:10 +0000 | [diff] [blame] | 720 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 721 | // No reader (inex is empty) |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 722 | if (this.index.reader() == null) return (long) 0; |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 723 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 724 | // This is redundant to index stuff |
| 725 | if (type.equals("documents") || type.equals("base/texts")) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 726 | if (this.cbi == null) { |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 727 | if (this.index.reader() == null) return (long) 0; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 728 | return (long) this.index.reader().numDocs(); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 729 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 730 | else |
| 731 | return this.docCount(); |
| 732 | }; |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 733 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 734 | // Create search term |
| 735 | // This may be prefixed by foundries |
| 736 | Term term = new Term(field, "-:" + type); |
| 737 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 738 | if (DEBUG) log.debug("Iterate for {}/{}", field, type); |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 739 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 740 | long occurrences = 0; |
| 741 | try { |
| 742 | // Iterate over all atomic readers and collect occurrences |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 743 | for (LeafReaderContext atomic : this.index.reader().leaves()) { |
| Akron | fd05f50 | 2015-07-30 18:34:26 +0200 | [diff] [blame] | 744 | Bits bits = this.bits(atomic); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 745 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 746 | if (DEBUG) log.debug("Final bits {}", _bits(bits)); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 747 | |
| 748 | occurrences += this._numberOfAtomic(bits, atomic, term); |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 749 | if (DEBUG) log.debug("Added up to {} for {}/{}", occurrences, |
| 750 | field, type); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 751 | }; |
| 752 | } |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 753 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 754 | // Something went wrong |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 755 | catch (IOException e) { |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 756 | log.warn(e.getLocalizedMessage()); |
| 757 | } |
| 758 | |
| 759 | // E.g. reference corpus not found |
| 760 | catch (QueryException e) { |
| 761 | log.warn(e.getLocalizedMessage()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 762 | }; |
| 763 | |
| 764 | return occurrences; |
| Nils Diewald | baf68c5 | 2013-11-20 13:22:19 +0000 | [diff] [blame] | 765 | }; |
| 766 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 767 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 768 | // Search for meta information in term vectors |
| 769 | // This will create the sum of all numerical payloads |
| 770 | // of the term in the document vector |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 771 | private long _numberOfAtomic (Bits docvec, LeafReaderContext atomic, |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 772 | Term term) throws IOException { |
| 773 | |
| 774 | // This reimplements docsAndPositionsEnum with payloads |
| 775 | final Terms terms = atomic.reader().fields().terms(term.field()); |
| 776 | |
| 777 | // No terms were found |
| 778 | if (terms != null) { |
| 779 | // Todo: Maybe reuse a termsEnum! |
| 780 | final TermsEnum termsEnum = terms.iterator(null); |
| 781 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 782 | // Set the position in the iterator to the term that is |
| 783 | // seeked |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 784 | if (termsEnum.seekExact(term.bytes())) { |
| 785 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 786 | // TODO: Reuse a DocsAndPositionsEnum!! |
| 787 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 788 | // Start an iterator to fetch all payloads of the term |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 789 | DocsAndPositionsEnum docs = termsEnum.docsAndPositions(docvec, |
| 790 | null, DocsAndPositionsEnum.FLAG_PAYLOADS); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 791 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 792 | |
| 793 | // The iterator is empty |
| 794 | // This may even be an error, but we return 0 |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 795 | if (docs.docID() == DocsAndPositionsEnum.NO_MORE_DOCS) return 0; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 796 | |
| 797 | // Init some variables for data copying |
| 798 | long occurrences = 0; |
| 799 | BytesRef payload; |
| 800 | |
| 801 | // Init nextDoc() |
| 802 | while (docs.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { |
| 803 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 804 | if (docs.freq() < 1) continue; |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 805 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 806 | // Initialize (go to first term) |
| 807 | docs.nextPosition(); |
| 808 | |
| 809 | // Copy payload with the offset of the BytesRef |
| 810 | payload = docs.getPayload(); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 811 | if (payload != null) { |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 812 | System.arraycopy(payload.bytes, payload.offset, pl, 0, |
| 813 | 4); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 814 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 815 | // Add payload as integer |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 816 | occurrences += ByteBuffer.wrap(pl).getInt(); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 817 | |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 818 | if (DEBUG) log.debug( |
| 819 | "Value for {} incremented by {} to {} in {}", |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 820 | term, ByteBuffer.wrap(pl).getInt(), occurrences, |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 821 | docs.docID()); |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 822 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 823 | }; |
| 824 | |
| 825 | // Return the sum of all occurrences |
| 826 | return occurrences; |
| 827 | }; |
| 828 | }; |
| 829 | |
| 830 | // Nothing found |
| 831 | return 0; |
| Akron | 4055017 | 2015-08-04 03:06:12 +0200 | [diff] [blame] | 832 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 833 | |
| 834 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 835 | /** |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 836 | * Return the number of documents in the virtual |
| 837 | * collection. |
| Nils Diewald | bb33da2 | 2015-03-04 16:24:25 +0000 | [diff] [blame] | 838 | * |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 839 | * @return The number of the occurrences. |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 840 | * @see #numberOf |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 841 | */ |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 842 | public long docCount () { |
| Nils Diewald | afab8f3 | 2015-01-26 19:11:32 +0000 | [diff] [blame] | 843 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 844 | // No index defined |
| margaretha | 8efa375 | 2018-07-24 17:46:43 +0200 | [diff] [blame] | 845 | if (this.index == null) return (long) 0; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 846 | |
| 847 | // TODO: Caching! |
| 848 | |
| 849 | long docCount = 0; |
| 850 | try { |
| 851 | FixedBitSet bitset; |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 852 | for (LeafReaderContext atomic : this.index.reader().leaves()) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 853 | if ((bitset = this.bits(atomic)) != null) |
| 854 | docCount += bitset.cardinality(); |
| 855 | }; |
| 856 | } |
| 857 | catch (IOException e) { |
| 858 | log.warn(e.getLocalizedMessage()); |
| Akron | b59f40e | 2018-08-23 17:15:43 +0200 | [diff] [blame] | 859 | } |
| 860 | catch (QueryException e) { |
| 861 | log.warn(e.getLocalizedMessage()); |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 862 | }; |
| 863 | return docCount; |
| Nils Diewald | dfb21ea | 2013-11-21 14:26:47 +0000 | [diff] [blame] | 864 | }; |
| 865 | |
| Nils Diewald | ea96950 | 2015-02-16 21:10:54 +0000 | [diff] [blame] | 866 | |
| Akron | aa74ec6 | 2015-07-31 17:22:55 +0200 | [diff] [blame] | 867 | private static String _bits (Bits bitset) { |
| 868 | String str = ""; |
| 869 | for (int i = 0; i < bitset.length(); i++) { |
| 870 | str += bitset.get(i) ? "1" : "0"; |
| 871 | }; |
| 872 | return str; |
| 873 | }; |
| margaretha | 2ac95e3 | 2021-11-29 15:31:14 +0100 | [diff] [blame] | 874 | |
| margaretha | 5a8abea | 2021-11-08 16:57:51 +0100 | [diff] [blame] | 875 | @Override |
| 876 | public Set<String> getAllLeafFingerprints () { |
| 877 | return index.getAllLeafFingerprints(); |
| margaretha | 85ee2ac | 2018-07-25 17:58:09 +0200 | [diff] [blame] | 878 | } |
| margaretha | fe25280 | 2018-07-30 14:59:50 +0200 | [diff] [blame] | 879 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 880 | /* |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 881 | * Analyze how terms relate |
| 882 | */ |
| 883 | /* |
| Nils Diewald | 7cbcfe9 | 2014-09-22 22:01:51 +0000 | [diff] [blame] | 884 | @Deprecated |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 885 | public HashMap getTermRelation (KrillCollection kc, String field) |
| 886 | throws Exception { |
| 887 | HashMap<String, Long> map = new HashMap<>(100); |
| 888 | long docNumber = 0, checkNumber = 0; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 889 | |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 890 | try { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 891 | if (kc.getCount() <= 0) { |
| 892 | checkNumber = (long) this.reader().numDocs(); |
| 893 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 894 | |
| Akron | 700c1eb | 2015-09-25 16:57:30 +0200 | [diff] [blame] | 895 | for (LeafReaderContext atomic : this.reader().leaves()) { |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 896 | HashMap<String, FixedBitSet> termVector = new HashMap<>(20); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 897 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 898 | FixedBitSet docvec = kc.bits(atomic); |
| 899 | if (docvec != null) { |
| 900 | docNumber += docvec.cardinality(); |
| 901 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 902 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 903 | Terms terms = atomic.reader().fields().terms(field); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 904 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 905 | if (terms == null) { |
| 906 | continue; |
| 907 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 908 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 909 | int docLength = atomic.reader().maxDoc(); |
| 910 | FixedBitSet bitset = new FixedBitSet(docLength); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 911 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 912 | // Iterate over all tokens in this field |
| 913 | TermsEnum termsEnum = terms.iterator(null); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 914 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 915 | while (termsEnum.next() != null) { |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 916 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 917 | String termString = termsEnum.term().utf8ToString(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 918 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 919 | bitset.clear(0, docLength); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 920 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 921 | // Get frequency |
| 922 | bitset.or((DocIdSetIterator) termsEnum.docs((Bits) docvec, |
| 923 | null)); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 924 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 925 | long value = 0; |
| 926 | if (map.containsKey(termString)) |
| 927 | value = map.get(termString); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 928 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 929 | map.put(termString, value + bitset.cardinality()); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 930 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 931 | termVector.put(termString, bitset.clone()); |
| 932 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 933 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 934 | int keySize = termVector.size(); |
| 935 | String[] keys = termVector.keySet() |
| 936 | .toArray(new String[keySize]); |
| 937 | java.util.Arrays.sort(keys); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 938 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 939 | if (keySize > maxTermRelations) { |
| 940 | throw new Exception("termRelations are limited to " |
| 941 | + maxTermRelations + " sets" |
| 942 | + " (requested were at least " + keySize + " sets)"); |
| 943 | }; |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 944 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 945 | for (int i = 0; i < keySize; i++) { |
| 946 | for (int j = i + 1; j < keySize; j++) { |
| 947 | FixedBitSet comby = termVector.get(keys[i]).clone(); |
| 948 | comby.and(termVector.get(keys[j])); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 949 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 950 | StringBuilder sb = new StringBuilder(); |
| 951 | sb.append("#__").append(keys[i]).append(":###:") |
| 952 | .append(keys[j]); |
| 953 | String combString = sb.toString(); |
| Eliza Margaretha | 6f98920 | 2016-10-14 21:48:29 +0200 | [diff] [blame] | 954 | |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 955 | long cap = (long) comby.cardinality(); |
| 956 | if (map.containsKey(combString)) { |
| 957 | cap += map.get(combString); |
| 958 | }; |
| 959 | map.put(combString, cap); |
| 960 | }; |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 961 | }; |
| 962 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 963 | map.put("-docs", checkNumber != 0 ? checkNumber : docNumber); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 964 | } |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 965 | catch (IOException e) { |
| 966 | log.warn(e.getMessage()); |
| Nils Diewald | 44d5fa1 | 2015-01-15 21:31:52 +0000 | [diff] [blame] | 967 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 968 | return map; |
| Nils Diewald | 2276e1c | 2014-04-10 15:01:59 +0000 | [diff] [blame] | 969 | }; |
| Akron | 176c9b1 | 2015-07-29 19:53:40 +0200 | [diff] [blame] | 970 | */ |
| Nils Diewald | 65894bd | 2015-02-16 21:36:53 +0000 | [diff] [blame] | 971 | |
| 972 | |
| Nils Diewald | 01b4ce3 | 2013-12-05 22:39:25 +0000 | [diff] [blame] | 973 | }; |