blob: 3e7095c0a93205a744c8ecdf38e7f2905d44f8b8 [file] [log] [blame]
Michael Hanlca740d72015-06-16 10:04:58 +02001// Connector to the Lucene Backend
2package de.ids_mannheim.korap.web;
Michael Hanl8abaf9e2016-05-23 16:46:35 +02003
margaretha61471cc2017-04-20 18:42:23 +02004import java.io.File;
5import java.io.IOException;
6import java.nio.file.Paths;
7import java.util.List;
margarethaf68daa62017-09-21 02:11:24 +02008import java.util.Properties;
margarethaa76ed242017-05-24 17:48:22 +02009import java.util.regex.Matcher;
10import java.util.regex.Pattern;
margaretha61471cc2017-04-20 18:42:23 +020011
12import org.apache.lucene.store.MMapDirectory;
13import org.slf4j.Logger;
14import org.slf4j.LoggerFactory;
15
Michael Hanlca740d72015-06-16 10:04:58 +020016import de.ids_mannheim.korap.Krill;
17import de.ids_mannheim.korap.KrillCollection;
18import de.ids_mannheim.korap.KrillIndex;
margarethaa76ed242017-05-24 17:48:22 +020019import de.ids_mannheim.korap.exceptions.KustvaktException;
20import de.ids_mannheim.korap.exceptions.StatusCodes;
Michael Hanlca740d72015-06-16 10:04:58 +020021import de.ids_mannheim.korap.response.Match;
22import de.ids_mannheim.korap.response.Result;
23import de.ids_mannheim.korap.util.QueryException;
Michael Hanl8abaf9e2016-05-23 16:46:35 +020024
Michael Hanlca740d72015-06-16 10:04:58 +020025/**
Akronb99b70c2015-06-19 20:14:28 +020026 * The SearchKrill class allows for searching in the
27 * Lucene based Krill backend by applying KoralQuery.
Michael Hanl8abaf9e2016-05-23 16:46:35 +020028 *
Michael Hanlca740d72015-06-16 10:04:58 +020029 * @author Nils Diewald
30 */
Akronb99b70c2015-06-19 20:14:28 +020031public class SearchKrill {
Michael Hanl8abaf9e2016-05-23 16:46:35 +020032 private final static Logger jlog = LoggerFactory
Michael Hanlf0785322015-11-13 16:14:45 +010033 .getLogger(SearchKrill.class);
Akron78e2d202016-10-13 14:17:11 +020034
35 // Temporary - shouldn't be here.
Michael Hanlca740d72015-06-16 10:04:58 +020036 String indexDir = "/data/prep_corpus/index/";
37 String i = "/Users/hanl/Projects/prep_corpus";
38 String klinux10 = "/vol/work/hanl/indices";
Michael Hanlca740d72015-06-16 10:04:58 +020039 private KrillIndex index;
margarethaf68daa62017-09-21 02:11:24 +020040
Michael Hanlca740d72015-06-16 10:04:58 +020041 /**
42 * Constructor
43 */
44 // todo: use korap.config to get index location
Michael Hanl19390652016-01-16 11:01:24 +010045 public SearchKrill (String path) {
Bodmo3d6bd352017-04-25 11:31:39 +020046
Michael Hanlca740d72015-06-16 10:04:58 +020047 try {
Michael Hanl19390652016-01-16 11:01:24 +010048 if (path.equals(":temp:")) {
49 this.index = new KrillIndex();
Michael Hanlca740d72015-06-16 10:04:58 +020050 }
Michael Hanl19390652016-01-16 11:01:24 +010051 else {
52 File f = new File(path);
Michael Hanl8abaf9e2016-05-23 16:46:35 +020053 jlog.info("Loading index from " + path);
Michael Hanl19390652016-01-16 11:01:24 +010054 if (!f.exists()) {
Bodmo3d6bd352017-04-25 11:31:39 +020055 jlog.error("Index not found: " + path + "!");
Michael Hanl19390652016-01-16 11:01:24 +010056 System.exit(-1);
Michael Hanldaf86602016-05-12 14:31:52 +020057 }
Michael Hanl19390652016-01-16 11:01:24 +010058 this.index = new KrillIndex(new MMapDirectory(Paths.get(path)));
59 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +020060 }
61 catch (IOException e) {
62 jlog.error("Unable to loadSubTypes index: {}", e.getMessage());
Michael Hanl19390652016-01-16 11:01:24 +010063 };
Akron78e2d202016-10-13 14:17:11 +020064 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +020065
Michael Hanl19390652016-01-16 11:01:24 +010066 public KrillIndex getIndex () {
67 return this.index;
Akron78e2d202016-10-13 14:17:11 +020068 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +020069
70
Michael Hanlca740d72015-06-16 10:04:58 +020071 /**
72 * Search in the Lucene index.
Michael Hanl8abaf9e2016-05-23 16:46:35 +020073 *
74 * @param json
75 * JSON-LD string with search and potential meta
76 * filters.
Michael Hanlca740d72015-06-16 10:04:58 +020077 */
Michael Hanl8abaf9e2016-05-23 16:46:35 +020078 public String search (String json) {
margaretha61471cc2017-04-20 18:42:23 +020079 jlog.trace(json);
Michael Hanlca740d72015-06-16 10:04:58 +020080 if (this.index != null)
Akronb99b70c2015-06-19 20:14:28 +020081 return new Krill(json).apply(this.index).toJsonString();
Michael Hanlca740d72015-06-16 10:04:58 +020082 Result kr = new Result();
Akronb99b70c2015-06-19 20:14:28 +020083 kr.addError(601, "Unable to find index");
Michael Hanlca740d72015-06-16 10:04:58 +020084 return kr.toJsonString();
Akron78e2d202016-10-13 14:17:11 +020085 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +020086
87
Michael Hanlca740d72015-06-16 10:04:58 +020088 /**
89 * Search in the Lucene index and return matches as token lists.
Michael Hanl8abaf9e2016-05-23 16:46:35 +020090 *
91 * @param json
92 * JSON-LD string with search and potential meta
93 * filters.
Michael Hanlca740d72015-06-16 10:04:58 +020094 */
Akronb99b70c2015-06-19 20:14:28 +020095 @Deprecated
Michael Hanl8abaf9e2016-05-23 16:46:35 +020096 public String searchTokenList (String json) {
margaretha61471cc2017-04-20 18:42:23 +020097 jlog.trace(json);
Michael Hanlca740d72015-06-16 10:04:58 +020098 if (this.index != null)
Akronb99b70c2015-06-19 20:14:28 +020099 return new Krill(json).apply(this.index).toTokenListJsonString();
Michael Hanlca740d72015-06-16 10:04:58 +0200100 Result kr = new Result();
Akronb99b70c2015-06-19 20:14:28 +0200101 kr.addError(601, "Unable to find index");
Michael Hanlca740d72015-06-16 10:04:58 +0200102 return kr.toJsonString();
Akron78e2d202016-10-13 14:17:11 +0200103 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200104
105
Michael Hanlca740d72015-06-16 10:04:58 +0200106 /**
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200107 * Get info on a match - by means of a richly annotated html
108 * snippet.
109 *
110 * @param id
111 * match id
margarethaa76ed242017-05-24 17:48:22 +0200112 * @param availabilityList
113 * @throws KustvaktException
Michael Hanlca740d72015-06-16 10:04:58 +0200114 */
margarethaa76ed242017-05-24 17:48:22 +0200115 public String getMatch (String id, Pattern licensePattern) {
116 Match km;
Michael Hanlca740d72015-06-16 10:04:58 +0200117 if (this.index != null) {
118 try {
margarethaa76ed242017-05-24 17:48:22 +0200119 km = this.index.getMatch(id);
120 String availability = km.getAvailability();
margaretha698d9532017-06-27 10:53:27 +0200121 if (licensePattern!=null && availability != null){
margarethaa76ed242017-05-24 17:48:22 +0200122 Matcher m = licensePattern.matcher(availability);
123 if (!m.matches()){
margarethaaec93f72017-05-29 16:51:41 +0200124 km = new Match();
margarethaa76ed242017-05-24 17:48:22 +0200125 km.addError(StatusCodes.ACCESS_DENIED,
margaretha65ca5fb2017-06-29 15:01:57 +0200126 "Retrieving match info with ID "+id+" is not allowed.", id);
margarethaa76ed242017-05-24 17:48:22 +0200127 }
128 }
Michael Hanl19390652016-01-16 11:01:24 +0100129 }
130 catch (QueryException qe) {
margarethaa76ed242017-05-24 17:48:22 +0200131 km = new Match();
Akronb99b70c2015-06-19 20:14:28 +0200132 km.addError(qe.getErrorCode(), qe.getMessage());
Michael Hanlca740d72015-06-16 10:04:58 +0200133 }
margarethaa76ed242017-05-24 17:48:22 +0200134 }
135 else{
136 km = new Match();
137 km.addError(601, "Unable to find index");
138 }
Michael Hanlca740d72015-06-16 10:04:58 +0200139 return km.toJsonString();
Akron78e2d202016-10-13 14:17:11 +0200140 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200141
142
143 public String getMatch (String id, List<String> foundries,
Michael Hanlca740d72015-06-16 10:04:58 +0200144 List<String> layers, boolean includeSpans,
margarethaa76ed242017-05-24 17:48:22 +0200145 boolean includeHighlights, boolean sentenceExpansion,
146 Pattern licensePattern) {
147 Match km;
Michael Hanlca740d72015-06-16 10:04:58 +0200148 if (this.index != null) {
149 try {
margarethaa76ed242017-05-24 17:48:22 +0200150 km = this.index.getMatchInfo(id, "tokens", true, foundries,
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200151 layers, includeSpans, includeHighlights,
margarethaa76ed242017-05-24 17:48:22 +0200152 sentenceExpansion);
153 String availability = km.getAvailability();
margaretha698d9532017-06-27 10:53:27 +0200154 if (licensePattern !=null && availability != null){
margarethaa76ed242017-05-24 17:48:22 +0200155 Matcher m = licensePattern.matcher(availability);
156 if (!m.matches()){
margaretha65b67142017-05-29 16:23:16 +0200157 km = new Match();
margarethaa76ed242017-05-24 17:48:22 +0200158 km.addError(StatusCodes.ACCESS_DENIED,
margaretha65ca5fb2017-06-29 15:01:57 +0200159 "Retrieving match info with ID "+id+" is not allowed.", id);
margarethaa76ed242017-05-24 17:48:22 +0200160 }
161 }
162
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200163 }
164 catch (QueryException qe) {
margarethaa76ed242017-05-24 17:48:22 +0200165 km = new Match();
Akronb99b70c2015-06-19 20:14:28 +0200166 km.addError(qe.getErrorCode(), qe.getMessage());
Michael Hanlca740d72015-06-16 10:04:58 +0200167 }
margarethaa76ed242017-05-24 17:48:22 +0200168 }
169 else{
170 km = new Match();
171 km.addError(601, "Unable to find index");
172 }
Michael Hanlca740d72015-06-16 10:04:58 +0200173 return km.toJsonString();
Akron78e2d202016-10-13 14:17:11 +0200174 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200175
176
Michael Hanlca740d72015-06-16 10:04:58 +0200177 /**
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200178 * Get info on a match - by means of a richly annotated html
179 * snippet.
180 *
181 * @param id
182 * match id
183 * @param foundry
184 * the foundry of interest - may be null
185 * @param layer
186 * the layer of interest - may be null
187 * @param includeSpans
188 * Should spans be included (or only token infos)?
189 * @param includeHighlights
190 * Should highlight markup be included?
Michael Hanlca740d72015-06-16 10:04:58 +0200191 */
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200192 public String getMatch (String id, String foundry, String layer,
Michael Hanlca740d72015-06-16 10:04:58 +0200193 boolean includeSpans, boolean includeHighlights,
194 boolean sentenceExpansion) {
195
196 if (this.index != null) {
197 try {
Michael Hanl19390652016-01-16 11:01:24 +0100198 /*
199 For multiple foundries/layers use
200 String idString,
201 "tokens",
202 true,
203 ArrayList<String> foundry,
204 ArrayList<String> layer,
205 boolean includeSpans,
206 boolean includeHighlights,
207 boolean extendToSentence
208 */
Michael Hanlca740d72015-06-16 10:04:58 +0200209 return this.index.getMatchInfo(id, "tokens", foundry, layer,
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200210 includeSpans, includeHighlights, sentenceExpansion)
Michael Hanlca740d72015-06-16 10:04:58 +0200211 .toJsonString();
Michael Hanl19390652016-01-16 11:01:24 +0100212 }
213 catch (QueryException qe) {
Michael Hanlca740d72015-06-16 10:04:58 +0200214 Match km = new Match();
Akronb99b70c2015-06-19 20:14:28 +0200215 km.addError(qe.getErrorCode(), qe.getMessage());
Michael Hanlca740d72015-06-16 10:04:58 +0200216 return km.toJsonString();
217 }
Michael Hanl19390652016-01-16 11:01:24 +0100218 };
Michael Hanlca740d72015-06-16 10:04:58 +0200219 Match km = new Match();
Akronb99b70c2015-06-19 20:14:28 +0200220 km.addError(601, "Unable to find index");
Michael Hanlca740d72015-06-16 10:04:58 +0200221 return km.toJsonString();
Akron78e2d202016-10-13 14:17:11 +0200222 };
Michael Hanlca740d72015-06-16 10:04:58 +0200223
Michael Hanlca740d72015-06-16 10:04:58 +0200224
225 /**
226 * Get statistics on (virtual) collections.
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200227 *
228 * @param json
229 * JSON-LD string with potential meta filters.
Michael Hanlca740d72015-06-16 10:04:58 +0200230 */
Akronb99b70c2015-06-19 20:14:28 +0200231 @Deprecated
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200232 public String getStatistics (String json) {
margaretha61471cc2017-04-20 18:42:23 +0200233 jlog.trace(json);
Michael Hanlca740d72015-06-16 10:04:58 +0200234 if (this.index == null) {
235 return "{\"documents\" : -1, error\" : \"No index given\" }";
Michael Hanl19390652016-01-16 11:01:24 +0100236 };
Michael Hanlca740d72015-06-16 10:04:58 +0200237 // Create Virtual collection from json search
238 KrillCollection kc = new KrillCollection(json);
Michael Hanlca740d72015-06-16 10:04:58 +0200239 // Set index
240 kc.setIndex(this.index);
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200241 long docs = 0, tokens = 0, sentences = 0, paragraphs = 0;
Michael Hanlca740d72015-06-16 10:04:58 +0200242 // Get numbers from index (currently slow)
243 try {
244 docs = kc.numberOf("documents");
Akrona3afa7d2017-07-04 16:13:22 +0200245 if (docs > 0) {
246 tokens = kc.numberOf("tokens");
247 sentences = kc.numberOf("base/sentences");
248 paragraphs = kc.numberOf("base/paragraphs");
249 };
Michael Hanlca740d72015-06-16 10:04:58 +0200250 }
Michael Hanl19390652016-01-16 11:01:24 +0100251 catch (IOException e) {
252 e.printStackTrace();
253 };
Michael Hanlca740d72015-06-16 10:04:58 +0200254 // Build json response
255 StringBuilder sb = new StringBuilder("{");
256 sb.append("\"documents\":").append(docs).append(",\"tokens\":")
257 .append(tokens).append(",\"sentences\":").append(sentences)
258 .append(",\"paragraphs\":").append(paragraphs).append("}");
259 return sb.toString();
Akron78e2d202016-10-13 14:17:11 +0200260 };
Michael Hanl8abaf9e2016-05-23 16:46:35 +0200261
margaretha61471cc2017-04-20 18:42:23 +0200262
Akron78e2d202016-10-13 14:17:11 +0200263 /**
margaretha61471cc2017-04-20 18:42:23 +0200264 * Return the match identifier as a string.
265 * This is a convenient method to deal with legacy instantiation
266 * of the
267 * code.
268 */
269 public String getMatchId (String corpusID, String docID, String textID,
270 String matchID) {
271 // Create a string representation of the match
272 StringBuilder sb = new StringBuilder();
273 sb.append("match-").append(corpusID).append('/').append(docID)
274 .append('/').append(textID).append('-').append(matchID);
275 return sb.toString();
Akron78e2d202016-10-13 14:17:11 +0200276 };
277};