blob: e4de9dce67a411c96ac7e7947f6b1d3e59a5ed38 [file] [log] [blame]
// Connector to the Lucene Backend
package de.ids_mannheim.korap.web;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.fasterxml.jackson.databind.JsonNode;
import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillCollection;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.exceptions.KustvaktException;
import de.ids_mannheim.korap.exceptions.StatusCodes;
import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.MetaFields;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.util.QueryException;
/**
* The SearchKrill class allows for searching in the
* Lucene based Krill backend by applying KoralQuery.
*
* @author Nils Diewald
*/
public class SearchKrill {
private final static Logger jlog = LogManager.getLogger(SearchKrill.class);
private static final boolean DEBUG = false;
public static KrillIndex index;
/**
* Constructor
*/
// todo: use korap.config to get index location
public SearchKrill (String path) {
try {
if (path.equals(":temp:")) {
index = new KrillIndex();
}
else {
File f = new File(path);
jlog.info("Loading index from " + path);
if (!f.exists()) {
jlog.error("Index not found: " + path + "!");
System.exit(-1);
}
index = new KrillIndex(Paths.get(path));
};
}
catch (IOException e) {
jlog.error("Unable to loadSubTypes index:" + e.getMessage());
};
};
public KrillIndex getIndex () {
return index;
};
public void closeIndexReader () throws KustvaktException {
try {
index.closeReader();
}
catch (IOException e) {
throw new KustvaktException(500, "Failed closing index reader");
}
}
/**
* Search in the Lucene index.
*
* @param json
* JSON-LD string with search and potential meta
* filters.
*/
public String search (String json) {
if (DEBUG) {
jlog.debug(json);
}
if (index != null) {
String result = new Krill(json).apply(index).toJsonString();
if (DEBUG) {
jlog.debug(result);
}
return result;
}
Result kr = new Result();
kr.addError(601, "Unable to find index");
return kr.toJsonString();
};
/**
* Search in the Lucene index and return matches as token lists.
*
* @param json
* JSON-LD string with search and potential meta
* filters.
*/
@Deprecated
public String searchTokenList (String json) {
if (DEBUG) {
jlog.debug(json);
}
if (index != null)
return new Krill(json).apply(index).toTokenListJsonString();
Result kr = new Result();
kr.addError(601, "Unable to find index");
return kr.toJsonString();
};
/**
* Get info on a match - by means of a richly annotated html
* snippet.
*
* @param id
* match id
* @param availabilityList
* @throws KustvaktException
*/
public String getMatch (String id, Pattern licensePattern)
throws KustvaktException {
Match km;
if (index != null) {
try {
km = index.getMatch(id);
String availability = km.getAvailability();
checkAvailability(licensePattern, availability, id);
}
catch (QueryException qe) {
km = new Match();
km.addError(qe.getErrorCode(), qe.getMessage());
}
}
else {
km = new Match();
km.addError(601, "Unable to find index");
}
return km.toJsonString();
};
private void checkAvailability (Pattern licensePattern, String availability,
String id) throws KustvaktException {
if (DEBUG) {
jlog.debug("pattern: " + licensePattern.toString()
+ ", availability: " + availability);
}
if (licensePattern != null && availability != null) {
Matcher m = licensePattern.matcher(availability);
if (!m.matches()) {
if (availability.isEmpty()) {
throw new KustvaktException(StatusCodes.MISSING_ATTRIBUTE,
"Availability for " + id + "is empty.", id);
}
throw new KustvaktException(StatusCodes.AUTHORIZATION_FAILED,
"Retrieving resource with ID " + id
+ " is not allowed.",
id);
}
}
}
/*
* Retrieve the meta fields for a certain document
*/
public String getFields (String id, List<String> fields,
Pattern licensePattern) throws KustvaktException {
MetaFields meta;
// No index found
if (index == null) {
meta = new MetaFields(id);
meta.addError(601, "Unable to find index");
}
// Index available
else if (fields != null) {
// Get fields
meta = index.getFields(id, fields);
}
else {
// Get fields
meta = index.getFields(id);
}
// EM: this approach forbids the whole metadata
// this should be refined by filtering out only the restricted
// metadata fields
// String availability = meta.getFieldValue("availability");
// checkAvailability(licensePattern, availability, id);
return meta.toJsonString();
};
public String getMatch (String id, boolean info, List<String> foundries,
List<String> layers, boolean includeSpans, boolean includeSnippet,
boolean includeTokens, boolean includeHighlights,
boolean sentenceExpansion, Pattern licensePattern,
boolean isDeprecated)
throws KustvaktException {
Match km;
if (index != null) {
try {
km = index.getMatchInfo(id, "tokens", info, foundries, layers,
includeSpans, includeSnippet, includeTokens,
includeHighlights, sentenceExpansion);
String availability = km.getAvailability();
checkAvailability(licensePattern, availability, id);
}
catch (QueryException qe) {
km = new Match();
km.addError(qe.getErrorCode(), qe.getMessage());
}
}
else {
km = new Match();
km.addError(601, "Unable to find index");
}
if (isDeprecated) {
km.addWarning(StatusCodes.DEPRECATED,
"This service is deprecated. Please use the following service"
+ " URL instead: {version}/corpus/{corpusId}/{docId}/"
+ "{textId}/{matchId}");
}
return km.toJsonString();
};
/**
* Get info on a match - by means of a richly annotated html
* snippet.
*
* @param id
* match id
* @param foundry
* the foundry of interest - may be null
* @param layer
* the layer of interest - may be null
* @param includeSpans
* Should spans be included (or only token infos)?
* @param includeHighlights
* Should highlight markup be included?
*/
public String getMatch (String id, String foundry, String layer,
boolean includeSpans, boolean includeHighlights,
boolean sentenceExpansion) {
if (index != null) {
try {
/*
For multiple foundries/layers use
String idString,
"tokens",
true,
ArrayList<String> foundry,
ArrayList<String> layer,
boolean includeSpans,
boolean includeHighlights,
boolean extendToSentence
*/
return index.getMatchInfo(id, "tokens", foundry, layer,
includeSpans, includeHighlights, sentenceExpansion)
.toJsonString();
}
catch (QueryException qe) {
Match km = new Match();
km.addError(qe.getErrorCode(), qe.getMessage());
return km.toJsonString();
}
};
Match km = new Match();
km.addError(601, "Unable to find index");
return km.toJsonString();
};
/**
* Get statistics on (virtual) collections.
*
* EM: might be changed later
*
* @param json
* JSON-LD string with potential meta filters.
* @throws KustvaktException
*/
public String getStatistics (String json) throws KustvaktException {
if (index == null) {
return "{\"documents\" : -1, error\" : \"No index given\" }";
};
// Define a virtual corpus
KrillCollection kc;
if (json != null && !json.equals("")) {
if (DEBUG) {
jlog.debug(json);
}
// Create Virtual collection from json search
kc = new KrillCollection(json);
}
// There is no json string defined
else {
// Create Virtual collection of everything
kc = new KrillCollection();
};
// Set index
kc.setIndex(index);
long docs = 0, tokens = 0, sentences = 0, paragraphs = 0;
// Get numbers from index (currently slow)
try {
docs = kc.numberOf("documents");
if (docs > 0) {
tokens = kc.numberOf("tokens");
sentences = kc.numberOf("base/sentences");
paragraphs = kc.numberOf("base/paragraphs");
};
}
catch (IOException e) {
e.printStackTrace();
};
if (kc.hasErrors()) {
throw new KustvaktException(
"{\"errors\":" + kc.getErrors().toJsonString() + "}");
}
// Build json response
StringBuilder sb = new StringBuilder("{");
sb.append("\"documents\":").append(docs).append(",\"tokens\":")
.append(tokens).append(",\"sentences\":").append(sentences)
.append(",\"paragraphs\":").append(paragraphs).append("}");
return sb.toString();
};
/**
* Return the match identifier as a string.
* This is a convenient method to deal with legacy instantiation
* of the
* code.
*/
public String getMatchId (String corpusID, String docID, String textID,
String matchID) {
// Create a string representation of the match
StringBuilder sb = new StringBuilder();
sb.append("match-").append(corpusID).append('/').append(docID)
.append('/').append(textID).append('-').append(matchID);
return sb.toString();
};
/**
* Return the text sigle as a string.
*/
public String getTextSigle (String corpusID, String docID, String textID) {
// Create a string representation of the match
StringBuilder sb = new StringBuilder();
sb.append(corpusID).append('/').append(docID).append('/')
.append(textID);
return sb.toString();
};
/**
* Return the fingerprint of the latest index revision.
*/
public String getIndexFingerprint () {
if (index != null) {
return index.getFingerprint();
};
return "null";
}
public JsonNode getFieldValuesForVC (String koralQuery, String fieldName) {
return new Krill().retrieveFieldValues(koralQuery, index, fieldName);
}
};