| package de.ids_mannheim.korap; |
| import java.util.*; |
| import java.io.*; |
| |
| import java.lang.StringBuffer; |
| import java.nio.ByteBuffer; |
| |
| import com.fasterxml.jackson.annotation.*; |
| import com.fasterxml.jackson.annotation.JsonInclude.Include; |
| import com.fasterxml.jackson.databind.ObjectMapper; |
| import com.fasterxml.jackson.databind.JsonNode; |
| import com.fasterxml.jackson.databind.node.*; |
| |
| import de.ids_mannheim.korap.index.PositionsToOffset; |
| import de.ids_mannheim.korap.index.SearchContext; |
| import de.ids_mannheim.korap.document.KorapPrimaryData; |
| |
| import static de.ids_mannheim.korap.util.KorapHTML.*; |
| import de.ids_mannheim.korap.index.MatchIdentifier; |
| import de.ids_mannheim.korap.index.PosIdentifier; |
| import de.ids_mannheim.korap.query.SpanElementQuery; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.lucene.index.AtomicReaderContext; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermContext; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.search.spans.Spans; |
| |
| /* |
| Todo: The implemented classes and private names are horrible! |
| Refactor, future-me! |
| |
| The number based Highlighttype is ugly - UGLY! |
| */ |
| |
| /** |
| * Representation of Matches in a KorapResult. |
| * |
| * @author Nils Diewald |
| * @see KorapResult |
| */ |
| @JsonInclude(Include.NON_NULL) |
| public class KorapMatch extends KorapDocument { |
| |
| // Logger |
| private final static Logger log = LoggerFactory.getLogger(KorapMatch.class); |
| |
| // This advices the java compiler to ignore all loggings |
| public static final boolean DEBUG = false; |
| |
| // Mapper for JSON serialization |
| ObjectMapper mapper = new ObjectMapper(); |
| |
| // Snippet information |
| @JsonIgnore |
| public SearchContext context; |
| |
| // Should be deprecated, but used wildly in tests! |
| @JsonIgnore |
| public int startPos, endPos; |
| |
| @JsonIgnore |
| public int potentialStartPosChar = -1, |
| potentialEndPosChar = -1; |
| |
| private String error = null; |
| private String version; |
| |
| // TEMPORARILY |
| @JsonIgnore |
| public int localDocID = -1; |
| |
| HashMap<Integer, String> annotationNumber = new HashMap<>(16); |
| HashMap<Integer, Relation> relationNumber = new HashMap<>(16); |
| HashMap<Integer, Integer> identifierNumber = new HashMap<>(16); |
| |
| // -1 is match highlight |
| int annotationNumberCounter = 256; |
| int relationNumberCounter = 2048; |
| int identifierNumberCounter = -2; |
| |
| private String tempSnippet, |
| snippetHTML, |
| snippetBrackets, |
| identifier; |
| |
| private HighlightCombinator snippetStack; |
| |
| public boolean startMore = true, |
| endMore = true; |
| |
| private Collection<byte[]> payload; |
| private ArrayList<Highlight> highlight; |
| private LinkedList<int[]> span; |
| |
| private PositionsToOffset positionsToOffset; |
| private boolean processed = false; |
| |
| /** |
| * Constructs a new KorapMatch object. |
| * Todo: Maybe that's not necessary! |
| * |
| * @param pto The PositionsToOffset object, containing relevant |
| * positional information for highlighting |
| * @param localDocID Document ID based on the atomic reader. |
| * @param startPos Start position of the match in the document. |
| * @param endPos End position of the match in the document. |
| * |
| * @see #snippetHTML() |
| * @see #snippetBrackets() |
| * @see PositionsToOffset |
| */ |
| public KorapMatch (PositionsToOffset pto, int localDocID, int startPos, int endPos) { |
| this.positionsToOffset = pto; |
| this.localDocID = localDocID; |
| this.startPos = startPos; |
| this.endPos = endPos; |
| }; |
| |
| |
| /** |
| * Constructs a new KorapMatch object. |
| */ |
| public KorapMatch () {}; |
| |
| |
| /** |
| * Constructs a new KorapMatch object. |
| * |
| * @param idString Match identifier string as provided by KorapResult. |
| * @param includeHighlights Boolean value indicating if possible provided |
| * highlight information should be ignored or not. |
| */ |
| public KorapMatch (String idString, boolean includeHighlights) { |
| MatchIdentifier id = new MatchIdentifier(idString); |
| this.setCorpusID(id.getCorpusID()); |
| this.setDocID(id.getDocID()); |
| this.setStartPos(id.getStartPos()); |
| this.setEndPos(id.getEndPos()); |
| |
| if (includeHighlights) |
| for (int[] pos : id.getPos()) { |
| if (pos[0] < id.getStartPos() || pos[1] > id.getEndPos()) |
| continue; |
| |
| this.addHighlight(pos[0], pos[1], pos[2]); |
| }; |
| }; |
| |
| |
| /** |
| * Private class of highlights. |
| */ |
| private class Highlight { |
| public int start, end; |
| public int number = -1; |
| |
| // Relational highlight |
| public Highlight (int start, int end, String annotation, int ref) { |
| this.start = start; |
| this.end = end; |
| // TODO: This can overflow! |
| this.number = relationNumberCounter++; |
| relationNumber.put(this.number, new Relation(annotation, ref)); |
| }; |
| |
| // Span highlight |
| public Highlight (int start, int end, String annotation) { |
| this.start = start; |
| this.end = end; |
| // TODO: This can overflow! |
| if (annotationNumberCounter < 2048) { |
| this.number = annotationNumberCounter++; |
| annotationNumber.put(this.number, annotation); |
| }; |
| }; |
| |
| // Simple highlight |
| public Highlight (int start, int end, int number) { |
| this.start = start; |
| this.end = end; |
| this.number = number; |
| }; |
| }; |
| |
| |
| /** |
| * Private class of relations. |
| */ |
| private class Relation { |
| public int ref; |
| public String annotation; |
| public Relation (String annotation, int ref) { |
| this.annotation = annotation; |
| this.ref = ref; |
| }; |
| }; |
| |
| |
| /** |
| * Insert a highlight for the snippet view by means of positional |
| * offsets and an optional class number. |
| * |
| * @param start Integer value of a span's positional start offset. |
| * @param end Integer value of a span's positional end offset. |
| * @param number Optional class number of the highlight. |
| */ |
| public void addHighlight (int start, int end) { |
| this.addHighlight(new Highlight(start, end, (int) 0)); |
| }; |
| |
| public void addHighlight (int start, int end, byte number) { |
| this.addHighlight(new Highlight(start, end, (int) number)); |
| }; |
| |
| public void addHighlight (int start, int end, short number) { |
| this.addHighlight(new Highlight(start, end, (int) number)); |
| }; |
| |
| public void addHighlight (int start, int end, int number) { |
| this.addHighlight(new Highlight(start, end, number)); |
| }; |
| |
| |
| /** |
| * Insert a highlight for the snippet view. |
| * |
| * @param hl A highlight object to add to the match. |
| */ |
| public void addHighlight (Highlight hl) { |
| |
| if (this.highlight == null) |
| this.highlight = new ArrayList<Highlight>(16); |
| |
| if (DEBUG) |
| log.trace("Add highlight from pos {}-{} of class {}", |
| hl.start, hl.end, hl.number); |
| |
| // Reset the fetched match data |
| this._reset(); |
| |
| this.highlight.add(hl); |
| }; |
| |
| |
| /** |
| * Insert a textual annotation for the snippet view by |
| * means of positional offsets and an annotation string. |
| * |
| * @param start Integer value of a span's positional start offset. |
| * @param end Integer value of a span's positional end offset. |
| * @param annotation Annotation string. |
| */ |
| public void addAnnotation (int start, int end, String annotation) { |
| this.addHighlight(new Highlight(start, end, annotation)); |
| }; |
| |
| |
| /** |
| * Insert an annotated relation for the snippet view by |
| * means of relational participant positions and an annotation string. |
| * |
| * @param src Integer value of a span's positional source object. |
| * @param target Integer value of a span's positional target object. |
| * @param annotation Annotation string. |
| */ |
| public void addRelation (int src, int target, String annotation) { |
| this.addHighlight(new Highlight(src, src, annotation, target)); |
| int id = identifierNumberCounter--; |
| identifierNumber.put(id, target); |
| this.addHighlight(new Highlight(target, target, id)); |
| }; |
| |
| |
| /** |
| * Populate document meta information with information coming from the index. |
| * |
| * @param doc Document object. |
| * @param field Primary data field. |
| * @param fields Hash object with all supported fields. |
| */ |
| public void populateDocument (Document doc, String field, HashSet<String> fields) { |
| this.setField(field); |
| this.setPrimaryData( new KorapPrimaryData(doc.get(field)) ); |
| if (fields.contains("corpusID")) |
| this.setCorpusID(doc.get("corpusID")); |
| if (fields.contains("ID")) |
| this.setDocID(doc.get("ID")); |
| if (fields.contains("author")) |
| this.setAuthor(doc.get("author")); |
| if (fields.contains("textClass")) |
| this.setTextClass(doc.get("textClass")); |
| if (fields.contains("title")) |
| this.setTitle(doc.get("title")); |
| if (fields.contains("subTitle")) |
| this.setSubTitle(doc.get("subTitle")); |
| if (fields.contains("pubDate")) |
| this.setPubDate(doc.get("pubDate")); |
| if (fields.contains("pubPlace")) |
| this.setPubPlace(doc.get("pubPlace")); |
| |
| // Temporary (later meta fields in term vector) |
| if (fields.contains("foundries")) |
| this.setFoundries(doc.get("foundries")); |
| if (fields.contains("tokenization")) |
| this.setTokenization(doc.get("tokenization")); |
| if (fields.contains("layerInfo")) |
| this.setLayerInfo(doc.get("layerInfo")); |
| }; |
| |
| |
| /** |
| * Get document id. |
| */ |
| @JsonProperty("docID") |
| public String getDocID () { |
| return super.getID(); |
| }; |
| |
| |
| /** |
| * Set document id. |
| * |
| * @param id String representation of document ID. |
| */ |
| public void setDocID (String id) { |
| super.setID(id); |
| }; |
| |
| |
| /** |
| * Set version of the index |
| */ |
| @JsonIgnore |
| public String getVersion () { |
| if (this.version == null) |
| return null; |
| StringBuilder sb = new StringBuilder("lucene-backend-"); |
| return sb.append(this.version).toString(); |
| }; |
| |
| |
| /** |
| * Set version number. |
| * |
| * @param version The version number of the index as |
| * a string representation. |
| */ |
| @JsonIgnore |
| public void setVersion (String version) { |
| this.version = version; |
| }; |
| |
| |
| /** |
| * Get the positional start offset of the match. |
| */ |
| @JsonIgnore |
| public int getStartPos() { |
| return this.startPos; |
| }; |
| |
| |
| /** |
| * Set the positional start offset of the match. |
| * |
| * @param pos The positional offset. |
| */ |
| @JsonIgnore |
| public void setStartPos(int pos) { |
| this.startPos = pos; |
| }; |
| |
| |
| /** |
| * Get the positional end offset of the match. |
| */ |
| @JsonIgnore |
| public int getEndPos() { |
| return this.endPos; |
| }; |
| |
| |
| /** |
| * Set the positional end offset of the match. |
| * |
| * @param pos The positional offset. |
| */ |
| @JsonIgnore |
| public void setEndPos(int pos) { |
| this.endPos = pos; |
| }; |
| |
| |
| /** |
| * Get the local (i.e. Lucene given) ID of the document. |
| */ |
| @JsonIgnore |
| public int getLocalDocID () { |
| return this.localDocID; |
| }; |
| |
| |
| /** |
| * Set the local (i.e. Lucene given) ID of the document. |
| * |
| * @param id The id of the document. |
| */ |
| @JsonIgnore |
| public void setLocalDocID (int id) { |
| this.localDocID = id; |
| }; |
| |
| |
| /** |
| * Get the PositionsToOffset object. |
| * |
| * @see PositionsToOffset |
| */ |
| @JsonIgnore |
| public PositionsToOffset getPositionsToOffset () { |
| return this.positionsToOffset; |
| }; |
| |
| |
| /** |
| * Set the PositionsToOffset object. |
| * |
| * @param pto The PositionsToOffset object |
| * @see PositionsToOffset |
| */ |
| @JsonIgnore |
| public void setPositionsToOffset (PositionsToOffset pto) { |
| this.positionsToOffset = pto; |
| }; |
| |
| |
| /** |
| * Get match ID (for later retrieval). |
| * |
| * @see MatchIdentifier |
| */ |
| @Override |
| @JsonProperty("ID") |
| public String getID () { |
| |
| // Identifier already given |
| if (this.identifier != null) |
| return this.identifier; |
| |
| // No, nada, nix |
| if (this.localDocID == -1) |
| return null; |
| |
| MatchIdentifier id = new MatchIdentifier(); |
| |
| // Get prefix string corpus/doc |
| id.setCorpusID(this.getCorpusID()); |
| id.setDocID(this.getDocID()); |
| id.setStartPos(startPos); |
| id.setEndPos(endPos); |
| |
| // There are highlights to integrate |
| if (this.highlight != null) { |
| for (Highlight h : this.highlight) { |
| if (h.number >= 256) |
| continue; |
| |
| // Add highlight to the snippet |
| id.addPos(h.start, h.end, h.number); |
| }; |
| }; |
| |
| return (this.identifier = id.toString()); |
| }; |
| |
| |
| /** |
| * Get identifier for a specific position. |
| * |
| * @param int Position to get identifier on. |
| */ |
| @JsonIgnore |
| public String getPosID (int pos) { |
| |
| // Identifier already given |
| if (this.identifier != null) |
| return this.identifier; |
| |
| // Nothing here |
| if (this.localDocID == -1) |
| return null; |
| |
| PosIdentifier id = new PosIdentifier(); |
| |
| // Get prefix string corpus/doc |
| id.setCorpusID(this.getCorpusID()); |
| id.setDocID(this.getDocID()); |
| id.setPos(pos); |
| |
| return id.toString(); |
| }; |
| |
| /** |
| * Get possible error message. |
| */ |
| // Identical to KorapResult |
| public String getError () { |
| return this.error; |
| }; |
| |
| /** |
| * Set error message. |
| * |
| * @param msg The error message. |
| */ |
| public void setError (String msg) { |
| this.error = msg; |
| }; |
| |
| |
| public KorapMatch setContext (SearchContext context) { |
| this.context = context; |
| return this; |
| }; |
| |
| @JsonIgnore |
| public SearchContext getContext () { |
| if (this.context == null) |
| this.context = new SearchContext(); |
| return this.context; |
| }; |
| |
| |
| // Expand the context to a span |
| public int[] expandContextToSpan (String element) { |
| |
| // TODO: THE BITS HAVE TO BE SET! |
| |
| if (this.positionsToOffset != null) |
| return this.expandContextToSpan( |
| this.positionsToOffset.getAtomicReader(), |
| (Bits) null, |
| "tokens", |
| element |
| ); |
| return new int[]{0,0,0,0}; |
| }; |
| |
| // Expand the context to a span |
| // THIS IS NOT VERY CLEVER - MAKE IT MORE CLEVER! |
| public int[] expandContextToSpan (AtomicReaderContext atomic, |
| Bits bitset, |
| String field, |
| String element) { |
| |
| try { |
| // Store character offsets in ByteBuffer |
| ByteBuffer bb = ByteBuffer.allocate(8); |
| |
| SpanElementQuery cquery = |
| new SpanElementQuery(field, element); |
| |
| Spans contextSpans = cquery.getSpans( |
| atomic, |
| bitset, |
| new HashMap<Term, TermContext>() |
| ); |
| |
| int newStart = -1, |
| newEnd = -1; |
| int newStartChar = -1, |
| newEndChar = -1; |
| |
| if (DEBUG) |
| log.trace("Extend match to context boundary with {} in {}", |
| cquery.toString(), |
| this.localDocID); |
| |
| while (true) { |
| |
| // Game over |
| if (contextSpans.next() != true) |
| break; |
| |
| if (contextSpans.doc() != this.localDocID) { |
| contextSpans.skipTo(this.localDocID); |
| if (contextSpans.doc() != this.localDocID) |
| break; |
| }; |
| |
| // There's a <context> found -- I'm curious, |
| // if it's closer to the match than everything before |
| if (contextSpans.start() <= this.getStartPos() && |
| contextSpans.end() >= this.getStartPos()) { |
| |
| // Set as newStart |
| newStart = contextSpans.start() > newStart ? |
| contextSpans.start() : newStart; |
| |
| if (DEBUG) |
| log.trace("NewStart is at {}", newStart); |
| |
| // Get character offset (start) |
| if (contextSpans.isPayloadAvailable()) { |
| try { |
| bb.rewind(); |
| for (byte[] b : contextSpans.getPayload()) { |
| |
| // Not an element span |
| if (b.length != 8) |
| continue; |
| |
| bb.put(b); |
| bb.rewind(); |
| newStartChar = bb.getInt(); |
| newEndChar = bb.getInt(); |
| break; |
| }; |
| } |
| catch (Exception e) { |
| log.warn(e.getMessage()); |
| }; |
| }; |
| } |
| else { |
| // Has to be resettet to avoid multiple readings of the payload |
| newEndChar = 0; |
| }; |
| |
| // There's an s found, that ends after the match |
| if (contextSpans.end() >= this.getEndPos()) { |
| newEnd = contextSpans.end(); |
| |
| // Get character offset (end) |
| if (newEndChar == 0 && contextSpans.isPayloadAvailable()) { |
| try { |
| bb.rewind(); |
| for (byte[] b : contextSpans.getPayload()) { |
| |
| // Not an element span |
| if (b.length != 8) |
| continue; |
| |
| bb.put(b); |
| bb.rewind(); |
| newEndChar = bb.getInt(1); |
| break; |
| }; |
| } |
| catch (Exception e) { |
| log.warn(e.getMessage()); |
| }; |
| }; |
| break; |
| }; |
| }; |
| |
| // We have a new match surrounding |
| if (DEBUG) |
| log.trace("New match spans from {}-{}/{}-{}", newStart, newEnd, newStartChar, newEndChar); |
| |
| return new int[]{newStart, newEnd, newStartChar, newEndChar}; |
| } |
| catch (IOException e) { |
| log.error(e.getMessage()); |
| }; |
| |
| return new int[]{-1,-1,-1,-1}; |
| }; |
| |
| |
| // Reset all internal data |
| private void _reset () { |
| this.processed = false; |
| this.snippetHTML = null; |
| this.snippetBrackets = null; |
| this.identifier = null; |
| |
| // Delete all spans |
| if (this.span != null) |
| this.span.clear(); |
| }; |
| |
| |
| // Start building highlighted snippets |
| private boolean _processHighlight () { |
| if (processed) |
| return true; |
| |
| // Relevant details are missing |
| if (this.positionsToOffset == null || this.localDocID == -1) { |
| log.warn("You have to define " + |
| "positionsToOffset and localDocID first " + |
| "before"); |
| return false; |
| }; |
| |
| if (DEBUG) |
| log.trace("--- Start highlight processing ..."); |
| |
| // Get pto object |
| PositionsToOffset pto = this.positionsToOffset; |
| pto.add(this.localDocID, this.getStartPos()); |
| pto.add(this.localDocID, this.getEndPos() - 1); |
| |
| if (DEBUG) |
| log.trace("PTO will retrieve {} & {} (Match boundary)", |
| this.getStartPos(), |
| this.getEndPos()); |
| |
| // Add all highlights for character retrieval |
| if (this.highlight != null) { |
| for (Highlight hl : this.highlight) { |
| pto.add(this.localDocID, hl.start); |
| pto.add(this.localDocID, hl.end); |
| |
| if (DEBUG) |
| log.trace("PTO will retrieve {} & {} (Highlight boundary)", |
| hl.start, hl.end); |
| }; |
| }; |
| |
| // Get the list of spans for matches and highlighting |
| if (this.span == null || this.span.size() == 0) { |
| if (!this._processHighlightSpans()) |
| return false; |
| }; |
| |
| // Create a stack for highlighted elements |
| // (opening and closing elements) |
| ArrayList<int[]> stack = this._processHighlightStack(); |
| |
| if (DEBUG) |
| log.trace("The snippet is {}", this.tempSnippet); |
| |
| |
| // The temporary snippet is empty, nothing to do |
| if (this.tempSnippet == null) { |
| processed = true; |
| return false; |
| }; |
| |
| // Merge the element stack with the primary textual data |
| this._processHighlightSnippet(this.tempSnippet, stack); |
| |
| // Match is processed - done |
| return (processed = true); |
| }; |
| |
| |
| /* |
| Comparator class for opening tags |
| */ |
| private class OpeningTagComparator implements Comparator<int[]> { |
| @Override |
| public int compare (int[] arg0, int[] arg1) { |
| // Check start positions |
| if (arg0[0] > arg1[0]) { |
| return 1; |
| } |
| else if (arg0[0] == arg1[0]) { |
| // Check endpositions |
| if (arg0[1] > arg1[1]) |
| return -1; |
| return 1; |
| }; |
| return -1; |
| }; |
| }; |
| |
| /* |
| Comparator class for closing tags |
| */ |
| private class ClosingTagComparator implements Comparator<int[]> { |
| @Override |
| public int compare (int[] arg0, int[] arg1) { |
| // Check end positions |
| if (arg0[1] > arg1[1]) { |
| return 1; |
| } |
| else if (arg0[1] == arg1[1]) { |
| // Check start positions |
| if (arg0[0] < arg1[0]) |
| return 1; |
| return -1; |
| }; |
| return -1; |
| }; |
| }; |
| |
| /* |
| Private class for elements with highlighting information |
| */ |
| private class HighlightCombinatorElement { |
| |
| // Type 0: Textual data |
| // Type 1: Opening |
| // Type 2: Closing |
| private byte type; |
| |
| private int number = 0; |
| |
| private String characters; |
| private boolean terminal = true; |
| |
| // Constructor for highlighting elements |
| public HighlightCombinatorElement (byte type, int number) { |
| this.type = type; |
| this.number = number; |
| }; |
| |
| // Constructor for highlighting elements, |
| // that may not be terminal, i.e. they were closed and will |
| // be reopened for overlapping issues. |
| public HighlightCombinatorElement (byte type, int number, boolean terminal) { |
| this.type = type; |
| this.number = number; |
| this.terminal = terminal; |
| }; |
| |
| // Constructor for textual data |
| public HighlightCombinatorElement (String characters) { |
| this.type = (byte) 0; |
| this.characters = characters; |
| }; |
| |
| // Return html fragment for this combinator element |
| public String toHTML (KorapMatch match, FixedBitSet level, byte[] levelCache) { |
| // Opening |
| if (this.type == 1) { |
| StringBuilder sb = new StringBuilder(); |
| if (this.number == -1) { |
| sb.append("<span class=\"match\">"); |
| } |
| |
| else if (this.number < -1) { |
| sb.append("<span xml:id=\"") |
| .append(match.getPosID( |
| identifierNumber.get(this.number))) |
| .append("\">"); |
| } |
| |
| else if (this.number >= 256) { |
| sb.append("<span "); |
| if (this.number < 2048) { |
| sb.append("title=\"") |
| .append(annotationNumber.get(this.number)) |
| .append('"'); |
| } |
| else { |
| Relation rel = relationNumber.get(this.number); |
| sb.append("xlink:title=\"") |
| .append(rel.annotation) |
| .append('"'); |
| sb.append(" xlink:type=\"simple\""); |
| sb.append(" xlink:href=\"#"); |
| sb.append(match.getPosID(rel.ref)); |
| sb.append('"'); |
| }; |
| sb.append('>'); |
| } |
| else { |
| // Get the first free level slot |
| byte pos; |
| if (levelCache[this.number] != '\0') { |
| pos = levelCache[this.number]; |
| } |
| else { |
| pos = (byte) level.nextSetBit(0); |
| level.clear(pos); |
| levelCache[this.number] = pos; |
| }; |
| sb.append("<em class=\"class-") |
| .append(this.number) |
| .append(" level-") |
| .append(pos) |
| .append("\">"); |
| }; |
| return sb.toString(); |
| } |
| // Closing |
| else if (this.type == 2) { |
| if (this.number <= -1 || this.number >= 256) |
| return "</span>"; |
| |
| if (this.terminal) |
| level.set((int) levelCache[this.number]); |
| return "</em>"; |
| }; |
| |
| // HTML encode primary data |
| return encodeHTML(this.characters); |
| }; |
| |
| // Return bracket fragment for this combinator element |
| public String toBrackets () { |
| if (this.type == 1) { |
| StringBuilder sb = new StringBuilder(); |
| |
| // Match |
| if (this.number == -1) { |
| sb.append("["); |
| } |
| |
| // Identifier |
| else if (this.number < -1) { |
| sb.append("{#"); |
| sb.append(identifierNumber.get(this.number)); |
| sb.append(':'); |
| } |
| |
| // Highlight, Relation, Span |
| else { |
| sb.append("{"); |
| if (this.number >= 256) { |
| if (this.number < 2048) |
| sb.append(annotationNumber.get(this.number)); |
| else { |
| Relation rel = relationNumber.get(this.number); |
| sb.append(rel.annotation); |
| sb.append('>').append(rel.ref); |
| }; |
| sb.append(':'); |
| } |
| else if (this.number != 0) |
| sb.append(this.number).append(':'); |
| }; |
| return sb.toString(); |
| } |
| else if (this.type == 2) { |
| if (this.number == -1) |
| return "]"; |
| return "}"; |
| }; |
| return this.characters; |
| }; |
| }; |
| |
| /* |
| Private class for combining highlighting elements |
| */ |
| private class HighlightCombinator { |
| private LinkedList<HighlightCombinatorElement> combine; |
| private LinkedList<Integer> balanceStack = new LinkedList<>(); |
| private ArrayList<Integer> tempStack = new ArrayList<>(32); |
| |
| // Empty constructor |
| public HighlightCombinator () { |
| this.combine = new LinkedList<>(); |
| }; |
| |
| // Return the combination stack |
| public LinkedList<HighlightCombinatorElement> stack () { |
| return this.combine; |
| }; |
| |
| // get the first element (without removing) |
| public HighlightCombinatorElement getFirst () { |
| return this.combine.getFirst(); |
| }; |
| |
| // get the last element (without removing) |
| public HighlightCombinatorElement getLast () { |
| return this.combine.getLast(); |
| }; |
| |
| // get an element by index (without removing) |
| public HighlightCombinatorElement get (int index) { |
| return this.combine.get(index); |
| }; |
| |
| // Get the size of te combinator stack |
| public short size () { |
| return (short) this.combine.size(); |
| }; |
| |
| // Add primary data to the stack |
| public void addString (String characters) { |
| this.combine.add(new HighlightCombinatorElement(characters)); |
| }; |
| |
| // Add opening highlight combinator to the stack |
| public void addOpen (int number) { |
| this.combine.add(new HighlightCombinatorElement((byte) 1, number)); |
| this.balanceStack.add(number); |
| }; |
| |
| // Add closing highlight combinator to the stack |
| public void addClose (int number) { |
| HighlightCombinatorElement lastComb; |
| this.tempStack.clear(); |
| |
| // Shouldn't happen |
| if (this.balanceStack.size() == 0) { |
| if (DEBUG) |
| log.trace("The balance stack is empty"); |
| return; |
| }; |
| |
| if (DEBUG) { |
| StringBuilder sb = new StringBuilder( |
| "Stack for checking with class " |
| ); |
| sb.append(number).append(" is "); |
| for (int s : this.balanceStack) { |
| sb.append('[').append(s).append(']'); |
| }; |
| log.trace(sb.toString()); |
| }; |
| |
| // class number of the last element |
| int eold = this.balanceStack.removeLast(); |
| |
| // the closing element is not balanced |
| while (eold != number) { |
| |
| // Retrieve last combinator on stack |
| lastComb = this.combine.peekLast(); |
| |
| if (DEBUG) |
| log.trace("Closing element is unbalanced - {} " + |
| "!= {} with lastComb {}|{}|{}", |
| eold, |
| number, |
| lastComb.type, |
| lastComb.number, |
| lastComb.characters); |
| |
| // combinator is opening and the number is not equal to the last |
| // element on the balanceStack |
| if (lastComb.type == 1 && lastComb.number == eold) { |
| |
| // Remove the last element - it's empty and uninteresting! |
| this.combine.removeLast(); |
| } |
| |
| // combinator is either closing (??) or another opener |
| else { |
| |
| if (DEBUG) |
| log.trace("close element a) {}", eold); |
| |
| // Add a closer for the old element (this has following elements) |
| this.combine.add(new HighlightCombinatorElement((byte) 2, eold, false)); |
| }; |
| |
| // add this element number temporarily on the stack |
| tempStack.add(eold); |
| |
| // Check next element |
| eold = this.balanceStack.removeLast(); |
| }; |
| |
| // Get last combinator on the stack |
| lastComb = this.combine.peekLast(); |
| |
| if (DEBUG) { |
| log.trace("LastComb: " + lastComb.type + '|' + lastComb.number + '|' + lastComb.characters + " for " + number); |
| log.trace("Stack for checking 2: {}|{}|{}|{}", lastComb.type, lastComb.number, lastComb.characters, number); |
| }; |
| |
| if (lastComb.type == 1 && lastComb.number == number) { |
| while (lastComb.type == 1 && lastComb.number == number) { |
| // Remove the damn thing - It's empty and uninteresting! |
| this.combine.removeLast(); |
| lastComb = this.combine.peekLast(); |
| }; |
| } |
| else { |
| if (DEBUG) |
| log.trace("close element b) {}", number); |
| |
| // Add a closer |
| this.combine.add(new HighlightCombinatorElement((byte) 2, number)); |
| }; |
| |
| |
| // Fetch everything from the tempstack and reopen it |
| for (int e : tempStack) { |
| if (DEBUG) |
| log.trace("Reopen element {}", e); |
| combine.add(new HighlightCombinatorElement((byte) 1, e)); |
| balanceStack.add(e); |
| }; |
| }; |
| |
| // Get all combined elements as a string |
| public String toString () { |
| StringBuilder sb = new StringBuilder(); |
| for (HighlightCombinatorElement e : combine) { |
| sb.append(e.toString()).append("\n"); |
| }; |
| return sb.toString(); |
| }; |
| }; |
| |
| private void _processHighlightSnippet (String clean, |
| ArrayList<int[]> stack) { |
| |
| if (DEBUG) |
| log.trace("--- Process Highlight snippet"); |
| |
| int pos = 0, |
| oldPos = 0; |
| |
| this.snippetStack = new HighlightCombinator(); |
| |
| for (int[] element : stack) { |
| pos = element[3] != 0 ? element[0] : element[1]; |
| |
| if (pos > oldPos) { |
| |
| if (pos > clean.length()) { |
| pos = clean.length() - 1; |
| }; |
| |
| snippetStack.addString(clean.substring(oldPos, pos)); |
| |
| oldPos = pos; |
| }; |
| |
| if (element[3] != 0) { |
| snippetStack.addOpen(element[2]); |
| } |
| else { |
| snippetStack.addClose(element[2]); |
| }; |
| }; |
| |
| if (clean.length() > pos) { |
| snippetStack.addString(clean.substring(pos)); |
| }; |
| }; |
| |
| @Deprecated |
| public String snippetHTML () { |
| return this.getSnippetHTML(); |
| }; |
| |
| @JsonProperty("snippet") |
| public String getSnippetHTML () { |
| |
| if (!this._processHighlight()) |
| return null; |
| |
| if (this.processed && this.snippetHTML != null) |
| return this.snippetHTML; |
| |
| if (DEBUG) |
| log.trace("Create HTML Snippet"); |
| |
| StringBuilder sb = new StringBuilder(); |
| |
| short start = (short) 0; |
| short end = this.snippetStack.size(); |
| FixedBitSet level = new FixedBitSet(16); |
| level.set(0, 15); |
| byte[] levelCache = new byte[16]; |
| |
| HighlightCombinatorElement elem = this.snippetStack.getFirst(); |
| |
| // Create context |
| sb.append("<span class=\"context-left\">"); |
| if (startMore) |
| sb.append("<span class=\"more\"></span>"); |
| |
| if (elem.type == 0) { |
| sb.append(elem.toHTML(this, level, levelCache)); |
| start++; |
| }; |
| sb.append("</span>"); |
| |
| elem = this.snippetStack.getLast(); |
| |
| StringBuilder rightContext = new StringBuilder(); |
| |
| // Create context, if trhere is any |
| rightContext.append("<span class=\"context-right\">"); |
| if (elem != null && elem.type == 0) { |
| rightContext.append(elem.toHTML(this, level, levelCache)); |
| end--; |
| }; |
| if (endMore) |
| rightContext.append("<span class=\"more\"></span>"); |
| rightContext.append("</span>"); |
| |
| for (short i = start; i < end; i++) { |
| sb.append(this.snippetStack.get(i).toHTML(this, level,levelCache)); |
| }; |
| |
| sb.append(rightContext); |
| |
| return (this.snippetHTML = sb.toString()); |
| }; |
| |
| @Deprecated |
| public String snippetBrackets () { |
| return this.getSnippetBrackets(); |
| }; |
| |
| @JsonIgnore |
| public String getSnippetBrackets () { |
| |
| if (!this._processHighlight()) |
| return null; |
| |
| if (this.processed && this.snippetBrackets != null) |
| return this.snippetBrackets; |
| |
| StringBuilder sb = new StringBuilder(); |
| |
| if (startMore) |
| sb.append("... "); |
| |
| for (HighlightCombinatorElement hce : this.snippetStack.stack()) { |
| sb.append(hce.toBrackets()); |
| }; |
| |
| if (endMore) |
| sb.append(" ..."); |
| |
| return (this.snippetBrackets = sb.toString()); |
| }; |
| |
| |
| // This sorts all highlight and match spans to make them nesting correctly, |
| // even in case they overlap |
| // TODO: Not very fast - improve! |
| private ArrayList<int[]> _processHighlightStack () { |
| if (DEBUG) |
| log.trace("--- Process Highlight stack"); |
| |
| LinkedList<int[]> openList = new LinkedList<int[]>(); |
| LinkedList<int[]> closeList = new LinkedList<int[]>(); |
| |
| // Filter multiple identifiers, that may be introduced and would |
| // result in invalid xml |
| this._filterMultipleIdentifiers(); |
| |
| // Add highlight spans to balance lists |
| openList.addAll(this.span); |
| closeList.addAll(this.span); |
| |
| // Sort balance lists |
| Collections.sort(openList, new OpeningTagComparator()); |
| Collections.sort(closeList, new ClosingTagComparator()); |
| |
| // New stack array |
| ArrayList<int[]> stack = new ArrayList<>(openList.size() * 2); |
| |
| // Create stack unless both lists are empty |
| while (!openList.isEmpty() || !closeList.isEmpty()) { |
| |
| if (openList.isEmpty()) { |
| stack.addAll(closeList); |
| break; |
| } |
| |
| // Not sure about this, but it can happen |
| else if (closeList.isEmpty()) { |
| break; |
| }; |
| |
| if (openList.peekFirst()[0] < closeList.peekFirst()[1]) { |
| int[] e = openList.removeFirst().clone(); |
| e[3] = 1; |
| stack.add(e); |
| } |
| else { |
| stack.add(closeList.removeFirst()); |
| }; |
| }; |
| return stack; |
| }; |
| |
| /** |
| * This will retrieve character offsets for all spans. |
| */ |
| private boolean _processHighlightSpans () { |
| |
| if (DEBUG) |
| log.trace("--- Process Highlight spans"); |
| |
| // Local document ID |
| int ldid = this.localDocID; |
| |
| int startPosChar = -1, endPosChar = -1; |
| |
| // No positionsToOffset object found |
| if (this.positionsToOffset == null) |
| return false; |
| |
| // Match position |
| startPosChar = this.positionsToOffset.start(ldid, this.startPos); |
| |
| if (DEBUG) |
| log.trace("Unaltered startPosChar is {}", startPosChar); |
| |
| // Check potential differing start characters |
| // e.g. from element spans |
| if (potentialStartPosChar != -1 && |
| (startPosChar > this.potentialStartPosChar)) |
| startPosChar = this.potentialStartPosChar; |
| |
| endPosChar = this.positionsToOffset.end(ldid, this.endPos - 1); |
| |
| if (DEBUG) |
| log.trace("Unaltered endPosChar is {}", endPosChar); |
| |
| // Potential end characters may come from spans with |
| // defined character offsets like sentences including .", ... etc. |
| if (endPosChar < potentialEndPosChar) |
| endPosChar = potentialEndPosChar; |
| |
| if (DEBUG) |
| log.trace("Refined: Match offset is pos {}-{} (chars {}-{})", |
| this.startPos, |
| this.endPos, |
| startPosChar, |
| endPosChar); |
| |
| this.identifier = null; |
| |
| // No spans yet |
| if (this.span == null) |
| this.span = new LinkedList<int[]>(); |
| |
| // Process offset char findings |
| int[] intArray = this._processOffsetChars(ldid, startPosChar, endPosChar); |
| |
| // Recalculate startOffsetChar |
| int startOffsetChar = startPosChar - intArray[0]; |
| |
| // Add match span |
| this.span.add(intArray); |
| |
| // highlights |
| // -- I'm not sure about this. |
| if (this.highlight != null) { |
| if (DEBUG) |
| log.trace("There are highlights!"); |
| |
| for (Highlight highlight : this.highlight) { |
| int start = this.positionsToOffset.start( |
| ldid, highlight.start |
| ); |
| |
| int end = this.positionsToOffset.end( |
| ldid, |
| highlight.end |
| ); |
| |
| if (DEBUG) |
| log.trace("PTO has retrieved {}-{} for class {}", |
| start, |
| end, |
| highlight.number); |
| |
| start -= startOffsetChar; |
| end -= startOffsetChar; |
| |
| if (start < 0 || end < 0) |
| continue; |
| |
| // Create intArray for highlight |
| intArray = new int[]{ |
| start, |
| end, |
| highlight.number, |
| 0 // Dummy value for later |
| }; |
| |
| this.span.add(intArray); |
| }; |
| }; |
| return true; |
| }; |
| |
| |
| // Pass the local docid to retrieve character positions for the offset |
| private int[] _processOffsetChars (int ldid, int startPosChar, int endPosChar) { |
| |
| int startOffsetChar = -1, endOffsetChar = -1; |
| int startOffset = -1, endOffset = -1; |
| |
| // The offset is defined by a span |
| if (this.getContext().isSpanDefined()) { |
| |
| if (DEBUG) |
| log.trace("Try to expand to <{}>", |
| this.context.getSpanContext()); |
| |
| this.startMore = false; |
| this.endMore = false; |
| |
| int [] spanContext = this.expandContextToSpan( |
| this.positionsToOffset.getAtomicReader(), |
| (Bits) null, |
| "tokens", |
| this.context.getSpanContext() |
| ); |
| startOffset = spanContext[0]; |
| endOffset = spanContext[1]; |
| startOffsetChar = spanContext[2]; |
| endOffsetChar = spanContext[3]; |
| if (DEBUG) |
| log.trace("Got context is based from span {}-{}/{}-{}", |
| startOffset, endOffset, startOffsetChar, endOffsetChar); |
| }; |
| |
| // The offset is defined by tokens or characters |
| if (endOffset == -1) { |
| |
| PositionsToOffset pto = this.positionsToOffset; |
| |
| // The left offset is defined by tokens |
| if (this.context.left.isToken()) { |
| startOffset = this.startPos - this.context.left.getLength(); |
| if (DEBUG) |
| log.trace("PTO will retrieve {} (Left context)", startOffset); |
| pto.add(ldid, startOffset); |
| } |
| |
| // The left offset is defined by characters |
| else { |
| startOffsetChar = startPosChar - this.context.left.getLength(); |
| }; |
| |
| // The right context is defined by tokens |
| if (this.context.right.isToken()) { |
| endOffset = this.endPos + this.context.right.getLength() -1; |
| if (DEBUG) |
| log.trace("PTO will retrieve {} (Right context)", endOffset); |
| pto.add(ldid, endOffset); |
| |
| } |
| |
| // The right context is defined by characters |
| else { |
| endOffsetChar = (endPosChar == -1) ? -1 : |
| endPosChar + this.context.right.getLength(); |
| }; |
| |
| if (startOffset != -1) |
| startOffsetChar = pto.start(ldid, startOffset); |
| |
| if (endOffset != -1) |
| endOffsetChar = pto.end(ldid, endOffset); |
| }; |
| |
| if (DEBUG) |
| log.trace("Premature found offsets at {}-{}", |
| startOffsetChar, |
| endOffsetChar); |
| |
| |
| // This can happen in case of non-token characters |
| // in the match and null offsets |
| if (startOffsetChar > startPosChar) |
| startOffsetChar = startPosChar; |
| else if (startOffsetChar < 0) |
| startOffsetChar = 0; |
| |
| // No "..." at the beginning |
| if (startOffsetChar == 0) |
| this.startMore = false; |
| |
| if (endOffsetChar != -1 && endOffsetChar < endPosChar) |
| endOffsetChar = endPosChar; |
| |
| if (DEBUG) |
| log.trace("The context spans from chars {}-{}", |
| startOffsetChar, endOffsetChar); |
| |
| // Get snippet information from the primary data |
| if (endOffsetChar > -1 && |
| (endOffsetChar < this.getPrimaryDataLength())) { |
| this.tempSnippet = this.getPrimaryData( |
| startOffsetChar, |
| endOffsetChar |
| ); |
| } |
| else { |
| this.tempSnippet = this.getPrimaryData(startOffsetChar); |
| this.endMore = false; |
| }; |
| |
| if (DEBUG) |
| log.trace("Snippet: '" + this.tempSnippet + "'"); |
| |
| if (DEBUG) |
| log.trace("The match entry is {}-{} ({}-{}) with absolute offsetChars {}-{}", |
| startPosChar - startOffsetChar, |
| endPosChar - startOffsetChar, |
| startPosChar, |
| endPosChar, |
| startOffsetChar, |
| endOffsetChar); |
| |
| // TODO: Simplify |
| return new int[]{ |
| startPosChar - startOffsetChar, |
| endPosChar - startOffsetChar, |
| -1, |
| 0}; |
| }; |
| |
| |
| // Identical to KorapResult! |
| public String toJSON () { |
| ObjectNode json = (ObjectNode) mapper.valueToTree(this); |
| |
| // Match was no match |
| if (json.size() == 0) |
| return "{}"; |
| |
| json.put("context", this.getContext().toJSON()); |
| |
| if (this.version != null) |
| json.put("version", this.getVersion()); |
| |
| try { |
| return mapper.writeValueAsString(json); |
| } |
| catch (Exception e) { |
| log.warn(e.getLocalizedMessage()); |
| }; |
| |
| return "{}"; |
| }; |
| |
| |
| // Remove duplicate identifiers |
| // Yeah ... I mean ... why not? |
| private void _filterMultipleIdentifiers () { |
| ArrayList<Integer> removeDuplicate = new ArrayList<>(10); |
| HashSet<Integer> identifiers = new HashSet<>(20); |
| for (int i = 0; i < this.span.size(); i++) { |
| // span is an int array: [Start, End, Number, Dummy] |
| int highlightNumber = this.span.get(i)[2]; |
| |
| // Number is an identifier |
| if (highlightNumber < -1) { |
| |
| // Get the real identifier |
| int idNumber = identifierNumber.get(highlightNumber); |
| if (identifiers.contains(idNumber)) { |
| removeDuplicate.add(i); |
| } |
| else { |
| identifiers.add(idNumber); |
| }; |
| }; |
| }; |
| |
| // Order the duplicates to filter from the tail |
| Collections.sort(removeDuplicate); |
| Collections.reverse(removeDuplicate); |
| |
| // Delete all duplicate identifiers |
| for (int delete : removeDuplicate) { |
| this.span.remove(delete); |
| }; |
| }; |
| }; |