blob: 1beee5a330aca6ca978c4ee2b45a620223e573cd [file] [log] [blame]
Nils Diewald392bcf32015-02-26 20:01:17 +00001package de.ids_mannheim.korap.response;
Nils Diewaldbb33da22015-03-04 16:24:25 +00002
margaretha50c76332015-03-19 10:10:39 +01003import java.io.IOException;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00004import java.nio.ByteBuffer;
Akron6590c322015-07-02 16:08:13 +02005import java.util.*;
Nils Diewaldf399a672013-11-18 17:55:22 +00006
Akron700c1eb2015-09-25 16:57:30 +02007import org.apache.lucene.index.LeafReaderContext;
Nils Diewald1e5d5942014-05-20 13:29:53 +00008import org.apache.lucene.index.Term;
9import org.apache.lucene.index.TermContext;
Nils Diewald1e5d5942014-05-20 13:29:53 +000010import org.apache.lucene.search.spans.Spans;
Akronf815b592017-02-09 16:54:59 +010011import org.apache.lucene.search.spans.SpanTermQuery;
margaretha50c76332015-03-19 10:10:39 +010012import org.apache.lucene.util.Bits;
13import org.apache.lucene.util.FixedBitSet;
14import org.slf4j.Logger;
15import org.slf4j.LoggerFactory;
16
17import com.fasterxml.jackson.annotation.JsonIgnore;
18import com.fasterxml.jackson.annotation.JsonInclude;
19import com.fasterxml.jackson.annotation.JsonInclude.Include;
20import com.fasterxml.jackson.annotation.JsonProperty;
21import com.fasterxml.jackson.databind.ObjectMapper;
Akron7d45e6b2015-06-26 17:23:42 +020022import com.fasterxml.jackson.databind.JsonNode;
margaretha50c76332015-03-19 10:10:39 +010023import com.fasterxml.jackson.databind.node.ArrayNode;
24import com.fasterxml.jackson.databind.node.ObjectNode;
25
Akrona7b936d2016-03-04 13:40:54 +010026import static de.ids_mannheim.korap.util.KrillByte.*;
margaretha50c76332015-03-19 10:10:39 +010027import de.ids_mannheim.korap.index.AbstractDocument;
28import de.ids_mannheim.korap.index.PositionsToOffset;
29import de.ids_mannheim.korap.query.SpanElementQuery;
30import de.ids_mannheim.korap.response.match.HighlightCombinator;
31import de.ids_mannheim.korap.response.match.HighlightCombinatorElement;
32import de.ids_mannheim.korap.response.match.MatchIdentifier;
33import de.ids_mannheim.korap.response.match.PosIdentifier;
34import de.ids_mannheim.korap.response.match.Relation;
Nils Diewald8c221782013-12-13 19:52:58 +000035
Nils Diewaldf399a672013-11-18 17:55:22 +000036/*
Akron35c2d0d2017-02-15 11:16:22 +010037 * The snippet building algorithm is quite complicated for now
38 * and should probably be refactored.
39 * It works like this:
40 *
41 * 1. For all spans and highlights, pagebreaks etc. all necessary
42 * positions are collected (processHighlight)
43 * 2. For all collected positions the character offsets are retrieved
44 * and based on that for all spans and highlights a list
45 * is created with arrays of the spans with the structure
46 * [startchar, endchar, highlightClass] (processHighlightSpans)
47 * 2.1 The primary data and optional context information is retrieved
48 * (processOffsetChars)
49 * 3. Based on the collected spans 2 lists are created for opening and
50 * closing tags (pretty much clones of the initial span list),
51 * sorted for opening resp. closing, and processed in parallel
52 * to form an open/close stack. The new structure on the stack is
Akrond4b19332017-02-15 18:36:24 +010053 * [startchar, endchar, highlightclass, close=0/open=1/empty=2]
Akron35c2d0d2017-02-15 11:16:22 +010054 * (processHighlightStack)
55 * 3.1. If the element is a relation with an identifier, this may
56 * be removed if duplicate (filterMultipleIdentifiers)
57 * 4. Based on the stack and the primary data the snippet is created.
58 * (processHighlightSnippet)
Akrond4b19332017-02-15 18:36:24 +010059 * 4.1. To avoid unbalanced elements, all open/close/empty tags
60 * are balanced (i.e. closed and reopened if overlaps occur).
61 * (Highlightcombinator)
Akron35c2d0d2017-02-15 11:16:22 +010062 */
Nils Diewald345bdc02014-01-21 21:48:57 +000063
Akron35c2d0d2017-02-15 11:16:22 +010064/*
65 * Todo: The implemented classes and private names are horrible!
66 * Refactor, future-me!
67 *
68 * The number based Highlighttype is ugly - UGLY!
69 *
70 * substrings may be out of range - e.g. if snippets are not lifted!
71 */
Nils Diewaldf399a672013-11-18 17:55:22 +000072
73/**
Nils Diewald884dbcf2015-02-27 17:02:28 +000074 * Representation of Matches in a Result.
Akron75ee2b82016-06-20 21:20:34 +020075 * <strong>Warning:</strong> This is currently highly dependent
Akron3e0403f2015-06-24 20:59:13 +020076 * on DeReKo data and will change in the future.
Nils Diewaldbb33da22015-03-04 16:24:25 +000077 *
Nils Diewald498d5982014-03-03 20:09:22 +000078 * @author Nils Diewald
Nils Diewald884dbcf2015-02-27 17:02:28 +000079 * @see Result
Nils Diewaldf399a672013-11-18 17:55:22 +000080 */
Nils Diewaldcde69082014-01-16 15:46:48 +000081@JsonInclude(Include.NON_NULL)
Nils Diewald392bcf32015-02-26 20:01:17 +000082public class Match extends AbstractDocument {
Nils Diewald82a4b862014-02-20 21:17:41 +000083
Nils Diewald498d5982014-03-03 20:09:22 +000084 // Logger
Nils Diewald392bcf32015-02-26 20:01:17 +000085 private final static Logger log = LoggerFactory.getLogger(Match.class);
Nils Diewald498d5982014-03-03 20:09:22 +000086
Akron70ce0c02018-05-25 23:44:26 +020087 private static final int MAX_MATCH_TOKENS = 50;
88
Akronb98c2662017-02-14 19:38:05 +010089 // end marker of highlights that are pagebreaks
90 private static final int PB_MARKER = -99999;
91
Akron12cd2582018-02-17 12:58:38 +010092 // Textual elements that are in context
93 private static final int CONTEXT = -99998;
94
Nils Diewald498d5982014-03-03 20:09:22 +000095 // This advices the java compiler to ignore all loggings
Akron04f00952018-03-06 18:56:54 +010096 public static final boolean DEBUG = false;
Nils Diewald498d5982014-03-03 20:09:22 +000097
98 // Mapper for JSON serialization
Nils Diewaldf399a672013-11-18 17:55:22 +000099 ObjectMapper mapper = new ObjectMapper();
100
101 // Snippet information
102 @JsonIgnore
Nils Diewald1e5d5942014-05-20 13:29:53 +0000103 public SearchContext context;
Nils Diewaldf399a672013-11-18 17:55:22 +0000104
Nils Diewald7534fdf2014-11-27 02:28:10 +0000105 // Public, while used wildly in tests!
Nils Diewaldf399a672013-11-18 17:55:22 +0000106 @JsonIgnore
Nils Diewald66b8b7a2014-06-16 17:17:46 +0000107 public int startPos, endPos = -1;
Nils Diewaldf399a672013-11-18 17:55:22 +0000108
109 @JsonIgnore
Akronf05fde62016-08-03 23:46:17 +0200110 private int innerMatchStartPos, innerMatchEndPos = -1;
111
112 @JsonIgnore
Nils Diewaldbb33da22015-03-04 16:24:25 +0000113 public int potentialStartPosChar = -1, potentialEndPosChar = -1;
Nils Diewaldf399a672013-11-18 17:55:22 +0000114
Akron70ce0c02018-05-25 23:44:26 +0200115 @JsonIgnore
116 public boolean cutted = false;
117
Nils Diewaldcdd465b2014-02-24 18:47:38 +0000118 private String version;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000119
Nils Diewald1e5d5942014-05-20 13:29:53 +0000120 // TEMPORARILY
Nils Diewaldcde69082014-01-16 15:46:48 +0000121 @JsonIgnore
122 public int localDocID = -1;
123
Nils Diewaldbb33da22015-03-04 16:24:25 +0000124 private HashMap<Integer, String> annotationNumber = new HashMap<>(16);
125 private HashMap<Integer, Relation> relationNumber = new HashMap<>(16);
Akron99220ea2018-01-30 19:09:20 +0100126 private HashMap<Integer, String> identifierNumber = new HashMap<>(16);
Nils Diewald345bdc02014-01-21 21:48:57 +0000127
128 // -1 is match highlight
Nils Diewaldcde69082014-01-16 15:46:48 +0000129 int annotationNumberCounter = 256;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000130 int relationNumberCounter = 2048;
Nils Diewald345bdc02014-01-21 21:48:57 +0000131 int identifierNumberCounter = -2;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000132
Akron79d51d42017-02-13 21:28:27 +0100133 private int startPage = -1;
134 private int endPage = -1;
135
Akron7e750972018-03-23 14:21:21 +0100136 private String tempSnippet,
137 snippetHTML,
138 snippetBrackets,
139 identifier,
140 mirrorIdentifier;
Nils Diewald833fe7e2013-12-14 16:06:33 +0000141
Nils Diewald46790102014-09-18 16:05:42 +0000142 private HighlightCombinator snippetArray;
Nils Diewald833fe7e2013-12-14 16:06:33 +0000143
Nils Diewaldbb33da22015-03-04 16:24:25 +0000144 public boolean startMore = true, endMore = true;
Nils Diewaldf399a672013-11-18 17:55:22 +0000145
146 private Collection<byte[]> payload;
Nils Diewaldcde69082014-01-16 15:46:48 +0000147 private ArrayList<Highlight> highlight;
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000148 private LinkedList<int[]> span;
Nils Diewaldf399a672013-11-18 17:55:22 +0000149
Nils Diewald833fe7e2013-12-14 16:06:33 +0000150 private PositionsToOffset positionsToOffset;
Nils Diewald3caa00d2013-12-13 02:24:04 +0000151 private boolean processed = false;
152
Nils Diewaldbb33da22015-03-04 16:24:25 +0000153
Nils Diewald833fe7e2013-12-14 16:06:33 +0000154 /**
Nils Diewald392bcf32015-02-26 20:01:17 +0000155 * Constructs a new Match object.
Nils Diewald1e5d5942014-05-20 13:29:53 +0000156 * Todo: Maybe that's not necessary!
Nils Diewaldbb33da22015-03-04 16:24:25 +0000157 *
158 * @param pto
159 * The PositionsToOffset object, containing relevant
Nils Diewald833fe7e2013-12-14 16:06:33 +0000160 * positional information for highlighting
Nils Diewaldbb33da22015-03-04 16:24:25 +0000161 * @param localDocID
162 * Document ID based on the atomic reader.
163 * @param startPos
164 * Start position of the match in the document.
165 * @param endPos
166 * End position of the match in the document.
167 *
Nils Diewald833fe7e2013-12-14 16:06:33 +0000168 * @see #snippetHTML()
169 * @see #snippetBrackets()
170 * @see PositionsToOffset
171 */
Nils Diewaldbb33da22015-03-04 16:24:25 +0000172 public Match (PositionsToOffset pto, int localDocID, int startPos,
173 int endPos) {
174 this.positionsToOffset = pto;
175 this.localDocID = localDocID;
Akron70ce0c02018-05-25 23:44:26 +0200176 this.setStartPos(startPos);
177 this.setEndPos(endPos);
Nils Diewald3caa00d2013-12-13 02:24:04 +0000178 };
179
Nils Diewaldbb33da22015-03-04 16:24:25 +0000180
Nils Diewaldf399a672013-11-18 17:55:22 +0000181 /**
Nils Diewald392bcf32015-02-26 20:01:17 +0000182 * Constructs a new Match object.
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000183 */
Nils Diewald392bcf32015-02-26 20:01:17 +0000184 public Match () {};
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000185
Nils Diewaldbb33da22015-03-04 16:24:25 +0000186
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000187 /**
Nils Diewald392bcf32015-02-26 20:01:17 +0000188 * Constructs a new Match object.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000189 *
190 * @param idString
191 * Match identifier string as provided by Result.
192 * @param includeHighlights
193 * Boolean value indicating if possible provided
194 * highlight information should be ignored or not.
Nils Diewaldcde69082014-01-16 15:46:48 +0000195 */
Nils Diewald392bcf32015-02-26 20:01:17 +0000196 public Match (String idString, boolean includeHighlights) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000197 MatchIdentifier id = new MatchIdentifier(idString);
Akron7e750972018-03-23 14:21:21 +0100198
Nils Diewaldbb33da22015-03-04 16:24:25 +0000199 if (id.getStartPos() > -1) {
Akron7e750972018-03-23 14:21:21 +0100200 this.mirrorIdentifier = id.toString();
Akron48937e92015-06-26 01:49:02 +0200201
202 if (id.getTextSigle() != null)
203 this.setTextSigle(id.getTextSigle());
204
205 // <legacy>
Nils Diewaldbb33da22015-03-04 16:24:25 +0000206 this.setCorpusID(id.getCorpusID());
207 this.setDocID(id.getDocID());
Akron48937e92015-06-26 01:49:02 +0200208 // </legacy>
209
Nils Diewaldbb33da22015-03-04 16:24:25 +0000210 this.setStartPos(id.getStartPos());
211 this.setEndPos(id.getEndPos());
Nils Diewaldcde69082014-01-16 15:46:48 +0000212
Nils Diewaldbb33da22015-03-04 16:24:25 +0000213 if (includeHighlights)
214 for (int[] pos : id.getPos()) {
215 if (pos[0] < id.getStartPos() || pos[1] > id.getEndPos())
216 continue;
Nils Diewaldbb33da22015-03-04 16:24:25 +0000217 this.addHighlight(pos[0], pos[1], pos[2]);
Akron7e750972018-03-23 14:21:21 +0100218 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000219 };
Nils Diewaldcde69082014-01-16 15:46:48 +0000220 };
221
Nils Diewaldbb33da22015-03-04 16:24:25 +0000222
Nils Diewald498d5982014-03-03 20:09:22 +0000223 /**
224 * Private class of highlights.
Akronb98c2662017-02-14 19:38:05 +0100225 * TODO: This should probably be renamed, as it not only contains highlights
Akron99220ea2018-01-30 19:09:20 +0100226 * but also annotations, pagebreaks and relations
Nils Diewaldbb33da22015-03-04 16:24:25 +0000227 */
Nils Diewaldcde69082014-01-16 15:46:48 +0000228 private class Highlight {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000229 public int start, end;
230 public int number = -1;
Nils Diewaldcde69082014-01-16 15:46:48 +0000231
Nils Diewaldbb33da22015-03-04 16:24:25 +0000232 // Relational highlight
Akron652e4362017-09-18 20:14:44 +0200233 public Highlight (int start, int end, String annotation, int refStart, int refEnd) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000234 this.start = start;
235 this.end = end;
236 // TODO: This can overflow!
237 this.number = relationNumberCounter++;
Akron99220ea2018-01-30 19:09:20 +0100238
239 if (DEBUG) {
240 log.trace("Add relation (2) '{}': source={}-{} >> target={}-{}",
241 annotation, start, end, refStart, refEnd);
242 };
243
Akron652e4362017-09-18 20:14:44 +0200244 relationNumber.put(this.number, new Relation(annotation, refStart, refEnd));
Nils Diewaldbb33da22015-03-04 16:24:25 +0000245 };
Nils Diewaldcde69082014-01-16 15:46:48 +0000246
Nils Diewaldbb33da22015-03-04 16:24:25 +0000247
248 // Span highlight
249 public Highlight (int start, int end, String annotation) {
250 this.start = start;
251 this.end = end;
Akroncb1093a2016-07-28 16:27:59 +0200252
Nils Diewaldbb33da22015-03-04 16:24:25 +0000253 // TODO: This can overflow!
254 if (annotationNumberCounter < 2048) {
255 this.number = annotationNumberCounter++;
256 annotationNumber.put(this.number, annotation);
257 };
258 };
259
260
261 // Simple highlight
262 public Highlight (int start, int end, int number) {
263 this.start = start;
264 this.end = end;
265 this.number = number;
266 };
Akronb98c2662017-02-14 19:38:05 +0100267
268 // Pagebreak
269 public Highlight (int start, int pagenumber) {
270 this.start = start;
271 this.end = PB_MARKER;
272 this.number = pagenumber;
273 };
Nils Diewald345bdc02014-01-21 21:48:57 +0000274 };
275
Nils Diewaldbb33da22015-03-04 16:24:25 +0000276
Nils Diewaldc7b60632014-09-05 19:59:01 +0000277 // TODO: Here are offsets and highlight offsets!
278 // <> payloads have 12 bytes (iii) or 8!?
279 // highlightoffsets have 11 bytes (iis)!
Nils Diewald67f54042014-09-27 14:53:38 +0000280 public void addPayload (List<byte[]> payload) {
281
Nils Diewaldbb33da22015-03-04 16:24:25 +0000282 if (DEBUG)
283 log.trace("Add payloads to match");
Nils Diewalda206b2e2014-11-05 17:24:47 +0000284
Nils Diewaldbb33da22015-03-04 16:24:25 +0000285 // Reverse to make embedding of highlights correct
286 Collections.reverse(payload);
287 try {
Akron6cc7b7b2016-01-14 21:39:18 +0100288
289 ByteBuffer bb = ByteBuffer.allocate(24);
Nils Diewald67f54042014-09-27 14:53:38 +0000290
Nils Diewaldbb33da22015-03-04 16:24:25 +0000291 // TODO: REVERSE ITERATOR!
292 for (byte[] b : payload) {
Nils Diewaldc7b60632014-09-05 19:59:01 +0000293
Nils Diewaldbb33da22015-03-04 16:24:25 +0000294 if (DEBUG)
Akron6d2c4692016-02-03 18:29:10 +0100295 log.trace("Found a payload of pti {}", b[0]);
Nils Diewaldc7b60632014-09-05 19:59:01 +0000296
Nils Diewaldbb33da22015-03-04 16:24:25 +0000297 // Todo element searches!
Nils Diewaldc7b60632014-09-05 19:59:01 +0000298
Akrona7b936d2016-03-04 13:40:54 +0100299 // Highlights! This is a class PTI
Akron5f044032015-12-18 00:35:38 +0100300 if (b[0] == 0) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000301 bb.put(b);
Akron5f044032015-12-18 00:35:38 +0100302 bb.position(1); // Ignore PTI
Nils Diewaldbb33da22015-03-04 16:24:25 +0000303 int start = bb.getInt();
304 int end = bb.getInt();
305 byte number = bb.get();
Nils Diewaldc7b60632014-09-05 19:59:01 +0000306
Nils Diewaldbb33da22015-03-04 16:24:25 +0000307 if (DEBUG)
308 log.trace(
309 "Have a highlight of class {} in {}-{} inside of {}-{}",
Akron6759b042016-04-28 01:25:00 +0200310 unsignedByte(number), start, end,
311 this.getStartPos(), this.getEndPos());
Nils Diewaldc7b60632014-09-05 19:59:01 +0000312
Nils Diewaldbb33da22015-03-04 16:24:25 +0000313 // Ignore classes out of match range and set by the system
Akron6d2c4692016-02-03 18:29:10 +0100314 // TODO: This may be decidable by PTI!
Akron6759b042016-04-28 01:25:00 +0200315 if (unsignedByte(number) <= 128
316 && start >= this.getStartPos()
Akron42993552016-02-04 13:24:24 +0100317 && end <= this.getEndPos()) {
Akron63cd32f2016-04-21 17:56:06 +0200318
319 if (DEBUG) {
Akron430703a2017-11-16 18:32:54 +0100320 log.trace("Add highlight with class/relationnr {}!",
Akron6759b042016-04-28 01:25:00 +0200321 unsignedByte(number));
Akron63cd32f2016-04-21 17:56:06 +0200322 };
323
Nils Diewaldbb33da22015-03-04 16:24:25 +0000324 this.addHighlight(start, end - 1, number);
Akron6d2c4692016-02-03 18:29:10 +0100325 }
326 else if (DEBUG) {
Akron6759b042016-04-28 01:25:00 +0200327 log.trace("Don't add highlight of class {}!",
328 unsignedByte(number));
Akron6d2c4692016-02-03 18:29:10 +0100329 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000330 }
Nils Diewaldc7b60632014-09-05 19:59:01 +0000331
Nils Diewaldbb33da22015-03-04 16:24:25 +0000332 // Element payload for match!
333 // This MAY BE the correct match
Akron6cc7b7b2016-01-14 21:39:18 +0100334 else if (b[0] == (byte) 64) {
335
Nils Diewaldbb33da22015-03-04 16:24:25 +0000336 bb.put(b);
Akron6d2c4692016-02-03 18:29:10 +0100337 bb.position(1); // Ignore pti
Akron6cc7b7b2016-01-14 21:39:18 +0100338
Akron6d2c4692016-02-03 18:29:10 +0100339 // Wasn't set before
Nils Diewaldbb33da22015-03-04 16:24:25 +0000340 if (this.potentialStartPosChar == -1) {
Akron6cc7b7b2016-01-14 21:39:18 +0100341 this.potentialStartPosChar = bb.getInt(1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000342 }
343 else {
344 if (bb.getInt(0) < this.potentialStartPosChar)
Akron6cc7b7b2016-01-14 21:39:18 +0100345 this.potentialStartPosChar = bb.getInt(1);
Nils Diewaldbb33da22015-03-04 16:24:25 +0000346 };
Nils Diewaldc7b60632014-09-05 19:59:01 +0000347
Akron70ce0c02018-05-25 23:44:26 +0200348 if (bb.getInt(4) > this.potentialEndPosChar && !this.cutted)
Akron6cc7b7b2016-01-14 21:39:18 +0100349 this.potentialEndPosChar = bb.getInt(5);
Nils Diewaldc7b60632014-09-05 19:59:01 +0000350
Nils Diewaldbb33da22015-03-04 16:24:25 +0000351 if (DEBUG)
352 log.trace("Element payload from {} to {}",
353 this.potentialStartPosChar,
354 this.potentialEndPosChar);
355 };
356
357 // Clear bytebuffer
358 bb.clear();
359 };
360 }
361
362 catch (Exception e) {
363 log.error(e.getMessage());
364 }
Nils Diewaldc7b60632014-09-05 19:59:01 +0000365 };
366
Nils Diewaldcde69082014-01-16 15:46:48 +0000367
368 /**
Nils Diewaldf399a672013-11-18 17:55:22 +0000369 * Insert a highlight for the snippet view by means of positional
370 * offsets and an optional class number.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000371 *
372 * @param start
373 * Integer value of a span's positional start offset.
374 * @param end
375 * Integer value of a span's positional end offset.
376 * @param number
377 * Optional class number of the highlight.
Nils Diewaldf399a672013-11-18 17:55:22 +0000378 */
Nils Diewaldcde69082014-01-16 15:46:48 +0000379 public void addHighlight (int start, int end) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000380 this.addHighlight(new Highlight(start, end, (int) 0));
Nils Diewaldcde69082014-01-16 15:46:48 +0000381 };
382
Nils Diewaldbb33da22015-03-04 16:24:25 +0000383
Nils Diewaldf399a672013-11-18 17:55:22 +0000384 public void addHighlight (int start, int end, byte number) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000385 this.addHighlight(new Highlight(start, end, (int) number));
Nils Diewaldf399a672013-11-18 17:55:22 +0000386 };
387
Nils Diewaldbb33da22015-03-04 16:24:25 +0000388
Nils Diewaldf399a672013-11-18 17:55:22 +0000389 public void addHighlight (int start, int end, short number) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000390 this.addHighlight(new Highlight(start, end, (int) number));
Nils Diewaldf399a672013-11-18 17:55:22 +0000391 };
392
Nils Diewaldbb33da22015-03-04 16:24:25 +0000393
Nils Diewaldf399a672013-11-18 17:55:22 +0000394 public void addHighlight (int start, int end, int number) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000395 this.addHighlight(new Highlight(start, end, number));
Nils Diewaldcde69082014-01-16 15:46:48 +0000396 };
397
Nils Diewaldbb33da22015-03-04 16:24:25 +0000398
Nils Diewald498d5982014-03-03 20:09:22 +0000399 /**
400 * Insert a highlight for the snippet view.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000401 *
402 * @param hl
403 * A highlight object to add to the match.
Nils Diewald498d5982014-03-03 20:09:22 +0000404 */
Nils Diewaldcde69082014-01-16 15:46:48 +0000405 public void addHighlight (Highlight hl) {
406
Nils Diewaldbb33da22015-03-04 16:24:25 +0000407 if (this.highlight == null)
408 this.highlight = new ArrayList<Highlight>(16);
Nils Diewald82a4b862014-02-20 21:17:41 +0000409
Nils Diewaldbb33da22015-03-04 16:24:25 +0000410 if (DEBUG)
411 log.trace("Add highlight from pos {}-{} of class {}", hl.start,
412 hl.end, hl.number);
Nils Diewaldf399a672013-11-18 17:55:22 +0000413
Nils Diewaldbb33da22015-03-04 16:24:25 +0000414 // Reset the fetched match data
415 this._reset();
Nils Diewald833fe7e2013-12-14 16:06:33 +0000416
Nils Diewaldbb33da22015-03-04 16:24:25 +0000417 this.highlight.add(hl);
Nils Diewaldf399a672013-11-18 17:55:22 +0000418 };
419
Nils Diewaldcde69082014-01-16 15:46:48 +0000420
Nils Diewald498d5982014-03-03 20:09:22 +0000421 /**
422 * Insert a textual annotation for the snippet view by
423 * means of positional offsets and an annotation string.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000424 *
425 * @param start
426 * Integer value of a span's positional start offset.
427 * @param end
428 * Integer value of a span's positional end offset.
429 * @param annotation
430 * Annotation string.
Nils Diewald498d5982014-03-03 20:09:22 +0000431 */
Nils Diewaldcde69082014-01-16 15:46:48 +0000432 public void addAnnotation (int start, int end, String annotation) {
Akron9ebdfab2018-02-19 16:38:17 +0100433
434 if (DEBUG && start > end)
435 log.warn("Annotation span is negative: {}, {} for {}", start, end, annotation);
436
Nils Diewaldbb33da22015-03-04 16:24:25 +0000437 this.addHighlight(new Highlight(start, end, annotation));
Nils Diewaldcde69082014-01-16 15:46:48 +0000438 };
439
Nils Diewaldbb33da22015-03-04 16:24:25 +0000440
Nils Diewald498d5982014-03-03 20:09:22 +0000441 /**
442 * Insert an annotated relation for the snippet view by
Nils Diewaldbb33da22015-03-04 16:24:25 +0000443 * means of relational participant positions and an annotation
444 * string.
445 *
446 * @param src
447 * Integer value of a span's positional source object.
448 * @param target
449 * Integer value of a span's positional target object.
450 * @param annotation
451 * Annotation string.
Nils Diewald498d5982014-03-03 20:09:22 +0000452 */
Akronfae2c682017-09-18 18:47:49 +0200453 public void addRelation (int srcStart,
454 int srcEnd,
455 int targetStart,
456 int targetEnd,
457 String annotation) {
Akron47929692017-09-12 14:41:26 +0200458
459 if (DEBUG)
Akron99220ea2018-01-30 19:09:20 +0100460 log.trace("Add relation (1) '{}': source={}-{} >> target={}-{}",
Akrona82cee22017-09-18 14:52:12 +0200461 annotation, srcStart, srcEnd, targetStart, targetEnd);
Akron47929692017-09-12 14:41:26 +0200462
Akron22d119d2017-11-15 16:53:02 +0100463 // Add source token
Akron430703a2017-11-16 18:32:54 +0100464 if (srcEnd == -1) { // || srcStart == srcEnd) {
465 this.addHighlight(
466 new Highlight(srcStart, srcStart, annotation, targetStart, targetEnd)
467 );
Akronfae2c682017-09-18 18:47:49 +0200468 }
Akron22d119d2017-11-15 16:53:02 +0100469 // Add source span
Akronfae2c682017-09-18 18:47:49 +0200470 else {
Akron430703a2017-11-16 18:32:54 +0100471 this.addHighlight(
472 new Highlight(srcStart, srcEnd, annotation, targetStart, targetEnd)
473 );
Akronfae2c682017-09-18 18:47:49 +0200474 };
475
Nils Diewaldbb33da22015-03-04 16:24:25 +0000476 int id = identifierNumberCounter--;
Akron99220ea2018-01-30 19:09:20 +0100477
478 // Here is probably the problem: the identifier-number
479 // needs to incorporate targetEnd as well
Akronfae2c682017-09-18 18:47:49 +0200480
Akron22d119d2017-11-15 16:53:02 +0100481 // Add target token
Akron99220ea2018-01-30 19:09:20 +0100482 // (The last part was previously commented
483 // out for unknown reason)
484 if (targetEnd == -1 || targetStart == targetEnd) {
Akronfae2c682017-09-18 18:47:49 +0200485 this.addHighlight(new Highlight(targetStart, targetStart, id));
Akron99220ea2018-01-30 19:09:20 +0100486
487 identifierNumber.put(id, String.valueOf(targetStart));
Akronfae2c682017-09-18 18:47:49 +0200488 }
Akron22d119d2017-11-15 16:53:02 +0100489
490 // Add target span
Akronfae2c682017-09-18 18:47:49 +0200491 else {
492 this.addHighlight(new Highlight(targetStart, targetEnd, id));
Akron99220ea2018-01-30 19:09:20 +0100493 identifierNumber.put(id, targetStart + "-" + targetEnd);
494
Akronfae2c682017-09-18 18:47:49 +0200495 };
Nils Diewald345bdc02014-01-21 21:48:57 +0000496 };
497
Akron40550172015-08-04 03:06:12 +0200498
Akron35c2d0d2017-02-15 11:16:22 +0100499 public void addPagebreak (int start, int pagenumber) {
500 this.addHighlight(new Highlight(start, pagenumber));
Akronb98c2662017-02-14 19:38:05 +0100501 };
502
Akron6590c322015-07-02 16:08:13 +0200503 /**
Nils Diewald498d5982014-03-03 20:09:22 +0000504 * Get document id.
505 */
Nils Diewald010c10f2013-12-17 01:58:31 +0000506 @JsonProperty("docID")
507 public String getDocID () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000508 return super.getID();
Nils Diewald010c10f2013-12-17 01:58:31 +0000509 };
510
Nils Diewaldbb33da22015-03-04 16:24:25 +0000511
Akron79d51d42017-02-13 21:28:27 +0100512 /**
513 * Get start page.
514 */
515 @JsonIgnore
516 public int getStartPage () {
517 return this.startPage;
518 };
519
520
521 /**
522 * Get end page.
523 */
524 @JsonIgnore
525 public int getEndPage () {
526 return this.endPage;
527 };
528
529
Nils Diewald498d5982014-03-03 20:09:22 +0000530 /**
531 * Set document id.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000532 *
533 * @param id
534 * String representation of document ID.
Nils Diewald498d5982014-03-03 20:09:22 +0000535 */
Nils Diewald364eb642013-12-22 15:03:01 +0000536 public void setDocID (String id) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000537 super.setID(id);
Nils Diewald364eb642013-12-22 15:03:01 +0000538 };
539
Nils Diewaldbb33da22015-03-04 16:24:25 +0000540
Nils Diewald498d5982014-03-03 20:09:22 +0000541 /**
542 * Get the positional start offset of the match.
543 */
Nils Diewaldcdd465b2014-02-24 18:47:38 +0000544 @JsonIgnore
Nils Diewaldbb33da22015-03-04 16:24:25 +0000545 public int getStartPos () {
546 return this.startPos;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000547 };
548
Nils Diewald498d5982014-03-03 20:09:22 +0000549
550 /**
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000551 * Get the positional start offset of the class.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000552 *
553 * @param number
554 * Class number of the highlight.
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000555 */
556 @JsonIgnore
557 public int getStartPos (int number) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000558 if (number > 256 || this.highlight == null)
559 return -1;
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000560
Nils Diewaldbb33da22015-03-04 16:24:25 +0000561 // Iterate over highlights to find matching class
562 for (Highlight h : this.highlight) {
Akronb98c2662017-02-14 19:38:05 +0100563 if (h.number == number && h.end != PB_MARKER)
Nils Diewaldbb33da22015-03-04 16:24:25 +0000564 return h.start;
565 };
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000566
Nils Diewaldbb33da22015-03-04 16:24:25 +0000567 return -1;
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000568 };
569
570
571 /**
Nils Diewald498d5982014-03-03 20:09:22 +0000572 * Set the positional start offset of the match.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000573 *
574 * @param pos
575 * The positional offset.
Nils Diewald498d5982014-03-03 20:09:22 +0000576 */
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000577 @JsonIgnore
Nils Diewaldbb33da22015-03-04 16:24:25 +0000578 public void setStartPos (int pos) {
579 this.startPos = pos;
Akron70ce0c02018-05-25 23:44:26 +0200580 if (this.endPos != -1 && (this.endPos - pos) > MAX_MATCH_TOKENS) {
581 this.endPos = pos + MAX_MATCH_TOKENS;
582 this.cutted = true;
583 };
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000584 };
585
Nils Diewald498d5982014-03-03 20:09:22 +0000586
587 /**
588 * Get the positional end offset of the match.
589 */
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000590 @JsonIgnore
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000591 public int getEndPos () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000592 return this.endPos;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000593 };
594
Nils Diewald498d5982014-03-03 20:09:22 +0000595
596 /**
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000597 * Get the positional end offset of the class.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000598 *
599 * @param number
600 * Class number of the highlight.
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000601 */
602 @JsonIgnore
603 public int getEndPos (int number) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000604 if (number > 256 || this.highlight == null)
605 return -1;
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000606
Nils Diewaldbb33da22015-03-04 16:24:25 +0000607 // Iterate over highlights to find matching class
608 for (Highlight h : this.highlight) {
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000609
Nils Diewaldbb33da22015-03-04 16:24:25 +0000610 // Get the number (incremented by 1)
Akronb98c2662017-02-14 19:38:05 +0100611 if (h.number == number && h.end != PB_MARKER)
Nils Diewaldbb33da22015-03-04 16:24:25 +0000612 return h.end + 1;
613 };
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000614
Nils Diewaldbb33da22015-03-04 16:24:25 +0000615 return -1;
Nils Diewald99d7f8a2014-09-17 14:49:42 +0000616 };
617
618
619 /**
Nils Diewald498d5982014-03-03 20:09:22 +0000620 * Set the positional end offset of the match.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000621 *
622 * @param pos
623 * The positional offset.
Nils Diewald498d5982014-03-03 20:09:22 +0000624 */
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000625 @JsonIgnore
Nils Diewaldbb33da22015-03-04 16:24:25 +0000626 public void setEndPos (int pos) {
Akron70ce0c02018-05-25 23:44:26 +0200627 if (this.startPos != -1 && (pos - this.startPos) > MAX_MATCH_TOKENS) {
628 pos = this.startPos + MAX_MATCH_TOKENS;
629 this.cutted = true;
630 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000631 this.endPos = pos;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000632 };
633
Nils Diewaldbb33da22015-03-04 16:24:25 +0000634
Nils Diewald498d5982014-03-03 20:09:22 +0000635 /**
636 * Get the local (i.e. Lucene given) ID of the document.
637 */
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000638 @JsonIgnore
639 public int getLocalDocID () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000640 return this.localDocID;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000641 };
642
Nils Diewald498d5982014-03-03 20:09:22 +0000643
644 /**
645 * Set the local (i.e. Lucene given) ID of the document.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000646 *
647 * @param id
648 * The id of the document.
Nils Diewald498d5982014-03-03 20:09:22 +0000649 */
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000650 @JsonIgnore
651 public void setLocalDocID (int id) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000652 this.localDocID = id;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000653 };
654
Nils Diewaldbb33da22015-03-04 16:24:25 +0000655
Nils Diewald498d5982014-03-03 20:09:22 +0000656 /**
657 * Get the PositionsToOffset object.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000658 *
Nils Diewald498d5982014-03-03 20:09:22 +0000659 * @see PositionsToOffset
660 */
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000661 @JsonIgnore
662 public PositionsToOffset getPositionsToOffset () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000663 return this.positionsToOffset;
Nils Diewaldbfe554b2014-01-09 19:35:05 +0000664 };
665
Nils Diewald498d5982014-03-03 20:09:22 +0000666
667 /**
668 * Set the PositionsToOffset object.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000669 *
670 * @param pto
671 * The PositionsToOffset object
Nils Diewald498d5982014-03-03 20:09:22 +0000672 * @see PositionsToOffset
673 */
674 @JsonIgnore
675 public void setPositionsToOffset (PositionsToOffset pto) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000676 this.positionsToOffset = pto;
Nils Diewald498d5982014-03-03 20:09:22 +0000677 };
678
679
680 /**
681 * Get match ID (for later retrieval).
Nils Diewaldbb33da22015-03-04 16:24:25 +0000682 *
Nils Diewald498d5982014-03-03 20:09:22 +0000683 * @see MatchIdentifier
684 */
Nils Diewald010c10f2013-12-17 01:58:31 +0000685 @Override
Akron48937e92015-06-26 01:49:02 +0200686 @JsonProperty("matchID")
Nils Diewald010c10f2013-12-17 01:58:31 +0000687 public String getID () {
Akron7e750972018-03-23 14:21:21 +0100688
689 // Return identifier as given
690 if (this.mirrorIdentifier != null) {
691 return this.mirrorIdentifier;
692 };
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000693
Akron7e750972018-03-23 14:21:21 +0100694 // Identifier already created
695 if (this.identifier != null) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000696 return this.identifier;
Akron7e750972018-03-23 14:21:21 +0100697 };
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000698
Nils Diewaldbb33da22015-03-04 16:24:25 +0000699 // No, nada, nix
700 if (this.localDocID == -1)
701 return null;
Nils Diewaldcde69082014-01-16 15:46:48 +0000702
Akron8f6f7a32015-06-25 01:03:15 +0200703
Nils Diewaldbb33da22015-03-04 16:24:25 +0000704 MatchIdentifier id = this.getMatchIdentifier();
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000705
Nils Diewaldbb33da22015-03-04 16:24:25 +0000706 // Get prefix string corpus/doc
Akron8f6f7a32015-06-25 01:03:15 +0200707 if (this.getTextSigle() != null) {
Akron640458c2015-06-25 12:36:15 +0200708 id.setTextSigle(this.getTextSigle());
Akron8f6f7a32015-06-25 01:03:15 +0200709 }
710 // LEGACY
711 else {
712 id.setCorpusID(this.getCorpusID());
713 id.setDocID(this.getDocID());
714 };
Nils Diewald6aa929e2014-09-17 13:30:34 +0000715
Nils Diewaldbb33da22015-03-04 16:24:25 +0000716 return (this.identifier = id.toString());
Nils Diewald6aa929e2014-09-17 13:30:34 +0000717 };
718
Nils Diewaldbb33da22015-03-04 16:24:25 +0000719
Nils Diewald6aa929e2014-09-17 13:30:34 +0000720 @JsonIgnore
721 public MatchIdentifier getMatchIdentifier () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000722 MatchIdentifier id = new MatchIdentifier();
Nils Diewald6aa929e2014-09-17 13:30:34 +0000723
Nils Diewaldbb33da22015-03-04 16:24:25 +0000724 id.setStartPos(startPos);
725 id.setEndPos(endPos);
Nils Diewald2cd1c3d2014-01-08 22:53:08 +0000726
Nils Diewaldbb33da22015-03-04 16:24:25 +0000727 // There are highlights to integrate
728 if (this.highlight != null) {
729 for (Highlight h : this.highlight) {
Akronb98c2662017-02-14 19:38:05 +0100730 if (h.number >= 256 || h.end == PB_MARKER)
Nils Diewaldbb33da22015-03-04 16:24:25 +0000731 continue;
Nils Diewaldcde69082014-01-16 15:46:48 +0000732
Nils Diewaldbb33da22015-03-04 16:24:25 +0000733 // Add highlight to the snippet
734 id.addPos(h.start, h.end, h.number);
735 };
736 };
Nils Diewalda115a332014-01-07 13:59:09 +0000737
Nils Diewaldbb33da22015-03-04 16:24:25 +0000738 return id;
Nils Diewald010c10f2013-12-17 01:58:31 +0000739 };
740
Nils Diewald498d5982014-03-03 20:09:22 +0000741 /**
742 * Get identifier for a specific position.
Nils Diewaldbb33da22015-03-04 16:24:25 +0000743 *
Eliza Margaretha6f989202016-10-14 21:48:29 +0200744 * @param int
745 * Position to get identifier on.
Nils Diewald498d5982014-03-03 20:09:22 +0000746 */
Akron652e4362017-09-18 20:14:44 +0200747 @JsonIgnore
Nils Diewald345bdc02014-01-21 21:48:57 +0000748 public String getPosID (int pos) {
Akron652e4362017-09-18 20:14:44 +0200749 return this.getPosID(pos, -1);
750 };
751
Akron99220ea2018-01-30 19:09:20 +0100752
753 /**
754 * Get identifier for a specific position.
755 *
756 * @param String
757 * Start and optional end position to get
758 * identifier on, separated by a dash.
759 */
760 @JsonIgnore
761 public String getPosID (String pos) {
762
763 String[] startEnd = pos.split("-");
764 if (startEnd.length == 2) {
765 return this.getPosID(
766 Integer.parseInt(startEnd[0]),
767 Integer.parseInt(startEnd[1])
768 );
769 }
770 return this.getPosID(Integer.parseInt(startEnd[0]), -1);
771 };
772
773
774
Akron652e4362017-09-18 20:14:44 +0200775 /**
776 * Get identifier for a specific position.
777 *
778 * @param int
779 * Start position to get identifier on.
780 * @param int
781 * End position to get identifier on.
782 */
783 @JsonIgnore
Akron99220ea2018-01-30 19:09:20 +0100784 public String getPosID (int start, int end) {
Nils Diewald498d5982014-03-03 20:09:22 +0000785
Akron47929692017-09-12 14:41:26 +0200786 if (DEBUG)
Akron99220ea2018-01-30 19:09:20 +0100787 log.trace("Retrieve identifier for position {}-{}", start, end);
Akron47929692017-09-12 14:41:26 +0200788
Nils Diewaldbb33da22015-03-04 16:24:25 +0000789 // Identifier already given
790 if (this.identifier != null)
791 return this.identifier;
Nils Diewald345bdc02014-01-21 21:48:57 +0000792
Nils Diewaldbb33da22015-03-04 16:24:25 +0000793 // Nothing here
794 if (this.localDocID == -1)
795 return null;
Nils Diewald345bdc02014-01-21 21:48:57 +0000796
Nils Diewaldbb33da22015-03-04 16:24:25 +0000797 PosIdentifier id = new PosIdentifier();
Nils Diewald345bdc02014-01-21 21:48:57 +0000798
Nils Diewaldbb33da22015-03-04 16:24:25 +0000799 // Get prefix string corpus/doc
Akron47929692017-09-12 14:41:26 +0200800 // <legacy>
Nils Diewaldbb33da22015-03-04 16:24:25 +0000801 id.setCorpusID(this.getCorpusID());
802 id.setDocID(this.getDocID());
Akron47929692017-09-12 14:41:26 +0200803 // </legacy>
804 id.setTextSigle(this.getTextSigle());
Akron652e4362017-09-18 20:14:44 +0200805 id.setStart(start);
806 id.setEnd(end);
Nils Diewald345bdc02014-01-21 21:48:57 +0000807
Akron47929692017-09-12 14:41:26 +0200808 if (DEBUG)
809 log.trace(
Akron99220ea2018-01-30 19:09:20 +0100810 "Identifier is {} in {} ({}-{}) {}",
Akron47929692017-09-12 14:41:26 +0200811 id.toString(),
812 this.getTextSigle(),
813 this.getCorpusID(),
814 this.getDocID(),
Akron652e4362017-09-18 20:14:44 +0200815 start
Akron47929692017-09-12 14:41:26 +0200816 );
817
Nils Diewaldbb33da22015-03-04 16:24:25 +0000818 return id.toString();
Nils Diewald345bdc02014-01-21 21:48:57 +0000819 };
820
Nils Diewaldbb33da22015-03-04 16:24:25 +0000821
Nils Diewald392bcf32015-02-26 20:01:17 +0000822 public Match setContext (SearchContext context) {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000823 this.context = context;
824 return this;
Nils Diewald1e5d5942014-05-20 13:29:53 +0000825 };
826
Nils Diewaldbb33da22015-03-04 16:24:25 +0000827
Nils Diewald1e5d5942014-05-20 13:29:53 +0000828 @JsonIgnore
829 public SearchContext getContext () {
Nils Diewaldbb33da22015-03-04 16:24:25 +0000830 if (this.context == null)
831 this.context = new SearchContext();
832 return this.context;
Nils Diewald1e5d5942014-05-20 13:29:53 +0000833 };
Nils Diewaldbb33da22015-03-04 16:24:25 +0000834
Nils Diewald1e5d5942014-05-20 13:29:53 +0000835
Akronf815b592017-02-09 16:54:59 +0100836
837 // Retrieve pagebreaks in a certain area
838 public List<int[]> retrievePagebreaks (String pb) {
Akron79d51d42017-02-13 21:28:27 +0100839 if (this.positionsToOffset != null) {
Akronf815b592017-02-09 16:54:59 +0100840 return this.retrievePagebreaks(
841 this.positionsToOffset.getLeafReader(),
842 (Bits) null,
Akrond8f88612017-02-15 19:26:54 +0100843 "tokens",
844 pb
Akronf815b592017-02-09 16:54:59 +0100845 );
Akron79d51d42017-02-13 21:28:27 +0100846 };
Akronf815b592017-02-09 16:54:59 +0100847
848 return null;
849 };
850
851 // Retrieve pagebreaks in a certain area
852 // THIS IS NOT VERY CLEVER - MAKE IT MORE CLEVER!
853 public List<int[]> retrievePagebreaks (LeafReaderContext atomic,
854 Bits bitset,
855 String field,
856 String pb) {
857
858 // List of relevant pagebreaks
859 List<int[]> pagebreaks = new ArrayList<>(24);
Akronb98c2662017-02-14 19:38:05 +0100860
Akrond8f88612017-02-15 19:26:54 +0100861 int charOffset = 0, pagenumber = 0, start = 0;
862
863 if (DEBUG)
864 log.debug("Retrieve pagebreaks between {}-{}",
865 this.getStartPos(),
866 this.getEndPos());
867
Akronf815b592017-02-09 16:54:59 +0100868 try {
869
870 // Store character offsets in ByteBuffer
Akron79d51d42017-02-13 21:28:27 +0100871 ByteBuffer bb = ByteBuffer.allocate(16);
Akronf815b592017-02-09 16:54:59 +0100872
873 // Store last relevant pagebreak in byte array
874 byte[] b = null;
Akron79d51d42017-02-13 21:28:27 +0100875
876 SpanTermQuery stq = new SpanTermQuery(new Term(field, pb));
877
878 if (DEBUG)
879 log.trace("Check pagebreaks with {}", stq.toString());
880
881 Spans pagebreakSpans = stq.getSpans(
Akronf815b592017-02-09 16:54:59 +0100882 atomic, bitset, new HashMap<Term, TermContext>()
883 );
884
885 // Iterate over all pagebreaks
Akron79d51d42017-02-13 21:28:27 +0100886 while (pagebreakSpans.next() == true) {
Akronf815b592017-02-09 16:54:59 +0100887
Akron79d51d42017-02-13 21:28:27 +0100888 if (DEBUG) {
Akrond8f88612017-02-15 19:26:54 +0100889 log.debug("There is a pagebreak at {}/{}",
890 pagebreakSpans.doc(),
891 pagebreakSpans.start());
Akron79d51d42017-02-13 21:28:27 +0100892 };
893
Akronf815b592017-02-09 16:54:59 +0100894 // Current pagebreak is not in the correct document
895 if (pagebreakSpans.doc() != this.localDocID) {
896 pagebreakSpans.skipTo(this.localDocID);
897
898 // No pagebreaks in this document
899 if (pagebreakSpans.doc() != this.localDocID)
900 break;
901 };
902
Akron79d51d42017-02-13 21:28:27 +0100903 if (DEBUG)
904 log.debug("The pagebreak occurs in the document");
905
Akronf815b592017-02-09 16:54:59 +0100906 // There is a pagebreak found - check,
907 // if it is in the correct area
908 if (pagebreakSpans.start() <= this.getStartPos()) {
909
Akron79d51d42017-02-13 21:28:27 +0100910 if (DEBUG)
Akrond8f88612017-02-15 19:26:54 +0100911 log.debug("PB start position is before match at {}",
Akronb98c2662017-02-14 19:38:05 +0100912 pagebreakSpans.start());
Akron79d51d42017-02-13 21:28:27 +0100913
Akronf815b592017-02-09 16:54:59 +0100914 // Only the first payload is relevant
915 b = pagebreakSpans.getPayload().iterator().next();
Akrond8f88612017-02-15 19:26:54 +0100916 start = pagebreakSpans.start();
Akronf815b592017-02-09 16:54:59 +0100917 }
918
919 // This is the first pagebreak!
920 else {
921
922 // b is already defined!
923 if (b != null) {
924 bb.rewind();
925 bb.put(b);
Akron79d51d42017-02-13 21:28:27 +0100926 bb.rewind();
927
Akronb98c2662017-02-14 19:38:05 +0100928 pagenumber = bb.getInt();
Akron35c2d0d2017-02-15 11:16:22 +0100929 charOffset = bb.getInt();
Akrond4b19332017-02-15 18:36:24 +0100930
931 if (DEBUG)
932 log.debug("Add pagebreak to list: {}-{}", charOffset, pagenumber);
Akron79d51d42017-02-13 21:28:27 +0100933
934 // This is the first pagebreak!
Akronb98c2662017-02-14 19:38:05 +0100935 pagebreaks.add(new int[]{charOffset, pagenumber});
Akrond8f88612017-02-15 19:26:54 +0100936 if (start >= this.getStartPos()) {
937
938 if (DEBUG)
939 log.debug("Add pagebreak to rendering: {}-{}",
940 charOffset,
941 pagenumber);
942 this.addPagebreak(charOffset, pagenumber);
943 };
Akron79d51d42017-02-13 21:28:27 +0100944 }
945
946 // b wasn't used yet
Akrond8f88612017-02-15 19:26:54 +0100947 if (pagebreakSpans.start() <= this.getEndPos()) {
Akron79d51d42017-02-13 21:28:27 +0100948
949 // Set new pagebreak
950 // Only the first payload is relevant
951 b = pagebreakSpans.getPayload().iterator().next();
952 bb.rewind();
953 bb.put(b);
954 bb.rewind();
Akronf815b592017-02-09 16:54:59 +0100955
Akronb98c2662017-02-14 19:38:05 +0100956 pagenumber = bb.getInt();
Akron35c2d0d2017-02-15 11:16:22 +0100957 charOffset = bb.getInt();
Akronb98c2662017-02-14 19:38:05 +0100958
Akronf815b592017-02-09 16:54:59 +0100959 // This is the first pagebreak!
Akronb98c2662017-02-14 19:38:05 +0100960 pagebreaks.add(new int[]{charOffset, pagenumber});
961 this.addPagebreak(charOffset,pagenumber);
Akron79d51d42017-02-13 21:28:27 +0100962 }
963
964 // Pagebreak beyond the current position
965 else {
966 break;
967 };
Akronf815b592017-02-09 16:54:59 +0100968
969 // Reset byte
970 b = null;
Akronf815b592017-02-09 16:54:59 +0100971 };
972 };
973 }
974 catch (Exception e) {
975 log.warn("Some problems with ByteBuffer: {}", e.getMessage());
976 };
977
Akron79d51d42017-02-13 21:28:27 +0100978 if (pagebreaks.size() > 0) {
Akron35c2d0d2017-02-15 11:16:22 +0100979 this.startPage = pagebreaks.get(0)[1];
Akron79d51d42017-02-13 21:28:27 +0100980 if (pagebreaks.size() > 1 && pagebreaks.get(pagebreaks.size()-1) != null)
Akron35c2d0d2017-02-15 11:16:22 +0100981 this.endPage = pagebreaks.get(pagebreaks.size()-1)[1];
Akron79d51d42017-02-13 21:28:27 +0100982 }
983
Akronf815b592017-02-09 16:54:59 +0100984 return pagebreaks;
985 };
986
987
Nils Diewald1e5d5942014-05-20 13:29:53 +0000988 // Expand the context to a span
989 public int[] expandContextToSpan (String element) {
990
Nils Diewaldbb33da22015-03-04 16:24:25 +0000991 // TODO: THE BITS HAVE TO BE SET!
992
993 if (this.positionsToOffset != null)
994 return this.expandContextToSpan(
Akron700c1eb2015-09-25 16:57:30 +0200995 this.positionsToOffset.getLeafReader(), (Bits) null,
Nils Diewaldbb33da22015-03-04 16:24:25 +0000996 "tokens", element);
997 return new int[] { 0, 0, 0, 0 };
Nils Diewald1e5d5942014-05-20 13:29:53 +0000998 };
999
Akronf815b592017-02-09 16:54:59 +01001000
Nils Diewaldbb33da22015-03-04 16:24:25 +00001001
Nils Diewald1e5d5942014-05-20 13:29:53 +00001002 // Expand the context to a span
Nils Diewald84934372014-05-20 13:48:18 +00001003 // THIS IS NOT VERY CLEVER - MAKE IT MORE CLEVER!
Akron700c1eb2015-09-25 16:57:30 +02001004 public int[] expandContextToSpan (LeafReaderContext atomic, Bits bitset,
Nils Diewaldbb33da22015-03-04 16:24:25 +00001005 String field, String element) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001006
Nils Diewaldbb33da22015-03-04 16:24:25 +00001007 try {
1008 // Store character offsets in ByteBuffer
Akron499c94c2016-02-04 13:13:43 +01001009 ByteBuffer bb = ByteBuffer.allocate(24);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001010
Nils Diewaldbb33da22015-03-04 16:24:25 +00001011 SpanElementQuery cquery = new SpanElementQuery(field, element);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001012
Nils Diewaldbb33da22015-03-04 16:24:25 +00001013 Spans contextSpans = cquery.getSpans(atomic, bitset,
1014 new HashMap<Term, TermContext>());
Nils Diewald1e5d5942014-05-20 13:29:53 +00001015
Nils Diewaldbb33da22015-03-04 16:24:25 +00001016 int newStart = -1, newEnd = -1;
1017 int newStartChar = -1, newEndChar = -1;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001018
Nils Diewaldbb33da22015-03-04 16:24:25 +00001019 if (DEBUG)
Akron42993552016-02-04 13:24:24 +01001020 log.trace(
1021 "Extend match to context boundary with {} in docID {}",
Nils Diewaldbb33da22015-03-04 16:24:25 +00001022 cquery.toString(), this.localDocID);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001023
Nils Diewaldbb33da22015-03-04 16:24:25 +00001024 while (true) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001025
Nils Diewaldbb33da22015-03-04 16:24:25 +00001026 // Game over
1027 if (contextSpans.next() != true)
1028 break;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001029
Nils Diewaldbb33da22015-03-04 16:24:25 +00001030 if (contextSpans.doc() != this.localDocID) {
1031 contextSpans.skipTo(this.localDocID);
1032 if (contextSpans.doc() != this.localDocID)
1033 break;
1034 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001035
Nils Diewaldbb33da22015-03-04 16:24:25 +00001036 // There's a <context> found -- I'm curious,
1037 // if it's closer to the match than everything before
1038 if (contextSpans.start() <= this.getStartPos()
1039 && contextSpans.end() >= this.getStartPos()) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001040
Nils Diewaldbb33da22015-03-04 16:24:25 +00001041 // Set as newStart
Eliza Margaretha6f989202016-10-14 21:48:29 +02001042 newStart = contextSpans.start() > newStart
1043 ? contextSpans.start() : newStart;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001044
Nils Diewaldbb33da22015-03-04 16:24:25 +00001045 if (DEBUG)
1046 log.trace("NewStart is at {}", newStart);
Nils Diewald84934372014-05-20 13:48:18 +00001047
Nils Diewaldbb33da22015-03-04 16:24:25 +00001048 // Get character offset (start)
1049 if (contextSpans.isPayloadAvailable()) {
1050 try {
1051 bb.rewind();
1052 for (byte[] b : contextSpans.getPayload()) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001053
Nils Diewaldbb33da22015-03-04 16:24:25 +00001054 // Not an element span
Akron499c94c2016-02-04 13:13:43 +01001055 if (b[0] != (byte) 64)
Nils Diewaldbb33da22015-03-04 16:24:25 +00001056 continue;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001057
Nils Diewaldbb33da22015-03-04 16:24:25 +00001058 bb.rewind();
Akron499c94c2016-02-04 13:13:43 +01001059 bb.put(b);
1060 bb.position(1);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001061 newStartChar = bb.getInt();
1062 newEndChar = bb.getInt();
1063 break;
1064 };
1065 }
1066 catch (Exception e) {
Akronf815b592017-02-09 16:54:59 +01001067 log.warn("Some problems with ByteBuffer: {}",
1068 e.getMessage());
Nils Diewaldbb33da22015-03-04 16:24:25 +00001069 };
1070 };
1071 }
1072 else {
1073 // Has to be resettet to avoid multiple readings of the payload
1074 newEndChar = 0;
1075 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001076
Nils Diewaldbb33da22015-03-04 16:24:25 +00001077 // There's an s found, that ends after the match
1078 if (contextSpans.end() >= this.getEndPos()) {
1079 newEnd = contextSpans.end();
Nils Diewald1e5d5942014-05-20 13:29:53 +00001080
Nils Diewaldbb33da22015-03-04 16:24:25 +00001081 // Get character offset (end)
1082 if (newEndChar == 0 && contextSpans.isPayloadAvailable()) {
1083 try {
1084 bb.rewind();
1085 for (byte[] b : contextSpans.getPayload()) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001086
Nils Diewaldbb33da22015-03-04 16:24:25 +00001087 // Not an element span
Akron499c94c2016-02-04 13:13:43 +01001088 if (b[0] != (byte) 64)
Nils Diewaldbb33da22015-03-04 16:24:25 +00001089 continue;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001090
Nils Diewaldbb33da22015-03-04 16:24:25 +00001091 bb.rewind();
Akron499c94c2016-02-04 13:13:43 +01001092 bb.put(b);
1093 bb.position(1);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001094 newEndChar = bb.getInt(1);
1095 break;
1096 };
1097 }
1098 catch (Exception e) {
1099 log.warn(e.getMessage());
1100 };
1101 };
1102 break;
1103 };
1104 };
1105
1106 // We have a new match surrounding
1107 if (DEBUG)
1108 log.trace("New match spans from {}-{}/{}-{}", newStart, newEnd,
1109 newStartChar, newEndChar);
1110
1111 return new int[] { newStart, newEnd, newStartChar, newEndChar };
1112 }
1113 catch (IOException e) {
1114 log.error(e.getMessage());
1115 };
1116
1117 return new int[] { -1, -1, -1, -1 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001118 };
1119
Nils Diewaldbb33da22015-03-04 16:24:25 +00001120
Nils Diewald498d5982014-03-03 20:09:22 +00001121 // Reset all internal data
Nils Diewald833fe7e2013-12-14 16:06:33 +00001122 private void _reset () {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001123 this.processed = false;
1124 this.snippetHTML = null;
1125 this.snippetBrackets = null;
Akron7e750972018-03-23 14:21:21 +01001126 this.identifier = null;
Nils Diewald498d5982014-03-03 20:09:22 +00001127
Nils Diewaldbb33da22015-03-04 16:24:25 +00001128 // Delete all spans
1129 if (this.span != null)
1130 this.span.clear();
Nils Diewaldf399a672013-11-18 17:55:22 +00001131 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001132
Nils Diewaldbb33da22015-03-04 16:24:25 +00001133
Nils Diewald833fe7e2013-12-14 16:06:33 +00001134 // Start building highlighted snippets
Nils Diewaldcde69082014-01-16 15:46:48 +00001135 private boolean _processHighlight () {
Akronf05fde62016-08-03 23:46:17 +02001136
Akron08f4ceb2016-08-03 23:53:32 +02001137 if (processed)
Nils Diewaldbb33da22015-03-04 16:24:25 +00001138 return true;
Nils Diewaldcde69082014-01-16 15:46:48 +00001139
Nils Diewaldbb33da22015-03-04 16:24:25 +00001140 // Relevant details are missing
1141 if (this.positionsToOffset == null || this.localDocID == -1) {
1142 log.warn("You have to define "
Akron42993552016-02-04 13:24:24 +01001143 + "positionsToOffset and localDocID first before");
Nils Diewaldbb33da22015-03-04 16:24:25 +00001144 return false;
1145 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001146
Nils Diewaldbb33da22015-03-04 16:24:25 +00001147 if (DEBUG)
1148 log.trace("--- Start highlight processing ...");
Nils Diewaldcde69082014-01-16 15:46:48 +00001149
Nils Diewaldbb33da22015-03-04 16:24:25 +00001150 // Get pto object
1151 PositionsToOffset pto = this.positionsToOffset;
1152 pto.add(this.localDocID, this.getStartPos());
1153 pto.add(this.localDocID, this.getEndPos() - 1);
Nils Diewaldcde69082014-01-16 15:46:48 +00001154
Nils Diewaldbb33da22015-03-04 16:24:25 +00001155 if (DEBUG)
1156 log.trace("PTO will retrieve {} & {} (Match boundary)",
1157 this.getStartPos(), this.getEndPos());
Nils Diewaldcde69082014-01-16 15:46:48 +00001158
Akron08f4ceb2016-08-03 23:53:32 +02001159 // Set inner match
1160 if (this.innerMatchEndPos != 1)
1161 this.addHighlight(this.innerMatchStartPos, this.innerMatchEndPos,
1162 -1);
1163
Nils Diewaldbb33da22015-03-04 16:24:25 +00001164 // Add all highlights for character retrieval
1165 if (this.highlight != null) {
1166 for (Highlight hl : this.highlight) {
1167 if (hl.start >= this.getStartPos()
1168 && hl.end <= this.getEndPos()) {
Nils Diewald498d5982014-03-03 20:09:22 +00001169
Akronb98c2662017-02-14 19:38:05 +01001170 // Highlight is no pagebreak
1171 if (hl.end != PB_MARKER) {
1172 pto.add(this.localDocID, hl.start);
1173 pto.add(this.localDocID, hl.end);
1174
1175 if (DEBUG)
1176 log.trace(
Nils Diewaldbb33da22015-03-04 16:24:25 +00001177 "PTO will retrieve {} & {} (Highlight boundary)",
1178 hl.start, hl.end);
Akronb98c2662017-02-14 19:38:05 +01001179
1180 }
1181
1182 else if (DEBUG) {
1183 log.trace("Highlight is a pagebreak - do not retrieve PTO");
1184 };
Nils Diewaldbb33da22015-03-04 16:24:25 +00001185 };
1186 };
1187 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001188
Nils Diewaldbb33da22015-03-04 16:24:25 +00001189 // Get the list of spans for matches and highlighting
1190 if (this.span == null || this.span.size() == 0) {
1191 if (!this._processHighlightSpans())
1192 return false;
1193 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001194
Nils Diewaldbb33da22015-03-04 16:24:25 +00001195 // Create a stack for highlighted elements
1196 // (opening and closing elements)
1197 ArrayList<int[]> stack = this._processHighlightStack();
1198
1199 if (DEBUG)
1200 log.trace("The snippet is {}", this.tempSnippet);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001201
Nils Diewaldbb33da22015-03-04 16:24:25 +00001202 // The temporary snippet is empty, nothing to do
1203 if (this.tempSnippet == null) {
1204 processed = true;
1205 return false;
1206 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001207
Nils Diewaldbb33da22015-03-04 16:24:25 +00001208 // Merge the element stack with the primary textual data
1209 this._processHighlightSnippet(this.tempSnippet, stack);
Nils Diewaldf399a672013-11-18 17:55:22 +00001210
Nils Diewaldbb33da22015-03-04 16:24:25 +00001211 // Match is processed - done
1212 return (processed = true);
Nils Diewaldf399a672013-11-18 17:55:22 +00001213 };
1214
Nils Diewald498d5982014-03-03 20:09:22 +00001215
Nils Diewald833fe7e2013-12-14 16:06:33 +00001216 /*
1217 Comparator class for opening tags
1218 */
Nils Diewaldf399a672013-11-18 17:55:22 +00001219 private class OpeningTagComparator implements Comparator<int[]> {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001220 @Override
1221 public int compare (int[] arg0, int[] arg1) {
1222 // Check start positions
1223 if (arg0[0] > arg1[0]) {
1224 return 1;
1225 }
1226 else if (arg0[0] == arg1[0]) {
1227 // Check endpositions
1228 if (arg0[1] > arg1[1]) {
1229 return -1;
1230 }
1231 else if (arg0[1] == arg1[1]) {
Akronf05fde62016-08-03 23:46:17 +02001232
Akron08f4ceb2016-08-03 23:53:32 +02001233 // Compare class number
1234 if (arg0[2] > arg1[2])
1235 return 1;
Akron417eaa92017-01-13 18:00:15 +01001236 else if (arg0[2] < arg1[2])
Akron08f4ceb2016-08-03 23:53:32 +02001237 return -1;
1238 return 0;
Akronf05fde62016-08-03 23:46:17 +02001239
Nils Diewaldbb33da22015-03-04 16:24:25 +00001240 }
1241 return 1;
1242 };
1243 return -1;
1244 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001245 };
1246
Nils Diewald833fe7e2013-12-14 16:06:33 +00001247 /*
Akronf05fde62016-08-03 23:46:17 +02001248 * Comparator class for closing tags
Nils Diewald833fe7e2013-12-14 16:06:33 +00001249 */
Nils Diewaldf399a672013-11-18 17:55:22 +00001250 private class ClosingTagComparator implements Comparator<int[]> {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001251 @Override
1252 public int compare (int[] arg0, int[] arg1) {
1253 // Check end positions
1254 if (arg0[1] > arg1[1]) {
1255 return 1;
1256 }
1257 else if (arg0[1] == arg1[1]) {
Akronf05fde62016-08-03 23:46:17 +02001258
Akron08f4ceb2016-08-03 23:53:32 +02001259 // Check start positions
1260 if (arg0[0] < arg1[0]) {
1261 return 1;
1262 }
1263 else if (arg0[0] == arg1[0]) {
1264 return 0;
1265 };
1266 return -1;
Nils Diewaldbb33da22015-03-04 16:24:25 +00001267 };
1268 return -1;
1269 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001270 };
1271
Nils Diewald833fe7e2013-12-14 16:06:33 +00001272
Akrona7b936d2016-03-04 13:40:54 +01001273 /*
1274 * This takes a clean string and the tag stack
1275 * to decorate the string with annotations.
1276 */
Eliza Margaretha6f989202016-10-14 21:48:29 +02001277 private void _processHighlightSnippet (String clean,
1278 ArrayList<int[]> stack) {
Nils Diewaldf399a672013-11-18 17:55:22 +00001279
Akron22d119d2017-11-15 16:53:02 +01001280 if (DEBUG) {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001281 log.trace("--- Process Highlight snippet");
Akron22d119d2017-11-15 16:53:02 +01001282 log.trace("--- Snippet: {}", clean);
1283 };
Nils Diewald498d5982014-03-03 20:09:22 +00001284
Nils Diewaldbb33da22015-03-04 16:24:25 +00001285 int pos = 0, oldPos = 0;
Nils Diewaldf399a672013-11-18 17:55:22 +00001286
Nils Diewaldbb33da22015-03-04 16:24:25 +00001287 this.snippetArray = new HighlightCombinator();
Nils Diewaldf399a672013-11-18 17:55:22 +00001288
Akrona7b936d2016-03-04 13:40:54 +01001289 // Iterate over all elements of the stack
Nils Diewaldbb33da22015-03-04 16:24:25 +00001290 for (int[] element : stack) {
Akrona7b936d2016-03-04 13:40:54 +01001291
Akrond4b19332017-02-15 18:36:24 +01001292 // The position is the start position for opening and
1293 // empty elements and the end position for closing elements
Nils Diewaldbb33da22015-03-04 16:24:25 +00001294 pos = element[3] != 0 ? element[0] : element[1];
Nils Diewaldf399a672013-11-18 17:55:22 +00001295
Akron22d119d2017-11-15 16:53:02 +01001296 if (DEBUG)
1297 log.trace("Add tag at position {} (was {})",
1298 pos,
1299 oldPos);
1300
1301
Akronb98c2662017-02-14 19:38:05 +01001302 // The new position is behind the old position
Nils Diewaldbb33da22015-03-04 16:24:25 +00001303 if (pos > oldPos) {
Nils Diewaldda1722b2014-02-17 00:12:05 +00001304
Akronb98c2662017-02-14 19:38:05 +01001305 // The position is behind the string length,
1306 // which may end when an element ends beyond
Nils Diewaldbb33da22015-03-04 16:24:25 +00001307 if (pos > clean.length()) {
Akronb98c2662017-02-14 19:38:05 +01001308
1309 // Reposition to the end
Akron22d119d2017-11-15 16:53:02 +01001310 pos = clean.length();
1311
1312 if (DEBUG)
1313 log.trace("Position exceeds string, now {}",
1314 pos);
1315
Nils Diewaldbb33da22015-03-04 16:24:25 +00001316 };
Nils Diewaldda1722b2014-02-17 00:12:05 +00001317
Akronb98c2662017-02-14 19:38:05 +01001318 // Add partial string
Akrondd31e8d2017-11-15 16:22:45 +01001319 if (pos > 0 && pos > oldPos) {
Akron57d57aa2017-11-13 18:56:33 +01001320 snippetArray.addString(clean.substring(oldPos, pos));
Akrondd31e8d2017-11-15 16:22:45 +01001321 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001322
Akronb98c2662017-02-14 19:38:05 +01001323 // Remember the new position
Akrondd31e8d2017-11-15 16:22:45 +01001324 oldPos = pos;
Nils Diewaldbb33da22015-03-04 16:24:25 +00001325 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001326
Akronb98c2662017-02-14 19:38:05 +01001327 // close tag
1328 if (element[3] == 0) {
Akrond4b19332017-02-15 18:36:24 +01001329
1330 // Add close
Nils Diewaldbb33da22015-03-04 16:24:25 +00001331 snippetArray.addClose(element[2]);
Akronb98c2662017-02-14 19:38:05 +01001332 }
1333
Akron99220ea2018-01-30 19:09:20 +01001334 // empty tag
Akrond4b19332017-02-15 18:36:24 +01001335 else if (element[3] == 2) {
1336
1337 // Add Empty (pagebreak)
1338 snippetArray.addEmpty(element[2]);
1339 }
1340
1341
Akronb98c2662017-02-14 19:38:05 +01001342 // open tag
1343 else {
1344 snippetArray.addOpen(element[2]);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001345 };
1346 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001347
Akron1dd062d2016-11-11 23:21:46 +01001348 if (clean.length() > pos && pos >= 0) {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001349 snippetArray.addString(clean.substring(pos));
1350 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001351 };
1352
Nils Diewaldf399a672013-11-18 17:55:22 +00001353
1354 @JsonProperty("snippet")
1355 public String getSnippetHTML () {
Nils Diewaldcde69082014-01-16 15:46:48 +00001356
Nils Diewaldbb33da22015-03-04 16:24:25 +00001357 if (!this._processHighlight())
1358 return null;
Nils Diewald3caa00d2013-12-13 02:24:04 +00001359
Nils Diewaldbb33da22015-03-04 16:24:25 +00001360 if (this.processed && this.snippetHTML != null)
1361 return this.snippetHTML;
Nils Diewaldf399a672013-11-18 17:55:22 +00001362
Nils Diewaldbb33da22015-03-04 16:24:25 +00001363 if (DEBUG)
1364 log.trace("Create HTML Snippet");
Nils Diewald833fe7e2013-12-14 16:06:33 +00001365
Nils Diewaldbb33da22015-03-04 16:24:25 +00001366 StringBuilder sb = new StringBuilder();
Akron1dd062d2016-11-11 23:21:46 +01001367 StringBuilder rightContext = new StringBuilder();
Nils Diewaldf399a672013-11-18 17:55:22 +00001368
Akron1c126b42018-01-30 19:48:48 +01001369 // Remember ids already defined to
1370 // have joined elements
1371 HashSet<String> joins = new HashSet<>(100);
1372
Nils Diewaldbb33da22015-03-04 16:24:25 +00001373 // Snippet stack sizes
1374 short start = (short) 0;
1375 short end = this.snippetArray.size();
Nils Diewald79f6c4d2014-09-17 17:34:01 +00001376
Akron1dd062d2016-11-11 23:21:46 +01001377 // Create context
Nils Diewaldbb33da22015-03-04 16:24:25 +00001378 sb.append("<span class=\"context-left\">");
1379 if (this.startMore)
1380 sb.append("<span class=\"more\"></span>");
Nils Diewald3caa00d2013-12-13 02:24:04 +00001381
Akron1dd062d2016-11-11 23:21:46 +01001382 // Set levels for highlights
1383 FixedBitSet level = new FixedBitSet(255);
1384 level.set(0, 255);
1385 byte[] levelCache = new byte[255];
Nils Diewaldf399a672013-11-18 17:55:22 +00001386
Akron1dd062d2016-11-11 23:21:46 +01001387 HighlightCombinatorElement elem;
Nils Diewaldf3b30ae2013-11-27 17:42:37 +00001388
Akron1dd062d2016-11-11 23:21:46 +01001389 end--;
1390 if (end > 0) {
Nils Diewaldf3b30ae2013-11-27 17:42:37 +00001391
Akron1dd062d2016-11-11 23:21:46 +01001392 // First element of sorted array
1393 elem = this.snippetArray.getFirst();
Nils Diewald46790102014-09-18 16:05:42 +00001394
Akron1dd062d2016-11-11 23:21:46 +01001395 // First element is textual
1396 if (elem.type == 0) {
Akron1c126b42018-01-30 19:48:48 +01001397 sb.append(elem.toHTML(this, level, levelCache, joins));
Akron1dd062d2016-11-11 23:21:46 +01001398 // Move start position
1399 start++;
1400 };
1401 sb.append("</span>");
Nils Diewald46790102014-09-18 16:05:42 +00001402
Akron1dd062d2016-11-11 23:21:46 +01001403 // Last element of sorted array
1404 elem = this.snippetArray.getLast();
1405
1406 // Create right context, if there is any
1407 rightContext.append("<span class=\"context-right\">");
1408
1409 // Last element is textual
1410 if (elem != null && elem.type == 0) {
Akron1c126b42018-01-30 19:48:48 +01001411 rightContext.append(
1412 elem.toHTML(this, level, levelCache, joins)
1413 );
Akron1dd062d2016-11-11 23:21:46 +01001414
1415 // decrement end
1416 end--;
1417 };
1418 };
Akron8288ad02016-11-11 19:23:05 +01001419
1420 if (this.endMore)
Nils Diewaldbb33da22015-03-04 16:24:25 +00001421 rightContext.append("<span class=\"more\"></span>");
Akron8288ad02016-11-11 19:23:05 +01001422
Nils Diewaldbb33da22015-03-04 16:24:25 +00001423 rightContext.append("</span>");
Nils Diewaldf3b30ae2013-11-27 17:42:37 +00001424
Nils Diewaldbb33da22015-03-04 16:24:25 +00001425 // Iterate through all remaining elements
Akron08f4ceb2016-08-03 23:53:32 +02001426 sb.append("<span class=\"match\">");
Nils Diewaldbb33da22015-03-04 16:24:25 +00001427 for (short i = start; i <= end; i++) {
Akron99220ea2018-01-30 19:09:20 +01001428
Akron8288ad02016-11-11 19:23:05 +01001429 elem = this.snippetArray.get(i);
1430 // UNTESTED
Akron99220ea2018-01-30 19:09:20 +01001431 if (elem != null) {
Akron1c126b42018-01-30 19:48:48 +01001432 String elemString = elem.toHTML(
1433 this, level, levelCache, joins
1434 );
Akron99220ea2018-01-30 19:09:20 +01001435 if (DEBUG) {
1436 log.trace("Add node {}", elemString);
1437 };
1438 sb.append(elemString);
1439 }
Nils Diewaldbb33da22015-03-04 16:24:25 +00001440 };
Akron70ce0c02018-05-25 23:44:26 +02001441 if (this.cutted) {
1442 sb.append("<span class=\"cutted\"></span>");
1443 };
Akron08f4ceb2016-08-03 23:53:32 +02001444 sb.append("</span>");
Nils Diewaldbb33da22015-03-04 16:24:25 +00001445 sb.append(rightContext);
Nils Diewaldf399a672013-11-18 17:55:22 +00001446
Nils Diewaldbb33da22015-03-04 16:24:25 +00001447 return (this.snippetHTML = sb.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +00001448 };
Nils Diewaldbb33da22015-03-04 16:24:25 +00001449
1450
Nils Diewaldf399a672013-11-18 17:55:22 +00001451 @JsonIgnore
1452 public String getSnippetBrackets () {
Nils Diewald3caa00d2013-12-13 02:24:04 +00001453
Nils Diewaldbb33da22015-03-04 16:24:25 +00001454 if (!this._processHighlight())
1455 return null;
Nils Diewald3caa00d2013-12-13 02:24:04 +00001456
Nils Diewaldbb33da22015-03-04 16:24:25 +00001457 if (this.processed && this.snippetBrackets != null)
1458 return this.snippetBrackets;
Nils Diewaldf399a672013-11-18 17:55:22 +00001459
Akronf05fde62016-08-03 23:46:17 +02001460 // Snippet stack sizes
1461 short start = (short) 0;
1462 short end = this.snippetArray.size();
1463 end--;
Akron08f4ceb2016-08-03 23:53:32 +02001464
Nils Diewaldbb33da22015-03-04 16:24:25 +00001465 StringBuilder sb = new StringBuilder();
Nils Diewaldf399a672013-11-18 17:55:22 +00001466
Nils Diewaldbb33da22015-03-04 16:24:25 +00001467 if (this.startMore)
1468 sb.append("... ");
Nils Diewaldf399a672013-11-18 17:55:22 +00001469
Akron08f4ceb2016-08-03 23:53:32 +02001470 // First element of sorted array
Akronf05fde62016-08-03 23:46:17 +02001471 HighlightCombinatorElement elem = this.snippetArray.getFirst();
Akron08f4ceb2016-08-03 23:53:32 +02001472 if (elem.type == 0) {
1473 sb.append(elem.toBrackets(this));
1474 start++;
1475 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001476
Akron08f4ceb2016-08-03 23:53:32 +02001477 sb.append("[");
1478
1479 // Last element of sorted array
Akronf05fde62016-08-03 23:46:17 +02001480 elem = this.snippetArray.getLast();
1481 StringBuilder rightContext = new StringBuilder();
1482
Akron08f4ceb2016-08-03 23:53:32 +02001483 // Last element is textual
Akronf05fde62016-08-03 23:46:17 +02001484 if (elem != null && elem.type == 0) {
1485 rightContext.append(elem.toBrackets(this));
1486 // decrement end
1487 end--;
1488 };
1489
Akron08f4ceb2016-08-03 23:53:32 +02001490 for (short i = start; i <= end; i++) {
1491 sb.append(this.snippetArray.get(i).toBrackets(this));
1492 };
Akronf05fde62016-08-03 23:46:17 +02001493
Akron70ce0c02018-05-25 23:44:26 +02001494 if (this.cutted) {
1495 sb.append("<!>");
1496 };
Akron08f4ceb2016-08-03 23:53:32 +02001497 sb.append("]");
1498 sb.append(rightContext);
1499
Nils Diewaldbb33da22015-03-04 16:24:25 +00001500 if (this.endMore)
1501 sb.append(" ...");
Nils Diewaldf399a672013-11-18 17:55:22 +00001502
Nils Diewaldbb33da22015-03-04 16:24:25 +00001503 return (this.snippetBrackets = sb.toString());
Nils Diewaldf399a672013-11-18 17:55:22 +00001504 };
1505
1506
Nils Diewald3caa00d2013-12-13 02:24:04 +00001507 // This sorts all highlight and match spans to make them nesting correctly,
1508 // even in case they overlap
1509 // TODO: Not very fast - improve!
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00001510 private ArrayList<int[]> _processHighlightStack () {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001511 if (DEBUG)
1512 log.trace("--- Process Highlight stack");
Nils Diewaldf399a672013-11-18 17:55:22 +00001513
Nils Diewaldbb33da22015-03-04 16:24:25 +00001514 LinkedList<int[]> openList = new LinkedList<int[]>();
1515 LinkedList<int[]> closeList = new LinkedList<int[]>();
Nils Diewaldf399a672013-11-18 17:55:22 +00001516
Nils Diewaldbb33da22015-03-04 16:24:25 +00001517 // Filter multiple identifiers, that may be introduced and would
1518 // result in invalid xml
1519 this._filterMultipleIdentifiers();
Nils Diewald50389b02014-04-11 16:27:52 +00001520
Nils Diewaldbb33da22015-03-04 16:24:25 +00001521 // Add highlight spans to balance lists
1522 openList.addAll(this.span);
1523 closeList.addAll(this.span);
Nils Diewaldf399a672013-11-18 17:55:22 +00001524
Nils Diewaldbb33da22015-03-04 16:24:25 +00001525 // Sort balance lists
1526 Collections.sort(openList, new OpeningTagComparator());
1527 Collections.sort(closeList, new ClosingTagComparator());
Nils Diewaldf399a672013-11-18 17:55:22 +00001528
Nils Diewaldbb33da22015-03-04 16:24:25 +00001529 // New stack array
1530 ArrayList<int[]> stack = new ArrayList<>(openList.size() * 2);
Nils Diewaldf399a672013-11-18 17:55:22 +00001531
Nils Diewaldbb33da22015-03-04 16:24:25 +00001532 // Create stack unless both lists are empty
1533 while (!openList.isEmpty() || !closeList.isEmpty()) {
Nils Diewaldf399a672013-11-18 17:55:22 +00001534
Akronb98c2662017-02-14 19:38:05 +01001535 // Nothing more to open -- close all
Nils Diewaldbb33da22015-03-04 16:24:25 +00001536 if (openList.isEmpty()) {
Akrond4b19332017-02-15 18:36:24 +01001537
1538 if (DEBUG)
1539 log.debug("No more open tags -- close all non pagebreaks");
1540
1541 if (closeList.peekFirst()[1] != PB_MARKER) {
1542 stack.add(closeList.removeFirst());
1543 }
1544 else if (DEBUG) {
1545 if (DEBUG)
1546 log.debug("Close is pagebreak -- ignore (1)");
1547 };
1548
1549 continue;
Nils Diewaldbb33da22015-03-04 16:24:25 +00001550 }
Nils Diewald20607ab2014-03-20 23:28:36 +00001551
Nils Diewaldbb33da22015-03-04 16:24:25 +00001552 // Not sure about this, but it can happen
1553 else if (closeList.isEmpty()) {
1554 break;
1555 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001556
Akrond4b19332017-02-15 18:36:24 +01001557 // Closener is pagebreak
1558 if (closeList.peekFirst()[1] == PB_MARKER) {
1559
1560 if (DEBUG)
1561 log.debug("Close is pagebreak -- ignore (2)");
1562
1563 // Remove closing pagebreak
1564 closeList.removeFirst();
1565 }
1566
1567 // Opener is pagebreak
1568 else if (openList.peekFirst()[1] == PB_MARKER) {
1569 int[] e = openList.removeFirst().clone();
1570
1571 if (DEBUG)
1572 log.debug("Open is pagebreak");
1573
1574 // Mark as empty
1575 e[1] = e[0]; // Remove pagebreak marker
1576 e[3] = 2;
1577
1578 // Add empty pagebreak
1579 stack.add(e);
1580 }
1581
Akronb98c2662017-02-14 19:38:05 +01001582 // check if the opener is smaller than the closener
Akrond4b19332017-02-15 18:36:24 +01001583 else if (openList.peekFirst()[0] < closeList.peekFirst()[1]) {
1584
1585 if (DEBUG)
Akron99220ea2018-01-30 19:09:20 +01001586 log.debug("Open tag starts before close tag ends");
Akrond4b19332017-02-15 18:36:24 +01001587
Nils Diewaldbb33da22015-03-04 16:24:25 +00001588 int[] e = openList.removeFirst().clone();
Akronb98c2662017-02-14 19:38:05 +01001589
1590 // Mark as opener
Akron12cd2582018-02-17 12:58:38 +01001591 e[3] = 1;
Akronb98c2662017-02-14 19:38:05 +01001592
Akron99220ea2018-01-30 19:09:20 +01001593 if (DEBUG) {
1594
1595 // -1: match
1596 // < -1: relation target
Akron12cd2582018-02-17 12:58:38 +01001597 // -99998: context
Akron99220ea2018-01-30 19:09:20 +01001598 // >= 2048: relation source
1599 // >= 256: annotation
1600
1601 log.trace(
Akron12cd2582018-02-17 12:58:38 +01001602 "Add open with number {} to stack at {}-{} as {}",
1603 e[2], e[0], e[1], e[3]
Akron99220ea2018-01-30 19:09:20 +01001604 );
1605 };
1606
Akronb98c2662017-02-14 19:38:05 +01001607 // Add opener to stack
Nils Diewaldbb33da22015-03-04 16:24:25 +00001608 stack.add(e);
1609 }
Akrond4b19332017-02-15 18:36:24 +01001610
Akron35c2d0d2017-02-15 11:16:22 +01001611 else {
Akron99220ea2018-01-30 19:09:20 +01001612 int[] e = closeList.removeFirst();
1613
1614 if (DEBUG) {
Akrond4b19332017-02-15 18:36:24 +01001615 log.debug("Close ends before open");
1616
Akron99220ea2018-01-30 19:09:20 +01001617 log.trace(
1618 "Add close with number {} to stack at {}-{}",
1619 e[2], e[0], e[1]
1620 );
1621 };
1622
Akronb98c2662017-02-14 19:38:05 +01001623 // Add closener to stack
Akron99220ea2018-01-30 19:09:20 +01001624 stack.add(e);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001625 };
1626 };
1627 return stack;
Nils Diewaldf399a672013-11-18 17:55:22 +00001628 };
1629
Akronf05fde62016-08-03 23:46:17 +02001630
Akron08f4ceb2016-08-03 23:53:32 +02001631 /**
1632 * Sometimes the match start and end positions are inside the
1633 * matching region, e.g. when the match was expanded.
1634 * This will override the original matching positions
1635 * And matrk the real matching.
1636 */
1637 public void overrideMatchPosition (int start, int end) {
1638 if (DEBUG)
1639 log.trace("--- Override match position");
1640
1641 this.innerMatchStartPos = start;
1642 this.innerMatchEndPos = end;
1643 };
Akronf05fde62016-08-03 23:46:17 +02001644
Nils Diewaldbb33da22015-03-04 16:24:25 +00001645
Nils Diewald498d5982014-03-03 20:09:22 +00001646 /**
1647 * This will retrieve character offsets for all spans.
1648 */
Nils Diewald1e5d5942014-05-20 13:29:53 +00001649 private boolean _processHighlightSpans () {
Nils Diewald498d5982014-03-03 20:09:22 +00001650
Nils Diewaldbb33da22015-03-04 16:24:25 +00001651 if (DEBUG)
1652 log.trace("--- Process Highlight spans");
Nils Diewald498d5982014-03-03 20:09:22 +00001653
Nils Diewaldbb33da22015-03-04 16:24:25 +00001654 // Local document ID
1655 int ldid = this.localDocID;
Nils Diewaldf399a672013-11-18 17:55:22 +00001656
Nils Diewaldbb33da22015-03-04 16:24:25 +00001657 int startPosChar = -1, endPosChar = -1;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001658
Nils Diewaldbb33da22015-03-04 16:24:25 +00001659 // No positionsToOffset object found
1660 if (this.positionsToOffset == null)
1661 return false;
Nils Diewaldcde69082014-01-16 15:46:48 +00001662
Nils Diewaldbb33da22015-03-04 16:24:25 +00001663 // Match position
1664 startPosChar = this.positionsToOffset.start(ldid, this.startPos);
Nils Diewald498d5982014-03-03 20:09:22 +00001665
Nils Diewaldbb33da22015-03-04 16:24:25 +00001666 if (DEBUG)
1667 log.trace("Unaltered startPosChar is {}", startPosChar);
Nils Diewald20607ab2014-03-20 23:28:36 +00001668
Nils Diewaldbb33da22015-03-04 16:24:25 +00001669 // Check potential differing start characters
1670 // e.g. from element spans
1671 if (potentialStartPosChar != -1
1672 && (startPosChar > this.potentialStartPosChar))
1673 startPosChar = this.potentialStartPosChar;
Nils Diewaldf399a672013-11-18 17:55:22 +00001674
Nils Diewaldbb33da22015-03-04 16:24:25 +00001675 endPosChar = this.positionsToOffset.end(ldid, this.endPos - 1);
Nils Diewald20607ab2014-03-20 23:28:36 +00001676
Nils Diewaldbb33da22015-03-04 16:24:25 +00001677 if (DEBUG)
1678 log.trace("Unaltered endPosChar is {}", endPosChar);
Nils Diewald20607ab2014-03-20 23:28:36 +00001679
Nils Diewaldbb33da22015-03-04 16:24:25 +00001680 // Potential end characters may come from spans with
1681 // defined character offsets like sentences including .", ... etc.
1682 if (endPosChar < potentialEndPosChar)
1683 endPosChar = potentialEndPosChar;
Nils Diewald20607ab2014-03-20 23:28:36 +00001684
Nils Diewaldbb33da22015-03-04 16:24:25 +00001685 if (DEBUG)
1686 log.trace("Refined: Match offset is pos {}-{} (chars {}-{})",
1687 this.startPos, this.endPos, startPosChar, endPosChar);
Nils Diewaldcde69082014-01-16 15:46:48 +00001688
Nils Diewaldbb33da22015-03-04 16:24:25 +00001689 this.identifier = null;
Nils Diewald498d5982014-03-03 20:09:22 +00001690
Nils Diewaldbb33da22015-03-04 16:24:25 +00001691 // No spans yet
1692 if (this.span == null)
1693 this.span = new LinkedList<int[]>();
Nils Diewald2cd1c3d2014-01-08 22:53:08 +00001694
Nils Diewaldbb33da22015-03-04 16:24:25 +00001695 // Process offset char findings
1696 int[] intArray = this._processOffsetChars(ldid, startPosChar,
1697 endPosChar);
Nils Diewaldf399a672013-11-18 17:55:22 +00001698
Nils Diewaldbb33da22015-03-04 16:24:25 +00001699 // Recalculate startOffsetChar
1700 int startOffsetChar = startPosChar - intArray[0];
Nils Diewald20607ab2014-03-20 23:28:36 +00001701
Akronf05fde62016-08-03 23:46:17 +02001702 // Add match span, in case no inner match is defined
Akron35c2d0d2017-02-15 11:16:22 +01001703 if (this.innerMatchEndPos == -1) {
1704 if (DEBUG)
Akron12cd2582018-02-17 12:58:38 +01001705 log.debug("Added array to match span with {} (1)", intArray);
Akron08f4ceb2016-08-03 23:53:32 +02001706 this.span.add(intArray);
Akron35c2d0d2017-02-15 11:16:22 +01001707 };
Nils Diewaldf399a672013-11-18 17:55:22 +00001708
Akron12cd2582018-02-17 12:58:38 +01001709 // Add context highlight
1710 this.span.add(new int[]{intArray[0], intArray[1], CONTEXT, 0});
1711
Nils Diewaldbb33da22015-03-04 16:24:25 +00001712 // highlights
1713 // -- I'm not sure about this.
1714 if (this.highlight != null) {
1715 if (DEBUG)
1716 log.trace("There are highlights!");
Nils Diewaldf399a672013-11-18 17:55:22 +00001717
Nils Diewaldbb33da22015-03-04 16:24:25 +00001718 for (Highlight highlight : this.highlight) {
Akron9ebdfab2018-02-19 16:38:17 +01001719 if (DEBUG && highlight.start > highlight.end) {
1720 log.warn("Start position is before end position {}-{}!",
1721 highlight.start,
1722 highlight.end);
1723 };
1724
1725
Akronb98c2662017-02-14 19:38:05 +01001726 int start = -1;
1727 int end = -1;
Nils Diewaldf399a672013-11-18 17:55:22 +00001728
Akronb98c2662017-02-14 19:38:05 +01001729 // Highlight is a pagebreak
1730 if (highlight.end != PB_MARKER) {
1731 start = this.positionsToOffset.start(ldid, highlight.start);
1732 end = this.positionsToOffset.end(ldid, highlight.end);
1733 }
1734 else {
1735
1736 if (DEBUG)
1737 log.trace("Highlight is pagebreak -- do not retrieve offset");
1738
1739 // In pagebreak highlights
1740 // there is already a character
1741 start = highlight.start;
Akrond4b19332017-02-15 18:36:24 +01001742 end = highlight.end;
Akronb98c2662017-02-14 19:38:05 +01001743 };
Nils Diewald3ef9a472013-12-02 16:06:09 +00001744
Nils Diewaldbb33da22015-03-04 16:24:25 +00001745 if (DEBUG)
1746 log.trace("PTO has retrieved {}-{} for class {}", start,
Akronb98c2662017-02-14 19:38:05 +01001747 end, highlight.number);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001748
1749 start -= startOffsetChar;
Nils Diewaldbb33da22015-03-04 16:24:25 +00001750
Akronb98c2662017-02-14 19:38:05 +01001751 // Keep end equal -1
1752 if (end != PB_MARKER) {
1753 end -= startOffsetChar;
Akrond4b19332017-02-15 18:36:24 +01001754 }
1755 else if (DEBUG) {
1756 log.debug("Pagebreak keeps end position");
Akronb98c2662017-02-14 19:38:05 +01001757 };
1758
1759 if (start < 0 || (end < 0 && end != PB_MARKER))
Nils Diewaldbb33da22015-03-04 16:24:25 +00001760 continue;
1761
1762 // Create intArray for highlight
Akronb98c2662017-02-14 19:38:05 +01001763 intArray = new int[] {
1764 start,
1765 end,
1766 highlight.number,
1767 0 // Dummy value for later use
Nils Diewaldbb33da22015-03-04 16:24:25 +00001768 };
1769
Akron35c2d0d2017-02-15 11:16:22 +01001770 if (DEBUG)
1771 log.debug("Added array to span with {} (2)", intArray);
1772
Nils Diewaldbb33da22015-03-04 16:24:25 +00001773 this.span.add(intArray);
1774 };
1775 };
1776 return true;
Nils Diewaldcde69082014-01-16 15:46:48 +00001777 };
1778
Nils Diewaldbfe554b2014-01-09 19:35:05 +00001779
Nils Diewald1e5d5942014-05-20 13:29:53 +00001780 // Pass the local docid to retrieve character positions for the offset
Nils Diewaldbb33da22015-03-04 16:24:25 +00001781 private int[] _processOffsetChars (int ldid, int startPosChar,
1782 int endPosChar) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001783
Nils Diewaldbb33da22015-03-04 16:24:25 +00001784 int startOffsetChar = -1, endOffsetChar = -1;
1785 int startOffset = -1, endOffset = -1;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001786
Nils Diewaldbb33da22015-03-04 16:24:25 +00001787 // The offset is defined by a span
1788 if (this.getContext().isSpanDefined()) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001789
Nils Diewaldbb33da22015-03-04 16:24:25 +00001790 if (DEBUG)
1791 log.trace("Try to expand to <{}>",
1792 this.context.getSpanContext());
Nils Diewald1e5d5942014-05-20 13:29:53 +00001793
Nils Diewaldbb33da22015-03-04 16:24:25 +00001794 this.startMore = false;
1795 this.endMore = false;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001796
Nils Diewaldbb33da22015-03-04 16:24:25 +00001797 int[] spanContext = this.expandContextToSpan(
Akron700c1eb2015-09-25 16:57:30 +02001798 this.positionsToOffset.getLeafReader(), (Bits) null,
Nils Diewaldbb33da22015-03-04 16:24:25 +00001799 "tokens", this.context.getSpanContext());
1800 startOffset = spanContext[0];
1801 endOffset = spanContext[1];
1802 startOffsetChar = spanContext[2];
1803 endOffsetChar = spanContext[3];
1804 if (DEBUG)
Akronc27b8112018-02-16 17:08:55 +01001805 log.trace("Got context based on span {}-{}/{}-{}",
Nils Diewaldbb33da22015-03-04 16:24:25 +00001806 startOffset, endOffset, startOffsetChar, endOffsetChar);
1807 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001808
Nils Diewaldbb33da22015-03-04 16:24:25 +00001809 // The offset is defined by tokens or characters
1810 if (endOffset == -1) {
Nils Diewald1e5d5942014-05-20 13:29:53 +00001811
Nils Diewaldbb33da22015-03-04 16:24:25 +00001812 PositionsToOffset pto = this.positionsToOffset;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001813
Nils Diewaldbb33da22015-03-04 16:24:25 +00001814 // The left offset is defined by tokens
1815 if (this.context.left.isToken()) {
1816 startOffset = this.startPos - this.context.left.getLength();
1817 if (DEBUG)
1818 log.trace("PTO will retrieve {} (Left context)",
1819 startOffset);
1820 pto.add(ldid, startOffset);
1821 }
Nils Diewald1e5d5942014-05-20 13:29:53 +00001822
Nils Diewaldbb33da22015-03-04 16:24:25 +00001823 // The left offset is defined by characters
1824 else {
1825 startOffsetChar = startPosChar - this.context.left.getLength();
1826 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001827
Nils Diewaldbb33da22015-03-04 16:24:25 +00001828 // The right context is defined by tokens
1829 if (this.context.right.isToken()) {
1830 endOffset = this.endPos + this.context.right.getLength() - 1;
1831 if (DEBUG)
Eliza Margaretha6f989202016-10-14 21:48:29 +02001832 log.trace("PTO will retrieve {} (Right context)",
1833 endOffset);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001834 pto.add(ldid, endOffset);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001835
Nils Diewaldbb33da22015-03-04 16:24:25 +00001836 }
Nils Diewald1e5d5942014-05-20 13:29:53 +00001837
Nils Diewaldbb33da22015-03-04 16:24:25 +00001838 // The right context is defined by characters
1839 else {
Eliza Margaretha6f989202016-10-14 21:48:29 +02001840 endOffsetChar = (endPosChar == -1) ? -1
1841 : endPosChar + this.context.right.getLength();
Nils Diewaldbb33da22015-03-04 16:24:25 +00001842 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001843
Nils Diewaldbb33da22015-03-04 16:24:25 +00001844 if (startOffset != -1)
1845 startOffsetChar = pto.start(ldid, startOffset);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001846
Nils Diewaldbb33da22015-03-04 16:24:25 +00001847 if (endOffset != -1)
1848 endOffsetChar = pto.end(ldid, endOffset);
1849 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001850
Nils Diewaldbb33da22015-03-04 16:24:25 +00001851 if (DEBUG)
1852 log.trace("Premature found offsets at {}-{}", startOffsetChar,
1853 endOffsetChar);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001854
Nils Diewald1e5d5942014-05-20 13:29:53 +00001855
Nils Diewaldbb33da22015-03-04 16:24:25 +00001856 // This can happen in case of non-token characters
1857 // in the match and null offsets
1858 if (startOffsetChar > startPosChar)
1859 startOffsetChar = startPosChar;
1860 else if (startOffsetChar < 0)
1861 startOffsetChar = 0;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001862
Nils Diewaldbb33da22015-03-04 16:24:25 +00001863 // No "..." at the beginning
1864 if (startOffsetChar == 0)
1865 this.startMore = false;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001866
Nils Diewaldbb33da22015-03-04 16:24:25 +00001867 if (endOffsetChar != -1 && endOffsetChar < endPosChar)
1868 endOffsetChar = endPosChar;
Nils Diewald1e5d5942014-05-20 13:29:53 +00001869
Nils Diewaldbb33da22015-03-04 16:24:25 +00001870 if (DEBUG)
1871 log.trace("The context spans from chars {}-{}", startOffsetChar,
1872 endOffsetChar);
Nils Diewald1e5d5942014-05-20 13:29:53 +00001873
Nils Diewaldbb33da22015-03-04 16:24:25 +00001874 // Get snippet information from the primary data
Eliza Margaretha6f989202016-10-14 21:48:29 +02001875 if (endOffsetChar > -1
1876 && (endOffsetChar < this.getPrimaryDataLength())) {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001877 this.tempSnippet = this.getPrimaryData(startOffsetChar,
1878 endOffsetChar);
1879 }
1880 else {
1881 this.tempSnippet = this.getPrimaryData(startOffsetChar);
1882 this.endMore = false;
1883 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001884
Nils Diewaldbb33da22015-03-04 16:24:25 +00001885 if (DEBUG)
1886 log.trace("Snippet: '" + this.tempSnippet + "'");
1887
1888 if (DEBUG)
1889 log.trace(
1890 "The match entry is {}-{} ({}-{}) with absolute offsetChars {}-{}",
Eliza Margaretha6f989202016-10-14 21:48:29 +02001891 startPosChar - startOffsetChar,
1892 endPosChar - startOffsetChar, startPosChar, endPosChar,
Nils Diewaldbb33da22015-03-04 16:24:25 +00001893 startOffsetChar, endOffsetChar);
1894
1895 // TODO: Simplify
1896 return new int[] { startPosChar - startOffsetChar,
1897 endPosChar - startOffsetChar, -1, 0 };
Nils Diewald1e5d5942014-05-20 13:29:53 +00001898 };
Nils Diewaldbb33da22015-03-04 16:24:25 +00001899
Nils Diewald1e5d5942014-05-20 13:29:53 +00001900
Nils Diewald884dbcf2015-02-27 17:02:28 +00001901 // Identical to Result!
Akron7d45e6b2015-06-26 17:23:42 +02001902 public JsonNode toJsonNode () {
1903 // ObjectNode json = (ObjectNode) mapper.valueToTree(this);
1904 ObjectNode json = (ObjectNode) super.toJsonNode();
Nils Diewaldcde69082014-01-16 15:46:48 +00001905
Nils Diewaldbb33da22015-03-04 16:24:25 +00001906 if (this.context != null)
1907 json.put("context", this.getContext().toJsonNode());
Nils Diewaldbfe554b2014-01-09 19:35:05 +00001908
Nils Diewaldbb33da22015-03-04 16:24:25 +00001909 if (this.version != null)
1910 json.put("version", this.getVersion());
Nils Diewaldcdd465b2014-02-24 18:47:38 +00001911
Akron79d51d42017-02-13 21:28:27 +01001912 if (this.startPage != -1) {
1913 ArrayNode pages = mapper.createArrayNode();
1914 pages.add(this.startPage);
1915 if (this.endPage != -1 && this.endPage != this.startPage)
1916 pages.add(this.endPage);
1917
1918 json.put("pages", pages);
1919 };
1920
Akron7d45e6b2015-06-26 17:23:42 +02001921 return json;
1922 };
1923
1924
1925 public String toJsonString () {
1926 JsonNode json = (JsonNode) this.toJsonNode();
1927
1928 // Match was no match
1929 if (json.size() == 0)
1930 return "{}";
Nils Diewaldbb33da22015-03-04 16:24:25 +00001931 try {
1932 return mapper.writeValueAsString(json);
1933 }
1934 catch (Exception e) {
1935 log.warn(e.getLocalizedMessage());
1936 };
Nils Diewaldbfe554b2014-01-09 19:35:05 +00001937
Nils Diewaldbb33da22015-03-04 16:24:25 +00001938 return "{}";
Nils Diewaldbfe554b2014-01-09 19:35:05 +00001939 };
Nils Diewald50389b02014-04-11 16:27:52 +00001940
Nils Diewaldbb33da22015-03-04 16:24:25 +00001941
Nils Diewald277e9ce2014-11-06 03:42:11 +00001942 // Return match as token list
Akron48937e92015-06-26 01:49:02 +02001943 // TODO: This will be retrieved in case "tokenList" is
1944 // requested in "fields"
Nils Diewald277e9ce2014-11-06 03:42:11 +00001945 public ObjectNode toTokenList () {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001946 ObjectNode json = mapper.createObjectNode();
Nils Diewald277e9ce2014-11-06 03:42:11 +00001947
Nils Diewaldbb33da22015-03-04 16:24:25 +00001948 if (this.getDocID() != null)
1949 json.put("textSigle", this.getDocID());
1950 else if (this.getTextSigle() != null)
1951 json.put("textSigle", this.getTextSigle());
Nils Diewald277e9ce2014-11-06 03:42:11 +00001952
Nils Diewaldbb33da22015-03-04 16:24:25 +00001953 ArrayNode tokens = json.putArray("tokens");
Nils Diewald277e9ce2014-11-06 03:42:11 +00001954
Nils Diewaldbb33da22015-03-04 16:24:25 +00001955 // Get pto object
1956 PositionsToOffset pto = this.positionsToOffset;
Nils Diewald277e9ce2014-11-06 03:42:11 +00001957
Nils Diewaldbb33da22015-03-04 16:24:25 +00001958 // Add for position retrieval
1959 for (int i = this.getStartPos(); i < this.getEndPos(); i++) {
1960 pto.add(this.localDocID, i);
1961 };
Nils Diewald277e9ce2014-11-06 03:42:11 +00001962
Nils Diewaldbb33da22015-03-04 16:24:25 +00001963 // Retrieve positions
1964 for (int i = this.getStartPos(); i < this.getEndPos(); i++) {
1965 ArrayNode token = tokens.addArray();
1966 for (int offset : pto.span(this.localDocID, i)) {
1967 token.add(offset);
1968 };
1969 };
Nils Diewald277e9ce2014-11-06 03:42:11 +00001970
Nils Diewaldbb33da22015-03-04 16:24:25 +00001971 return json;
Nils Diewald277e9ce2014-11-06 03:42:11 +00001972 };
1973
Nils Diewald50389b02014-04-11 16:27:52 +00001974
1975 // Remove duplicate identifiers
1976 // Yeah ... I mean ... why not?
1977 private void _filterMultipleIdentifiers () {
Nils Diewaldbb33da22015-03-04 16:24:25 +00001978 ArrayList<Integer> removeDuplicate = new ArrayList<>(10);
Akron99220ea2018-01-30 19:09:20 +01001979 HashSet<String> identifiers = new HashSet<>(20);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001980 for (int i = 0; i < this.span.size(); i++) {
Akronb98c2662017-02-14 19:38:05 +01001981
Nils Diewaldbb33da22015-03-04 16:24:25 +00001982 // span is an int array: [Start, End, Number, Dummy]
1983 int highlightNumber = this.span.get(i)[2];
Nils Diewald50389b02014-04-11 16:27:52 +00001984
Nils Diewaldbb33da22015-03-04 16:24:25 +00001985 // Number is an identifier
1986 if (highlightNumber < -1) {
Nils Diewaldd216a032014-04-30 17:40:19 +00001987
Nils Diewaldbb33da22015-03-04 16:24:25 +00001988 // Get the real identifier
Akron99220ea2018-01-30 19:09:20 +01001989 String idNumber =
1990 identifierNumber.get(highlightNumber);
Nils Diewaldbb33da22015-03-04 16:24:25 +00001991 if (identifiers.contains(idNumber)) {
1992 removeDuplicate.add(i);
1993 }
1994 else {
1995 identifiers.add(idNumber);
1996 };
1997 };
1998 };
Nils Diewald50389b02014-04-11 16:27:52 +00001999
Nils Diewaldbb33da22015-03-04 16:24:25 +00002000 // Order the duplicates to filter from the tail
2001 Collections.sort(removeDuplicate);
2002 Collections.reverse(removeDuplicate);
Nils Diewald50389b02014-04-11 16:27:52 +00002003
Nils Diewaldbb33da22015-03-04 16:24:25 +00002004 // Delete all duplicate identifiers
2005 for (int delete : removeDuplicate) {
2006 this.span.remove(delete);
2007 };
Nils Diewald50389b02014-04-11 16:27:52 +00002008 };
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002009
2010
2011 /*
2012 * Get identifier based on class number
2013 */
Akrond504f212015-06-20 00:27:54 +02002014 @JsonIgnore
Akron99220ea2018-01-30 19:09:20 +01002015 public String getClassID (int nr) {
Nils Diewaldbb33da22015-03-04 16:24:25 +00002016 return this.identifierNumber.get(nr);
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002017 };
2018
Nils Diewaldbb33da22015-03-04 16:24:25 +00002019
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002020 /*
2021 * Get annotation based on id
2022 */
Akrond504f212015-06-20 00:27:54 +02002023 @JsonIgnore
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002024 public String getAnnotationID (int nr) {
Nils Diewaldbb33da22015-03-04 16:24:25 +00002025 return this.annotationNumber.get(nr);
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002026 };
2027
2028
2029 /*
2030 * Get relation based on id
2031 */
Akrond504f212015-06-20 00:27:54 +02002032 @JsonIgnore
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002033 public Relation getRelationID (int nr) {
Nils Diewaldbb33da22015-03-04 16:24:25 +00002034 return this.relationNumber.get(nr);
Nils Diewald79f6c4d2014-09-17 17:34:01 +00002035 };
Nils Diewaldf399a672013-11-18 17:55:22 +00002036};