MatchIdentifier and MatchInfo retrieval
diff --git a/CHANGES b/CHANGES
index 77d7ffb..2666f1b 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,9 @@
+0.26 2014-01-16
+ - Introduced standalone SpanSegmentQueries (margaretha)
+ - [bugfix] SpanNextQueries (margaretha)
+ - Support for Match Identifiers (diewald)
+ - Support for distinct Match retrieval (diewald)
+
0.25.3 2014-01-10
- Updated Lucene to 4.3.1.
diff --git a/pom.xml b/pom.xml
index 36ec331..979b146 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,7 +9,7 @@
<groupId>KorAP-modules</groupId>
<artifactId>KorAP-lucene-index</artifactId>
- <version>0.25.3</version>
+ <version>0.26</version>
<packaging>jar</packaging>
<name>KorAP-lucene-index</name>
diff --git a/src/main/java/de/ids_mannheim/korap/KorapDocument.java b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
index 9c6dc5a..4b434c9 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
@@ -66,7 +66,9 @@
@JsonProperty("pubDate")
public String getPubDateString () {
- return this.pubDate.toDisplay();
+ if (this.pubDate != null)
+ return this.pubDate.toDisplay();
+ return null;
};
public void setAuthor (String author) {
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 1bdad67..6b4ca02 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -74,6 +74,7 @@
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.index.TermInfo;
import de.ids_mannheim.korap.index.SpanInfo;
+import de.ids_mannheim.korap.index.MatchIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -453,49 +454,40 @@
public KorapMatch getMatch (String id) {
- return this.getMatchInfo(id, false, null, null, false, true);
+ return this.getMatchInfo(id, "tokens", false, null, null, false, true);
};
- public KorapMatch getMatchInfo (String id, String foundry, String layer, boolean includeSpans, boolean includeHighlights) {
- return this.getMatchInfo(id, true, foundry, layer, includeSpans, includeHighlights);
+ public KorapMatch getMatchInfo (String id,
+ String field,
+ String foundry,
+ String layer,
+ boolean includeSpans,
+ boolean includeHighlights) {
+ return this.getMatchInfo(id, field, true, foundry, layer, includeSpans, includeHighlights);
};
/**
* Get a match.
* BE AWARE - THIS IS STILL A PLAYGROUND!
*/
- // TODO: collect all information based on a prefix (like cnx/p etc.)
- // TODO: Generate a meaningful structure (e.g. a tree)
/*
KorapInfo is associated with a KorapMatch and has an array with all informations
per position in the match.
-
- public KorapInfo infoOf (KorapMatch km, String prefix);
*/
- public KorapMatch getMatchInfo (String id, boolean info, String foundry, String layer, boolean includeSpans, boolean includeHighlights) {
+ public KorapMatch getMatchInfo (String idString,
+ String field,
+ boolean info,
+ String foundry,
+ String layer,
+ boolean includeSpans,
+ boolean includeHighlights) {
- // List of terms to populate
- SpanInfo termList = new SpanInfo();
-
- KorapMatch match = new KorapMatch();
-
- // That's purely temporary
- // From ID:
- String corpusID = "WPD";
- String docID = "WPD_AAA.00003";
- int startPos = 25;
- int endPos = 30;
-
- foundry = "mate";
- layer = "l";
- includeSpans = true;
-
- String field = "tokens"; // text field
+ KorapMatch match = new KorapMatch(idString, includeHighlights);
// Create a filter based on the corpusID and the docID
BooleanQuery bool = new BooleanQuery();
- bool.add(new TermQuery(new Term("ID", docID)), BooleanClause.Occur.MUST);
- bool.add(new TermQuery(new Term("corpusID", corpusID)), BooleanClause.Occur.MUST);
+ bool.add(new TermQuery(new Term("ID", match.getDocID())), BooleanClause.Occur.MUST);
+ bool.add(new TermQuery(new Term("corpusID", match.getCorpusID())), BooleanClause.Occur.MUST);
Filter filter = (Filter) new QueryWrapperFilter(bool);
CompiledAutomaton fst = null;
@@ -508,23 +500,25 @@
*/
StringBuffer regex = new StringBuffer();
+ // Todo: Only support one direction!
if (includeSpans)
- regex.append("(((\"<>\"|\"<\"|\">\")\":\")?");
- else
- regex.append("[^<>-]");
+ regex.append("((\"<>\"|\"<\"|\">\")\":\")?");
if (foundry != null) {
regex.append(foundry).append('/');
if (layer != null)
regex.append(layer).append(":");
}
else if (includeSpans) {
- regex.append("[^-]");
+ regex.append("([^-is]+?|[-is][^:])");
+ }
+ else {
+ regex.append("([^-is<>]+?|([-is<>]|\"<>\")[^:])");
};
- regex.append("(.){1,})|_[0-9]+");
+ regex.append("(.){1,}|_[0-9]+");
+ log.trace("The final regex is {}", regex.toString());
RegExp regexObj = new RegExp(regex.toString());
fst = new CompiledAutomaton(regexObj.toAutomaton());
- log.trace("The final regex is {}", regex.toString());
};
@@ -541,13 +535,20 @@
// Create a bitset for the correct document
Bits bitset = filterSet.bits();
+ DocIdSetIterator filterIterator = filterSet.iterator();
+
+ // No document found
+ if (filterIterator == null)
+ continue;
+
// Go to the matching doc - and remember its ID
- int localDocID = filterSet.iterator().nextDoc();
+ int localDocID = filterIterator.nextDoc();
if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
continue;
// We've found the correct document! Hurray!
+ log.trace("We've found a matching document");
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
fieldsToLoadLocal.add(field);
@@ -558,22 +559,23 @@
Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
// Put some more information to the match
- match.setPositionsToOffset(new PositionsToOffset(atomic, field));
+ PositionsToOffset pto = new PositionsToOffset(atomic, field);
+ match.setPositionsToOffset(pto);
match.setLocalDocID(localDocID);
- match.setStartPos(startPos);
- match.setEndPos(endPos);
match.populateDocument(doc, field, fieldsToLoadLocal);
- log.trace("The document is called '{}'", match.getTitle());
+ log.trace("The document has the id '{}'", match.getDocID());
- if (!info)
- break;
+ if (!info) break;
// Limit the terms to all the terms of interest
TermsEnum termsEnum = docTerms.intersect(fst, null);
DocsAndPositionsEnum docs = null;
+ // List of terms to populate
+ SpanInfo termList = new SpanInfo(pto, localDocID);
+
// Iterate over all terms in the document
while (termsEnum.next() != null) {
@@ -607,7 +609,7 @@
int pos = docs.nextPosition();
// Check, if the position of the term is in the interesting area
- if (pos >= startPos && pos < endPos) {
+ if (pos >= match.getStartPos() && pos < match.getEndPos()) {
log.trace(
">> {}: {}-{}-{}",
@@ -632,15 +634,28 @@
};
};
};
+
+ // Add annotations based on the retrieved infos
+ for (TermInfo t : termList.getTerms()) {
+ log.trace("Add term {}/{}:{} to {}({})-{}({})",
+ t.getFoundry(),
+ t.getLayer(),
+ t.getValue(),
+ t.getStartChar(),
+ t.getStartPos(),
+ t.getEndChar(),
+ t.getEndPos());
+
+ if (t.getType() == "term" || t.getType() == "span")
+ match.addAnnotation(t.getStartPos(), t.getEndPos(), t.getAnnotation());
+ };
+
break;
};
}
catch (IOException e) {
- // ...
- };
-
- for (TermInfo t : termList.getTerms()) {
- log.trace("Add term {}/{}:{} to {}-{}", t.getFoundry(), t.getLayer(), t.getValue(), t.getStartChar(), t.getEndChar());
+ log.warn(e.getLocalizedMessage());
+ match.setError(e.getLocalizedMessage());
};
return match;
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index d7b8e79..ad1a017 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -4,6 +4,7 @@
import java.nio.ByteBuffer;
import com.fasterxml.jackson.annotation.*;
+import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.*;
@@ -12,7 +13,7 @@
import de.ids_mannheim.korap.document.KorapPrimaryData;
import static de.ids_mannheim.korap.util.KorapHTML.*;
-
+import de.ids_mannheim.korap.index.MatchIdentifier;
// import org.apache.commons.codec.binary.Base64;
import org.slf4j.Logger;
@@ -32,6 +33,7 @@
* @see KorapResult
* @author ndiewald
*/
+@JsonInclude(Include.NON_NULL)
public class KorapMatch extends KorapDocument {
ObjectMapper mapper = new ObjectMapper();
@@ -49,9 +51,14 @@
public int potentialStartPosChar = -1,
potentialEndPosChar = -1;
- private int startOffsetChar = 0;
+ private String error = null;
- private int localDocID = -1;
+ // TEMPRARILY
+ @JsonIgnore
+ public int localDocID = -1;
+
+ HashMap<Integer, String> annotationNumber = new HashMap<>(16);
+ int annotationNumberCounter = 256;
@JsonIgnore
public boolean leftTokenContext,
@@ -68,7 +75,7 @@
endMore = true;
private Collection<byte[]> payload;
- private ArrayList<int[]> highlight;
+ private ArrayList<Highlight> highlight;
private LinkedList<int[]> span;
private PositionsToOffset positionsToOffset;
@@ -79,6 +86,7 @@
/**
* Constructs a new KorapMatch object.
+ * TODo: Maybe that's not necessary!
*
* @param pto The PositionsToOffset object, containing relevant
* positional information for highlighting
@@ -91,10 +99,6 @@
this.localDocID = localDocID;
this.startPos = startPos;
this.endPos = endPos;
-
- // Preprocess matching
- pto.add(localDocID, startPos);
- pto.add(localDocID, endPos - 1);
};
/**
@@ -103,6 +107,41 @@
public KorapMatch () {};
/**
+ * Constructs a new KorapMatch object.
+ */
+ public KorapMatch (String idString, boolean includeHighlights) {
+ MatchIdentifier id = new MatchIdentifier(idString);
+ this.setCorpusID(id.getCorpusID());
+ this.setDocID(id.getDocID());
+ this.setStartPos(id.getStartPos());
+ this.setEndPos(id.getEndPos());
+
+ if (includeHighlights)
+ for (int[] pos : id.getPos())
+ this.addHighlight(pos[0], pos[1], pos[2]);
+ };
+
+ private class Highlight {
+ public int start, end;
+ public int number = -1;
+
+ public Highlight (int start, int end, String annotation) {
+ this.start = start;
+ this.end = end;
+ // TODO: This can overflow!
+ this.number = annotationNumberCounter++;
+ log.trace("Add annotation: {} ({})", annotation, this.number);
+ annotationNumber.put(this.number, annotation);
+ };
+
+ public Highlight (int start, int end, int number) {
+ this.start = start;
+ this.end = end;
+ this.number = number;
+ };
+ }
+
+ /**
* Insert a highlight for the snippet view by means of positional
* offsets and an optional class number.
*
@@ -110,32 +149,39 @@
* @param end Integer value of a span's positional end offset.
* @param number Optional class number of the highlight.
*/
+ public void addHighlight (int start, int end) {
+ this.addHighlight(new Highlight(start, end, (int) 0));
+ };
+
public void addHighlight (int start, int end, byte number) {
- this.addHighlight(start, end, (int) number);
+ this.addHighlight(new Highlight(start, end, (int) number));
};
public void addHighlight (int start, int end, short number) {
- this.addHighlight(start, end, (int) number);
- };
-
- public void addHighlight (int start, int end) {
- this.addHighlight(start, end, (int) 0);
+ this.addHighlight(new Highlight(start, end, (int) number));
};
public void addHighlight (int start, int end, int number) {
+ this.addHighlight(new Highlight(start, end, number));
+ };
+
+ public void addHighlight (Highlight hl) {
+
if (this.highlight == null)
- this.highlight = new ArrayList<int[]>(16);
- log.trace("Add highlight of class {} from {} to {}", number, start, end);
+ this.highlight = new ArrayList<Highlight>(16);
+ log.trace("Add highlight {} from {} to {}", hl.number, hl.start, hl.end);
this._reset();
- // Add this for offset search
- this.positionsToOffset.add(this.localDocID, start);
- this.positionsToOffset.add(this.localDocID, end);
-
- this.highlight.add(new int[]{ start, end, number});
+ this.highlight.add(hl);
};
+
+ public void addAnnotation (int start, int end, String annotation) {
+ this.addHighlight(new Highlight(start, end, annotation));
+ };
+
+
public void populateDocument (Document doc, String field, HashSet<String> fields) {
this.setField(field);
@@ -187,16 +233,6 @@
@JsonIgnore
public void setStartPos(int pos) {
this.startPos = pos;
-
- if (this.positionsToOffset == null || this.localDocID == -1) {
- log.warn("You have to define " +
- "positionsToOffset and localDocID first " +
- "before adding position information");
- return;
- };
-
- // Preprocess matching
- this.positionsToOffset.add(this.localDocID, pos);
};
@JsonIgnore
@@ -207,16 +243,6 @@
@JsonIgnore
public void setEndPos(int pos) {
this.endPos = pos;
-
- if (this.positionsToOffset == null || this.localDocID == -1) {
- log.warn("You have to define " +
- "positionsToOffset and localDocID first " +
- "before adding position information");
- return;
- };
-
- // Preprocess matching
- this.positionsToOffset.add(this.localDocID, pos - 1);
};
@JsonIgnore
@@ -246,44 +272,27 @@
if (this.identifier != null)
return this.identifier;
- StringBuffer sb = new StringBuffer("match-");
+ if (this.localDocID == -1)
+ return null;
+
+ MatchIdentifier id = new MatchIdentifier();
// Get prefix string corpus/doc
- if (this.getCorpusID() != null) {
- sb.append(this.getCorpusID());
-
- if (this.getDocID() != null) {
- sb.append('-');
- sb.append(this.getDocID());
- };
- }
- else {
- sb.append(this.localDocID);
- };
-
- sb.append("-p");
-
- // Get Position information
- sb.append(startPos).append('-').append(endPos);
+ id.setCorpusID(this.getCorpusID());
+ id.setDocID(this.getDocID());
+ id.setStartPos(startPos);
+ id.setEndPos(endPos);
if (this.highlight != null) {
- for (int[] h : this.highlight) {
- sb.append('(').append(h[2]).append(')');
- sb.append(h[0]).append('-').append(h[1]);
+ for (Highlight h : this.highlight) {
+ if (h.number >= 256)
+ continue;
+
+ id.addPos(h.start, h.end, h.number);
};
};
- if (this.processed) {
- sb.append('c');
- for (int[] s : this.span) {
- if (s[2] != -1)
- sb.append('(').append(s[2]).append(')');
- sb.append(s[0] + this.startOffsetChar);
- sb.append('-');
- sb.append(s[1] + this.startOffsetChar);
- };
- };
- return (this.identifier = sb.toString());
+ return (this.identifier = id.toString());
};
private void _reset () {
@@ -296,49 +305,58 @@
};
// Start building highlighted snippets
- private void _processHighlight () {
+ private boolean _processHighlight () {
if (processed)
- return;
+ return true;
+
+ if (this.positionsToOffset == null || this.localDocID == -1) {
+ log.warn("You have to define " +
+ "positionsToOffset and localDocID first " +
+ "before");
+ return false;
+ };
log.trace("Start highlight processing ...");
+
+ PositionsToOffset pto = this.positionsToOffset;
+ pto.add(this.localDocID, this.getStartPos());
+ pto.add(this.localDocID, this.getEndPos() - 1);
+
+ log.trace("PTO now has start and end positions {}-{}", this.getStartPos(), this.getEndPos());
+
+ if (this.highlight != null) {
+ for (Highlight hl : this.highlight) {
+ pto.add(this.localDocID, hl.start);
+ pto.add(this.localDocID, hl.end);
+ };
+ };
+ log.trace("All highlights are added");
+
// Get the list of spans for matches and highlighting
if (this.span == null || this.span.size() == 0) {
- this._processHighlightSpans(
- leftTokenContext,
- rightTokenContext
- );
+ if (!this._processHighlightSpans(
+ leftTokenContext,
+ rightTokenContext
+ ))
+ return false;
};
- /*
- for (int[] s : spans) {
- log.trace(" >> [Spans] Start: {}, End: {}, Class: {}, Dummy: {}",
- s[0], s[1], s[2], s[3]);
- };
- */
-
// Create a stack for highlighted elements (opening and closing elements)
ArrayList<int[]> stack = this._processHighlightStack();
- /*
- for (int[] s : stack) {
- log.trace(" >> [Stack] Start: {}, End: {}, Class: {}, Dummy: {}",
- s[0], s[1], s[2], s[3]);
- };
- */
-
// The temparary snippet is empty, nothing to do
if (this.tempSnippet == null) {
processed = true;
- return;
+ return false;
};
// Merge the element stack with the primary textual data
this._processHighlightSnippet(this.tempSnippet, stack);
// Match is processed - done
- processed = true;
+ return (processed = true);
};
/*
@@ -392,7 +410,6 @@
private byte type;
private int number = 0;
- // TODO: Should be possibly a short (as for the -1)
private String characters;
private boolean terminal = true;
@@ -426,6 +443,11 @@
if (this.number == -1) {
sb.append("<span class=\"match\">");
}
+ else if (this.number >= 256) {
+ sb.append("<span title=\"")
+ .append(annotationNumber.get(this.number))
+ .append("\">");
+ }
else {
// Get the first free level slot
byte pos;
@@ -447,7 +469,7 @@
}
// Closing
else if (this.type == 2) {
- if (this.number == -1)
+ if (this.number == -1 || this.number >= 256)
return "</span>";
if (this.terminal)
@@ -468,7 +490,9 @@
}
else {
sb.append("{");
- if (this.number != 0)
+ if (this.number >= 256)
+ sb.append(annotationNumber.get(this.number)).append(':');
+ else if (this.number != 0)
sb.append(this.number).append(':');
};
return sb.toString();
@@ -480,7 +504,6 @@
};
return this.characters;
};
-
};
/*
@@ -553,7 +576,9 @@
// Retrieve last combinator on stack
lastComb = this.combine.peekLast();
- log.trace("Closing element is unbalanced - {} != {} with lastComb {}|{}|{}", eold, number, lastComb.type, lastComb.number, lastComb.characters);
+ log.trace("Closing element is unbalanced - {} " +
+ "!= {} with lastComb {}|{}|{}",
+ eold, number, lastComb.type, lastComb.number, lastComb.characters);
// combinator is opening and the number is not equal to the last
// element on the balanceStack
@@ -583,18 +608,6 @@
lastComb = this.combine.peekLast();
log.trace("LastComb: " + lastComb.type + '|' + lastComb.number + '|' + lastComb.characters + " for " + number);
- /*
- // The last combinator is opening and identical to the current one
- if (lastComb.type == 1 && lastComb.number == number) {
- // Remove the damn thing - It's empty and uninteresting!
- this.combine.removeLast();
- }
- else {
- // Add a closer
- this.combine.add(new HighlightCombinatorElement((byte) 2, number));
- };
- */
-
log.trace("Stack for checking 2: {}|{}|{}|{}", lastComb.type, lastComb.number, lastComb.characters, number);
if (lastComb.type == 1 && lastComb.number == number) {
@@ -667,7 +680,9 @@
@JsonProperty("snippet")
public String getSnippetHTML () {
- this._processHighlight();
+
+ if (!this._processHighlight())
+ return null;
if (this.processed && this.snippetHTML != null)
return this.snippetHTML;
@@ -726,7 +741,8 @@
@JsonIgnore
public String getSnippetBrackets () {
- this._processHighlight();
+ if (!this._processHighlight())
+ return null;
if (this.processed && this.snippetBrackets != null)
return this.snippetBrackets;
@@ -786,7 +802,7 @@
};
- private void _processHighlightSpans (boolean leftTokenContext,
+ private boolean _processHighlightSpans (boolean leftTokenContext,
boolean rightTokenContext) {
int startOffsetChar,
endOffsetChar,
@@ -797,15 +813,20 @@
int ldid = this.localDocID;
+ if (this.positionsToOffset == null)
+ return false;
+
// Match position
startPosChar = this.positionsToOffset.start(ldid, this.startPos);
-
// Check potential differing start characters
// e.g. from element spans
if (potentialStartPosChar != -1 && startPosChar > potentialStartPosChar)
startPosChar = potentialStartPosChar;
endPosChar = this.positionsToOffset.end(ldid, this.endPos - 1);
+ log.trace("startPosChar for PTO is {}({})", startPosChar, this.startPos);
+ log.trace("endPosChar for PTO is {}({})", endPosChar, this.endPos);
+
if (endPosChar < potentialEndPosChar)
endPosChar = potentialEndPosChar;
@@ -853,9 +874,6 @@
if (endOffsetChar != -1 && endOffsetChar < endPosChar)
endOffsetChar = endPosChar;
- this.startOffsetChar = startOffsetChar;
-
-
log.trace("Offsetposition {} till {} with contexts {} and {}", startOffsetChar, endOffsetChar, leftContextOffset, rightContextOffset);
if (endOffsetChar > -1 && endOffsetChar < this.getPrimaryDataLength()) {
@@ -866,8 +884,6 @@
endMore = false;
};
- // log.trace("Temporary snippet is \"{}\"", this.tempSnippet);
-
if (this.span == null)
this.span = new LinkedList<int[]>();
@@ -881,9 +897,9 @@
// highlights
// -- I'm not sure about this.
if (this.highlight != null) {
- for (int[] highlight : this.highlight) {
- int start = this.positionsToOffset.start(ldid, highlight[0]) - startOffsetChar;
- int end = this.positionsToOffset.end(ldid, highlight[1]) - startOffsetChar;
+ for (Highlight highlight : this.highlight) {
+ int start = this.positionsToOffset.start(ldid, highlight.start) - startOffsetChar;
+ int end = this.positionsToOffset.end(ldid, highlight.end) - startOffsetChar;
if (start < 0 || end < 0)
continue;
@@ -891,7 +907,7 @@
intArray = new int[]{
start,
end,
- highlight[2],
+ highlight.number,
0 // Dummy value for later
};
@@ -902,6 +918,16 @@
this.span.add(intArray);
};
};
+ return true;
+ };
+
+ // Identical to KorapResult
+ public String getError () {
+ return this.error;
+ };
+
+ public void setError (String msg) {
+ this.error = msg;
};
@@ -909,6 +935,10 @@
public String toJSON () {
ObjectNode json = (ObjectNode) mapper.valueToTree(this);
+ // Match was no match
+ if (json.size() == 0)
+ return "{}";
+
ArrayNode leftContext = mapper.createArrayNode();
leftContext.add(this.leftTokenContext ? "token" : "char");
leftContext.add(this.leftContextOffset);
diff --git a/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java
new file mode 100644
index 0000000..c5a65f1
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java
@@ -0,0 +1,126 @@
+package de.ids_mannheim.korap.index;
+import java.util.*;
+import java.util.regex.*;
+
+
+public class MatchIdentifier {
+ private String corpusID, docID;
+ private int startPos, endPos = 0;
+
+ private ArrayList<int[]> pos = new ArrayList<>(8);
+
+ Pattern idRegex = Pattern.compile(
+ "^match-(?:([^!]+?)!)?" +
+ "([^!]+)-p([0-9]+)-([0-9]+)" +
+ "((?:\\(-?[0-9]+\\)-?[0-9]+--?[0-9]+)*)" +
+ "(?:c.+?)?$");
+ Pattern posRegex = Pattern.compile(
+ "\\(([0-9]+)\\)([0-9]+)-([0-9]+)");
+
+ public MatchIdentifier () {};
+
+ public MatchIdentifier (String id) {
+ Matcher matcher = idRegex.matcher(id);
+ if (matcher.matches()) {
+ this.setCorpusID(matcher.group(1));
+ this.setDocID(matcher.group(2));
+ this.setStartPos(Integer.parseInt(matcher.group(3)));
+ this.setEndPos(Integer.parseInt(matcher.group(4)));
+
+ if (matcher.group(5) != null) {
+ matcher = posRegex.matcher(matcher.group(5));
+ while (matcher.find()) {
+ this.addPos(
+ Integer.parseInt(matcher.group(2)),
+ Integer.parseInt(matcher.group(3)),
+ Integer.parseInt(matcher.group(1))
+ );
+ };
+ };
+ };
+ };
+
+ public String getCorpusID () {
+ return this.corpusID;
+ };
+
+ public void setCorpusID (String id) {
+ if (id != null && !id.contains("!"))
+ this.corpusID = id;
+ };
+
+ public String getDocID () {
+ return this.docID;
+ };
+
+ public void setDocID (String id) {
+ if (!id.contains("!"))
+ this.docID = id;
+ };
+
+ public int getStartPos () {
+ return this.startPos;
+ };
+
+ public void setStartPos (int pos) {
+ if (pos >= 0)
+ this.startPos = pos;
+ };
+
+ public int getEndPos () {
+ return this.endPos;
+ };
+
+ public void setEndPos (int pos) {
+ if (pos >= 0)
+ this.endPos = pos;
+ };
+
+ public void addPos(int start, int end, int number) {
+ if (start >= 0 && end >= 0 && number >= 0)
+ this.pos.add(new int[]{start, end, number});
+ };
+
+ public ArrayList<int[]> getPos () {
+ return this.pos;
+ };
+
+ public String toString () {
+
+ if (this.docID == null) return null;
+
+ StringBuffer sb = new StringBuffer("match-");
+
+ // Get prefix string corpus/doc
+ if (this.corpusID != null) {
+ sb.append(this.corpusID).append('!');
+ };
+ sb.append(this.docID);
+
+ sb.append("-p");
+ sb.append(this.startPos).append('-').append(this.endPos);
+
+ // Get Position information
+ for (int[] i : this.pos) {
+ sb.append('(').append(i[2]).append(')');
+ sb.append(i[0]).append('-').append(i[1]);
+ };
+
+ /*
+ if (this.processed) {
+ sb.append('c');
+ for (int[] s : this.span) {
+ if (s[2] >= 256)
+ continue;
+
+ if (s[2] != -1)
+ sb.append('(').append(s[2]).append(')');
+ sb.append(s[0] + this.startOffsetChar);
+ sb.append('-');
+ sb.append(s[1] + this.startOffsetChar);
+ };
+ };
+ */
+ return sb.toString();
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
index c817449..1fa1b35 100644
--- a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
+++ b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
@@ -133,6 +133,15 @@
return this.offsets.get(ptoa);
};
+ public void addOffset (int docID,
+ int pos,
+ int startOffset,
+ int endOffset) {
+ offsets.put(
+ new PositionsToOffsetArray(docID, pos),
+ new Integer[]{startOffset, endOffset}
+ );
+ };
public HashMap<PositionsToOffsetArray, Integer[]> offsets () {
if (processed)
@@ -170,7 +179,10 @@
if (termsEnum.seekExact(term.bytes(), true)) {
- log.trace("Search for {} in doc {} with pos {}", term.toString(), posDoc.docID, posDoc.pos);
+ log.trace("Search for {} in doc {} with pos {}",
+ term.toString(),
+ posDoc.docID,
+ posDoc.pos);
// Start an iterator to fetch all payloads of the term
DocsAndPositionsEnum docs = termsEnum.docsAndPositions(
diff --git a/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java b/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java
new file mode 100644
index 0000000..ffd795f
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/SpanInfo.java
@@ -0,0 +1,81 @@
+package de.ids_mannheim.korap.index;
+import de.ids_mannheim.korap.index.TermInfo;
+import de.ids_mannheim.korap.KorapMatch;
+import de.ids_mannheim.korap.index.PositionsToOffset;
+
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+
+public class SpanInfo {
+ ArrayList<TermInfo> terms;
+ HashMap<Integer,Integer> startChar, endChar;
+ PositionsToOffset pto;
+ int localDocID;
+
+ // Logger
+ private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
+
+ public SpanInfo (PositionsToOffset pto, int localDocID) {
+ this.terms = new ArrayList<TermInfo>(64);
+ this.startChar = new HashMap<Integer,Integer>(16);
+ this.endChar = new HashMap<Integer,Integer>(16);
+ this.pto = pto;
+ this.localDocID = localDocID;
+ };
+
+ public void add (TermInfo info) {
+ info.analyze();
+ if (info.getType() != "pos") {
+ this.terms.add(info);
+ }
+ else {
+ this.startChar.put(info.getStartPos(), info.getStartChar());
+ this.endChar.put(info.getEndPos(), info.getEndChar());
+ };
+ };
+
+ public ArrayList<TermInfo> getTerms () {
+
+ // Sort terms (this will also analyze them!)
+ Collections.sort(this.terms);
+ boolean found;
+
+ // Add character offset information to terms that are
+ // missing this information
+ for (TermInfo t : this.terms) {
+ log.trace("Check offsets for {} and {}", t.getStartPos(), t.getEndPos());
+ found = true;
+ if (t.getStartChar() == -1) {
+ if (this.startChar.containsKey(t.getStartPos()))
+ t.setStartChar(this.startChar.get(t.getStartPos()));
+ else
+ found = false;
+ }
+ if (t.getEndChar() == -1) {
+ if (this.endChar.containsKey(t.getEndPos()))
+ t.setEndChar(this.endChar.get(t.getEndPos()));
+ else
+ found = false;
+ };
+
+ // Add this to found offsets
+ if (found && t.getStartPos() == t.getEndPos())
+ this.pto.addOffset(
+ this.localDocID,
+ t.getStartPos(),
+ t.getStartChar(),
+ t.getEndChar()
+ );
+ else {
+ log.trace("{} can't be found!", t.getAnnotation());
+ this.pto.add(this.localDocID, t.getStartPos());
+ this.pto.add(this.localDocID, t.getStartPos());
+ };
+ };
+
+ return this.terms;
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
index 92bbb53..38659af 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
@@ -2,6 +2,7 @@
import java.util.*;
import java.nio.ByteBuffer;
+import java.lang.StringBuffer;
import java.util.regex.*;
import de.ids_mannheim.korap.KorapMatch;
@@ -13,7 +14,7 @@
// Logger
private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
- private String foundry, layer, value, term, type;
+ private String foundry, layer, value, term, type, annotation;
// type can be "term", "pos", "span", "rel-src", "rel-target"
private int pos = 0;
@@ -33,7 +34,7 @@
public TermInfo (String term, int pos, ByteBuffer payload) {
this.term = term;
this.startPos = pos;
- this.endPos = pos + 1;
+ this.endPos = pos;
this.payload = payload;
};
@@ -84,6 +85,7 @@
log.trace("Check {} for {}", tterm, prefixRegex.toString());
matcher = prefixRegex.matcher(tterm);
if (matcher.matches() && matcher.groupCount() == 3) {
+ this.annotation = tterm;
this.foundry = matcher.group(1);
this.layer = matcher.group(2);
this.value = matcher.group(3);
@@ -105,7 +107,8 @@
// for spans and relations
if (ttype > 1)
- this.endPos = this.payload.getInt();
+ // Unsure if this is correct
+ this.endPos = this.payload.getInt() -1;
if (ttype == 2 && this.payload.hasRemaining()) {
this.depth = this.payload.get();
@@ -160,11 +163,18 @@
return this.value;
};
+ public String getAnnotation () {
+ return this.annotation;
+ };
+
@Override
public int compareTo (TermInfo obj) {
this.analyze();
obj.analyze();
+ // TODO: This sorting does not seem to work!
+ // although it might only be important for depth stuff.
+
if (this.startChar < obj.startChar) {
return -1;
}
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
index d46564f..f6cface 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
@@ -183,12 +183,14 @@
System.err.println(kr.toJSON());
*/
- kr = ki.search(query, 0, (short) 5, true, (short) 2, false, (short) 5);
+ kr = ki.search(query, 0, (short) 50, true, (short) 2, false, (short) 5);
+
+// System.err.println(kr.toJSON());
// System.out.println(query.toString());
// System.out.println(kr.match(37));
assertEquals(38, kr.totalResults());
- assertEquals(5, kr.itemsPerPage());
+ assertEquals(50, kr.itemsPerPage());
assertEquals("... Buchstabe des [{1:{2:lateinischen} Alphabets}] und ...", kr.match(0).getSnippetBrackets());
assertEquals("... Texten eine [{1:{2:durchschnittliche} Häufigkeit}] von ...", kr.match(1).getSnippetBrackets());
assertEquals("... damit der [{1:{2:sechsthäufigste} Buchstabe}] in d ...", kr.match(2).getSnippetBrackets());
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index bce6751..e0eee30 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -7,10 +7,13 @@
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
+import de.ids_mannheim.korap.index.MatchIdentifier;
+
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.KorapSearch;
import de.ids_mannheim.korap.KorapResult;
+import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.index.FieldDocument;
@@ -18,28 +21,60 @@
public class TestMatchIdentifier {
@Test
+ public void identifierExample1 () throws IOException {
+ MatchIdentifier id = new MatchIdentifier("match-c1!d1-p4-20");
+ assertEquals(id.getCorpusID(), "c1");
+ assertEquals(id.getDocID(), "d1");
+ assertEquals(id.getStartPos(), 4);
+ assertEquals(id.getEndPos(), 20);
+
+ assertEquals(id.toString(), "match-c1!d1-p4-20");
+ id.addPos(10,14,2);
+ assertEquals(id.toString(), "match-c1!d1-p4-20(2)10-14");
+ id.addPos(11,12,5);
+ assertEquals(id.toString(), "match-c1!d1-p4-20(2)10-14(5)11-12");
+ // Ignore
+ id.addPos(11,12,-8);
+ assertEquals(id.toString(), "match-c1!d1-p4-20(2)10-14(5)11-12");
+ id.addPos(11,-12,8);
+ assertEquals(id.toString(), "match-c1!d1-p4-20(2)10-14(5)11-12");
+ id.addPos(-11,12,8);
+ assertEquals(id.toString(), "match-c1!d1-p4-20(2)10-14(5)11-12");
+
+ id = new MatchIdentifier("matc-c1!d1-p4-20");
+ assertNull(id.toString());
+ id = new MatchIdentifier("match-d1-p4-20");
+ assertNull(id.getCorpusID());
+ assertEquals(id.getDocID(), "d1");
+ id = new MatchIdentifier("match-p4-20");
+ assertNull(id.toString());
+
+ id = new MatchIdentifier("match-c1!d1-p4-20");
+ assertEquals(id.toString(), "match-c1!d1-p4-20");
+
+ id = new MatchIdentifier("match-c1!d1-p4-20(5)7-8");
+ assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8");
+
+ id = new MatchIdentifier("match-c1!d1-p4-20(5)7-8(-2)9-10");
+ assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8");
+
+ id = new MatchIdentifier("match-c1!d1-p4-20(5)7-8(-2)9-10(2)3-4(3)-5-6");
+ assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8(2)3-4");
+
+ id = new MatchIdentifier("match-c1!d1-p4-20(5)7-8(-2)9-10(2)3-4(3)-5-6(4)7-8");
+ assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8(2)3-4(4)7-8");
+
+ id = new MatchIdentifier("match-c1!d1-p4-20(5)7-8(-2)9-10(2)3-4(3)-5-6(4)7-8(5)9--10");
+ assertEquals(id.toString(), "match-c1!d1-p4-20(5)7-8(2)3-4(4)7-8");
+ };
+
+ @Test
public void indexExample1 () throws IOException {
KorapIndex ki = new KorapIndex();
-
- // abcabcabac
- FieldDocument fd = new FieldDocument();
- fd.addTV("base",
- "abcabcabac",
- "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]" +
- "[(1-2)s:b|i:b|_1#1-2]" +
- "[(2-3)s:c|i:c|_2#2-3]" +
- "[(3-4)s:a|i:a|_3#3-4]" +
- "[(4-5)s:b|i:b|_4#4-5]" +
- "[(5-6)s:c|i:c|_5#5-6]" +
- "[(6-7)s:a|i:a|_6#6-7]" +
- "[(7-8)s:b|i:b|_7#7-8]" +
- "[(8-9)s:a|i:a|_8#8-9]" +
- "[(9-10)s:c|i:c|_9#9-10]");
- ki.addDoc(fd);
-
+ ki.addDoc(createSimpleFieldDoc());
ki.commit();
- KorapQuery kq = new KorapQuery("base");
+ KorapQuery kq = new KorapQuery("tokens");
KorapSearch ks = new KorapSearch(kq._(2,kq.seq(kq.seg("s:b")).append(kq._(kq.seg("s:a")))));
KorapResult kr = ki.search(ks);
@@ -47,14 +82,29 @@
assertEquals("StartPos (0)", 7, kr.match(0).startPos);
assertEquals("EndPos (0)", 9, kr.match(0).endPos);
- assertEquals("SnippetBrackets (0)", "... bcabca[{2:b{a}}]c", kr.match(0).snippetBrackets());
+ KorapMatch km = kr.match(0);
- assertEquals("ID (0)", "match-0-p7-9(0)8-8(2)7-8c7-9(0)8-9(2)7-9", kr.match(0).getID());
+ assertEquals("SnippetBrackets (0)", "... bcabca[{2:b{a}}]c", km.snippetBrackets());
+ assertEquals("ID (0)", "match-c1!d1-p7-9(0)8-8(2)7-8", km.getID());
};
-
@Test
public void indexExample2 () throws IOException {
+ KorapIndex ki = new KorapIndex();
+ ki.addDoc(createSimpleFieldDoc());
+ ki.commit();
+
+ KorapMatch km = ki.getMatch("match-c1!d1-p7-9(0)8-8(2)7-8");
+
+ assertEquals("StartPos (0)", 7, km.getStartPos());
+ assertEquals("EndPos (0)", 9, km.getEndPos());
+
+ assertEquals("SnippetBrackets (0)", "... [{2:b{a}}] ...", km.snippetBrackets());
+ assertEquals("ID (0)", "match-c1!d1-p7-9(0)8-8(2)7-8", km.getID());
+ };
+
+
+ public void indexExample3 () throws IOException {
// Construct index
KorapIndex ki = new KorapIndex();
// Indexing test files
@@ -64,7 +114,26 @@
);
};
ki.commit();
- // System.err.println(ki.getMatchInfo("test", "", "", true, true).toJSON());
+ // System.err.println(ki.getMatchInfo("xxx", null, null, true, true).toJSON());
};
+
+ private FieldDocument createSimpleFieldDoc(){
+ FieldDocument fd = new FieldDocument();
+ fd.addString("corpusID", "c1");
+ fd.addString("ID", "d1");
+ fd.addTV("tokens",
+ "abcabcabac",
+ "[(0-1)s:a|i:a|f/m:eins|_0#0-1|-:t$<i>10]" +
+ "[(1-2)s:b|i:b|f/m:zwei|_1#1-2]" +
+ "[(2-3)s:c|i:c|f/m:drei|_2#2-3]" +
+ "[(3-4)s:a|i:a|f/m:vier|_3#3-4]" +
+ "[(4-5)s:b|i:b|f/m:fuenf|_4#4-5]" +
+ "[(5-6)s:c|i:c|f/m:sechs|_5#5-6]" +
+ "[(6-7)s:a|i:a|f/m:sieben|_6#6-7]" +
+ "[(7-8)s:b|i:b|f/m:acht|_7#7-8]" +
+ "[(8-9)s:a|i:a|f/m:neun|_8#8-9]" +
+ "[(9-10)s:c|i:c|f/m:zehn|_9#9-10]");
+ return fd;
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java b/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java
index b51c5c9..a52455b 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java
@@ -31,7 +31,7 @@
assertEquals("foundry", term.getFoundry(), "mate");
assertEquals("layer", term.getLayer(), "p");
assertEquals("startPos", term.getStartPos(), 4);
- assertEquals("endPos", term.getEndPos(), 7);
+ assertEquals("endPos", term.getEndPos(), 6);
assertEquals("startChar", term.getStartChar(), 20);
assertEquals("endChar", term.getEndChar(), 25);
assertEquals("depth", term.getDepth(), (byte) 4);
@@ -43,7 +43,7 @@
assertEquals("foundry", term.getFoundry(), "mate");
assertEquals("layer", term.getLayer(), "p");
assertEquals("startPos", term.getStartPos(), 9);
- assertEquals("endPos", term.getEndPos(), 10);
+ assertEquals("endPos", term.getEndPos(), 9);
assertEquals("startChar", term.getStartChar(), -1);
assertEquals("endChar", term.getEndChar(), -1);
assertEquals("depth", term.getDepth(), 0);
@@ -56,7 +56,7 @@
assertEquals("foundry", term.getFoundry(), "xip");
assertEquals("layer", term.getLayer(), "p");
assertEquals("startPos", term.getStartPos(), 11);
- assertEquals("endPos", term.getEndPos(), 17);
+ assertEquals("endPos", term.getEndPos(), 16);
assertEquals("startChar", term.getStartChar(), -1);
assertEquals("endChar", term.getEndChar(), -1);
assertEquals("depth", term.getDepth(), 0);
@@ -69,7 +69,7 @@
assertEquals("foundry", term.getFoundry(), "xip");
assertEquals("layer", term.getLayer(), "m");
assertEquals("startPos", term.getStartPos(), 20);
- assertEquals("endPos", term.getEndPos(), 24);
+ assertEquals("endPos", term.getEndPos(), 23);
assertEquals("startChar", term.getStartChar(), -1);
assertEquals("endChar", term.getEndChar(), -1);
assertEquals("depth", term.getDepth(), 0);
@@ -82,11 +82,9 @@
assertNull("foundry", term.getFoundry());
assertNull("layer", term.getLayer());
assertEquals("startPos", term.getStartPos(), 30);
- assertEquals("endPos", term.getEndPos(), 31);
+ assertEquals("endPos", term.getEndPos(), 30);
assertEquals("startChar", term.getStartChar(), 240);
assertEquals("endChar", term.getEndChar(), 400);
assertEquals("depth", term.getDepth(), 0);
-
-
};
};
\ No newline at end of file