Added spans and dependency relations to match retrieval
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 095d2d5..037452a 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -502,22 +502,23 @@
// Todo: Only support one direction!
if (includeSpans)
- regex.append("((\"<>\"|\"<\"|\">\")\":\")?");
+ regex.append("((\">\"|\"<\"\">\"?)\":\")?");
if (foundry != null) {
regex.append(foundry).append('/');
if (layer != null)
regex.append(layer).append(":");
}
else if (includeSpans) {
- regex.append("([^-is]+?|[-is][^:])");
+ regex.append("([^-is]|[-is][^:])");
}
else {
- regex.append("([^-is<>]+?|([-is<>]|\"<>\")[^:])");
+ regex.append("([^-is<>]|([-is>][^:])|<[^:>])");
};
regex.append("(.){1,}|_[0-9]+");
+
log.trace("The final regexString is {}", regex.toString());
- RegExp regexObj = new RegExp(regex.toString());
+ RegExp regexObj = new RegExp(regex.toString(), RegExp.COMPLEMENT);
fst = new CompiledAutomaton(regexObj.toAutomaton());
log.trace("The final regexObj is {}", regexObj.toString());
};
@@ -600,6 +601,8 @@
// How often does this term occur in the document?
int termOccurrences = docs.freq();
+ // log.trace("I found {} documents with this term", termOccurrences);
+
// String representation of the term
String termString = termsEnum.term().utf8ToString();
@@ -610,13 +613,17 @@
int pos = docs.nextPosition();
// Check, if the position of the term is in the interesting area
+
+ // log.trace("Check position!");
+
if (pos >= match.getStartPos() && pos < match.getEndPos()) {
log.trace(
">> {}: {}-{}-{}",
termString,
docs.freq(),
- pos, docs.getPayload()
+ pos,
+ docs.getPayload()
);
BytesRef payload = docs.getPayload();
@@ -630,8 +637,11 @@
payload.length
);
};
-
- termList.add(new TermInfo(termString, pos, bbTerm));
+ TermInfo ti = new TermInfo(termString, pos, bbTerm).analyze();
+ if (ti.getEndPos() < match.getEndPos()) {
+ log.trace("Add {}", ti.toString());
+ termList.add(ti);
+ };
};
};
};
@@ -649,6 +659,8 @@
if (t.getType() == "term" || t.getType() == "span")
match.addAnnotation(t.getStartPos(), t.getEndPos(), t.getAnnotation());
+ else if (t.getType() == "relSrc")
+ match.addRelation(t.getStartPos(), t.getEndPos(), t.getAnnotation());
};
break;
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index 15627de..31ccd5d 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -14,6 +14,7 @@
import static de.ids_mannheim.korap.util.KorapHTML.*;
import de.ids_mannheim.korap.index.MatchIdentifier;
+import de.ids_mannheim.korap.index.PosIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -24,6 +25,8 @@
/*
Todo: The implemented classes and private names are horrible!
Refactor, future-me!
+
+ The number based Highlighttype is ugly - UGLY!
*/
/**
@@ -56,8 +59,14 @@
@JsonIgnore
public int localDocID = -1;
- HashMap<Integer, String> annotationNumber = new HashMap<>(16);
+ HashMap<Integer, String> annotationNumber = new HashMap<>(16);
+ HashMap<Integer, Relation> relationNumber = new HashMap<>(16);
+ HashMap<Integer, Integer> identifierNumber = new HashMap<>(16);
+
+ // -1 is match highlight
int annotationNumberCounter = 256;
+ int relationNumberCounter = 2048;
+ int identifierNumberCounter = -2;
@JsonIgnore
public boolean leftTokenContext,
@@ -116,29 +125,55 @@
this.setEndPos(id.getEndPos());
if (includeHighlights)
- for (int[] pos : id.getPos())
+ for (int[] pos : id.getPos()) {
+ if (pos[0] < id.getStartPos() || pos[1] > id.getEndPos())
+ continue;
+
this.addHighlight(pos[0], pos[1], pos[2]);
+ };
};
private class Highlight {
public int start, end;
public int number = -1;
+ // Relational highlight
+ public Highlight (int start, int end, String annotation, int ref) {
+ this.start = start;
+ this.end = end;
+ // TODO: This can overflow!
+ this.number = relationNumberCounter++;
+ relationNumber.put(this.number, new Relation(annotation, ref));
+ };
+
+ // Span highlight
public Highlight (int start, int end, String annotation) {
this.start = start;
this.end = end;
// TODO: This can overflow!
- this.number = annotationNumberCounter++;
- log.trace("Add annotation: {} ({})", annotation, this.number);
- annotationNumber.put(this.number, annotation);
+ if (annotationNumberCounter < 2048) {
+ this.number = annotationNumberCounter++;
+ annotationNumber.put(this.number, annotation);
+ };
};
+ // Simple highlight
public Highlight (int start, int end, int number) {
this.start = start;
this.end = end;
this.number = number;
};
- }
+ };
+
+ private class Relation {
+ public int ref;
+ public String annotation;
+ public Relation (String annotation, int ref) {
+ this.annotation = annotation;
+ this.ref = ref;
+ };
+ };
+
/**
* Insert a highlight for the snippet view by means of positional
@@ -180,6 +215,13 @@
this.addHighlight(new Highlight(start, end, annotation));
};
+ public void addRelation (int src, int target, String annotation) {
+ this.addHighlight(new Highlight(src, src, annotation, target));
+ int id = identifierNumberCounter--;
+ identifierNumber.put(id, target);
+ this.addHighlight(new Highlight(target, target, id));
+ };
+
public void populateDocument (Document doc, String field, HashSet<String> fields) {
@@ -294,6 +336,24 @@
return (this.identifier = id.toString());
};
+ @JsonIgnore
+ public String getPosID (int pos) {
+ if (this.identifier != null)
+ return this.identifier;
+
+ if (this.localDocID == -1)
+ return null;
+
+ PosIdentifier id = new PosIdentifier();
+
+ // Get prefix string corpus/doc
+ id.setCorpusID(this.getCorpusID());
+ id.setDocID(this.getDocID());
+ id.setPos(pos);
+
+ return id.toString();
+ };
+
private void _reset () {
this.processed = false;
this.snippetHTML = null;
@@ -435,18 +495,40 @@
};
// Return html fragment for this combinator element
- public String toHTML (FixedBitSet level, byte[] levelCache) {
+ public String toHTML (KorapMatch match, FixedBitSet level, byte[] levelCache) {
// Opening
if (this.type == 1) {
StringBuilder sb = new StringBuilder();
if (this.number == -1) {
sb.append("<span class=\"match\">");
}
- else if (this.number >= 256) {
- sb.append("<span title=\"")
- .append(annotationNumber.get(this.number))
+
+ else if (this.number < -1) {
+ sb.append("<span xml:id=\"")
+ .append(match.getPosID(
+ identifierNumber.get(this.number)))
.append("\">");
}
+
+ else if (this.number >= 256) {
+ sb.append("<span ");
+ if (this.number < 2048) {
+ sb.append("title=\"")
+ .append(annotationNumber.get(this.number))
+ .append('"');
+ }
+ else {
+ Relation rel = relationNumber.get(this.number);
+ sb.append("xlink:title=\"")
+ .append(rel.annotation)
+ .append('"');
+ sb.append(" xlink:type=\"simple\"");
+ sb.append(" xlink:href=\"#");
+ sb.append(match.getPosID(rel.ref));
+ sb.append('"');
+ };
+ sb.append('>');
+ }
else {
// Get the first free level slot
byte pos;
@@ -468,7 +550,7 @@
}
// Closing
else if (this.type == 2) {
- if (this.number == -1 || this.number >= 256)
+ if (this.number <= -1 || this.number >= 256)
return "</span>";
if (this.terminal)
@@ -484,13 +566,32 @@
public String toBrackets () {
if (this.type == 1) {
StringBuilder sb = new StringBuilder();
+
+ // Match
if (this.number == -1) {
sb.append("[");
}
+
+ // Identifier
+ else if (this.number < -1) {
+ sb.append("{#");
+ sb.append(identifierNumber.get(this.number));
+ sb.append(':');
+ }
+
+ // Highlight, Relation, Span
else {
sb.append("{");
- if (this.number >= 256)
- sb.append(annotationNumber.get(this.number)).append(':');
+ if (this.number >= 256) {
+ if (this.number < 2048)
+ sb.append(annotationNumber.get(this.number));
+ else {
+ Relation rel = relationNumber.get(this.number);
+ sb.append(rel.annotation);
+ sb.append('>').append(rel.ref);
+ };
+ sb.append(':');
+ }
else if (this.number != 0)
sb.append(this.number).append(':');
};
@@ -704,7 +805,7 @@
sb.append("<span class=\"more\"></span>");
if (elem.type == 0) {
- sb.append(elem.toHTML(level, levelCache));
+ sb.append(elem.toHTML(this, level, levelCache));
start++;
};
sb.append("</span>");
@@ -716,7 +817,7 @@
// Create context, if trhere is any
rightContext.append("<span class=\"context-right\">");
if (elem != null && elem.type == 0) {
- rightContext.append(elem.toHTML(level, levelCache));
+ rightContext.append(elem.toHTML(this, level, levelCache));
end--;
};
if (endMore)
@@ -724,7 +825,7 @@
rightContext.append("</span>");
for (short i = start; i < end; i++) {
- sb.append(this.snippetStack.get(i).toHTML(level,levelCache));
+ sb.append(this.snippetStack.get(i).toHTML(this, level,levelCache));
};
sb.append(rightContext);
diff --git a/src/main/java/de/ids_mannheim/korap/index/DocIdentifier.java b/src/main/java/de/ids_mannheim/korap/index/DocIdentifier.java
new file mode 100644
index 0000000..b66d06a
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/DocIdentifier.java
@@ -0,0 +1,26 @@
+package de.ids_mannheim.korap.index;
+import java.util.*;
+import java.util.regex.*;
+
+
+public class DocIdentifier {
+ protected String corpusID, docID;
+
+ public String getCorpusID () {
+ return this.corpusID;
+ };
+
+ public void setCorpusID (String id) {
+ if (id != null && !id.contains("!"))
+ this.corpusID = id;
+ };
+
+ public String getDocID () {
+ return this.docID;
+ };
+
+ public void setDocID (String id) {
+ if (!id.contains("!"))
+ this.docID = id;
+ };
+};
diff --git a/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java b/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java
index c5a65f1..6b22ad9 100644
--- a/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java
+++ b/src/main/java/de/ids_mannheim/korap/index/MatchIdentifier.java
@@ -1,10 +1,10 @@
package de.ids_mannheim.korap.index;
import java.util.*;
import java.util.regex.*;
+import de.ids_mannheim.korap.index.DocIdentifier;
-public class MatchIdentifier {
- private String corpusID, docID;
+public class MatchIdentifier extends DocIdentifier {
private int startPos, endPos = 0;
private ArrayList<int[]> pos = new ArrayList<>(8);
@@ -40,24 +40,6 @@
};
};
- public String getCorpusID () {
- return this.corpusID;
- };
-
- public void setCorpusID (String id) {
- if (id != null && !id.contains("!"))
- this.corpusID = id;
- };
-
- public String getDocID () {
- return this.docID;
- };
-
- public void setDocID (String id) {
- if (!id.contains("!"))
- this.docID = id;
- };
-
public int getStartPos () {
return this.startPos;
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/PosIdentifier.java b/src/main/java/de/ids_mannheim/korap/index/PosIdentifier.java
new file mode 100644
index 0000000..0bf6e90
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/PosIdentifier.java
@@ -0,0 +1,36 @@
+package de.ids_mannheim.korap.index;
+import java.util.*;
+import de.ids_mannheim.korap.index.DocIdentifier;
+
+public class PosIdentifier extends DocIdentifier {
+ private int pos;
+
+ public PosIdentifier () {};
+
+ public void setPos (int pos) {
+ if (pos >= 0)
+ this.pos = pos;
+ };
+
+ public int getPos () {
+ return this.pos;
+ };
+
+ public String toString () {
+
+ if (this.docID == null) return null;
+
+ StringBuffer sb = new StringBuffer("word-");
+
+ // Get prefix string corpus/doc
+ if (this.corpusID != null) {
+ sb.append(this.corpusID).append('!');
+ };
+ sb.append(this.docID);
+
+ sb.append("-p");
+ sb.append(this.pos);
+
+ return sb.toString();
+ };
+};
\ No newline at end of file
diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
index 38659af..b9a894a 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
@@ -28,7 +28,7 @@
private byte depth = (byte) 0;
- private Pattern prefixRegex = Pattern.compile("([^/]+)/([^:]+):(.+?)");
+ private Pattern prefixRegex = Pattern.compile("(?:([^/]+)/)?([^:/]+)(?::(.+?))?");
private Matcher matcher;
public TermInfo (String term, int pos, ByteBuffer payload) {
@@ -44,6 +44,7 @@
int ttype = 0;
String tterm = this.term;
+ int lastPos = this.payload.position();
this.payload.rewind();
switch (tterm.charAt(0)) {
@@ -86,7 +87,10 @@
matcher = prefixRegex.matcher(tterm);
if (matcher.matches() && matcher.groupCount() == 3) {
this.annotation = tterm;
- this.foundry = matcher.group(1);
+ if (matcher.group(1) != null)
+ this.foundry = matcher.group(1);
+ else
+ this.foundry = "base";
this.layer = matcher.group(2);
this.value = matcher.group(3);
};
@@ -110,7 +114,7 @@
// Unsure if this is correct
this.endPos = this.payload.getInt() -1;
- if (ttype == 2 && this.payload.hasRemaining()) {
+ if (ttype == 2 && this.payload.position() < lastPos) {
this.depth = this.payload.get();
};
@@ -167,6 +171,27 @@
return this.annotation;
};
+ public String toString () {
+ this.analyze();
+
+ StringBuffer sb = new StringBuffer();
+ sb.append('<').append(this.getType()).append('>');
+ sb.append(this.getFoundry()).append('/').append(this.getLayer());
+
+ if (this.getValue() != null)
+ sb.append(':').append(this.getValue());
+
+ if (this.getDepth() != (byte) 0)
+ sb.append('(').append(this.getDepth()).append(')');
+
+ sb.append('[').append(this.getStartPos());
+ sb.append('-').append(this.getEndPos()).append(']');
+ sb.append('[').append(this.getStartChar());
+ sb.append('-').append(this.getEndChar()).append(']');
+
+ return sb.toString();
+ };
+
@Override
public int compareTo (TermInfo obj) {
this.analyze();
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index 54e64d8..3e26bdd 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -9,8 +9,8 @@
#log4j.logger.de.ids_mannheim.korap.query.spans.SimpleSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.KorapTermSpan = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.ClassSpans = TRACE, stdout
-#log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.KorapIndex = TRACE, stdout
+# log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.KorapIndex = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.KorapMatch = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.KorapFilter = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.KorapCollection = TRACE, stdout