Halfly finished info retriever
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 9b740ea..1bdad67 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -73,6 +73,7 @@
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.index.TermInfo;
+import de.ids_mannheim.korap.index.SpanInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -116,9 +117,9 @@
private ObjectMapper mapper = new ObjectMapper();
- private static ByteBuffer bb = ByteBuffer.allocate(4);
+ private static ByteBuffer bb = ByteBuffer.allocate(4);
private static ByteBuffer bbOffset = ByteBuffer.allocate(8);
-
+ private static ByteBuffer bbTerm = ByteBuffer.allocate(16);
private byte[] pl = new byte[4];
@@ -450,6 +451,15 @@
return this.numberOf("tokens");
};
+
+ public KorapMatch getMatch (String id) {
+ return this.getMatchInfo(id, false, null, null, false, true);
+ };
+
+ public KorapMatch getMatchInfo (String id, String foundry, String layer, boolean includeSpans, boolean includeHighlights) {
+ return this.getMatchInfo(id, true, foundry, layer, includeSpans, includeHighlights);
+ };
+
/**
* Get a match.
* BE AWARE - THIS IS STILL A PLAYGROUND!
@@ -462,22 +472,25 @@
public KorapInfo infoOf (KorapMatch km, String prefix);
*/
- public KorapMatch getMatch (String id) {
+ public KorapMatch getMatchInfo (String id, boolean info, String foundry, String layer, boolean includeSpans, boolean includeHighlights) {
// List of terms to populate
- LinkedList<TermInfo> termList = new LinkedList<TermInfo>();
+ SpanInfo termList = new SpanInfo();
KorapMatch match = new KorapMatch();
// That's purely temporary
+ // From ID:
String corpusID = "WPD";
String docID = "WPD_AAA.00003";
- String field = "tokens"; // text field
- String foundry = "mate";
- String layer = "l";
int startPos = 25;
int endPos = 30;
- Boolean includeSpans = true;
+
+ foundry = "mate";
+ layer = "l";
+ includeSpans = true;
+
+ String field = "tokens"; // text field
// Create a filter based on the corpusID and the docID
BooleanQuery bool = new BooleanQuery();
@@ -485,114 +498,106 @@
bool.add(new TermQuery(new Term("corpusID", corpusID)), BooleanClause.Occur.MUST);
Filter filter = (Filter) new QueryWrapperFilter(bool);
- // Create an automaton for prefixed terms of interest based on a Regex
- // Todo: Ignore -: stuff!
- StringBuffer regex = new StringBuffer();
- if (includeSpans)
- regex.append("(((\"<>\"|\"<\"|\">\")\":\")?");
- else
- regex.append("[^<>]");
- if (foundry != null)
- regex.append(foundry).append('/');
- if (layer != null)
- regex.append(layer).append(":");
- regex.append("(.){1,})|_[0-9]+");
+ CompiledAutomaton fst = null;
- RegExp regexObj = new RegExp(regex.toString());
- CompiledAutomaton fst = new CompiledAutomaton(regexObj.toAutomaton());
- log.trace("The final regex is {}", regex.toString());
+ if (info) {
+ /* Create an automaton for prefixed terms of interest.
+ * You can define the necessary foundry, the necessary layer,
+ * in case the foundry is given, and if span annotations
+ * are of interest.
+ */
+ StringBuffer regex = new StringBuffer();
+
+ if (includeSpans)
+ regex.append("(((\"<>\"|\"<\"|\">\")\":\")?");
+ else
+ regex.append("[^<>-]");
+ if (foundry != null) {
+ regex.append(foundry).append('/');
+ if (layer != null)
+ regex.append(layer).append(":");
+ }
+ else if (includeSpans) {
+ regex.append("[^-]");
+ };
+ regex.append("(.){1,})|_[0-9]+");
+
+ RegExp regexObj = new RegExp(regex.toString());
+ fst = new CompiledAutomaton(regexObj.toAutomaton());
+ log.trace("The final regex is {}", regex.toString());
+ };
+
try {
-
// Iterate over all atomic indices and find the matching document
for (AtomicReaderContext atomic : this.reader().leaves()) {
- /*
- DocIdSetIterator filterIter = filter.getDocIdSet(
- atomic,
- atomic.reader().getLiveDocs()
- ).iterator();
- */
+ // Retrieve the single document of interest
DocIdSet filterSet = filter.getDocIdSet(
atomic,
atomic.reader().getLiveDocs()
);
// Create a bitset for the correct document
- // Yeah ... I know ... it could've been easier probably
- /*
- FixedBitSet bitset = new FixedBitSet(atomic.reader().numDocs());
- bitset.or(filterIter);
- */
Bits bitset = filterSet.bits();
- // Go to the matching doc
- // int localDocID = bitset.iterator().nextDoc();
+ // Go to the matching doc - and remember its ID
int localDocID = filterSet.iterator().nextDoc();
- // log.trace("Found documents {} with the docID {}", bitset.cardinality(), localDocID);
-
if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
continue;
- // We've found the correct document!
+ // We've found the correct document! Hurray!
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
fieldsToLoadLocal.add(field);
// Get terms from the document
Terms docTerms = atomic.reader().getTermVector(localDocID, field);
- /* ---
- *
- */
- log.trace("docTerms has Payloads: {}", docTerms.hasPayloads());
- log.trace("docTerms has Positions: {}", docTerms.hasPositions());
-
// Load the necessary fields of the document
Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
// Put some more information to the match
match.setPositionsToOffset(new PositionsToOffset(atomic, field));
match.setLocalDocID(localDocID);
-
- log.trace("pto and localDocID defined");
-
match.setStartPos(startPos);
match.setEndPos(endPos);
match.populateDocument(doc, field, fieldsToLoadLocal);
- log.trace("We have found the correct document: {}", match.getTitle());
- // log.trace("The match is: {}", doc.get("tokens"));
+ log.trace("The document is called '{}'", match.getTitle());
- // A termsEnum object could be reused here
+ if (!info)
+ break;
+
+ // Limit the terms to all the terms of interest
TermsEnum termsEnum = docTerms.intersect(fst, null);
- DocsAndPositionsEnum docs = (DocsAndPositionsEnum) null;
- // DocsAndPositionsEnum docs;
+ DocsAndPositionsEnum docs = null;
// Iterate over all terms in the document
while (termsEnum.next() != null) {
+ // Get the positions and payloads of the term in the document
+ // The bitvector may look different (don't know why)
+ // and so the local ID may differ.
+ // That's why the requesting bitset is null.
docs = termsEnum.docsAndPositions(
- null, //bitset.bits(),
+ null,
docs,
DocsAndPositionsEnum.FLAG_PAYLOADS
);
+ // Init document iterator
docs.nextDoc();
- // log.trace("Check for '{}'({}) in document {}({}) from {}", termsEnum.term().utf8ToString(), termsEnum.totalTermFreq(), docs.docID(), localDocID, bitset.cardinality());
+ // Should never happen ... but hell.
if (docs.docID() == DocIdSetIterator.NO_MORE_DOCS)
continue;
- // Init docs
- /*
- if (docs.advance(localDocID) == DocIdSetIterator.NO_MORE_DOCS || docs.docID() != localDocID)
- continue;
- */
-
// How often does this term occur in the document?
int termOccurrences = docs.freq();
+
+ // String representation of the term
String termString = termsEnum.term().utf8ToString();
// Iterate over all occurrences
@@ -601,24 +606,29 @@
// Init positions and get the current
int pos = docs.nextPosition();
- log.trace(">> {}: {}-{}-{}!",
- termString, docs.freq(), pos, docs.getPayload());
-
- BytesRef payload = docs.getPayload();
-
- byte[] pl = new byte[12];
-
- if (payload != null)
- System.arraycopy(payload.bytes, payload.offset, pl, 0, payload.length);
-
-
// Check, if the position of the term is in the interesting area
- if (pos >= startPos && pos <= endPos) {
- termList.add(new TermInfo(
- termString,
- pos,
- pl
- ));
+ if (pos >= startPos && pos < endPos) {
+
+ log.trace(
+ ">> {}: {}-{}-{}",
+ termString,
+ docs.freq(),
+ pos, docs.getPayload()
+ );
+
+ BytesRef payload = docs.getPayload();
+
+ // Copy the payload
+ bbTerm.clear();
+ if (payload != null) {
+ bbTerm.put(
+ payload.bytes,
+ payload.offset,
+ payload.length
+ );
+ };
+
+ termList.add(new TermInfo(termString, pos, bbTerm));
};
};
};
@@ -629,6 +639,10 @@
// ...
};
+ for (TermInfo t : termList.getTerms()) {
+ log.trace("Add term {}/{}:{} to {}-{}", t.getFoundry(), t.getLayer(), t.getValue(), t.getStartChar(), t.getEndChar());
+ };
+
return match;
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index cc4fa60..d7b8e79 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -261,7 +261,7 @@
sb.append(this.localDocID);
};
- sb.append('p');
+ sb.append("-p");
// Get Position information
sb.append(startPos).append('-').append(endPos);
diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
index 379a57e..92bbb53 100644
--- a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
@@ -1,19 +1,182 @@
package de.ids_mannheim.korap.index;
import java.util.*;
+import java.nio.ByteBuffer;
+import java.util.regex.*;
+import de.ids_mannheim.korap.KorapMatch;
-public class TermInfo {
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
- private String prefix, foundry, layer, value;
+public class TermInfo implements Comparable<TermInfo> {
+
+ // Logger
+ private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
+
+ private String foundry, layer, value, term, type;
+ // type can be "term", "pos", "span", "rel-src", "rel-target"
+
private int pos = 0;
- private byte[] payload;
+ private ByteBuffer payload;
+ private boolean analyzed = false;
- // Temporary:
- private String name;
+ private int startChar = -1,
+ endChar = -1,
+ startPos = -1,
+ endPos = -1;
- public TermInfo (String name, int pos, byte[] payload) {
- this.name = name;
- this.pos = pos;
- this.payload = payload;
+ private byte depth = (byte) 0;
+
+ private Pattern prefixRegex = Pattern.compile("([^/]+)/([^:]+):(.+?)");
+ private Matcher matcher;
+
+ public TermInfo (String term, int pos, ByteBuffer payload) {
+ this.term = term;
+ this.startPos = pos;
+ this.endPos = pos + 1;
+ this.payload = payload;
+ };
+
+ public TermInfo analyze () {
+ if (analyzed)
+ return this;
+
+ int ttype = 0;
+ String tterm = this.term;
+ this.payload.rewind();
+
+ switch (tterm.charAt(0)) {
+ case '<':
+ // "<>:mate/l:..."
+ if (tterm.charAt(1) == '>') {
+ // span
+ this.type = "span";
+ tterm = tterm.substring(3);
+ ttype = 2;
+ }
+ // rel-target
+ else {
+ this.type = "relTarget";
+ tterm = tterm.substring(2);
+ ttype = 3;
+ };
+ break;
+ case '>':
+ // rel-src
+ this.type = "relSrc";
+ tterm = tterm.substring(2);
+ ttype = 3;
+ break;
+
+ case '_':
+ // pos
+ this.type = "pos";
+ ttype = 1;
+ tterm = tterm.substring(1);
+ break;
+ default:
+ // term
+ this.type = "term";
+ };
+
+ // Analyze term value
+ if (ttype != 1) {
+ log.trace("Check {} for {}", tterm, prefixRegex.toString());
+ matcher = prefixRegex.matcher(tterm);
+ if (matcher.matches() && matcher.groupCount() == 3) {
+ this.foundry = matcher.group(1);
+ this.layer = matcher.group(2);
+ this.value = matcher.group(3);
+ };
+ }
+
+ // for positions
+ else {
+ this.value = tterm;
+ this.startChar = this.payload.getInt();
+ this.endChar = this.payload.getInt();
+ };
+
+ // for spans
+ if (ttype == 2) {
+ this.startChar = this.payload.getInt();
+ this.endChar = this.payload.getInt();
+ };
+
+ // for spans and relations
+ if (ttype > 1)
+ this.endPos = this.payload.getInt();
+
+ if (ttype == 2 && this.payload.hasRemaining()) {
+ this.depth = this.payload.get();
+ };
+
+ // payloads can have different meaning
+ analyzed = true;
+ return this;
+ };
+
+ public String getType () {
+ return this.type;
+ };
+
+ public int getStartChar () {
+ return this.startChar;
+ };
+
+ public void setStartChar (int pos) {
+ this.startChar = pos;
+ };
+
+ public int getEndChar () {
+ return this.endChar;
+ };
+
+ public void setEndChar (int pos) {
+ this.endChar = pos;
+ };
+
+ public int getStartPos () {
+ return this.startPos;
+ };
+
+ public int getEndPos () {
+ return this.endPos;
+ };
+
+ public byte getDepth () {
+ return this.depth;
+ };
+
+ public String getFoundry () {
+ return this.foundry;
+ };
+
+ public String getLayer () {
+ return this.layer;
+ };
+
+ public String getValue () {
+ return this.value;
+ };
+
+ @Override
+ public int compareTo (TermInfo obj) {
+ this.analyze();
+ obj.analyze();
+
+ if (this.startChar < obj.startChar) {
+ return -1;
+ }
+ else if (this.startChar > obj.startChar) {
+ return 1;
+ }
+ else if (this.depth < obj.depth) {
+ return 1;
+ }
+ else if (this.depth > obj.depth) {
+ return -1;
+ };
+ return 0;
};
};
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index addc5d8..1bb4035 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -15,7 +15,7 @@
#log4j.logger.de.ids_mannheim.korap.KorapCollection = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.analysis.MultiTermTokenStream = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.analysis.MultiTermTokenStream = TRACE, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index acf1c2f..bce6751 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -49,7 +49,7 @@
assertEquals("SnippetBrackets (0)", "... bcabca[{2:b{a}}]c", kr.match(0).snippetBrackets());
- assertEquals("ID (0)", "match-0p7-9(0)8-8(2)7-8c7-9(0)8-9(2)7-9", kr.match(0).getID());
+ assertEquals("ID (0)", "match-0-p7-9(0)8-8(2)7-8c7-9(0)8-9(2)7-9", kr.match(0).getID());
};
@@ -64,7 +64,7 @@
);
};
ki.commit();
- // System.err.println(ki.getMatch("test").toJSON());
+ // System.err.println(ki.getMatchInfo("test", "", "", true, true).toJSON());
};
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java b/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java
new file mode 100644
index 0000000..b51c5c9
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestTermInfo.java
@@ -0,0 +1,92 @@
+import java.util.*;
+import java.io.*;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import de.ids_mannheim.korap.index.TermInfo;
+import java.nio.ByteBuffer;
+
+
+
+@RunWith(JUnit4.class)
+public class TestTermInfo {
+ @Test
+ public void termExample1 () throws IOException {
+
+ byte[] b = new byte[16];
+ ByteBuffer bb = ByteBuffer.allocate(16);
+ bb.putInt(20); // startOffset
+ bb.putInt(25); // endOffset
+ bb.putInt(7); // endPos
+ bb.put((byte) 4);
+
+ TermInfo term = new TermInfo("<>:mate/p:NN", 4, bb).analyze();
+
+ assertEquals("type", term.getType(), "span");
+ assertEquals("value", term.getValue(), "NN");
+ assertEquals("foundry", term.getFoundry(), "mate");
+ assertEquals("layer", term.getLayer(), "p");
+ assertEquals("startPos", term.getStartPos(), 4);
+ assertEquals("endPos", term.getEndPos(), 7);
+ assertEquals("startChar", term.getStartChar(), 20);
+ assertEquals("endChar", term.getEndChar(), 25);
+ assertEquals("depth", term.getDepth(), (byte) 4);
+
+ bb.clear();
+ term = new TermInfo("mate/p:NN", 9, bb).analyze();
+ assertEquals("type", term.getType(), "term");
+ assertEquals("value", term.getValue(), "NN");
+ assertEquals("foundry", term.getFoundry(), "mate");
+ assertEquals("layer", term.getLayer(), "p");
+ assertEquals("startPos", term.getStartPos(), 9);
+ assertEquals("endPos", term.getEndPos(), 10);
+ assertEquals("startChar", term.getStartChar(), -1);
+ assertEquals("endChar", term.getEndChar(), -1);
+ assertEquals("depth", term.getDepth(), 0);
+
+ bb.clear();
+ bb.putInt(17).put((byte) 2);
+ term = new TermInfo(">:xip/p:ADJ", 11, bb).analyze();
+ assertEquals("type", term.getType(), "relSrc");
+ assertEquals("value", term.getValue(), "ADJ");
+ assertEquals("foundry", term.getFoundry(), "xip");
+ assertEquals("layer", term.getLayer(), "p");
+ assertEquals("startPos", term.getStartPos(), 11);
+ assertEquals("endPos", term.getEndPos(), 17);
+ assertEquals("startChar", term.getStartChar(), -1);
+ assertEquals("endChar", term.getEndChar(), -1);
+ assertEquals("depth", term.getDepth(), 0);
+
+ bb.clear();
+ bb.putInt(24);
+ term = new TermInfo("<:xip/m:number:pl", 20, bb).analyze();
+ assertEquals("type", term.getType(), "relTarget");
+ assertEquals("value", term.getValue(), "number:pl");
+ assertEquals("foundry", term.getFoundry(), "xip");
+ assertEquals("layer", term.getLayer(), "m");
+ assertEquals("startPos", term.getStartPos(), 20);
+ assertEquals("endPos", term.getEndPos(), 24);
+ assertEquals("startChar", term.getStartChar(), -1);
+ assertEquals("endChar", term.getEndChar(), -1);
+ assertEquals("depth", term.getDepth(), 0);
+
+ bb.clear();
+ bb.putInt(240).putInt(400);
+ term = new TermInfo("_30", 30, bb).analyze();
+ assertEquals("type", term.getType(), "pos");
+ assertEquals("value", term.getValue(), "30");
+ assertNull("foundry", term.getFoundry());
+ assertNull("layer", term.getLayer());
+ assertEquals("startPos", term.getStartPos(), 30);
+ assertEquals("endPos", term.getEndPos(), 31);
+ assertEquals("startChar", term.getStartChar(), 240);
+ assertEquals("endChar", term.getEndChar(), 400);
+ assertEquals("depth", term.getDepth(), 0);
+
+
+ };
+};
\ No newline at end of file