Introduced a term collector based on matches; doesn't do anything meaningful yet, other than lifting the SLOC
diff --git a/pom.xml b/pom.xml
index 81ed84b..e07c7c6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -85,6 +85,15 @@
<version>1.0</version>
</dependency>
+ <!-- among others Base4 support -->
+ <!--
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.4</version>
+ </dependency>
+ -->
+
<!-- Jackson -->
<!-- see https://github.com/FasterXML/jackson-core -->
<!-- https://github.com/FasterXML/jackson-databind -->
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index 1f65da2..c41d7f9 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -31,6 +31,7 @@
// TODO: Make a cache for the bits!!! DELETE IT IN CASE OF AN EXTENSION OR A FILTER!
+// TODO: Maybe a constantScoreQuery can make things faster?
// accepts as first parameter the index
// THIS MAY CHANGE for stuff like combining virtual collections
diff --git a/src/main/java/de/ids_mannheim/korap/KorapDocument.java b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
index 2a86bcf..9c6dc5a 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
@@ -28,8 +28,8 @@
private String author, textClass, corpusID,
pubPlace, ID, title, subTitle,
foundries, tokenization,
- layerInfo;
-
+ layerInfo, field;
+
private KorapDate pubDate;
/**
@@ -178,4 +178,12 @@
public String getLayerInfo () {
return this.layerInfo;
};
+
+ public void setField (String field) {
+ this.field = field;
+ };
+
+ public String getField () {
+ return this.field;
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 612fe1f..9679d31 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -13,6 +13,13 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanClause;
+
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.QueryWrapperFilter;
+
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
@@ -49,6 +56,12 @@
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.FixedBitSet;
+
+// Automata
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.RegExp;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -59,6 +72,7 @@
import de.ids_mannheim.korap.KorapSearch;
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.index.PositionsToOffset;
+import de.ids_mannheim.korap.index.TermInfo;
import de.ids_mannheim.korap.document.KorapPrimaryData;
import org.slf4j.Logger;
@@ -140,7 +154,6 @@
fieldsToLoad.add("foundries");
fieldsToLoad.add("layerInfo");
fieldsToLoad.add("tokenization");
- // don't load tokenization
// Base analyzer for searching and indexing
// StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
@@ -153,7 +166,6 @@
analyzerPerField
);
-
// Create configuration with base analyzer
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
@@ -307,6 +319,7 @@
// Go to first term (initialization phase)
// TODO: THIS MAY BE WRONG!
+// TODO:: DELETEEEEE AND TESTT!
docs.nextPosition();
// Copy payload with the offset of the BytesRef
@@ -331,10 +344,10 @@
* Search for the number of occurrences of different types,
* e.g. "documents", "sentences" etc.
*
- * @param foundry The foundry to search in.
+ * @param field The field containing the textual data and the annotations.
* @param type The type of meta information, e.g. "documents" or "sentences".
*/
- public long numberOf (KorapCollection collection, String foundry, String type) throws IOException {
+ public long numberOf (KorapCollection collection, String field, String type) throws IOException {
// Short cut for documents
if (type.equals("documents")) {
if (collection.getCount() <= 0) {
@@ -354,8 +367,8 @@
};
// Create search term
- Term term = new Term(foundry, "-:" + type);
- // System.err.println(">> Search for -:" + type + " in " + foundry);
+ Term term = new Term(field, "-:" + type);
+ // System.err.println(">> Search for -:" + type + " in " + field);
long occurrences = 0;
try {
@@ -377,8 +390,8 @@
return occurrences;
};
- public long numberOf (String foundry, String type) throws IOException {
- return this.numberOf(new KorapCollection(this), foundry, type);
+ public long numberOf (String field, String type) throws IOException {
+ return this.numberOf(new KorapCollection(this), field, type);
};
@@ -400,12 +413,12 @@
* e.g. "documents", "sentences" etc., in a specific set of documents.
*
* @param docvec The document vector for filtering the search space.
- * @param foundry The foundry to search in.
+ * @param field The field containing the textual data and the annotations.
* @param type The type of meta information, e.g. "documents" or "sentences".
*
* @see #numberOf(String, String)
*/
- public long numberOf (Bits docvec, String foundry, String type) throws IOException {
+ public long numberOf (Bits docvec, String field, String type) throws IOException {
// Shortcut for documents
if (type.equals("documents")) {
@@ -413,7 +426,7 @@
return os.cardinality();
};
- Term term = new Term(foundry, "-:" + type);
+ Term term = new Term(field, "-:" + type);
int occurrences = 0;
try {
@@ -428,20 +441,6 @@
return occurrences;
};
-
- /*
- Accepts a KorapInfo (with startPos, endPos, docID ... etc.)
- everything that comes from an ID
- and collects all information based on a prefix (like cnx/p etc.)
-
- KorapInfo is associated with a KorapMatch and has an array with all informations
- per position in the match.
-
- public KorapInfo infoOf (KorapMatch km, String prefix) {
-
- };
- */
-
@Deprecated
public long countDocuments () throws IOException {
log.warn("countDocuments() is DEPRECATED in favor of numberOf(\"documents\")!");
@@ -455,9 +454,141 @@
return this.numberOf("tokens");
};
+ /**
+ * Get a match.
+ * BE AWARE - THIS IS STILL A PLAYGROUND!
+ */
+ public KorapMatch getMatch (String id) {
+
+ String corpusID = "WPD";
+ String docID = "WPD_AAA.00003";
+ String field = "tokens"; // text field
+ String foundry = "mate";
+ String layer = "l";
+ int startPos = 20;
+ int endPos = 30;
+ Boolean includeSpans = true;
+
+ KorapMatch km = (KorapMatch) null;
+ LinkedList<TermInfo> termList = new LinkedList<TermInfo>();
+
+ StringBuffer regex = new StringBuffer();
+
+ // Todo: Ignore -: stuff!
+
+ if (includeSpans)
+ regex.append("((<>|<|>):)?");
+ else
+ regex.append("[^<>]");
+ if (foundry != null)
+ regex.append(foundry).append('/');
+ if (layer != null)
+ regex.append(layer).append(":");
+ regex.append(".+?");
+
+ BooleanQuery bool = new BooleanQuery();
+ bool.add(new TermQuery(new Term("ID", docID)), BooleanClause.Occur.MUST);
+ bool.add(new TermQuery(new Term("corpusID", corpusID)), BooleanClause.Occur.MUST);
+
+ Filter filter = (Filter) new QueryWrapperFilter(bool);
+
+ // Create an automaton for prefixed terms of interest:
+ CompiledAutomaton fst = new CompiledAutomaton(
+ new RegExp(regex.toString()).toAutomaton()
+ );
+
+ try {
+ for (AtomicReaderContext atomic : this.reader().leaves()) {
+ DocIdSetIterator filterIter = filter.getDocIdSet(
+ atomic,
+ atomic.reader().getLiveDocs()
+ ).iterator();
+
+ // Go to the matching doc
+ int localDocID = filterIter.nextDoc();
+ if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
+ continue;
+
+ // We've found the correct document!
+ HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
+ fieldsToLoadLocal.add(field);
+
+ // Load the necessary fields of the document
+ Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
+ // Get terms from the document
+ Terms docTerms = atomic.reader().getTermVector(localDocID, field);
+
+ km = new KorapMatch(
+ new PositionsToOffset(atomic, field),
+ localDocID,
+ startPos,
+ endPos
+ );
+
+ // A termsEnum object could be reused here
+ final TermsEnum termsEnum = docTerms.intersect(fst, null);
+
+ // Create a bitset for the correct document
+ // Yeah ... I know ... it could've been easier probably
+ FixedBitSet bitset = new FixedBitSet(atomic.reader().numDocs());
+ bitset.or(filterIter);
+
+ DocsAndPositionsEnum docs = (DocsAndPositionsEnum) null;
+
+ // Iterate over all terms in the document
+ while (termsEnum.next() != null) {
+ docs = termsEnum.docsAndPositions(
+ bitset,
+ docs,
+ DocsAndPositionsEnum.FLAG_PAYLOADS
+ );
+
+ // Init docs
+ docs.nextDoc();
+
+ // How often does this term occur in the document?
+ int termOccurrences = docs.freq();
+
+ // Iterate over all occurrences
+ for (int i = 0; i < termOccurrences; i++) {
+
+ // Init positions and get the current
+ int pos = docs.nextPosition();
+
+ // Check, if the position of the term is in the interesting area
+ if (pos >= startPos && pos <= endPos) {
+ termList.add(new TermInfo(
+ termsEnum.term().utf8ToString(),
+ pos,
+ docs.getPayload()
+ ));
+ };
+ };
+ };
+
+ break;
+ };
+ }
+ catch (IOException e) {
+ // ...
+ };
+
+ return km;
+ };
+
+
+ // TODO: collect all information based on a prefix (like cnx/p etc.)
+ // TODO: Generate a meaningful structure (e.g. a tree)
+ /*
+ KorapInfo is associated with a KorapMatch and has an array with all informations
+ per position in the match.
+
+ public KorapInfo infoOf (KorapMatch km, String prefix);
+ */
+
/**
- * search
+ * Search in the index.
*/
public KorapResult search (SpanQuery query) {
return this.search(new KorapCollection(this), new KorapSearch(query));
@@ -516,7 +647,9 @@
this.termContexts = new HashMap<Term, TermContext>();
SpanQuery query = ks.getQuery();
- String foundry = query.getField();
+
+ // Get the field of textual data and annotations
+ String field = query.getField();
// Todo: Make kr subclassing ks - so ks has a method for a new KorapResult!
KorapResult kr = new KorapResult(
@@ -530,7 +663,7 @@
);
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
- fieldsToLoadLocal.add(foundry);
+ fieldsToLoadLocal.add(field);
int i = 0;
long t1 = 0, t2 = 0;
@@ -564,7 +697,7 @@
// Use OpenBitSet;
Bits bitset = collection.bits(atomic);
- PositionsToOffset pto = new PositionsToOffset(atomic, foundry);
+ PositionsToOffset pto = new PositionsToOffset(atomic, field);
// Spans spans = NearSpansOrdered();
Spans spans = query.getSpans(atomic, (Bits) bitset, termContexts);
@@ -677,8 +810,7 @@
match.internalDocID = docID;
- // match.foundry = foundry; // This is "tokens" or "base" or so
-
+ match.setField(field);
match.setAuthor(doc.get("author"));
match.setTextClass(doc.get("textClass"));
match.setDocID(doc.get("ID"));
@@ -688,7 +820,7 @@
match.setCorpusID(doc.get("corpusID"));
match.setPubDate(doc.get("pubDate"));
- log.trace("I've got a match in {} of {}", match.getID(), count);
+ log.trace("I've got a match in {} of {}", match.getDocID(), count);
// Temporary (later meta fields in term vector)
match.setFoundries(doc.get("foundries"));
@@ -697,7 +829,7 @@
match.setLayerInfo(doc.get("layerInfo"));
match.setPrimaryData(
- new KorapPrimaryData(doc.get(foundry))
+ new KorapPrimaryData(doc.get(field))
);
atomicMatches.add(match);
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index 8b48239..8f54f5e 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -1,6 +1,7 @@
package de.ids_mannheim.korap;
import java.util.*;
import java.lang.StringBuffer;
+import java.nio.ByteBuffer;
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -8,6 +9,8 @@
import de.ids_mannheim.korap.index.PositionsToOffset;
import static de.ids_mannheim.korap.util.KorapHTML.*;
+// import org.apache.commons.codec.binary.Base64;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -30,7 +33,7 @@
// Snippet information
@JsonIgnore
public short leftContext,
- rightContext;
+ rightContext;
@JsonIgnore
public int startPos,
@@ -40,13 +43,16 @@
public int potentialStartPosChar = -1,
potentialEndPosChar = -1;
+ private int startOffsetChar = 0;
+
@JsonIgnore
public boolean leftTokenContext,
rightTokenContext;
private String tempSnippet,
snippetHTML,
- snippetBrackets;
+ snippetBrackets,
+ identifier;
private HighlightCombinator snippetStack;
@@ -55,6 +61,7 @@
private Collection<byte[]> payload;
private ArrayList<int[]> highlight;
+ private LinkedList<int[]> span;
private PositionsToOffset positionsToOffset;
private boolean processed = false;
@@ -104,7 +111,7 @@
public void addHighlight (int start, int end, int number) {
if (this.highlight == null)
- this.highlight = new ArrayList<int[]>();
+ this.highlight = new ArrayList<int[]>(16);
log.trace("Add highlight of class {} from {} to {}", number, start, end);
this._reset();
@@ -128,25 +135,57 @@
@Override
@JsonProperty("ID")
public String getID () {
- StringBuffer sb = new StringBuffer();
- if (this.getDocID() != null)
- sb.append(this.getDocID());
- sb.append('#');
+
+ if (this.identifier != null)
+ return this.identifier;
+
+ StringBuffer sb = new StringBuffer("match-");
+
+ // Get prefix string corpus/doc
+ if (this.getCorpusID() != null) {
+ sb.append(this.getCorpusID());
+
+ if (this.getDocID() != null) {
+ sb.append('-');
+ sb.append(this.getDocID());
+ };
+ }
+ else {
+ sb.append(this.localDocID);
+ };
+
+ sb.append('p');
+
+ // Get Position information
sb.append(startPos).append('-').append(endPos);
+
if (this.highlight != null) {
for (int[] h : this.highlight) {
- sb.append(',').append(h[2]).append(':');
+ sb.append('(').append(h[2]).append(')');
sb.append(h[0]).append('-').append(h[1]);
};
};
- return sb.toString();
+ if (this.processed) {
+ sb.append('c');
+ for (int[] s : this.span) {
+ if (s[2] != -1)
+ sb.append('(').append(s[2]).append(')');
+ sb.append(s[0] + this.startOffsetChar);
+ sb.append('-');
+ sb.append(s[1] + this.startOffsetChar);
+ };
+ };
+ return (this.identifier = sb.toString());
};
private void _reset () {
this.processed = false;
this.snippetHTML = null;
this.snippetBrackets = null;
+ this.identifier = null;
+ if (this.span != null)
+ this.span.clear();
};
// Start building highlighted snippets
@@ -158,10 +197,12 @@
log.trace("Start highlight processing ...");
// Get the list of spans for matches and highlighting
- LinkedList<int[]> spans = this._processHighlightSpans(
- leftTokenContext,
- rightTokenContext
- );
+ if (this.span == null || this.span.size() == 0) {
+ this._processHighlightSpans(
+ leftTokenContext,
+ rightTokenContext
+ );
+ };
/*
for (int[] s : spans) {
@@ -171,7 +212,7 @@
*/
// Create a stack for highlighted elements (opening and closing elements)
- ArrayList<int[]> stack = this._processHighlightStack(spans);
+ ArrayList<int[]> stack = this._processHighlightStack();
/*
for (int[] s : stack) {
@@ -603,15 +644,15 @@
// This sorts all highlight and match spans to make them nesting correctly,
// even in case they overlap
// TODO: Not very fast - improve!
- private ArrayList<int[]> _processHighlightStack (LinkedList<int[]> spans) {
+ private ArrayList<int[]> _processHighlightStack () {
log.trace("Create Stack");
LinkedList<int[]> openList = new LinkedList<int[]>();
LinkedList<int[]> closeList = new LinkedList<int[]>();
- openList.addAll(spans);
- closeList.addAll(spans);
+ openList.addAll(span);
+ closeList.addAll(span);
Collections.sort(openList, new OpeningTagComparator());
Collections.sort(closeList, new ClosingTagComparator());
@@ -639,8 +680,8 @@
};
- private LinkedList<int[]> _processHighlightSpans (boolean leftTokenContext,
- boolean rightTokenContext) {
+ private void _processHighlightSpans (boolean leftTokenContext,
+ boolean rightTokenContext) {
int startOffsetChar,
endOffsetChar,
startPosChar,
@@ -675,7 +716,10 @@
// right context
if (rightTokenContext) {
- endOffsetChar = this.positionsToOffset.end(ldid, this.endPos + this.rightContext - 1);
+ endOffsetChar = this.positionsToOffset.end(
+ ldid,
+ this.endPos + this.rightContext - 1
+ );
log.trace("For endOffset {} ({}+{}-1) pto returns {}", (this.endPos + this.rightContext - 1), this.endPos, this.rightContext, endOffsetChar);
}
else {
@@ -703,10 +747,11 @@
if (endOffsetChar != -1 && endOffsetChar < endPosChar)
endOffsetChar = endPosChar;
+ this.startOffsetChar = startOffsetChar;
+
log.trace("Offsetposition {} till {} with contexts {} and {}", startOffsetChar, endOffsetChar, leftContext, rightContext);
-
if (endOffsetChar > -1 && endOffsetChar < this.getPrimaryDataLength()) {
this.tempSnippet = this.getPrimaryData(startOffsetChar, endOffsetChar);
}
@@ -717,12 +762,15 @@
log.trace("Temporary snippet is \"{}\"", this.tempSnippet);
- LinkedList<int[]> spans = new LinkedList<int[]>();
+ if (this.span == null)
+ this.span = new LinkedList<int[]>();
+
+ this.identifier = null;
// Todo: Simplify
int[] intArray = new int[]{ startPosChar - startOffsetChar, endPosChar - startOffsetChar, -1, 0};
log.trace("IntArray: {}", intArray);
- spans.add(intArray);
+ this.span.add(intArray);
// highlights
// -- I'm not sure about this.
@@ -745,11 +793,8 @@
log.trace("PTO-start: {}", start + startOffsetChar);
log.trace("PTO-end: {}", end + startOffsetChar);
- spans.add(intArray);
+ this.span.add(intArray);
};
};
-
- return spans;
};
-
};
diff --git a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
index b9bf69d..a447ccd 100644
--- a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
@@ -2,12 +2,13 @@
import java.util.*;
+import org.apache.lucene.index.Term;
+
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.RegexpQuery;
-import org.apache.lucene.index.Term;
import org.apache.lucene.search.NumericRangeQuery;
import de.ids_mannheim.korap.util.KorapDate;
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 99a0cab..292592c 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -15,6 +15,7 @@
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
import java.util.*;
@@ -58,6 +59,7 @@
keywords.setStoreTermVectorPositions(false);
keywords.setStoreTermVectorPayloads(false);
keywords.setStoreTermVectorOffsets(false);
+ keywords.setIndexOptions(IndexOptions.DOCS_ONLY);
}
// see http://www.cowtowncoder.com/blog/archives/2011/07/entry_457.html
diff --git a/src/main/java/de/ids_mannheim/korap/index/TermInfo.java b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
new file mode 100644
index 0000000..6859c8f
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/TermInfo.java
@@ -0,0 +1,20 @@
+package de.ids_mannheim.korap.index;
+
+import java.util.*;
+import org.apache.lucene.util.BytesRef;
+
+public class TermInfo {
+
+ private String prefix, foundry, layer, value;
+ private int pos = 0;
+ private BytesRef payload;
+
+ // Temporary:
+ private String name;
+
+ public TermInfo (String name, int pos, BytesRef payload) {
+ this.name = name;
+ this.pos = pos;
+ this.payload = payload;
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 13e3871..121ef3c 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -49,6 +49,6 @@
assertEquals("SnippetBrackets (0)", "... bcabca[{2:b{a}}]c", kr.match(0).snippetBrackets());
- assertEquals("ID (0)", "#7-9,0:8-8,2:7-8", kr.match(0).getID());
+ assertEquals("ID (0)", "match-0p7-9(0)8-8(2)7-8c7-9(0)8-9(2)7-9", kr.match(0).getID());
};
-};
\ No newline at end of file
+};