Clean up the index
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 5082be9..def2e5b 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -10,13 +10,15 @@
import java.util.zip.GZIPInputStream;
import java.io.FileInputStream;
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
+
import org.apache.lucene.document.Document;
+
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
@@ -25,32 +27,36 @@
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
+
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+
+import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSetIterator;
-import com.fasterxml.jackson.annotation.*;
-import com.fasterxml.jackson.databind.ObjectMapper;
-
import org.apache.lucene.util.Version;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
-import org.apache.lucene.search.DocIdSet;
-import de.ids_mannheim.korap.index.FieldDocument;
+import com.fasterxml.jackson.annotation.*;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.KorapCollection;
import de.ids_mannheim.korap.KorapSearch;
+import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.document.KorapPrimaryData;
@@ -66,6 +72,8 @@
http://invertedindex.blogspot.co.il/2009/04/lucene-dociduid-mapping-and-payload.html
see korap/search.java -> retrieveTokens
+ Support a callback for interrupts (to stop the searching)!
+
Support multiple indices.
Support frequency search with regular expressions, so multiple bookkeeping:
@@ -91,7 +99,7 @@
private IndexSearcher searcher;
private boolean readerOpen = false;
private int commitCounter = 0;
- private int autoCommit = 500;
+ private int autoCommit = 500; // Todo: Use configuration
private HashMap termContexts;
private ObjectMapper mapper = new ObjectMapper();
@@ -437,18 +445,14 @@
* search
*/
public KorapResult search (SpanQuery query) {
- return this.search((Bits) null, query, 0, (short) 10, true, (short) 6, true, (short) 6);
+ return this.search(new KorapCollection(this), new KorapSearch(query));
};
- public KorapResult search (SpanQuery query,
- short count) {
- return this.search((Bits) null, query, 0, count, true, (short) 6, true, (short) 6);
- };
-
- public KorapResult search (Bits bitset,
- SpanQuery query,
- short count) {
- return this.search((Bits) bitset, query, 0, count, true, (short) 6, true, (short) 6);
+ public KorapResult search (SpanQuery query, short count) {
+ return this.search(
+ new KorapCollection(this),
+ new KorapSearch(query).setCount(count)
+ );
};
public KorapResult search (SpanQuery query,
@@ -458,43 +462,23 @@
short leftContext,
boolean rightTokenContext,
short rightContext) {
- return this.search((Bits) null, query, startIndex, count,
- leftTokenContext, leftContext, rightTokenContext, rightContext);
- };
-
- // This is just a fallback! Delete!
- @Deprecated
- public KorapResult search (Bits bitset,
- SpanQuery query,
- int startIndex,
- short count,
- boolean leftTokenContext,
- short leftContext,
- boolean rightTokenContext,
- short rightContext) {
- // TODO: This might leak as hell!!!
- return this.search(new KorapCollection(this), query, startIndex, count, leftTokenContext, leftContext, rightTokenContext, rightContext);
- };
-
- public KorapResult search (KorapCollection kc, KorapSearch ks) {
- return this.search(kc,
- ks.getQuery(),
- ks.getStartIndex(),
- ks.getCount(),
- ks.leftContext.isToken(),
- ks.leftContext.getLength(),
- ks.rightContext.isToken(),
- ks.rightContext.getLength()
- );
+ return this.search(
+ new KorapCollection(this),
+ query,
+ startIndex,
+ count,
+ leftTokenContext,
+ leftContext,
+ rightTokenContext,
+ rightContext
+ );
};
public KorapResult search (KorapSearch ks) {
+ // TODO: This might leak as hell!!!
return this.search(new KorapCollection(this), ks);
};
-
-
- // old: Bits bitset
public KorapResult search (KorapCollection collection,
SpanQuery query,
int startIndex,
@@ -503,39 +487,57 @@
short leftContext,
boolean rightTokenContext,
short rightContext) {
+ KorapSearch ks = new KorapSearch(query);
+ ks.setStartIndex(startIndex).setCount(count);
+ ks.leftContext.setToken(leftTokenContext).setLength(leftContext);
+ ks.rightContext.setToken(rightTokenContext).setLength(rightContext);
+ return this.search(collection, ks);
+ };
+ public KorapResult search (KorapCollection collection, KorapSearch ks) {
log.trace("Start search");
this.termContexts = new HashMap<Term, TermContext>();
+ SpanQuery query = ks.getQuery();
String foundry = query.getField();
+ // Todo: Make kr subclassing ks - so ks has a method for a new KorapResult!
KorapResult kr = new KorapResult(
query.toString(),
- startIndex,
- count,
- leftTokenContext,
- leftContext,
- rightTokenContext,
- rightContext
- );
+ ks.getStartIndex(),
+ ks.getCount(),
+ ks.leftContext.isToken(),
+ ks.leftContext.getLength(),
+ ks.rightContext.isToken(),
+ ks.rightContext.getLength()
+ );
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
fieldsToLoadLocal.add(foundry);
+ int i = 0;
+ long t1 = 0, t2 = 0;
+ int startIndex = kr.getStartIndex();
+ int count = kr.getItemsPerPage();
+ int hits = kr.itemsPerPage() + startIndex;
+ int limit = ks.getLimit();
+ boolean cutoff = ks.doCutOff();
+
+ if (limit > 0) {
+ if (hits > limit)
+ hits = limit;
+
+ if (limit < startIndex)
+ return kr;
+ };
+
+ ArrayList<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(kr.itemsPerPage());
+
try {
- int i = 0;
- long t1 = 0;
- long t2 = 0;
-
- int hits = kr.itemsPerPage() + startIndex;
-
- ArrayList<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(kr.itemsPerPage());
for (AtomicReaderContext atomic : this.reader().leaves()) {
- log.trace("NUKULAR!");
-
// Use OpenBitSet;
Bits bitset = collection.bits(atomic);
@@ -555,10 +557,11 @@
log.trace("Match Nr {}/{}", i, count);
- if (spans.next() != true) {
+ // There are no more spans to find
+ if (spans.next() != true)
break;
- };
+ // The next matches are not yet part of the result
if (startIndex > i)
continue;
@@ -566,8 +569,11 @@
int docID = atomic.docBase + localDocID;
// Document doc = lreader.document(docID, fieldsToLoadLocal);
+
+
+ // Do not load all of this, in case the doc is the same!
Document doc = lreader.document(localDocID, fieldsToLoadLocal);
- KorapMatch match = new KorapMatch();
+ KorapMatch match = kr.addMatch(); // new KorapMatch();
match.startPos = spans.start();
match.endPos = spans.end();
@@ -576,29 +582,28 @@
pto.add(localDocID, match.startPos);
pto.add(localDocID, match.endPos - 1);
+ /*
match.leftContext = leftContext;
match.rightContext = rightContext;
match.leftTokenContext = leftTokenContext;
match.rightTokenContext = rightTokenContext;
-
+ */
// Add pos for context
- if (leftTokenContext) {
- pto.add(localDocID, match.startPos - leftContext);
+ if (match.leftTokenContext) {
+ pto.add(localDocID, match.startPos - match.leftContext);
};
// Add pos for context
- if (rightTokenContext) {
- pto.add(localDocID, match.endPos + rightContext - 1);
+ if (match.rightTokenContext) {
+ pto.add(localDocID, match.endPos + match.rightContext - 1);
};
if (spans.isPayloadAvailable()) {
// TODO: Here are offsets and highlight offsets!
// <> payloads have 12 bytes (iii) or 8!?
- // highlightoffsets have 10 bytes (iis)!
-
- // 11 bytes!!!
+ // highlightoffsets have 11 bytes (iis)!
/*
int[] offsets = getOffsetsFromPayload(spans.getPayload());
@@ -662,10 +667,10 @@
bb.clear();
};
-
}
catch (Exception e) {
+ log.error(e.getMessage());
}
// match.payload(spans.getPayload());
@@ -694,7 +699,7 @@
new KorapPrimaryData(doc.get(foundry))
);
atomicMatches.add(match);
- kr.add(match);
+ // kr.add(match);
};
// Benchmark till now
@@ -705,7 +710,9 @@
};
// Can be disabled TEMPORARILY
- while (spans.next()) {
+ while (!cutoff && spans.next()) {
+ if (limit > 0 && i <= limit)
+ break;
i++;
};
@@ -721,15 +728,6 @@
kr.setBenchmarkSearchResults(t2, t1);
};
kr.setTotalResults(i);
-
-
- // if (spans.isPayloadAvailable()) {
- // for (byte[] payload : spans.getPayload()) {
- // // retrieve payload for current matching span
- // payloadString.append(new String(payload));
- // payloadString.append(" | ");
- // };
- // };
}
catch (IOException e) {
kr.setError("There was an IO error");
diff --git a/src/main/java/de/ids_mannheim/korap/KorapResult.java b/src/main/java/de/ids_mannheim/korap/KorapResult.java
index 2f8def8..3abb51d 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapResult.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapResult.java
@@ -65,6 +65,17 @@
this.matches.add(km);
};
+ public KorapMatch addMatch () {
+ KorapMatch km = new KorapMatch();
+ // Temporary:
+ km.leftContext = this.leftContextOffset;
+ km.leftTokenContext = this.leftTokenContext;
+ km.rightContext = this.rightContextOffset;
+ km.rightTokenContext = this.rightTokenContext;
+ this.add(km);
+ return km;
+ };
+
public void setTotalResults (int i) {
this.totalResults = i;
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
index 092c5a7..3d36344 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapSearch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -15,15 +15,18 @@
// Todo: Use configuration file
/*
-KorapResult = new KorapSearch(String json).run(KorapIndex ki);
-startPage!!!
+ Todo: Let this class extend KorapResult!
+ Todo: implement an empty new Result Thingy!
+ KorapResult = new KorapSearch(String json).run(KorapIndex ki);
+ startPage!!!
*/
public class KorapSearch {
private int startIndex;
private short count = 25;
private short countMax = 50;
- // private int limit = -1;
+ private int limit = 0;
+ private boolean cutoff = false;
private SpanQuery query;
public KorapSearchContext leftContext, rightContext;
private KorapCollection collection;
@@ -191,6 +194,25 @@
return this.countMax;
};
+ public int getLimit () {
+ return this.limit;
+ };
+
+ public KorapSearch setLimit (int limit) {
+ if (limit > 0)
+ this.limit = limit;
+ return this;
+ };
+
+ public boolean doCutOff () {
+ return this.cutoff;
+ };
+
+ public KorapSearch setCutOff (boolean cutoff) {
+ this.cutoff = cutoff;
+ return this;
+ };
+
public KorapSearch setCount (int value) {
// Todo: Maybe update startIndex with known startPage!
this.setCount((short) value);
diff --git a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
index 8b880f7..95f7090 100644
--- a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
+++ b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
@@ -25,8 +25,8 @@
KorapIndex ki = new KorapIndex();
// Indexing test files
for (String i : new String[] {"00001", "00002", "00003", "00004", "00005", "00006", "02439"}) {
- FieldDocument fd = ki.addDocFile(
- getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
);
};
ki.commit();
@@ -64,6 +64,7 @@
SpanQuery query = kq.seg("opennlp/p:NN").with("tt/p:NN").toQuery();
KorapResult kr = kc.search(query);
+
assertEquals(70, kr.totalResults());
kc.extend( kf.and("textClass", "uninteresting") );
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
index 9b07d77..2472b2a 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
@@ -126,8 +126,11 @@
KorapQuery kq = new KorapQuery("tokens");
KorapResult kr = ki.search((SpanQuery) kq.seq(kq._(3, kq.seg("s:b"))).toQuery());
+
KorapMatch km = kr.getMatch(0);
+ System.err.println("----");
+
assertEquals(km.getPrimaryData(),"abc");
assertEquals(km.getCorpusID(),"WPD");
assertEquals(km.getID(),"WPD-AAA-00001");