Added unique IDs and fitting KorapCollection constructor
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index fdb0ff3..bebd668 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -50,6 +50,9 @@
// Logger
private final static Logger log = LoggerFactory.getLogger(KorapCollection.class);
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+
// user?
public KorapCollection (KorapIndex ki) {
this.index = ki;
@@ -63,7 +66,9 @@
try {
JsonNode json = mapper.readValue(jsonString, JsonNode.class);
if (json.has("collections")) {
- log.trace("Add meta collection");
+ if (DEBUG)
+ log.trace("Add meta collection");
+
for (JsonNode collection : json.get("collections")) {
this.fromJSON(collection);
};
@@ -78,15 +83,31 @@
this.filter = new ArrayList<FilterOperation>(5);
};
+ // Create a collection based on UIDs
+ public KorapCollection (String ... uids) {
+ this.filter = new ArrayList<FilterOperation>(5);
+ BooleanFilter filter = new BooleanFilter();
+ if (DEBUG)
+ log.debug("UID based collection: {},{}", uids[0], uids[1]);
+ filter.or("UID", uids);
+ if (DEBUG)
+ log.debug("UID based filter: {}", filter.toString());
+ this.filter(filter);
+ };
+
+
public void fromJSON(JsonNode json) throws QueryException {
String type = json.get("@type").asText();
if (type.equals("korap:meta-filter")) {
- log.trace("Add Filter");
+ if (DEBUG)
+ log.trace("Add Filter");
this.filter(new KorapFilter(json.get("@value")));
}
+
else if (type.equals("korap:meta-extend")) {
- log.trace("Add Extend");
+ if (DEBUG)
+ log.trace("Add Extend");
this.extend(new KorapFilter(json.get("@value")));
};
};
@@ -101,7 +122,8 @@
// The checks asre not necessary
public KorapCollection filter (BooleanFilter filter) {
- log.trace("Added filter: {}", filter.toString());
+ if (DEBUG)
+ log.trace("Added filter: {}", filter.toString());
if (filter == null) {
log.warn("No filter is given");
return this;
@@ -111,7 +133,7 @@
log.warn("Filter can't be wrapped");
return this;
};
- FilterOperation fo = new FilterOperation(f,false);
+ FilterOperation fo = new FilterOperation(f, false);
if (fo == null) {
log.warn("Filter operation invalid");
return this;
@@ -127,7 +149,8 @@
public KorapCollection extend (BooleanFilter filter) {
- log.trace("Added extension: {}", filter.toString());
+ if (DEBUG)
+ log.trace("Added extension: {}", filter.toString());
this.filter.add(
new FilterOperation(
(Filter) new QueryWrapperFilter(filter.toQuery()),
@@ -160,7 +183,10 @@
return sb.toString();
};
- // DEPRECATED BUT USED IN TEST CASES
+ /**
+ * Search in the virtual collection. This is just used for
+ * testing purposes and not recommended for serious usage.
+ */
public KorapResult search (SpanQuery query) {
return this.index.search(this, query, 0, (short) 20, true, (short) 5, true, (short) 5);
};
@@ -180,7 +206,8 @@
ArrayList<FilterOperation> filters = (ArrayList<FilterOperation>) this.filter.clone();
FilterOperation kcInit = filters.remove(0);
- log.trace("FILTER: {}", kcInit);
+ if (DEBUG)
+ log.trace("FILTER: {}", kcInit);
// Init vector
DocIdSet docids = kcInit.filter.getDocIdSet(atomic, null);
@@ -188,17 +215,18 @@
DocIdSetIterator filterIter = docids.iterator();
if (filterIter != null) {
- log.trace("InitFilter has effect");
- // System.err.println("Init has an effect");
+ if (DEBUG)
+ log.trace("InitFilter has effect");
bitset.or(filterIter);
noDoc = false;
};
if (!noDoc) {
for (FilterOperation kc : filters) {
- log.trace("FILTER: {}", kc);
+ if (DEBUG)
+ log.trace("FILTER: {}", kc);
- // BUG!!!
+ // TODO: BUG!!!!!!!!!!
docids = kc.filter.getDocIdSet(atomic, kc.isExtension() ? null : bitset);
filterIter = docids.iterator();
@@ -217,7 +245,6 @@
};
if (kc.isExtension()) {
// System.err.println("Term found!");
- // log.trace("Extend filter");
// System.err.println("Old Card:" + bitset.cardinality());
bitset.or(filterIter);
// System.err.println("New Card:" + bitset.cardinality());
@@ -260,6 +287,7 @@
};
// This is only for testing purposes!
+ @Deprecated
public HashMap getTermRelation(String field) throws Exception {
if (this.index == null) {
HashMap<String,Long> map = new HashMap<>(1);
@@ -270,6 +298,7 @@
return this.index.getTermRelation(this, field);
};
+ @Deprecated
public String getTermRelationJSON(String field) throws IOException {
ObjectMapper mapper = new ObjectMapper();
StringWriter sw = new StringWriter();
diff --git a/src/main/java/de/ids_mannheim/korap/KorapDocument.java b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
index 3857ecc..e8792b4 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapDocument.java
@@ -106,20 +106,26 @@
this.ID = ID;
};
+ @JsonProperty("ID")
+ public String getID () {
+ return this.ID;
+ };
+
public void setUID (int UID) {
this.UID = UID;
};
+ public void setUID (String UID) {
+ if (UID != null)
+ this.UID = Integer.parseInt(UID);
+ };
+
+
@JsonProperty("UID")
public int getUID () {
return this.UID;
};
- @JsonProperty("ID")
- public String getID () {
- return this.ID;
- };
-
public void setTitle (String title) {
this.title = title;
};
diff --git a/src/main/java/de/ids_mannheim/korap/KorapFilter.java b/src/main/java/de/ids_mannheim/korap/KorapFilter.java
index d97dd6e..f9052d9 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapFilter.java
@@ -225,6 +225,8 @@
};
public BooleanFilter or (String type, String ... terms) {
+ if (DEBUG)
+ log.debug("Got some terms here");
BooleanFilter bf = new BooleanFilter();
bf.or(type, terms);
return bf;
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index cbcbd1a..f99da41 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -187,6 +187,7 @@
fieldsToLoad = new HashSet<String>(16);
fieldsToLoad.add("author");
fieldsToLoad.add("ID");
+ fieldsToLoad.add("UID");
fieldsToLoad.add("title");
fieldsToLoad.add("subTitle");
fieldsToLoad.add("textClass");
@@ -230,6 +231,7 @@
public IndexReader reader () {
if (!readerOpen)
this.openReader();
+ // Todo: Maybe use DirectoryReader.openIfChanged(DirectoryReader)
return this.reader;
};
@@ -277,19 +279,32 @@
};
};
+ /*
+ * Some of these addDoc methods will probably be DEPRECATED,
+ * as they were added while the API changed slowly.
+ */
+
// Add document to index as FieldDocument
- public FieldDocument addDoc (FieldDocument fd) throws IOException {
-
- // Open writer if not already opened
- if (this.writer == null)
- this.writer = new IndexWriter(this.directory, this.config);
+ public FieldDocument addDoc (FieldDocument fd) {
- // Add document to writer
- this.writer.addDocument( fd.doc );
- if (++commitCounter > autoCommit) {
- this.commit();
- commitCounter = 0;
+ try {
+
+ // Open writer if not already opened
+ if (this.writer == null)
+ this.writer = new IndexWriter(this.directory, this.config);
+
+ // Add document to writer
+ this.writer.addDocument( fd.doc );
+ if (++commitCounter > autoCommit) {
+ this.commit();
+ commitCounter = 0;
+ };
+ }
+
+ // Failed to add document
+ catch (IOException e) {
+ log.error("File json not found");
};
return fd;
};
@@ -310,31 +325,35 @@
// Add document to index as JSON file
- public FieldDocument addDoc (File json) throws IOException {
- FieldDocument fd = this.mapper.readValue(json, FieldDocument.class);
- return this.addDoc(fd);
+ public FieldDocument addDoc (File json) {
+ try {
+ FieldDocument fd = this.mapper.readValue(json, FieldDocument.class);
+ return this.addDoc(fd);
+ }
+ catch (IOException e) {
+ log.error("File json not parseable");
+ };
+ return (FieldDocument) null;
};
// Add document to index as JSON file
- public FieldDocument addDocFile(String json) throws IOException {
+ public FieldDocument addDocFile(String json) {
return this.addDocFile(json, false);
};
-
- // Add document to index as JSON file (possibly gzipped)
- public FieldDocument addDocFile(String json, boolean gzip) {
+ private FieldDocument _addDocfromFile (String json, boolean gzip) {
try {
if (gzip) {
// Create json field document
FieldDocument fd = this.mapper.readValue(
- new GZIPInputStream(new FileInputStream(json)),
+ new GZIPInputStream(new FileInputStream(json)),
FieldDocument.class
);
- return this.addDoc(fd);
+ return fd;
};
- return this.addDoc(json);
+ return this.mapper.readValue(json, FieldDocument.class);
}
// Fail to add json object
@@ -344,6 +363,20 @@
return (FieldDocument) null;
};
+ // Add document to index as JSON file (possibly gzipped)
+ public FieldDocument addDocFile(String json, boolean gzip) {
+ return this.addDoc(this._addDocfromFile(json, gzip));
+ };
+
+ // Add document to index as JSON file (possibly gzipped)
+ public FieldDocument addDocFile(int uid, String json, boolean gzip) {
+ FieldDocument fd = this._addDocfromFile(json, gzip);
+ if (fd != null) {
+ fd.setUID(uid);
+ return this.addDoc(fd);
+ };
+ return fd;
+ };
// Commit changes to the index
public void commit () throws IOException {
@@ -1065,20 +1098,22 @@
HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
fieldsToLoadLocal.add(field);
- int i = 0;
- long t1 = 0, t2 = 0;
- int startIndex = kr.getStartIndex();
- int count = kr.getItemsPerPage();
- int hits = kr.itemsPerPage() + startIndex;
- int limit = ks.getLimit();
- boolean cutoff = ks.doCutOff();
+ int i = 0;
+ long t1 = 0,
+ t2 = 0;
+ int startIndex = kr.getStartIndex();
+ int count = kr.getItemsPerPage();
+ int hits = kr.itemsPerPage() + startIndex;
+ int limit = ks.getLimit();
+ boolean cutoff = ks.doCutOff();
short itemsPerResource = ks.getItemsPerResource();
-
+ // Check if there is work to do at all
if (limit > 0) {
if (hits > limit)
hits = limit;
+ // Nah - nothing to do! \o/
if (limit < startIndex)
return kr;
};
@@ -1100,7 +1135,10 @@
int oldLocalDocID = -1;
- // Use OpenBitSet;
+ /*
+ * Todo: There may be a way to now early if the bitset is emty
+ * by using OpenBitSet - but this may not be as fast as I think.
+ */
Bits bitset = collection.bits(atomic);
PositionsToOffset pto = new PositionsToOffset(atomic, field);
@@ -1261,7 +1299,7 @@
// Only load ID
HashSet<String> fieldsToLoadLocal = new HashSet<>();
- fieldsToLoadLocal.add("ID");
+ fieldsToLoadLocal.add("UID");
// List<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(10);
@@ -1275,6 +1313,7 @@
};
int matchcount = 0;
+ String uniqueDocIDString;;
int uniqueDocID = -1;
for (AtomicReaderContext atomic : this.reader().leaves()) {
@@ -1313,10 +1352,12 @@
};
// Read document id from index
- /*
- uniqueDocID = lreader.document(localDocID, fieldsToLoadLocal).get("ID");
- */
- uniqueDocID = localDocID;
+ uniqueDocIDString =
+ lreader.document(localDocID, fieldsToLoadLocal).get("UID");
+
+ if (uniqueDocIDString != null)
+ uniqueDocID = Integer.parseInt(uniqueDocIDString);
+
previousDocID = localDocID;
}
else {
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index d46fa0e..16e69d9 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -351,6 +351,8 @@
this.setCorpusID(doc.get("corpusID"));
if (fields.contains("ID"))
this.setDocID(doc.get("ID"));
+ if (fields.contains("UID"))
+ this.setUID(doc.get("UID"));
if (fields.contains("author"))
this.setAuthor(doc.get("author"));
if (fields.contains("textClass"))
diff --git a/src/main/java/de/ids_mannheim/korap/KorapNode.java b/src/main/java/de/ids_mannheim/korap/KorapNode.java
index 0c88f75..7227085 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapNode.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapNode.java
@@ -40,6 +40,13 @@
private static String name = "tanja";
+ /*
+ * Todo: Add shutdown hook,
+ * see: https://10.0.10.12/trac/korap/browser/KorAP-modules/KorAP-REST/src/main/java/de/ids_mannheim/korap/web/Application.java
+ * https://10.0.10.12/trac/korap/browser/KorAP-modules/KorAP-REST/src/main/java/de/ids_mannheim/korap/web/ShutdownHook.java
+ */
+
+
/**
* Starts Grizzly HTTP server exposing JAX-RS resources defined in this application.
* @return Grizzly HTTP server.
diff --git a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
index 57392bf..fecfd46 100644
--- a/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
+++ b/src/main/java/de/ids_mannheim/korap/filter/BooleanFilter.java
@@ -40,6 +40,9 @@
// Logger
private final static Logger log = LoggerFactory.getLogger(KorapFilter.class);
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+
private BooleanQuery bool;
private String error;
@@ -49,6 +52,10 @@
public BooleanFilter or (String type, String ... terms) {
for (String term : terms) {
+
+ if (DEBUG)
+ log.trace("Filter: OR {}={}", type, term);
+
bool.add(
new TermQuery(new Term(type, term)),
BooleanClause.Occur.SHOULD
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index e7ad688..7a845be 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -236,12 +236,12 @@
@Override
public void setUID (int ID) {
super.setUID(ID);
- this.addInt("UID", ID);
+ this.addString("UID", new Integer(ID).toString());
};
@Override
public void setLayerInfo (String layerInfo) {
- System.err.println(layerInfo);
+ // System.err.println(layerInfo);
super.setLayerInfo(layerInfo);
this.addStored("layerInfo", layerInfo);
};
diff --git a/src/main/java/de/ids_mannheim/korap/server/Resource.java b/src/main/java/de/ids_mannheim/korap/server/Resource.java
index 2ac540a..0577d93 100644
--- a/src/main/java/de/ids_mannheim/korap/server/Resource.java
+++ b/src/main/java/de/ids_mannheim/korap/server/Resource.java
@@ -23,6 +23,7 @@
import de.ids_mannheim.korap.KorapNode;
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapSearch;
+import de.ids_mannheim.korap.KorapCollection;
import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.server.KorapResponse;
@@ -77,7 +78,7 @@
* http://stackoverflow.com/questions/19765582/how-to-make-jersey-use-gzip-compression-for-the-response-message-body
*/
@PUT
- @Path("/{textID}")
+ @Path("/index/{textID}")
@Produces(MediaType.APPLICATION_JSON)
@Consumes(MediaType.APPLICATION_JSON)
public String add (@PathParam("textID") Integer uid,
@@ -124,6 +125,7 @@
* Commit data changes to the index
*/
@POST
+ @Path("/index")
@Produces(MediaType.APPLICATION_JSON)
public String commit () {
@@ -135,21 +137,57 @@
version = index.getVersion();
// There are documents to commit
- if (index.getUnstaged() != 0) {
- try {
- index.commit();
- }
- catch (IOException e) {
- // Set HTTP to ???
- return kresp.setErrstr(e.getMessage()).toJSON();
- };
-
+ try {
+ index.commit();
+ }
+ catch (IOException e) {
// Set HTTP to ???
- return kresp.setMsg("Unstaged data was committed").toJSON();
+ return kresp.setErrstr(e.getMessage()).toJSON();
};
// Set HTTP to ???
- return kresp.setMsg("No unstaged data available").toJSON();
+ return kresp.setMsg("Unstaged data was committed").toJSON();
+ };
+
+ /**
+ * Find matches in the lucene index and return them as results.
+ *
+ * @param text_id
+ */
+ @POST
+ @Produces(MediaType.APPLICATION_JSON)
+ @Consumes(MediaType.APPLICATION_JSON)
+ public String find (String json, @Context UriInfo uri) {
+
+ // Get index
+ KorapIndex index = KorapNode.getIndex();
+
+ // Search index
+ if (index != null) {
+ KorapSearch ks = new KorapSearch(json);
+
+ // Get query parameters
+ MultivaluedMap<String,String> qp = uri.getQueryParameters();
+
+ // Build Collection based on a list of uids
+ KorapCollection kc = new KorapCollection(
+ qp.get("uid").toArray(new String[0])
+ );
+
+ // TODO: RESTRICT COLLECTION TO ONLY RESPECT SELF DOCS (REPLICATION)
+
+ // Override old collection
+ ks.setCollection(kc);
+
+ // Only return the first match per text
+ ks.setItemsPerResource(1);
+
+ return ks.run(index).toJSON();
+ };
+ // Response with error message
+ KorapResult kr = new KorapResult();
+ kr.setError("Index not found");
+ return kr.toJSON();
};
diff --git a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
index 020c081..ba62e86 100644
--- a/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
+++ b/src/test/java/de/ids_mannheim/korap/filter/TestKorapCollection.java
@@ -7,6 +7,12 @@
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.KorapQuery;
import de.ids_mannheim.korap.filter.BooleanFilter;
+
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanQuery;
import static org.junit.Assert.*;
@@ -82,7 +88,7 @@
// System.err.println(kr.toJSON());
};
- @Ignore
+ @Test
public void filterExampleAtomic () throws IOException {
// That's exactly the same test class, but with multiple atomic indices
@@ -202,6 +208,45 @@
assertEquals(87, kr.totalResults());
// System.out.println(kr.toJSON());
};
+
+
+ @Test
+ public void uidCollection () throws IOException {
+
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ int uid = 1;
+ for (String i : new String[] {"00001",
+ "00002",
+ "00003",
+ "00004",
+ "00005",
+ "00006",
+ "02439"}) {
+ FieldDocument fd = ki.addDocFile(
+ uid++,
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(),
+ true
+ );
+ };
+ ki.commit();
+
+ assertEquals("Documents", 7, ki.numberOf("documents"));
+ assertEquals("Sentences", 281, ki.numberOf("sentences"));
+
+ SpanQuery sq = new SpanTermQuery(new Term("tokens", "s:der"));
+ KorapResult kr = ki.search(sq, (short) 10);
+ assertEquals(86,kr.getTotalResults());
+
+ // Create Virtual collections:
+ KorapCollection kc = new KorapCollection(new String[]{"2", "3", "4"});
+ kc.setIndex(ki);
+ assertEquals("Documents", 3, kc.numberOf("documents"));
+
+ kr = kc.search(sq);
+ assertEquals(39,kr.getTotalResults());
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/server/ResourceTest.java b/src/test/java/de/ids_mannheim/korap/server/ResourceTest.java
index 5f2365f..ee84419 100644
--- a/src/test/java/de/ids_mannheim/korap/server/ResourceTest.java
+++ b/src/test/java/de/ids_mannheim/korap/server/ResourceTest.java
@@ -71,7 +71,7 @@
}) {
String json = StringfromFile(getClass().getResource("/wiki/" + i + ".json").getFile());
- KorapResponse kresp = target.path("/" + i).
+ KorapResponse kresp = target.path("/index/" + i).
request("application/json").
put(Entity.json(json), KorapResponse.class);
@@ -84,7 +84,7 @@
assertEquals(kresp.getVersion(), "0.42");
};
- KorapResponse kresp = target.path("/").
+ KorapResponse kresp = target.path("/index").
request("application/json").
post(Entity.text(""), KorapResponse.class);
assertEquals(kresp.getNode(), "milena");