Added getDoc() method to KrillIndex for field retrieval
Change-Id: I17628024f74081e86f400d9fc52c031fbb0df815
diff --git a/.gitignore b/.gitignore
index 27469c6..b855d70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@
/.classpath
/todo.org
/wiki.org
+/misc/web-api.md
*~
.*
!.gitignore
diff --git a/Changes b/Changes
index 64087df..3b39a83 100644
--- a/Changes
+++ b/Changes
@@ -8,6 +8,7 @@
- [documentation] Refer to KoralQuery instead of Koral (diewald)
- [cleanup] Removed deprecated method in KrillCollection:
getCount() (diewald)
+ - [feature] Added getDoc() method to KrillIndex (diewald)
0.55.5 2016-05-02
- [performance] Changed to a dynamic window for sorting in FocusSpans (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index c603f6d..2706d8c 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -774,6 +774,68 @@
};
+ public Text getDoc (String uid) {
+ // This is very similar to getMatchInfo
+
+ Text text = new Text();
+
+ Filter filter = (Filter) new QueryWrapperFilter(new TermQuery(new Term("UID", uid)));
+
+ try {
+
+ // Iterate over all atomic indices and find the matching document
+ for (LeafReaderContext atomic : this.reader().leaves()) {
+
+ // Retrieve the single document of interest
+ DocIdSet filterSet = filter.getDocIdSet(atomic, atomic.reader()
+ .getLiveDocs());
+
+ // Create a bitset for the correct document
+ Bits bitset = filterSet.bits();
+
+ DocIdSetIterator filterIterator = filterSet.iterator();
+
+ if (DEBUG)
+ log.trace("Checking document in {} with {}", filterSet,
+ bitset);
+
+ // No document found
+ if (filterIterator == null)
+ continue;
+
+ // Go to the matching doc - and remember its ID
+ int localDocID = filterIterator.nextDoc();
+
+ if (localDocID == DocIdSetIterator.NO_MORE_DOCS)
+ continue;
+
+ // We've found the correct document! Hurray!
+ if (DEBUG)
+ log.trace("We've found a matching document");
+
+ // HashSet<String> fields = (HashSet<String>) new Krill()
+ // .getMeta().getFields().clone();
+ // fields.add(field);
+
+ // Load the necessary fields of the document
+
+ // TODO: Probably use
+ // document(int docID, StoredFieldVisitor visitor)
+ Document doc = atomic.reader().document(localDocID);
+ text.populateFields(doc);
+
+ return text;
+ };
+ }
+ catch (IOException e) {
+ text.addError(600, "Unable to read index", e.getLocalizedMessage());
+ log.warn(e.getLocalizedMessage());
+ };
+
+ text.addError(830, "Filter was empty");
+
+ return text;
+ };
public String getMatchIDWithContext (String id) {
/* No includeHighlights */
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index 2d3053b..e7712c8 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -6,6 +6,9 @@
import de.ids_mannheim.korap.index.FieldDocument;
import de.ids_mannheim.korap.response.Response;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexableField;
+
import com.fasterxml.jackson.annotation.*;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.databind.JsonNode;
@@ -72,6 +75,151 @@
// Meta information regarding annotations
tokenSource, layerInfos;
+ /**
+ * Populate document meta information with information coming from
+ * the index.
+ *
+ * @param doc
+ * Document object.
+ * @param field
+ * Primary data field.
+ */
+ public void populateDocument (Document doc, String field) {
+ HashSet<String> fieldList = new HashSet<>(32);
+ Iterator<IndexableField> fieldIterator = doc.getFields().iterator();
+ while (fieldIterator.hasNext())
+ fieldList.add(fieldIterator.next().name());
+
+ this.populateDocument(doc, field, fieldList);
+ };
+
+ public void populateFields (Document doc) {
+
+ HashSet<String> fieldList = new HashSet<>(32);
+ Iterator<IndexableField> fieldIterator = doc.getFields().iterator();
+ while (fieldIterator.hasNext())
+ fieldList.add(fieldIterator.next().name());
+
+ this.populateFields(doc, fieldList);
+ };
+
+ public void populateFields (Document doc, Collection<String> fields) {
+
+ // Remember - never serialize "tokens"
+
+ // LEGACY
+ if (fields.contains("corpusID"))
+ this.setCorpusID(doc.get("corpusID"));
+ if (fields.contains("ID"))
+ this.setID(doc.get("ID"));
+ if (fields.contains("tokenization"))
+ this.setTokenization(doc.get("tokenization"));
+ if (fields.contains("layerInfo"))
+ this.setLayerInfo(doc.get("layerInfo"));
+
+ // valid
+ if (fields.contains("UID"))
+ this.setUID(doc.get("UID"));
+ if (fields.contains("author"))
+ this.setAuthor(doc.get("author"));
+ if (fields.contains("textClass"))
+ this.setTextClass(doc.get("textClass"));
+ if (fields.contains("title"))
+ this.setTitle(doc.get("title"));
+ if (fields.contains("subTitle"))
+ this.setSubTitle(doc.get("subTitle"));
+ if (fields.contains("pubDate"))
+ this.setPubDate(doc.get("pubDate"));
+ if (fields.contains("pubPlace"))
+ this.setPubPlace(doc.get("pubPlace"));
+
+ // Temporary (later meta fields in term vector)
+ if (fields.contains("foundries"))
+ this.setFoundries(doc.get("foundries"));
+
+ // New fields
+ if (fields.contains("textSigle"))
+ this.setTextSigle(doc.get("textSigle"));
+ if (fields.contains("docSigle"))
+ this.setDocSigle(doc.get("docSigle"));
+ if (fields.contains("corpusSigle"))
+ this.setCorpusSigle(doc.get("corpusSigle"));
+ if (fields.contains("layerInfos"))
+ this.setLayerInfos(doc.get("layerInfos"));
+ if (fields.contains("tokenSource"))
+ this.setTokenSource(doc.get("tokenSource"));
+ if (fields.contains("editor"))
+ this.setEditor(doc.get("editor"));
+
+ if (fields.contains("corpusAuthor"))
+ this.setCorpusAuthor(doc.get("corpusAuthor"));
+ if (fields.contains("corpusEditor"))
+ this.setCorpusEditor(doc.get("corpusEditor"));
+ if (fields.contains("corpusTitle"))
+ this.setCorpusTitle(doc.get("corpusTitle"));
+ if (fields.contains("corpusSubTitle"))
+ this.setCorpusSubTitle(doc.get("corpusSubTitle"));
+
+ if (fields.contains("docAuthor"))
+ this.setDocAuthor(doc.get("docAuthor"));
+ if (fields.contains("docEditor"))
+ this.setDocEditor(doc.get("docEditor"));
+ if (fields.contains("docTitle"))
+ this.setDocTitle(doc.get("docTitle"));
+ if (fields.contains("docSubTitle"))
+ this.setDocSubTitle(doc.get("docSubTitle"));
+
+ if (fields.contains("publisher"))
+ this.setPublisher(doc.get("publisher"));
+ if (fields.contains("reference"))
+ this.setReference(doc.get("reference"));
+ if (fields.contains("creationDate"))
+ this.setCreationDate(doc.get("creationDate"));
+ if (fields.contains("keywords"))
+ this.setKeywords(doc.get("keywords"));
+ if (fields.contains("textClass"))
+ this.setTextClass(doc.get("textClass"));
+ if (fields.contains("textColumn"))
+ this.setTextColumn(doc.get("textColumn"));
+ if (fields.contains("textDomain"))
+ this.setTextDomain(doc.get("textDomain"));
+ if (fields.contains("textType"))
+ this.setTextType(doc.get("textType"));
+ if (fields.contains("textTypeArt"))
+ this.setTextTypeArt(doc.get("textTypeArt"));
+ if (fields.contains("textTypeRef"))
+ this.setTextTypeRef(doc.get("textTypeRef"));
+ if (fields.contains("language"))
+ this.setLanguage(doc.get("language"));
+ if (fields.contains("license"))
+ this.setLicense(doc.get("license"));
+ if (fields.contains("pages"))
+ this.setPages(doc.get("pages"));
+
+ if (fields.contains("biblEditionStatement"))
+ this.setBiblEditionStatement(doc.get("biblEditionStatement"));
+ if (fields.contains("fileEditionStatement"))
+ this.setFileEditionStatement(doc.get("fileEditionStatement"));
+ };
+
+ /**
+ * Populate document meta information with information coming from
+ * the index.
+ *
+ * @param doc
+ * Document object.
+ * @param field
+ * Primary data field.
+ * @param fields
+ * Hash object with all supported fields.
+ */
+ public void populateDocument (Document doc, String field,
+ Collection<String> fields) {
+ this.setField(field);
+ this.setPrimaryData(doc.get(field));
+ this.populateFields(doc, fields);
+ };
+
/**
* Get the publication date of the document
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 47806af..a3aeee7 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -4,14 +4,12 @@
import java.nio.ByteBuffer;
import java.util.*;
-import org.apache.lucene.document.Document;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.index.IndexableField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -46,7 +44,7 @@
/**
* Representation of Matches in a Result.
- * <strong>Warning:</strong> This is currently highliy dependent
+ * <strong>Warning:</strong> This is currently highly dependent
* on DeReKo data and will change in the future.
*
* @author Nils Diewald
@@ -392,139 +390,6 @@
/**
- * Populate document meta information with information coming from
- * the index.
- *
- * @param doc
- * Document object.
- * @param field
- * Primary data field.
- */
- public void populateDocument (Document doc, String field) {
- HashSet<String> fieldList = new HashSet<>(32);
- Iterator<IndexableField> fieldIterator = doc.getFields().iterator();
- while (fieldIterator.hasNext())
- fieldList.add(fieldIterator.next().name());
-
- this.populateDocument(doc, field, fieldList);
- };
-
-
- /**
- * Populate document meta information with information coming from
- * the index.
- *
- * @param doc
- * Document object.
- * @param field
- * Primary data field.
- * @param fields
- * Hash object with all supported fields.
- */
- public void populateDocument (Document doc, String field,
- Collection<String> fields) {
- this.setField(field);
- this.setPrimaryData(doc.get(field));
-
- // Remember - never serialize "tokens"
-
- // LEGACY
- if (fields.contains("corpusID"))
- this.setCorpusID(doc.get("corpusID"));
- if (fields.contains("ID"))
- this.setDocID(doc.get("ID"));
- if (fields.contains("tokenization"))
- this.setTokenization(doc.get("tokenization"));
- if (fields.contains("layerInfo"))
- this.setLayerInfo(doc.get("layerInfo"));
-
- // valid
- if (fields.contains("UID"))
- this.setUID(doc.get("UID"));
- if (fields.contains("author"))
- this.setAuthor(doc.get("author"));
- if (fields.contains("textClass"))
- this.setTextClass(doc.get("textClass"));
- if (fields.contains("title"))
- this.setTitle(doc.get("title"));
- if (fields.contains("subTitle"))
- this.setSubTitle(doc.get("subTitle"));
- if (fields.contains("pubDate"))
- this.setPubDate(doc.get("pubDate"));
- if (fields.contains("pubPlace"))
- this.setPubPlace(doc.get("pubPlace"));
-
- // Temporary (later meta fields in term vector)
- if (fields.contains("foundries"))
- this.setFoundries(doc.get("foundries"));
-
- // New fields
- if (fields.contains("textSigle"))
- this.setTextSigle(doc.get("textSigle"));
- if (fields.contains("docSigle"))
- this.setDocSigle(doc.get("docSigle"));
- if (fields.contains("corpusSigle"))
- this.setCorpusSigle(doc.get("corpusSigle"));
- if (fields.contains("layerInfos"))
- this.setLayerInfos(doc.get("layerInfos"));
- if (fields.contains("tokenSource"))
- this.setTokenSource(doc.get("tokenSource"));
- if (fields.contains("editor"))
- this.setEditor(doc.get("editor"));
-
- if (fields.contains("corpusAuthor"))
- this.setCorpusAuthor(doc.get("corpusAuthor"));
- if (fields.contains("corpusEditor"))
- this.setCorpusEditor(doc.get("corpusEditor"));
- if (fields.contains("corpusTitle"))
- this.setCorpusTitle(doc.get("corpusTitle"));
- if (fields.contains("corpusSubTitle"))
- this.setCorpusSubTitle(doc.get("corpusSubTitle"));
-
- if (fields.contains("docAuthor"))
- this.setDocAuthor(doc.get("docAuthor"));
- if (fields.contains("docEditor"))
- this.setDocEditor(doc.get("docEditor"));
- if (fields.contains("docTitle"))
- this.setDocTitle(doc.get("docTitle"));
- if (fields.contains("docSubTitle"))
- this.setDocSubTitle(doc.get("docSubTitle"));
-
- if (fields.contains("publisher"))
- this.setPublisher(doc.get("publisher"));
- if (fields.contains("reference"))
- this.setReference(doc.get("reference"));
- if (fields.contains("creationDate"))
- this.setCreationDate(doc.get("creationDate"));
- if (fields.contains("keywords"))
- this.setKeywords(doc.get("keywords"));
- if (fields.contains("textClass"))
- this.setTextClass(doc.get("textClass"));
- if (fields.contains("textColumn"))
- this.setTextColumn(doc.get("textColumn"));
- if (fields.contains("textDomain"))
- this.setTextDomain(doc.get("textDomain"));
- if (fields.contains("textType"))
- this.setTextType(doc.get("textType"));
- if (fields.contains("textTypeArt"))
- this.setTextTypeArt(doc.get("textTypeArt"));
- if (fields.contains("textTypeRef"))
- this.setTextTypeRef(doc.get("textTypeRef"));
- if (fields.contains("language"))
- this.setLanguage(doc.get("language"));
- if (fields.contains("license"))
- this.setLicense(doc.get("license"));
- if (fields.contains("pages"))
- this.setPages(doc.get("pages"));
-
- if (fields.contains("biblEditionStatement"))
- this.setBiblEditionStatement(doc.get("biblEditionStatement"));
- if (fields.contains("fileEditionStatement"))
- this.setFileEditionStatement(doc.get("fileEditionStatement"));
- };
-
-
- /**
* Get document id.
*/
@JsonProperty("docID")
diff --git a/src/main/java/de/ids_mannheim/korap/response/Text.java b/src/main/java/de/ids_mannheim/korap/response/Text.java
new file mode 100644
index 0000000..437f24c
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/response/Text.java
@@ -0,0 +1,53 @@
+package de.ids_mannheim.korap.response;
+
+import java.util.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonInclude.Include;
+
+import de.ids_mannheim.korap.index.AbstractDocument;
+
+/**
+ * Representation of Texts in a Result.
+ * <strong>Warning:</strong> This is currently highly dependent
+ * on DeReKo data and will change in the future.
+ *
+ * @author Nils Diewald
+ * @see Result
+ */
+@JsonInclude(Include.NON_NULL)
+public class Text extends AbstractDocument {
+
+ // Logger
+ private final static Logger log = LoggerFactory.getLogger(Text.class);
+
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+
+ // Mapper for JSON serialization
+ ObjectMapper mapper = new ObjectMapper();
+
+ public Text () {};
+
+ public String toJsonString () {
+ JsonNode json = (JsonNode) this.toJsonNode();
+
+ // Match was no match
+ if (json.size() == 0)
+ return "{}";
+ try {
+ return mapper.writeValueAsString(json);
+ }
+ catch (Exception e) {
+ log.warn(e.getLocalizedMessage());
+ };
+
+ return "{}";
+ };
+};
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index 7e03f21..b46d82c 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -1,4 +1,4 @@
-# log4j.rootLogger = ERROR, stdout
+log4j.rootLogger = ERROR, stdout
# Queries:
# log4j.logger.de.ids_mannheim.korap.query.SpanNextQuery = TRACE, stdout
@@ -17,6 +17,8 @@
# log4j.logger.de.ids_mannheim.korap.query.spans.FocusSpans = TRACE, stdout
# log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout
+org.glassfish.grizzly.http.server.NetworkListener = TRACE, stdout
+
# Wrappers:
# log4j.logger.de.ids_mannheim.korap.KrillQuery = TRACE, stdout
# log4j.logger.de.ids_mannheim.korap.query.wrap.SpanSequenceQueryWrapper = TRACE, stdout
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
index 653549a..15079c2 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
@@ -106,4 +106,20 @@
// hasDeletions, hasPendingMerges
};
+
+ @Test
+ public void indexFieldInfo () throws IOException {
+ KrillIndex ki = new KrillIndex();
+
+ FieldDocument fd = new FieldDocument();
+ fd.setTitle("Peter");
+ fd.setUID(22);
+ ki.addDoc(fd);
+ ki.commit();
+
+ assertEquals(1, ki.numberOf("base", "documents"));
+
+ assertEquals("Peter", ki.getDoc("22").getTitle());
+ assertEquals(22, ki.getDoc("22").getUID());
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/server/TestResource.java b/src/test/java/de/ids_mannheim/korap/server/TestResource.java
index 2f1c128..3181245 100644
--- a/src/test/java/de/ids_mannheim/korap/server/TestResource.java
+++ b/src/test/java/de/ids_mannheim/korap/server/TestResource.java
@@ -76,15 +76,9 @@
Node.closeDBPool();
t4 = System.nanoTime();
- double startup = (double) (t2 - t1) / 1000000000.0;
- double action = (double) (t3 - t2) / 1000000000.0;
+ double startup = (double) (t2 - t1) / 1000000000.0;
+ double action = (double) (t3 - t2) / 1000000000.0;
double shutdown = (double) (t4 - t3) / 1000000000.0;
-
- /*
- System.err.println("Startup: " + startup + ", " +
- "Action: " + action + ", " +
- "Shutdown: " + shutdown);
- */
};
@@ -129,6 +123,7 @@
res = mapper.readTree(resp);
assertEquals("milena", res.at("/meta/node").asText());
+ assertEquals(681, res.at("/messages/0/0").asInt());
}
catch (Exception e) {
fail("Server response failed " + e.getMessage()
@@ -150,6 +145,7 @@
// Check mirroring
assertEquals(2439, res.at("/text/UID").asInt());
assertEquals("milena", res.at("/meta/node").asText());
+ assertEquals(681, res.at("/messages/0/0").asInt());
}
catch (Exception e) {
fail("Server response failed " + e.getMessage() + " (Known issue)");
@@ -160,9 +156,18 @@
.post(Entity.text(""), String.class);
res = mapper.readTree(resp);
assertEquals("milena", res.at("/meta/node").asText());
+
+ // Staged data committed
assertEquals(683, res.at("/messages/0/0").asInt());
};
+ /*
+ @Test
+ public void testRemoving () throws IOException {
+ resp = target.path("/index/" + i).request("application/json")
+ .put(jsonE, String.class);
+ };
+ */
@Test
public void testCollection () throws IOException {