Added simple fields support
diff --git a/CHANGES b/CHANGES
index 2eddfc4..cfac631 100644
--- a/CHANGES
+++ b/CHANGES
@@ -8,6 +8,8 @@
- [bugfix] Queries with regular expressions in spanNext() are now
correctly rewritten (diewald)
- [bugfix] Ignore foundry for orth layer (diewald)
+ - [feature] Support fields in meta (diewald)
+ - [sigh] Support more legacy APIs (diewald)
0.47 2014-11-05
- [feature] Support new index format with more metadata (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/KorapCollection.java b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
index ff0ef24..5dcb81d 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapCollection.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapCollection.java
@@ -83,12 +83,32 @@
this.filter = new ArrayList<FilterOperation>(5);
};
+ public void fromJSON (String jsonString) throws QueryException {
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ this.fromJSON((JsonNode) mapper.readValue(jsonString, JsonNode.class));
+ }
+ catch (Exception e) {
+ this.error = e.getMessage();
+ };
+ };
public void fromJSON (JsonNode json) throws QueryException {
this.filter(new KorapFilter(json));
};
+ public void fromJSONLegacy (String jsonString) throws QueryException {
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ this.fromJSONLegacy((JsonNode) mapper.readValue(jsonString, JsonNode.class));
+ }
+ catch (Exception e) {
+ this.error = e.getMessage();
+ };
+ };
+
+
/**
* Legacy API for collection filters.
*/
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index 7278877..7072d87 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -103,10 +103,6 @@
private static ByteBuffer bb = ByteBuffer.allocate(4),
bbOffset = ByteBuffer.allocate(8),
bbTerm = ByteBuffer.allocate(16);
-
-
- private Set<String> fieldsToLoad;
-
// Logger
private final static Logger log = LoggerFactory.getLogger(KorapIndex.class);
@@ -145,24 +141,10 @@
public KorapIndex (Directory directory) throws IOException {
this.directory = directory;
- fieldsToLoad = new HashSet<String>(16);
- fieldsToLoad.add("author");
- fieldsToLoad.add("ID");
- fieldsToLoad.add("textSigle");
- fieldsToLoad.add("UID");
- fieldsToLoad.add("title");
- fieldsToLoad.add("subTitle");
- fieldsToLoad.add("textClass");
- fieldsToLoad.add("pubPlace");
- fieldsToLoad.add("pubDate");
- fieldsToLoad.add("corpusID");
- fieldsToLoad.add("foundries");
- fieldsToLoad.add("layerInfo");
- fieldsToLoad.add("tokenization");
-
// Base analyzer for searching and indexing
// StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
+ // TODO: Why is this here?
Map<String,Analyzer> analyzerPerField = new HashMap<String,Analyzer>();
analyzerPerField.put("textClass", new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
analyzerPerField.put("foundries", new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
@@ -767,20 +749,22 @@
// We've found the correct document! Hurray!
if (DEBUG)
log.trace("We've found a matching document");
- HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
- fieldsToLoadLocal.add(field);
+
+ HashSet<String> fields = (HashSet<String>)
+ new KorapSearch().getFields().clone();
+ fields.add(field);
// Get terms from the document
Terms docTerms = atomic.reader().getTermVector(localDocID, field);
// Load the necessary fields of the document
- Document doc = atomic.reader().document(localDocID, fieldsToLoadLocal);
+ Document doc = atomic.reader().document(localDocID, fields);
// Put some more information to the match
PositionsToOffset pto = new PositionsToOffset(atomic, field);
match.setPositionsToOffset(pto);
match.setLocalDocID(localDocID);
- match.populateDocument(doc, field, fieldsToLoadLocal);
+ match.populateDocument(doc, field, fields);
if (DEBUG)
log.trace("The document has the id '{}'", match.getDocID());
@@ -1086,8 +1070,8 @@
kr.setVersion(this.getVersion());
// The following fields should be lifted for matches
- HashSet<String> fieldsToLoadLocal = new HashSet<>(fieldsToLoad);
- fieldsToLoadLocal.add(field);
+ HashSet<String> fields = (HashSet<String>) ks.getFields().clone();
+ fields.add(field);
// Some initializations ...
int i = 0,
@@ -1183,11 +1167,8 @@
int docID = atomic.docBase + localDocID;
- // Document doc = lreader.document(docID, fieldsToLoadLocal);
-
-
// Do not load all of this, in case the doc is the same!
- Document doc = lreader.document(localDocID, fieldsToLoadLocal);
+ Document doc = lreader.document(localDocID, fields);
KorapMatch match = kr.addMatch(
pto,
localDocID,
@@ -1199,7 +1180,7 @@
match.addPayload((List<byte[]>) spans.getPayload());
match.internalDocID = docID;
- match.populateDocument(doc, field, fieldsToLoadLocal);
+ match.populateDocument(doc, field, fields);
if (DEBUG) {
if (match.getDocID() != null)
@@ -1289,8 +1270,8 @@
long t1 = System.nanoTime();
// Only load UID
- HashSet<String> fieldsToLoadLocal = new HashSet<>();
- fieldsToLoadLocal.add("UID");
+ HashSet<String> fields = new HashSet<>(1);
+ fields.add("UID");
// List<KorapMatch> atomicMatches = new ArrayList<KorapMatch>(10);
@@ -1344,7 +1325,7 @@
// Read document id from index
uniqueDocIDString =
- lreader.document(localDocID, fieldsToLoadLocal).get("UID");
+ lreader.document(localDocID, fields).get("UID");
if (uniqueDocIDString != null)
uniqueDocID = Integer.parseInt(uniqueDocIDString);
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
index b56a6d0..adb574c 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapSearch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -47,6 +47,25 @@
{
context = new SearchContext();
+
+ // Lift legacy fields per default
+ fields = new HashSet<String>(16);
+ for (String field : new String[]{
+ "ID",
+ "UID",
+ "textSigle",
+ "corpusID",
+ "author",
+ "title",
+ "subTitle",
+ "textClass",
+ "pubPlace",
+ "pubDate",
+ "foundries",
+ "layerInfo",
+ "tokenization"}) {
+ fields.add(field);
+ };
};
public KorapSearch (String jsonString) {
@@ -127,6 +146,22 @@
// Defined resource count
if (meta.has("itemsPerResource"))
this.setItemsPerResource(meta.get("itemsPerResource").asInt());
+
+ // Only lift a limited amount of fields from the metadata
+ if (meta.has("fields")) {
+
+ // Remove legacy default fields
+ this.fields.clear();
+
+ // Add fields
+ if (meta.get("fields").isArray()) {
+ for (JsonNode field : (JsonNode) meta.get("fields")) {
+ this.addField(field.asText());
+ };
+ }
+ else
+ this.addField(meta.get("fields").asText());
+ };
};
};
}
@@ -284,11 +319,15 @@
return this.itemsPerResource;
};
- // Get map of fields to lift
+ // Add field to set of fields
+ public KorapSearch addField (String field) {
+ this.fields.add(field);
+ return this;
+ };
+
+ // Get set of fields
public HashSet<String> getFields () {
- if (this.fields != null)
- return this.fields;
- return (HashSet<String>) null;
+ return this.fields;
};
public KorapSearch setCollection (KorapCollection kc) {
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
new file mode 100644
index 0000000..0677fc1
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
@@ -0,0 +1,101 @@
+package de.ids_mannheim.korap.search;
+
+import java.util.*;
+import java.io.*;
+
+import static de.ids_mannheim.korap.TestSimple.*;
+
+import de.ids_mannheim.korap.KorapSearch;
+import de.ids_mannheim.korap.KorapCollection;
+import de.ids_mannheim.korap.KorapQuery;
+import de.ids_mannheim.korap.KorapIndex;
+import de.ids_mannheim.korap.index.FieldDocument;
+import de.ids_mannheim.korap.index.SearchContext;
+import de.ids_mannheim.korap.KorapFilter;
+import de.ids_mannheim.korap.KorapResult;
+import java.nio.file.Files;
+import java.nio.file.FileSystem;
+import java.nio.file.Path;
+import java.nio.charset.StandardCharsets;
+import java.nio.ByteBuffer;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TestMetaFields {
+
+ @Test
+ public void searchMetaFields () throws IOException {
+
+ // Construct index
+ KorapIndex ki = new KorapIndex();
+ // Indexing test files
+ for (String i : new String[] {"00001",
+ "00002"}) {
+ ki.addDocFile(
+ getClass().getResource("/wiki/" + i + ".json.gz").getFile(), true
+ );
+ };
+ ki.commit();
+
+ String json = getString(
+ getClass().getResource("/queries/metas/fields.jsonld").getFile()
+ );
+
+ KorapSearch ks = new KorapSearch(json);
+ KorapResult kr = ks.run(ki);
+ assertEquals(17, kr.getTotalResults());
+ assertEquals(0, kr.getStartIndex());
+ assertEquals(9, kr.getItemsPerPage());
+
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode res = mapper.readTree(kr.toJSON());
+ assertEquals(0, res.at("/matches/0/UID").asInt());
+ assertEquals("WPD", res.at("/matches/0/corpusID").asText());
+ assertEquals("", res.at("/matches/0/docID").asText());
+ assertEquals("", res.at("/matches/0/textSigle").asText());
+ assertEquals("", res.at("/matches/0/ID").asText());
+ assertEquals("", res.at("/matches/0/author").asText());
+ assertEquals("", res.at("/matches/0/title").asText());
+ assertEquals("", res.at("/matches/0/subTitle").asText());
+ assertEquals("", res.at("/matches/0/textClass").asText());
+ assertEquals("", res.at("/matches/0/pubPlace").asText());
+ assertEquals("", res.at("/matches/0/pubDate").asText());
+ assertEquals("", res.at("/matches/0/foundries").asText());
+ assertEquals("", res.at("/matches/0/layerInfo").asText());
+ assertEquals("", res.at("/matches/0/tokenization").asText());
+
+ json = getString(
+ getClass().getResource("/queries/metas/fields_2.jsonld").getFile()
+ );
+ ks = new KorapSearch(json);
+ kr = ks.run(ki);
+ assertEquals(17, kr.getTotalResults());
+ assertEquals(0, kr.getStartIndex());
+ assertEquals(2, kr.getItemsPerPage());
+
+ mapper = new ObjectMapper();
+ res = mapper.readTree(kr.toJSON());
+ assertEquals(0, res.at("/matches/0/UID").asInt());
+ assertEquals("", res.at("/matches/0/corpusID").asText());
+ assertEquals("Ruru,Jens.Ol,Aglarech", res.at("/matches/0/author").asText());
+ assertEquals("A", res.at("/matches/0/title").asText());
+ assertEquals("WPD_AAA.00001", res.at("/matches/0/docID").asText());
+ assertEquals("", res.at("/matches/0/textSigle").asText());
+ assertEquals("match-WPD_AAA.00001-p6-7", res.at("/matches/0/ID").asText());
+ assertEquals("", res.at("/matches/0/subTitle").asText());
+ assertEquals("", res.at("/matches/0/textClass").asText());
+ assertEquals("", res.at("/matches/0/pubPlace").asText());
+ assertEquals("", res.at("/matches/0/pubDate").asText());
+ assertEquals("", res.at("/matches/0/foundries").asText());
+ assertEquals("", res.at("/matches/0/layerInfo").asText());
+ assertEquals("", res.at("/matches/0/tokenization").asText());
+ };
+};
diff --git a/src/test/resources/queries/metas/fields.jsonld b/src/test/resources/queries/metas/fields.jsonld
new file mode 100644
index 0000000..42966ac
--- /dev/null
+++ b/src/test/resources/queries/metas/fields.jsonld
@@ -0,0 +1,30 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.2/context.jsonld",
+ "announcements" : [],
+ "collections" : [
+ {
+ "@type" : "korap:meta-filter",
+ "@value" : {
+ "@field" : "korap:field#corpusID",
+ "@type" : "korap:term",
+ "@value" : "WPD"
+ }
+ }
+ ],
+ "errors" : [],
+ "meta" : {
+ "fields":["UID","corpusID"],
+ "count":9
+ },
+ "query" : {
+ "@type" : "korap:token",
+ "wrap" : {
+ "@type" : "korap:term",
+ "key" : "Buchstabe",
+ "foundry" : "tt",
+ "layer" : "lemma",
+ "match" : "match:eq"
+ }
+ },
+ "warnings" : []
+}
diff --git a/src/test/resources/queries/metas/fields_2.jsonld b/src/test/resources/queries/metas/fields_2.jsonld
new file mode 100644
index 0000000..c141962
--- /dev/null
+++ b/src/test/resources/queries/metas/fields_2.jsonld
@@ -0,0 +1,30 @@
+{
+ "@context" : "http://ids-mannheim.de/ns/KorAP/json-ld/v0.2/context.jsonld",
+ "announcements" : [],
+ "collections" : [
+ {
+ "@type" : "korap:meta-filter",
+ "@value" : {
+ "@field" : "korap:field#corpusID",
+ "@type" : "korap:term",
+ "@value" : "WPD"
+ }
+ }
+ ],
+ "errors" : [],
+ "meta" : {
+ "fields":["ID","docID","author", "title", "subTitle"],
+ "count":2
+ },
+ "query" : {
+ "@type" : "korap:token",
+ "wrap" : {
+ "@type" : "korap:term",
+ "key" : "Buchstabe",
+ "foundry" : "tt",
+ "layer" : "lemma",
+ "match" : "match:eq"
+ }
+ },
+ "warnings" : []
+}