Support for arbitrary metadata fields (fixes #47)
Change-Id: I3b02195699633665c3afb995dd6dc7a922e6cd45
diff --git a/Changes b/Changes
index 8fa53fd..8c82163 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.58.4 2019-01-16
+0.58.4 2019-01-17
- [cleanup] Remove deprecated methods setLicense/getLicense,
setTokenization/getTokenization, setLayerInfo/getLayerInfo,
setField/getField (including json serialization)
@@ -7,6 +7,8 @@
AbstractDocument and FieldDocument (diewald)
- [cleanup] Remove hardwired (de)serialization of legacy metadata
fields (diewald)
+ - [feature] Support for arbitrary metadata fields (fixes #47)
+ (diewald)
0.58.3 2018-12-17
- [feature] Introduced attachements as meta data fields
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index 444d862..8856d98 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -142,7 +142,6 @@
public void populateFields (Document doc) {
-
HashSet<String> fieldList = new HashSet<>(32);
Iterator<IndexableField> fieldIterator = doc.getFields().iterator();
while (fieldIterator.hasNext())
@@ -554,6 +553,10 @@
return;
KrillDate date = new KrillDate(value);
+
+ if (date == null)
+ return;
+
mFields.add(
new MetaField(
key,
@@ -562,4 +565,9 @@
)
);
};
+
+ @JsonIgnore
+ public void addDate (String key, int value) {
+ this.addDate(key, new Integer(value).toString());
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 4ca1ae6..de988ed 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -20,6 +20,7 @@
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
@@ -70,7 +71,7 @@
private FieldType tvField = new FieldType(TextField.TYPE_STORED);
private FieldType tvNoField = new FieldType(TextField.TYPE_NOT_STORED);
private FieldType keywordField = new FieldType(TextField.TYPE_STORED);
-
+
{
tvField.setStoreTermVectors(true);
tvField.setStoreTermVectorPositions(true);
@@ -101,7 +102,7 @@
case "type:integer":
try {
int val = Integer.parseInt(mf.values.get(0));
- doc.add(new IntField(mf.key, val, Field.Store.YES));
+ doc.add(new DoubleField(mf.key, (double) val, Field.Store.YES));
}
catch (NumberFormatException ne) {
continue;
@@ -112,8 +113,7 @@
KrillDate date = new KrillDate(mf.values.get(0));
if (date != null) {
try {
- int dateInt = date.toInteger();
- doc.add(new IntField(mf.key, dateInt, Field.Store.YES));
+ doc.add(new IntField(mf.key, date.toInteger(), Field.Store.YES));
}
catch (NumberFormatException ne) {
continue;
@@ -276,6 +276,9 @@
};
this.addKeywords(key, sb.toString());
}
+ else if (type.equals("type:keywords")) {
+ this.addKeywords(key, field.get("value").asText());
+ }
else {
this.addString(key, field.get("value").asText());
};
@@ -291,6 +294,12 @@
this.addInt(key, field.get("value").asInt());
}
+ // Add store field
+ else if (type.equals("type:store")) {
+ value = field.get("value").asText();
+ this.addStored(key, value);
+ }
+
// Add attachement field
else if (type.equals("type:attachement")) {
value = field.get("value").asText();
@@ -302,12 +311,6 @@
// Add date field
else if (type.equals("type:date")) {
this.addDate(key, field.get("value").asText());
- /*
- KrillDate date = new KrillDate(field.get("value").asText());
- if (date != null) {
- this.addInt(key, date.toString());
- };
- */
}
// Unknown
diff --git a/src/main/java/de/ids_mannheim/korap/response/MetaField.java b/src/main/java/de/ids_mannheim/korap/response/MetaField.java
index e6d6ca2..94052d7 100644
--- a/src/main/java/de/ids_mannheim/korap/response/MetaField.java
+++ b/src/main/java/de/ids_mannheim/korap/response/MetaField.java
@@ -53,7 +53,7 @@
ObjectNode json = mapper.createObjectNode();
json.put("@type", "koral:field");
json.put("type", this.type);
- json.put("key", this.key);
+ json.put("key", this.key);
// Value is numerical
if (this.type.equals("type:integer")) {
@@ -74,9 +74,10 @@
};
}
+
// Value is textual or keywords
else {
- // Value is a list
+ // Value is a list
if (this.values.size() > 1) {
ArrayNode list = json.putArray("value");
diff --git a/src/main/java/de/ids_mannheim/korap/response/MetaFieldsObj.java b/src/main/java/de/ids_mannheim/korap/response/MetaFieldsObj.java
index 049fc3a..ea992b2 100644
--- a/src/main/java/de/ids_mannheim/korap/response/MetaFieldsObj.java
+++ b/src/main/java/de/ids_mannheim/korap/response/MetaFieldsObj.java
@@ -25,6 +25,7 @@
import java.util.regex.*;
import org.apache.lucene.index.*;
+import org.apache.lucene.document.FieldType;
public class MetaFieldsObj implements Iterable<MetaField> {
@@ -34,10 +35,7 @@
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
- // TODO:
- // This is a temporary indicator to check
- // whether a date field is a date
- private static final Pattern dateKeyPattern = Pattern.compile(".*Date$");
+ private static final Pattern dateValuePattern = Pattern.compile("^([0-9]{8})$");
// Mapper for JSON serialization
ObjectMapper mapper = new ObjectMapper();
@@ -100,9 +98,10 @@
if (n != null) {
// Check if key indicates a date
- Matcher dateMatcher = dateKeyPattern.matcher(iField.name());
- if (dateMatcher.matches()) {
+ Matcher dateMatcher = dateValuePattern.matcher(n.toString());
+ if (dateMatcher.matches()) {
mf.type = "type:date";
+
KrillDate date = new KrillDate(n.toString());
if (date != null) {
@@ -114,7 +113,7 @@
// Field is a number
else {
mf.type = "type:integer";
- mf.values.add(n.toString());
+ mf.values.add(new Integer(n.intValue()).toString());
};
}
diff --git a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
index def7b39..7cdfb85 100644
--- a/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/collection/TestKrillCollectionIndex.java
@@ -1389,7 +1389,7 @@
fd.addString("ID", "doc-1");
fd.addString("author", "Frank");
fd.addKeywords("textClass", "Nachricht Kultur Reisen");
- fd.addInt("pubDate", 20051210);
+ fd.addDate("pubDate", 20051210);
fd.addText("text", "Der alte Mann ging über die Straße");
fd.addTV("tokens", "a b c", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
+ "[(2-3)s:b|i:b|_1$<i>2<i>3]" + "[(4-5)s:c|i:c|_2$<i>4<i>5]");
@@ -1403,7 +1403,7 @@
fd.addString("ID", "doc-2");
fd.addString("author", "Peter");
fd.addKeywords("textClass", "Kultur Reisen");
- fd.addInt("pubDate", 20051207);
+ fd.addDate("pubDate", 20051207);
fd.addText("text", "Der junge Mann hatte keine andere Wahl");
fd.addTV("tokens", "a c d", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
+ "[(2-3)s:c|i:c|_1$<i>2<i>3]" + "[(4-5)s:d|i:d|_2$<i>4<i>5]");
@@ -1417,7 +1417,7 @@
fd.addString("ID", "doc-3");
fd.addString("author", "Sebastian");
fd.addKeywords("textClass", "Reisen Finanzen");
- fd.addInt("pubDate", 20051216);
+ fd.addDate("pubDate", 20051216);
fd.addText("text", "Die Frau und der Mann küssten sich");
fd.addTV("tokens", "a d e", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
+ "[(2-3)s:d|i:d|_1$<i>2<i>3]" + "[(4-5)s:e|i:e|_2$<i>4<i>5]");
@@ -1430,7 +1430,7 @@
fd.addString("ID", "doc-5000");
fd.addString("author", "Sebastian");
fd.addKeywords("textClass", "Kultur Finanzen");
- fd.addInt("pubDate", 20180202);
+ fd.addDate("pubDate", 20180202);
fd.addText("text", "Die Frau und der Mann küssten sich");
fd.addTV("tokens", "a d e", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>3]"
+ "[(2-3)s:d|i:d|_1$<i>2<i>3]" + "[(4-5)s:e|i:e|_2$<i>4<i>5]");
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
index dcaa5c3..141e60b 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestFieldDocument.java
@@ -26,6 +26,7 @@
import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.util.QueryException;
+import static de.ids_mannheim.korap.response.MetaFieldsObj.*;
import org.apache.lucene.document.Document;
@@ -41,12 +42,13 @@
fd.addString("ID", "WPD-AAA-00001");
fd.addText("textClass", "music entertainment");
fd.addText("author", "Peter Frankenfeld");
- fd.addInt("pubDate", 20130617);
+ fd.addDate("pubDate", 20130617);
+ fd.addInt("justanumber", 12345678);
fd.addText("title", "Wikipedia");
fd.addText("subTitle", "Die freie Enzyklopädie");
fd.addStored("layerInfo", "opennlp/p=pos");
fd.addString("pubPlace", "Bochum");
- fd.addInt("lastModified", 20130717);
+ fd.addDate("lastModified", 20130717);
fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10]"
+ "[(1-2)s:b|i:b|_1$<i>1<i>2]" + "[(2-3)s:c|i:c|_2$<i>2<i>3]");
fd.addAttachement("Wikilink", "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel");
@@ -55,7 +57,7 @@
assertEquals(doc.getField("title").name(), "title");
assertEquals(doc.getField("title").stringValue(), "Wikipedia");
-
+
assertEquals(doc.getField("corpusID").name(), "corpusID");
assertEquals(doc.getField("corpusID").stringValue(), "WPD");
@@ -90,6 +92,9 @@
assertEquals(doc.getField("Wikilink").stringValue(),
"data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel"
);
+
+ assertEquals(doc.getField("justanumber").numericValue().intValue(), 12345678);
+
};
@@ -435,4 +440,187 @@
};
};
};
+
+ @Test
+ public void indexArbitraryMetaData () throws Exception {
+ String json = new String(
+ "{"
+ + " \"fields\" : ["
+ + " { "
+ + " \"primaryData\" : \"abc\""
+ + " },"
+ + " {"
+ + " \"name\" : \"tokens\","
+ + " \"data\" : ["
+ + " [ \"s:a\", \"i:a\", \"_0$<i>0<i>1\", \"-:t$<i>3\"],"
+ + " [ \"s:b\", \"i:b\", \"_1$<i>1<i>2\" ],"
+ + " [ \"s:c\", \"i:c\", \"_2$<i>2<i>3\" ]"
+ + " ]"
+ + " }"
+ + " ],"
+ + " \"metaFields\" : ["
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:string\","
+ + " \"key\" : \"textSigle\","
+ + " \"value\" : \"aa/bb/cc\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:integer\","
+ + " \"key\" : \"alter\","
+ + " \"value\" : 40"
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:string\","
+ + " \"key\" : \"name\","
+ + " \"value\" : \"Frank\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:string\","
+ + " \"key\" : \"name\","
+ + " \"value\" : \"Julian\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:string\","
+ + " \"key\" : \"schluesselwoerter\","
+ + " \"value\" : [\"musik\",\"unterhaltung\"]"
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:keywords\","
+ + " \"key\" : \"tags\","
+ + " \"value\" : \"nachrichten feuilleton\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:keywords\","
+ + " \"key\" : \"tags\","
+ + " \"value\" : [\"sport\",\"raetsel\"]"
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:text\","
+ + " \"key\" : \"titel\","
+ + " \"value\" : \"Der alte Baum\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:attachement\","
+ + " \"key\" : \"anhang\","
+ + " \"value\" : \"data:application/x.korap-link,http://spiegel.de/\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:store\","
+ + " \"key\" : \"referenz\","
+ + " \"value\" : \"So war das\""
+ + " },"
+ + " {"
+ + " \"@type\" : \"koral:field\","
+ + " \"type\" : \"type:date\","
+ + " \"key\" : \"datum\","
+ + " \"value\" : \"2018-04-03\""
+ + " }"
+ + " ]"
+ + "}");
+
+ KrillIndex ki = new KrillIndex();
+ FieldDocument fd = ki.addDoc(json);
+
+ ki.commit();
+
+ assertEquals(fd.getPrimaryData(), "abc");
+ assertEquals(fd.doc.getField("alter").stringValue(), "40.0");
+ assertEquals(fd.doc.getField("name").stringValue(), "Frank");
+ assertEquals(fd.doc.getField("schluesselwoerter").stringValue(), "musik unterhaltung");
+ assertEquals(fd.doc.getField("tags").stringValue(), "nachrichten feuilleton sport raetsel");
+ assertEquals(fd.doc.getField("titel").stringValue(), "Der alte Baum");
+ assertEquals(fd.doc.getField("anhang").stringValue(), "data:application/x.korap-link,http://spiegel.de/");
+ assertEquals(fd.doc.getField("referenz").stringValue(), "So war das");
+ assertEquals(fd.doc.getField("datum").stringValue(), "20180403");
+
+ JsonNode res = ki.getFields("aa/bb/cc").toJsonNode();
+
+ Iterator fieldIter = res.at("/document/fields").elements();
+
+ int checkC = 0;
+ while (fieldIter.hasNext()) {
+ JsonNode field = (JsonNode) fieldIter.next();
+
+ String key = field.at("/key").asText();
+
+ switch (key) {
+ case "textSigle":
+ assertEquals("type:string", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("aa/bb/cc", field.at("/value").asText());
+ checkC++;
+ break;
+
+ case "alter":
+ assertEquals("type:integer", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals(40, field.at("/value").asInt());
+ checkC++;
+ break;
+
+ case "name":
+ assertEquals("type:string", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("Frank", field.at("/value").asText());
+ checkC++;
+ break;
+
+ case "schluesselwoerter":
+ assertEquals("type:keywords", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("musik", field.at("/value/0").asText());
+ assertEquals("unterhaltung", field.at("/value/1").asText());
+ checkC++;
+ break;
+
+ case "tags":
+ assertEquals("type:keywords", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("nachrichten", field.at("/value/0").asText());
+ assertEquals("feuilleton", field.at("/value/1").asText());
+ assertEquals("sport", field.at("/value/2").asText());
+ assertEquals("raetsel", field.at("/value/3").asText());
+ checkC++;
+ break;
+
+ case "titel":
+ assertEquals("type:text", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("Der alte Baum", field.at("/value").asText());
+ checkC++;
+ break;
+
+ case "anhang":
+ assertEquals("type:attachement", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("data:application/x.korap-link,http://spiegel.de/", field.at("/value").asText());
+ checkC++;
+ break;
+
+ case "referenz":
+ assertEquals("type:store", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("So war das", field.at("/value").asText());
+ checkC++;
+ break;
+
+ case "datum":
+ assertEquals("type:date", field.at("/type").asText());
+ assertEquals("koral:field", field.at("/@type").asText());
+ assertEquals("2018-04-03", field.at("/value").asText());
+ checkC++;
+ break;
+ };
+ };
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index 7b2df00..a346f79 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -1219,7 +1219,7 @@
fd.addString("corpusSigle", "c1");
fd.addString("docSigle", "c1/d1");
fd.addString("textSigle", "c1/d1/t1");
- fd.addInt("UID", 1);
+ fd.setUID(1);
fd.addTV("tokens", "abcabcabac",
"[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10]"
+ "[(1-2)s:b|i:b|_1$<i>1<i>2]"
@@ -1239,7 +1239,7 @@
fd.addString("corpusSigle", "c1");
fd.addString("docSigle", "c1/d1");
fd.addString("textSigle", "c1/d1/t1");
- fd.addInt("UID", 2);
+ fd.setUID(2);
fd.addTV("tokens", "abcabcabac",
"[(0-1)s:a|i:a|_0$<i>0<i>1|-:t$<i>10]"
+ "[(1-2)s:b|i:b|_1$<i>1<i>2]"
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
index edede59..c61a1f2 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestMetaFields.java
@@ -328,7 +328,7 @@
fd.addText("title", "Die Wahlverwandtschaften");
fd.addText("author", "Johann Wolfgang von Goethe");
fd.addKeywords("textClass", "reisen wissenschaft");
- fd.addInt("pubDate", 20130617);
+ fd.addDate("pubDate", 20130617);
fd.addTV("tokens", "abc", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+ "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:c|i:c|_2#2-3]");
fd.addAttachement("WikiLink", "data:application/x.korap-link,https://de.wikipedia.org/wiki/Beispiel");