Serialize field value lists (fixes #81)
Change-Id: I7901e679f9168668025730ce6b460c52a076f5e3
diff --git a/Changes b/Changes
index 17f1622..48d360c 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
0.60.2 2022-01-03
- [security] More log4j updates (diewald)
+ - [feature] Support for field value vector method (fixes #81; diewald)
0.60.1 2021-12-17
- [feature] Added vc loading from classpath (margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 819a713..40addf9 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -28,6 +28,7 @@
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
@@ -1875,5 +1876,74 @@
}
return fingerprints;
}
-
+
+
+ // Return a vector representation of all
+ // different values for a certain field.
+ // This is a simplified "group" API and should in the future be
+ // succeeded by group.
+ public List<String> getFieldVector (String field, KrillCollection collection) {
+ collection.setIndex(this);
+
+ List fieldValues = new ArrayList<String>();
+ String fieldValue;
+
+ try {
+ final Filter filter = collection.toFilter();
+
+ // Get from filtered index
+ if (filter != null) {
+
+ // Iterate over all atomic readers and collect occurrences
+ for (LeafReaderContext atomic : this.reader().leaves()) {
+
+ LeafReader lreader = atomic.reader();
+
+ DocIdSet docids = filter.getDocIdSet(atomic, null);
+
+ DocIdSetIterator docs = (docids == null) ? null : docids.iterator();
+
+ if (docs == null)
+ continue;
+
+ while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ fieldValue = lreader.document(docs.docID()).get(field);
+ if (fieldValue != null && fieldValue != "")
+ fieldValues.add(fieldValue);
+ };
+
+ }
+ } else { // Get from unfiltered index
+
+ // Iterate over all atomic readers and collect occurrences
+ for (LeafReaderContext atomic : this.reader().leaves()) {
+
+ LeafReader lreader = atomic.reader();
+ Bits live = lreader.getLiveDocs();
+
+ for (int i=0; i<lreader.maxDoc(); i++) {
+ if (live != null && !live.get(i))
+ continue;
+
+ Document doc = lreader.document(i);
+ fieldValue = doc.get(field);
+ if (fieldValue != null && fieldValue != "")
+ fieldValues.add(fieldValue);
+ };
+ };
+ };
+ }
+
+ // Something went wrong
+ catch (IOException e) {
+ log.warn(e.getLocalizedMessage());
+ }
+
+ // E.g. reference corpus not found
+ catch (QueryException e) {
+ log.warn(e.getLocalizedMessage());
+ };
+
+ return fieldValues;
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
index 0e44573..2b224c3 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestKrillIndex.java
@@ -20,6 +20,8 @@
import de.ids_mannheim.korap.index.MultiTermTokenStream;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.util.QueryException;
+import de.ids_mannheim.korap.collection.CollectionBuilder;
+import de.ids_mannheim.korap.KrillCollection;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
@@ -375,4 +377,57 @@
assertEquals(1, checkC);
}
+
+ @Test
+ public void indexFieldVector () throws IOException {
+ KrillIndex ki = new KrillIndex();
+
+ FieldDocument fd = new FieldDocument();
+ fd.addString("textSigle", "aaaa");
+ ki.addDoc(fd);
+
+ fd = new FieldDocument();
+ fd.addString("textSigle", "bbbb");
+ fd.setUID("05678");
+ ki.addDoc(fd);
+
+ ki.commit();
+
+ CollectionBuilder cb = new CollectionBuilder();
+ KrillCollection kcn = new KrillCollection(ki);
+
+ List fieldValues = ki.getFieldVector("textSigle", kcn);
+ assertEquals(2, fieldValues.size());
+ assertEquals("aaaa", fieldValues.get(0));
+ assertEquals("bbbb", fieldValues.get(1));
+
+ fieldValues = ki.getFieldVector("UID", kcn);
+ assertEquals(1, fieldValues.size(), 1);
+ assertEquals("5678", fieldValues.get(0));
+
+ kcn.fromBuilder(cb.term("textSigle","bbbb"));
+ fieldValues = ki.getFieldVector("textSigle", kcn);
+ assertEquals(1, fieldValues.size());
+ assertEquals("bbbb", fieldValues.get(0));
+
+
+ fd = new FieldDocument();
+ fd.addString("textSigle", "cccc");
+ ki.addDoc(fd);
+
+ ki.commit();
+
+ kcn.fromBuilder(null);
+ fieldValues = ki.getFieldVector("textSigle", kcn);
+ assertEquals(3, fieldValues.size());
+ assertEquals("aaaa", fieldValues.get(0));
+ assertEquals("bbbb", fieldValues.get(1));
+ assertEquals("cccc", fieldValues.get(2));
+
+ kcn.fromBuilder(cb.orGroup().with(cb.term("textSigle","aaaa")).with(cb.term("textSigle","cccc")));
+ fieldValues = ki.getFieldVector("textSigle", kcn);
+ assertEquals(2, fieldValues.size());
+ assertEquals("aaaa", fieldValues.get(0));
+ assertEquals("cccc", fieldValues.get(1));
+ };
};