Protect against breaching primary data more low level
Change-Id: Ibb74fb94c67c325fd337fc6976779ca69f8788dc
diff --git a/Changes b/Changes
index 05d459e..d16a087 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.64.7 2026-05-05
+0.64.7 2026-05-06
- [bugfix] Keep highlights that extend beyond a cut match
(diewald; fixes #177; diewald; AI-assisted Claude Opus 4.6)
- [bugfix] Correctly handle foundry and layer in attribute groups
@@ -18,6 +18,8 @@
- [feature] Add krill.kwic.max.token as a convenience property
to cap total KWIC width; derives maxShrink values automatically
(diewald; AI-assisted Claude Opus 4.6)
+ - [enhancement] Improve low-level protection against data breaches
+ (diewald)
0.64.6 2026-03-09
- [performance] Add leaf cache. (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index 2979822..8212ffb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -119,8 +119,13 @@
public void populateDocument (Document doc, String field) {
List<String> fieldList = new ArrayList<>(32);
Iterator<IndexableField> fieldIterator = doc.getFields().iterator();
- while (fieldIterator.hasNext())
- fieldList.add(fieldIterator.next().name());
+ String name;
+ while (fieldIterator.hasNext()) {
+ name = fieldIterator.next().name();
+ if (name.equals("tokens") || name.equals("base"))
+ continue;
+ fieldList.add(name);
+ }
this.populateDocument(doc, field, fieldList);
};
@@ -140,6 +145,7 @@
List<String> fields) {
if (field != null)
this.setPrimaryData(doc.get(field));
+
this.populateFields(doc, fields);
};
@@ -147,8 +153,12 @@
public void populateFields (Document doc) {
ArrayList<String> fieldList = new ArrayList<>(32);
Iterator<IndexableField> fieldIterator = doc.getFields().iterator();
+ String name;
while (fieldIterator.hasNext()) {
- fieldList.add(fieldIterator.next().name());
+ name = fieldIterator.next().name();
+ if (name.equals("tokens") || name.equals("base"))
+ continue;
+ fieldList.add(name);
};
// TODO: Sort alphabetically!
@@ -171,7 +181,7 @@
String name = fieldsIter.next();
// Remember - never serialize "tokens"
- if (name.equals("tokens") || name.equals("UID"))
+ if (name.equals("tokens") || name.equals("base") || name.equals("UID"))
continue;
mFields.fieldsOrder.add(name);
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestPrimaryDataProtection.java b/src/test/java/de/ids_mannheim/korap/index/TestPrimaryDataProtection.java
new file mode 100644
index 0000000..11a6421
--- /dev/null
+++ b/src/test/java/de/ids_mannheim/korap/index/TestPrimaryDataProtection.java
@@ -0,0 +1,405 @@
+package de.ids_mannheim.korap.index;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import static de.ids_mannheim.korap.TestSimple.*;
+
+import de.ids_mannheim.korap.Krill;
+import de.ids_mannheim.korap.KrillIndex;
+import de.ids_mannheim.korap.query.QueryBuilder;
+import de.ids_mannheim.korap.response.Match;
+import de.ids_mannheim.korap.response.MetaFields;
+import de.ids_mannheim.korap.response.Result;
+import de.ids_mannheim.korap.response.Text;
+import de.ids_mannheim.korap.util.QueryException;
+
+
+/**
+ * Tests to verify that primary data stored in index fields
+ * ("tokens", "base") is never leaked through metadata APIs.
+ *
+ * The "tokens" and "base" fields use TextField.TYPE_STORED, so the
+ * full primary text is persisted in Lucene alongside term vectors.
+ * Without proper filtering, these fields would be serialized into
+ * JSON responses, exposing the complete text to API consumers.
+ */
+@RunWith(JUnit4.class)
+public class TestPrimaryDataProtection {
+
+ private static final String PRIMARY_TEXT = "abc def ghi";
+
+ private FieldDocument createDocWithTokensField () {
+ FieldDocument fd = new FieldDocument();
+ fd.addString("corpusSigle", "TST");
+ fd.addString("docSigle", "TST-001");
+ fd.addString("textSigle", "TST-001-0001");
+ fd.addText("title", "Test Document");
+ fd.addText("author", "Test Author");
+ fd.setUID(42);
+ fd.addTV("tokens", PRIMARY_TEXT,
+ "[(0-3)s:abc|i:abc|_0#0-3|-:t$<i>3]"
+ + "[(4-7)s:def|i:def|_1#4-7]"
+ + "[(8-11)s:ghi|i:ghi|_2#8-11]");
+ return fd;
+ };
+
+
+ private FieldDocument createDocWithBaseField () {
+ FieldDocument fd = new FieldDocument();
+ fd.addString("textSigle", "TST-002-0001");
+ fd.addText("title", "Base Field Document");
+ fd.addString("ID", "doc-base");
+ fd.setUID(43);
+ fd.addTV("base", PRIMARY_TEXT,
+ "[(0-3)s:abc|_0$<i>0<i>3|-:t$<i>3]"
+ + "[(4-7)s:def|_1$<i>4<i>7]"
+ + "[(8-11)s:ghi|_2$<i>8<i>11]");
+ return fd;
+ };
+
+
+ /**
+ * Test that getDoc() does not expose the "tokens" field value.
+ * getDoc() calls populateFields(doc) which must filter out "tokens".
+ */
+ @Test
+ public void testGetDocDoesNotLeakTokensField () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ Text text = ki.getDoc("42");
+ String json = text.toJsonString();
+
+ assertFalse(
+ "JSON response from getDoc must not contain primary data",
+ json.contains(PRIMARY_TEXT)
+ );
+ assertFalse(
+ "JSON response from getDoc must not contain 'tokens' as a field key",
+ json.contains("\"key\":\"tokens\"")
+ );
+
+ assertEquals("Test Document", text.getFieldValue("title"));
+ assertEquals("Test Author", text.getFieldValue("author"));
+ };
+
+
+ /**
+ * Test that getDoc() does not expose the "base" field value.
+ */
+ @Test
+ public void testGetDocDoesNotLeakBaseField () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithBaseField());
+ ki.commit();
+
+ Text text = ki.getDoc("43");
+ String json = text.toJsonString();
+
+ assertFalse(
+ "JSON response from getDoc must not contain primary data from 'base' field",
+ json.contains(PRIMARY_TEXT)
+ );
+ assertFalse(
+ "JSON response must not contain 'base' as a field key",
+ json.contains("\"key\":\"base\"")
+ );
+ };
+
+
+ /**
+ * Test that getFields() with @all does not leak tokens/base fields.
+ */
+ @Test
+ public void testGetFieldsAllDoesNotLeakPrimaryData () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ MetaFields mf = ki.getFields("TST-001-0001");
+ JsonNode res = mf.toJsonNode();
+ String json = res.toString();
+
+ assertFalse(
+ "getFields(@all) must not contain primary data",
+ json.contains(PRIMARY_TEXT)
+ );
+ assertFalse(
+ "getFields(@all) must not contain 'tokens' field key",
+ json.contains("\"key\":\"tokens\"")
+ );
+ assertFalse(
+ "getFields(@all) must not contain 'base' field key",
+ json.contains("\"key\":\"base\"")
+ );
+
+ Iterator<JsonNode> fieldIter = res.at("/document/fields").elements();
+ while (fieldIter.hasNext()) {
+ JsonNode field = fieldIter.next();
+ String key = field.at("/key").asText();
+ assertFalse(
+ "No field should be named 'tokens'",
+ key.equals("tokens")
+ );
+ assertFalse(
+ "No field should be named 'base'",
+ key.equals("base")
+ );
+ };
+ };
+
+
+ /**
+ * Test that getFields() with an explicit field list that includes
+ * "tokens" does not return the stored primary data.
+ */
+ @Test
+ public void testGetFieldsExplicitTokensDoesNotLeak () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ ArrayList<String> fields = new ArrayList<>();
+ fields.add("tokens");
+ fields.add("title");
+
+ MetaFields mf = ki.getFields("TST-001-0001", fields);
+ JsonNode res = mf.toJsonNode();
+ String json = res.toString();
+
+ assertFalse(
+ "Explicitly requesting 'tokens' field must not leak primary data",
+ json.contains(PRIMARY_TEXT)
+ );
+ };
+
+
+ /**
+ * Test that search results with @all fields do not leak tokens/base.
+ */
+ @Test
+ public void testSearchResultsAllFieldsDoNotLeak () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("tokens");
+ Krill ks = new Krill(kq.seg("i:abc").toQuery());
+ ks.getMeta().addField("@all");
+ ks.getMeta().setSnippets(true);
+
+ Result kr = ks.apply(ki);
+ assertEquals((long) 1, kr.getTotalResults());
+
+ ObjectMapper mapper = new ObjectMapper();
+ JsonNode res = mapper.readTree(kr.toJsonString());
+ String resultJson = kr.toJsonString();
+
+ assertFalse(
+ "Search results with @all fields must not leak primary data as a field value",
+ resultJson.contains("\"key\":\"tokens\"")
+ );
+
+ Iterator<JsonNode> matches = res.at("/matches").elements();
+ while (matches.hasNext()) {
+ JsonNode match = matches.next();
+ Iterator<JsonNode> fields = match.at("/fields").elements();
+ while (fields.hasNext()) {
+ JsonNode field = fields.next();
+ String key = field.at("/key").asText();
+ assertFalse(
+ "Match field should not be 'tokens'",
+ key.equals("tokens")
+ );
+ assertFalse(
+ "Match field should not be 'base'",
+ key.equals("base")
+ );
+ };
+ };
+ };
+
+
+ /**
+ * Test that search results with default fields do not leak primary data.
+ */
+ @Test
+ public void testSearchResultsDefaultFieldsDoNotLeak () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("tokens");
+ Krill ks = new Krill(kq.seg("i:abc").toQuery());
+ ks.getMeta().setSnippets(true);
+
+ Result kr = ks.apply(ki);
+ assertEquals((long) 1, kr.getTotalResults());
+
+ String resultJson = kr.toJsonString();
+ assertFalse(
+ "Default search results must not contain primary data as a field",
+ resultJson.contains("\"key\":\"tokens\"")
+ );
+ assertFalse(
+ "Default search results must not contain primary data as a field",
+ resultJson.contains("\"key\":\"base\"")
+ );
+ };
+
+
+ /**
+ * Test that getFieldVector returns empty for tokens/base.
+ */
+ @Test
+ public void testGetFieldVectorProtectsTokensAndBase () throws IOException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ de.ids_mannheim.korap.KrillCollection kc =
+ new de.ids_mannheim.korap.KrillCollection(ki);
+
+ List<String> tokenValues = ki.getFieldVector("tokens", kc);
+ assertEquals(
+ "getFieldVector for 'tokens' must return empty list",
+ 0, tokenValues.size()
+ );
+
+ List<String> baseValues = ki.getFieldVector("base", kc);
+ assertEquals(
+ "getFieldVector for 'base' must return empty list",
+ 0, baseValues.size()
+ );
+ };
+
+
+ /**
+ * Test that getMatchInfo does not expose tokens/base as metadata fields.
+ */
+ @Test
+ public void testMatchInfoDoesNotLeakPrimaryData () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(createDocWithTokensField());
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("tokens");
+ Krill ks = new Krill(kq.seg("i:abc").toQuery());
+ ks.getMeta().setSnippets(true);
+
+ Result kr = ks.apply(ki);
+ assertEquals((long) 1, kr.getTotalResults());
+
+ Match match = kr.getMatch(0);
+ String matchJson = match.toJsonString();
+
+ assertFalse(
+ "Match JSON must not contain 'tokens' as a metadata field key",
+ matchJson.contains("\"key\":\"tokens\"")
+ );
+ assertFalse(
+ "Match JSON must not contain 'base' as a metadata field key",
+ matchJson.contains("\"key\":\"base\"")
+ );
+ };
+
+
+ /**
+ * Test that populateFields called with a document containing both
+ * 'tokens' and 'base' fields does not include either in the result.
+ */
+ @Test
+ public void testPopulateFieldsFiltersBothTokensAndBase () throws IOException {
+ KrillIndex ki = new KrillIndex();
+
+ FieldDocument fd = new FieldDocument();
+ fd.addString("textSigle", "TST-003-0001");
+ fd.addText("title", "Dual Field Document");
+ fd.setUID(44);
+ fd.addTV("tokens", "primary text in tokens",
+ "[(0-7)s:primary|_0#0-7|-:t$<i>4]"
+ + "[(8-12)s:text|_1#8-12]"
+ + "[(13-15)s:in|_2#13-15]"
+ + "[(16-22)s:tokens|_3#16-22]");
+ fd.addTV("base", "primary text in base",
+ "[(0-7)s:primary|_0$<i>0<i>7|-:t$<i>4]"
+ + "[(8-12)s:text|_1$<i>8<i>12]"
+ + "[(13-15)s:in|_2$<i>13<i>15]"
+ + "[(16-20)s:base|_3$<i>16<i>20]");
+ ki.addDoc(fd);
+ ki.commit();
+
+ Text text = ki.getDoc("44");
+ String json = text.toJsonString();
+
+ assertFalse(
+ "Must not contain primary text from 'tokens' field",
+ json.contains("primary text in tokens")
+ );
+ assertFalse(
+ "Must not contain primary text from 'base' field",
+ json.contains("primary text in base")
+ );
+ assertFalse(
+ "Must not contain field key 'tokens'",
+ json.contains("\"key\":\"tokens\"")
+ );
+ assertFalse(
+ "Must not contain field key 'base'",
+ json.contains("\"key\":\"base\"")
+ );
+
+ assertEquals("Dual Field Document", text.getFieldValue("title"));
+ };
+
+
+ /**
+ * Test that a custom TV field (not named "tokens" or "base") with
+ * stored primary data WILL appear in the output. This documents
+ * the current behavior: only "tokens" and "base" are protected.
+ *
+ * If additional TV field names are used in the future to store
+ * primary data, they must be added to the filter list.
+ */
+ @Test
+ public void testCustomTvFieldIsNotFiltered () throws IOException {
+ KrillIndex ki = new KrillIndex();
+
+ FieldDocument fd = new FieldDocument();
+ fd.addString("textSigle", "TST-004-0001");
+ fd.addText("title", "Custom Field Document");
+ fd.setUID(45);
+ fd.addTV("customTokens", "leaked custom text",
+ "[(0-6)s:leaked|_0#0-6|-:t$<i>3]"
+ + "[(7-13)s:custom|_1#7-13]"
+ + "[(14-18)s:text|_2#14-18]");
+ ki.addDoc(fd);
+ ki.commit();
+
+ Text text = ki.getDoc("45");
+ String json = text.toJsonString();
+
+ assertTrue(
+ "Custom TV field IS exposed (not filtered) - this is documented behavior. "
+ + "Only 'tokens' and 'base' are filtered.",
+ json.contains("leaked custom text")
+ || json.contains("\"key\":\"customTokens\"")
+ );
+ };
+};