Add support for randomized page order
Resolves #182
Change-Id: I93ae7705cf4612f8e893dee2ccf52992272ea88b
diff --git a/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java b/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java
index 6a5357e..1b622ac 100644
--- a/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java
+++ b/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java
@@ -13,6 +13,7 @@
// Implemented by MatchAggregator
public boolean init (String s) throws IOException;
+ public boolean initMeta (String s) throws IOException;
public Exporter finish () throws IOException;
public void setMeta(JsonNode n);
public void setQuery(JsonNode n);
@@ -34,6 +35,8 @@
public int getTotalResults ();
public boolean hasTimeExceeded ();
public void setMaxResults (int m);
+ public void setSeed (Long s);
+ public Long getSeed ();
public void setSse (EventOutput sse);
public void forceFile ();
public String getExportID ();
diff --git a/src/main/java/de/ids_mannheim/korap/plkexport/JsonExporter.java b/src/main/java/de/ids_mannheim/korap/plkexport/JsonExporter.java
index c4bb60b..95e4cc2 100644
--- a/src/main/java/de/ids_mannheim/korap/plkexport/JsonExporter.java
+++ b/src/main/java/de/ids_mannheim/korap/plkexport/JsonExporter.java
@@ -62,6 +62,17 @@
.append(this.getCollection().toString());
};
+ // Add seed when randomization was used
+ if (this.getSeed() != null) {
+ if (header) {
+ w.append(',');
+ } else {
+ header = true;
+ };
+ w.append("\"seed\":")
+ .append(Long.toString(this.getSeed()));
+ };
+
if (header)
w.append(',');
diff --git a/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java b/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java
index 480f724..86cd462 100644
--- a/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java
+++ b/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java
@@ -54,6 +54,9 @@
maxResults = -1,
fetchedResults = 0;
+ // Seed for randomized page order (null = not randomized)
+ private Long seed = null;
+
// Event writer for progress
private EventOutput evOut;
@@ -249,6 +252,24 @@
return this.maxResults;
};
+
+ /**
+ * Set the seed used for randomized page order.
+ * A non-null value indicates randomization was active.
+ */
+ public void setSeed (Long seed) {
+ this.seed = seed;
+ };
+
+
+ /**
+ * Get the seed used for randomized page order.
+ * Returns null if randomization was not active.
+ */
+ public Long getSeed () {
+ return this.seed;
+ };
+
/**
* Get the export ID which is the pointer
@@ -392,6 +413,54 @@
return this.iterateThroughMatches(root.get("matches"));
};
+
+ /**
+ * Parse initial JSON file to get header information
+ * (meta data, query, collection) and write the export header,
+ * but do NOT process matches yet.
+ * This is used when randomizing page order, so that page 0's
+ * matches can be included in the shuffled sequence.
+ */
+ public boolean initMeta (String resp) throws IOException, JsonParseException {
+
+ if (resp == null)
+ return false;
+
+ JsonParser parser = mapper.getFactory().createParser(resp);
+ JsonNode root = mapper.readTree(parser);
+
+ if (root == null)
+ return false;
+
+ JsonNode meta = root.get("meta");
+ this.setMeta(meta);
+ this.setQuery(root.get("query"));
+ this.setCollection(root.get("collection"));
+
+ if (meta != null) {
+ if (meta.has("totalResults")) {
+ this.totalResults = meta.get("totalResults").asInt();
+ if (meta.has("timeExceeded")) {
+ this.timeExceeded = meta.get("timeExceeded").asBoolean();
+ };
+ };
+ };
+
+ // In case the writer is already set (e.g. forceFile() was issued),
+ // write in the header
+ if (writer == null) {
+ this.file = null;
+ writer = new StringWriter();
+ };
+
+ // Write header to exporter
+ this.writeHeader(writer);
+
+ // Do NOT process matches - they will be added later
+ // via appendMatches() in shuffled page order
+ return true;
+ };
+
/**
* Finalize the export stream.
diff --git a/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java b/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
index 62828a4..1ad9ec1 100644
--- a/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
+++ b/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
@@ -217,6 +217,11 @@
this.addInfoRow(w, "Backend-Version", this.getMeta().get("version").asText());
};
this.addInfoRow(w, "Export-Version", ExWSConf.version());
+
+ // Seed information (only when randomized page order was used)
+ if (this.getSeed() != null) {
+ this.addInfoRow(w, "Seed", Long.toString(this.getSeed()));
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/plkexport/Service.java b/src/main/java/de/ids_mannheim/korap/plkexport/Service.java
index b88f8e0..61a4564 100644
--- a/src/main/java/de/ids_mannheim/korap/plkexport/Service.java
+++ b/src/main/java/de/ids_mannheim/korap/plkexport/Service.java
@@ -9,11 +9,16 @@
import java.util.Base64;
import java.util.ResourceBundle;
import java.util.Locale;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jakarta.ws.rs.BadRequestException;
import jakarta.ws.rs.WebApplicationException;
+import jakarta.ws.rs.DefaultValue;
import jakarta.ws.rs.FormParam;
import jakarta.ws.rs.QueryParam;
import jakarta.ws.rs.PathParam;
@@ -136,7 +141,9 @@
String ql,
String cutoffStr,
int hitc,
- EventOutput eventOutput
+ EventOutput eventOutput,
+ boolean randomizePageOrder,
+ long seed
) throws WebApplicationException {
// These parameters are mandatory
@@ -238,6 +245,8 @@
exp.setMaxResults(maxResults);
exp.setQueryString(q);
exp.setCorpusQueryString(cq);
+ if (randomizePageOrder)
+ exp.setSeed(seed);
if (source != null)
exp.setSource(source);
else
@@ -256,13 +265,21 @@
exp.forceFile();
};
- // Initialize export with meta data
- // and first matches
+ // When randomizing, use initMeta() to extract header info
+ // without processing page 0's matches yet, so page 0 can
+ // be included in the shuffled page sequence.
+ // Save the initial response to replay page 0's matches later.
+ String initResp = resp;
+
try {
// TODO:
// Check return value.
- exp.init(resp);
+ if (randomizePageOrder) {
+ exp.initMeta(resp);
+ } else {
+ exp.init(resp);
+ }
}
catch (Exception e) {
@@ -293,7 +310,7 @@
// for a temporary export file, unless progress is
// requested. In case all matches are already fetched,
// stop here as well.
- if (cutoff || fetchCount <= pageSize) {
+ if (!randomizePageOrder && (cutoff || fetchCount <= pageSize)) {
try {
@@ -329,17 +346,52 @@
uri.queryParam("offset", "{offset}");
try {
-
- // Iterate over all results
- for (int i = pageSize; i <= fetchCount; i+=pageSize) {
- resource = client.target(uri.build(i));
- reqBuilder = resource.request(MediaType.APPLICATION_JSON);
- resp = authBuilder(reqBuilder, xff, auth).get(String.class);
+ if (randomizePageOrder) {
- // Stop when no more matches are allowed
- if (!exp.appendMatches(resp))
- break;
+ // When randomizing page order, compute all possible
+ // page offsets up to totalResults (not just fetchCount)
+ // so we sample broadly from the entire result set.
+ int totalForPages = exp.getTotalResults();
+ if (totalForPages < fetchCount)
+ totalForPages = fetchCount;
+
+ // Build list of ALL page offsets including page 0
+ List<Integer> pageOffsets = new ArrayList<>();
+ for (int i = 0; i < totalForPages; i += pageSize) {
+ pageOffsets.add(i);
+ }
+ Collections.shuffle(pageOffsets, new Random(seed));
+
+ // Fetch pages in random order until maxResults are collected
+ for (int offset : pageOffsets) {
+ if (offset == 0) {
+ // Use the already-fetched initial response for page 0
+ if (!exp.appendMatches(initResp))
+ break;
+ } else {
+ resource = client.target(uri.build(offset));
+ reqBuilder = resource.request(MediaType.APPLICATION_JSON);
+ resp = authBuilder(reqBuilder, xff, auth).get(String.class);
+
+ // Stop when no more matches are allowed
+ if (!exp.appendMatches(resp))
+ break;
+ }
+ }
+ }
+ else {
+ // Iterate over all results sequentially
+ for (int i = pageSize; i <= fetchCount; i+=pageSize) {
+
+ resource = client.target(uri.build(i));
+ reqBuilder = resource.request(MediaType.APPLICATION_JSON);
+ resp = authBuilder(reqBuilder, xff, auth).get(String.class);
+
+ // Stop when no more matches are allowed
+ if (!exp.appendMatches(resp))
+ break;
+ }
}
// Close all export writers
@@ -396,10 +448,14 @@
@FormParam("cq") String cq,
@FormParam("ql") String ql,
@FormParam("cutoff") String cutoffStr,
- @FormParam("hitc") int hitc
+ @FormParam("hitc") int hitc,
+ @FormParam("randomizePageOrder") String randomizePageOrderStr,
+ @DefaultValue("42") @FormParam("seed") long seed
) throws IOException {
- Exporter exp = export(fname, format, q, cq, ql, cutoffStr, hitc, null);
+ boolean randomize = "true".equals(randomizePageOrderStr);
+
+ Exporter exp = export(fname, format, q, cq, ql, cutoffStr, hitc, null, randomize, seed);
return exp.serve().build();
};
@@ -437,9 +493,13 @@
@QueryParam("cq") String cq,
@QueryParam("ql") String ql,
@QueryParam("cutoff") String cutoffStr,
- @QueryParam("hitc") int hitc
+ @QueryParam("hitc") int hitc,
+ @QueryParam("randomizePageOrder") String randomizePageOrderStr,
+ @DefaultValue("42") @QueryParam("seed") long seed
) throws InterruptedException {
+ boolean randomize = "true".equals(randomizePageOrderStr);
+
// See
// https://www.baeldung.com/java-ee-jax-rs-sse
// https://www.howopensource.com/2016/01/java-sse-chat-example/
@@ -463,7 +523,7 @@
eventBuilder.data("init");
eventOutput.write(eventBuilder.build());
Exporter exp = export(
- fname, format, q, cq, ql, cutoffStr, hitc, eventOutput
+ fname, format, q, cq, ql, cutoffStr, hitc, eventOutput, randomize, seed
);
if (eventOutput.isClosed())
@@ -706,6 +766,7 @@
String path = prop.getProperty("asset.path", "");
String defaultHitc = prop.getProperty("conf.default_hitc", "100");
int maxHitc = Integer.parseInt(prop.getProperty("conf.max_exp_limit", "10000"));
+ int pageSize = Integer.parseInt(prop.getProperty("conf.page_size", "5"));
UriBuilder uri = UriBuilder.fromPath("")
.host(host)
@@ -720,6 +781,7 @@
templateData.put("assetPath", uri.build());
templateData.put("defaultHitc", defaultHitc);
templateData.put("maxHitc", maxHitc);
+ templateData.put("pageSize", pageSize);
templateData.put("announcement", prop.getProperty("announcement"));
// There is an error code to pass
diff --git a/src/main/resources/assets/templates/export.ftl b/src/main/resources/assets/templates/export.ftl
index 19cce82..2b14a5b 100644
--- a/src/main/resources/assets/templates/export.ftl
+++ b/src/main/resources/assets/templates/export.ftl
@@ -68,6 +68,12 @@
width: 20%;
}
+ #hitc {
+ width: 8em;
+ }
+ .form-table input {
+ min-width: 0;
+ }
</style>
</head>
<body>
@@ -123,8 +129,27 @@
<fieldset class="form-line">
<legend>${dict.hitc}</legend>
- <input name="hitc" id="hitc" type="number" min="1" max="${maxHitc?c}" value="${defaultHitc}" />
- <p style="font-size: 80%; margin-top: .2em; margin-bottom: 0;">${dict.max_hitc} <tt>${maxHitc}</tt></p>
+ <div style="display: flex; margin-top: 0.5em; align-items: flex-start; flex-wrap: wrap; gap: 1em 0.3em;">
+ <div>
+ <input name="hitc" id="hitc" type="number" min="1" max="${maxHitc?c}" value="${defaultHitc}" />
+ <p style="font-size: 80%; margin-top: .2em; margin-bottom: 0;">${dict.max_hitc} <tt>${maxHitc}</tt></p>
+ </div>
+ <div style="display: flex; align-items: center; gap: 1em;">
+ <input type="checkbox"
+ id="randomizePageOrder"
+ name="randomizePageOrder"
+ value="true"
+ style="align-self: center; min-width: 0;" />
+ <label for="randomizePageOrder">
+ ${dict.randomize_page_order}
+ <span class="desc">${dict.randomize_page_order_desc?replace("{0}", pageSize?c)}</span>
+ </label>
+ </div>
+ <div style="display: flex; align-items: center;">
+ <label for="seed">${dict.seed}:</label>
+ <input name="seed" id="seed" type="number" min="0" value="42" style="width: 7em;" />
+ </div>
+ </div>
</fieldset>
<progress id="progress" value="0" max="100" style="display: none;">0%</progress>
diff --git a/src/main/resources/locales/export.properties b/src/main/resources/locales/export.properties
index 49dcbc9..d7f4008 100644
--- a/src/main/resources/locales/export.properties
+++ b/src/main/resources/locales/export.properties
@@ -6,4 +6,7 @@
banner = Experimental
with_ql = with
in_cq = in
-info = Depending on the settings and add-ons of your browser, it may not be possible to export corpus excerpts that require authentication.
\ No newline at end of file
+info = Depending on the settings and add-ons of your browser, it may not be possible to export corpus excerpts that require authentication.
+randomize_page_order = Randomized page order
+randomize_page_order_desc = (page size={0}).
+seed = Seed
\ No newline at end of file
diff --git a/src/main/resources/locales/export_de.properties b/src/main/resources/locales/export_de.properties
index 4a3aee5..3b76a34 100644
--- a/src/main/resources/locales/export_de.properties
+++ b/src/main/resources/locales/export_de.properties
@@ -10,3 +10,6 @@
with_ql = mit
in_cq = in
info = Abh\u00e4ngig von Ihren Browser-Einstellungen und -Erweiterungen kann der Export von Korpusbelegen, die eine Authentifizierung ben\u00f6tigen, nicht m\u00f6glich sein.
+randomize_page_order = Randomisierte Seitenreihenfolge
+randomize_page_order_desc = (Seitengr\u00f6\u00dfe={0}).
+seed = Seed
diff --git a/src/test/java/de/ids_mannheim/korap/plkexport/ServiceTest.java b/src/test/java/de/ids_mannheim/korap/plkexport/ServiceTest.java
index d56f703..0f4afec 100644
--- a/src/test/java/de/ids_mannheim/korap/plkexport/ServiceTest.java
+++ b/src/test/java/de/ids_mannheim/korap/plkexport/ServiceTest.java
@@ -4,6 +4,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.fail;
// Mockserver tests
@@ -23,7 +24,10 @@
import java.io.InputStreamReader;
import java.net.URLDecoder;
+import java.util.ArrayList;
+import java.util.HashSet;
import java.util.LinkedList;
+import java.util.List;
import java.util.Properties;
import jakarta.ws.rs.client.Entity;
@@ -519,6 +523,238 @@
assertTrue("Export-Version", str.contains("Export-Version:" + CELLSPLIT));
}
+
+ @Test
+ public void testExportWsRTFPagingRandomized () {
+
+ mockClient.reset().when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ .withQueryStringParameter("count", "5")
+ .withQueryStringParameter("offset", "5")
+ )
+ .respond(
+ response()
+ .withHeaders(new Header("Content-Type", "application/json; charset=UTF-8"))
+ .withBody(getFixture("response_plagegeist_2.json"))
+ .withStatusCode(200)
+ );
+
+ mockClient.when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ )
+ .respond(
+ response()
+ .withHeaders(new Header("Content-Type", "application/json; charset=UTF-8"))
+ .withBody(getFixture("response_plagegeist_1.json"))
+ .withStatusCode(200)
+ );
+
+ MultivaluedHashMap<String, String> frmap = new MultivaluedHashMap<String, String>();
+ frmap.add("format", "rtf");
+ frmap.add("q", "Plagegeist");
+ frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "30");
+ frmap.add("randomizePageOrder", "true");
+ String filenamer = "dateiPagingRtfRandom";
+ frmap.putSingle("fname", filenamer);
+
+ Response responsertf = target("/export").request()
+ .post(Entity.form(frmap));
+ assertEquals("Request RTF with randomize: Http Response should be 200: ",
+ Status.OK.getStatusCode(), responsertf.getStatus());
+
+ String str = responsertf.readEntity(String.class);
+ // Both pages should still be fetched (order is random but all content present)
+ assertTrue("Page 1 content", str.contains("Ironhoof"));
+ assertTrue("Page 2 content", str.contains("Sinologie"));
+ assertTrue("TotalResults", str.contains("Number of results:"));
+ }
+
+
+ @Test
+ public void testExportWsJsonPagingRandomized () throws IOException {
+
+ mockClient.reset().when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ .withQueryStringParameter("count", "5")
+ .withQueryStringParameter("offset", "5")
+ )
+ .respond(
+ response()
+ .withHeaders(new Header("Content-Type", "application/json; charset=UTF-8"))
+ .withBody(getFixture("response_plagegeist_2.json"))
+ .withStatusCode(200)
+ );
+
+ mockClient.when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ )
+ .respond(
+ response()
+ .withHeaders(new Header("Content-Type", "application/json; charset=UTF-8"))
+ .withBody(getFixture("response_plagegeist_1.json"))
+ .withStatusCode(200)
+ );
+
+ MultivaluedHashMap<String, String> frmap = new MultivaluedHashMap<String, String>();
+ frmap.add("format", "json");
+ frmap.add("q", "Plagegeist");
+ frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "30");
+ frmap.add("randomizePageOrder", "true");
+ String filenamer = "dateiPagingJsonRandom";
+ frmap.putSingle("fname", filenamer);
+
+ Response responsejson = target("/export").request()
+ .post(Entity.form(frmap));
+ assertEquals("Request JSON with randomize: Http Response should be 200: ",
+ Status.OK.getStatusCode(), responsejson.getStatus());
+
+ String str = responsejson.readEntity(String.class);
+ JsonParser parser = mapper.getFactory().createParser(str);
+ JsonNode obj = mapper.readTree(parser);
+
+ assertEquals(obj.at("/query/@type").asText(),"koral:token");
+ assertEquals(obj.at("/meta/totalResults").asInt(),9);
+ // All 9 matches should be present
+ assertTrue("Has 9 matches", obj.at("/matches").size() == 9);
+ }
+
+
+ /**
+ * Verify that page 0 is included in the shuffle, so randomization
+ * is effective even with just 2 pages. Uses page_size=2 with 5
+ * pages of distinct matches to confirm different seeds produce
+ * different match orderings, including a different first match.
+ */
+ @Test
+ public void testExportWsJsonRandomizeShufflesPage0 () throws IOException {
+
+ // Set page_size to 2 so we get 5 pages from 9 results
+ Properties properties = ExWSConf.properties(null);
+ String origPageSize = properties.getProperty("conf.page_size", "5");
+ properties.setProperty("conf.page_size", "2");
+
+ try {
+ // Mock 5 pages with distinct match IDs per page:
+ // offset 0 -> p1 (matches ending 4239, 737)
+ // offset 2 -> p2 (matches ending 19827, 10142)
+ // offset 4 -> p3 (matches ending 2701, 3804)
+ // offset 6 -> p4 (matches ending 16115, 16198)
+ // offset 8 -> p5 (match ending 16259)
+
+ mockClient.reset();
+
+ // Register specific offset mocks FIRST (most specific)
+ String[] fixtures = {"p2", "p3", "p4", "p5"};
+ int[] offsets = {2, 4, 6, 8};
+ for (int idx = 0; idx < offsets.length; idx++) {
+ mockClient.when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ .withQueryStringParameter("count", "2")
+ .withQueryStringParameter("offset", String.valueOf(offsets[idx]))
+ )
+ .respond(
+ response()
+ .withHeaders(new Header("Content-Type", "application/json; charset=UTF-8"))
+ .withBody(getFixture("response_plagegeist_" + fixtures[idx] + ".json"))
+ .withStatusCode(200)
+ );
+ }
+
+ // Catch-all for the initial request (no offset param) -> page 0
+ mockClient.when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ )
+ .respond(
+ response()
+ .withHeaders(new Header("Content-Type", "application/json; charset=UTF-8"))
+ .withBody(getFixture("response_plagegeist_p1.json"))
+ .withStatusCode(200)
+ );
+
+ // Export with seed=1
+ MultivaluedHashMap<String, String> frmap = new MultivaluedHashMap<>();
+ frmap.add("format", "json");
+ frmap.add("q", "Plagegeist");
+ frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "30");
+ frmap.add("randomizePageOrder", "true");
+ frmap.add("seed", "1");
+ frmap.putSingle("fname", "shufflePage0Test1");
+
+ Response resp1 = target("/export").request().post(Entity.form(frmap));
+ assertEquals(Status.OK.getStatusCode(), resp1.getStatus());
+ String str1 = resp1.readEntity(String.class);
+ JsonNode obj1 = mapper.readTree(mapper.getFactory().createParser(str1));
+ assertEquals("All 9 matches with seed 1", 9, obj1.at("/matches").size());
+
+ // Collect match order for seed 1
+ List<String> order1 = new ArrayList<>();
+ for (JsonNode m : obj1.at("/matches")) {
+ order1.add(m.get("matchID").asText());
+ }
+
+ // Verify no duplicates with seed 1
+ assertEquals("No duplicate matches with seed 1",
+ order1.size(), new HashSet<>(order1).size());
+
+ // Export with seed=999
+ frmap.putSingle("seed", "999");
+ frmap.putSingle("fname", "shufflePage0Test999");
+ Response resp2 = target("/export").request().post(Entity.form(frmap));
+ assertEquals(Status.OK.getStatusCode(), resp2.getStatus());
+ String str2 = resp2.readEntity(String.class);
+ JsonNode obj2 = mapper.readTree(mapper.getFactory().createParser(str2));
+ assertEquals("All 9 matches with seed 999", 9, obj2.at("/matches").size());
+
+ // Collect match order for seed 999
+ List<String> order2 = new ArrayList<>();
+ for (JsonNode m : obj2.at("/matches")) {
+ order2.add(m.get("matchID").asText());
+ }
+
+ // Verify no duplicates with seed 999
+ assertEquals("No duplicate matches with seed 999",
+ order2.size(), new HashSet<>(order2).size());
+
+ // The two seeds must produce different orderings
+ assertNotEquals("Different seeds must produce different match order",
+ order1, order2);
+
+ // At least one of the two seeds must NOT start with page 0's first match,
+ // proving that page 0 is included in the shuffle
+ String page0FirstMatch = "match-WUD17/G59/34284-p4238-4239";
+ boolean seed1StartsWithPage0 = order1.get(0).equals(page0FirstMatch);
+ boolean seed2StartsWithPage0 = order2.get(0).equals(page0FirstMatch);
+ assertFalse(
+ "Page 0 should be shuffled - at least one seed should NOT start with page 0's first match",
+ seed1StartsWithPage0 && seed2StartsWithPage0
+ );
+ } finally {
+ // Restore original page_size
+ properties.setProperty("conf.page_size", origPageSize);
+ }
+ }
+
@Test
public void testExportWsRTFPagingWithTimeout () {
diff --git a/src/test/resources/fixtures/response_plagegeist_p1.json b/src/test/resources/fixtures/response_plagegeist_p1.json
new file mode 100644
index 0000000..c2218e2
--- /dev/null
+++ b/src/test/resources/fixtures/response_plagegeist_p1.json
@@ -0,0 +1 @@
+{"@context": "http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld", "meta": {"cutOff": false, "count": 2, "startIndex": 0, "timeout": 10000, "context": {"left": ["token", 40], "right": ["token", 40]}, "fields": ["ID", "UID", "textSigle", "corpusID", "author", "title", "subTitle", "textClass", "pubPlace", "pubDate", "availability", "layerInfos", "docSigle", "corpusSigle"], "version": "0.59.2", "benchmark": "0.330334022 s", "totalResults": 9, "serialQuery": "tokens:s:Plagegeist", "itemsPerPage": 5}, "query": {"@type": "koral:token", "wrap": {"@type": "koral:term", "match": "match:eq", "layer": "orth", "key": "Plagegeist", "foundry": "opennlp", "rewrites": [{"@type": "koral:rewrite", "src": "Kustvakt", "operation": "operation:injection", "scope": "foundry"}]}}, "collection": {"operands": [{"@type": "koral:doc", "match": "match:eq", "type": "type:regex", "value": "CC-BY.*", "key": "availability"}, {"@type": "koral:doc", "match": "match:eq", "value": "WUD17", "key": "corpusSigle"}], "@type": "koral:docGroup", "operation": "operation:and"}, "matches": [{"snippet": "<span class=\"context-left\"><span class=\"more\"></span>1 Tag gesperrt. 24h Urlaub.^^ LG;-- 17:40, 11. Jan. 2011 (CET) Danke ich habe die nahezu zeitgleichen VMs von Dir und Ironhoof gesehen. Ob es ein Grund zum Jubeln ist, sei dahin gestellt. Immerhin habe ich f\u00fcr 24 Stunden einen "</span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">" weniger. Sag mal, zum K\u00f6lner Stammtisch isses doch nicht so weit ... wie w\u00e4r's ? Besten 17:49, 11. Jan. 2011 (CET) Er wurde gesperrt. Nach dem Theater hier zurecht. ABER: auch deine Beitr\u00e4ge hier, die er versuchte zu l\u00f6schen, sorgen nicht f\u00fcr<span class=\"more\"></span></span>", "matchID": "match-WUD17/G59/34284-p4238-4239", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/G59/34284", "author": "Umherirrender, u.a.", "docSigle": "WUD17/G59", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Gru\u00df Tom/Archiv/2011", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}, {"snippet": "<span class=\"context-left\"><span class=\"more\"></span>Sitte, die heute nur mehr aus dem s\u00fcddeutschen Raum bekannt ist, war \u2013 nach Aussage vieler Sagen \u2013 auch im Rheinland \u00fcblich. e) Der St\u00fcpp als Aufhocker Von den 30 St\u00fcpp-Sagen enthalten nur 12 das Ph\u00e4nomen des Aufhockers. Ein Aufhocker ist ein </span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">, der europaweit vorkommt. Er \u00fcberf\u00e4llt den Menschen immer auf dunklen Wegen (z.B. Hohlwegen) oder sehr oft nachts, wenn er an ber\u00fcchtigten, d.h. angsteinfl\u00f6\u00dfenden Orten vorbeikommt. Friedrich Ranke analysiert den Aufhocker als reales Erlebnis, entstanden aus der Angst. Das<span class=\"more\"></span></span>", "matchID": "match-WUD17/C53/60524-p736-737", "UID": 0, "textClass": "kultur literatur", "textSigle": "WUD17/C53/60524", "author": "Capitanezza, u.a.", "docSigle": "WUD17/C53", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Capitanezza", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}]}
\ No newline at end of file
diff --git a/src/test/resources/fixtures/response_plagegeist_p2.json b/src/test/resources/fixtures/response_plagegeist_p2.json
new file mode 100644
index 0000000..a873510
--- /dev/null
+++ b/src/test/resources/fixtures/response_plagegeist_p2.json
@@ -0,0 +1 @@
+{"@context": "http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld", "meta": {"cutOff": false, "count": 2, "startIndex": 2, "timeout": 10000, "context": {"left": ["token", 40], "right": ["token", 40]}, "fields": ["ID", "UID", "textSigle", "corpusID", "author", "title", "subTitle", "textClass", "pubPlace", "pubDate", "availability", "layerInfos", "docSigle", "corpusSigle"], "version": "0.59.2", "benchmark": "0.330334022 s", "totalResults": 9, "serialQuery": "tokens:s:Plagegeist", "itemsPerPage": 5}, "query": {"@type": "koral:token", "wrap": {"@type": "koral:term", "match": "match:eq", "layer": "orth", "key": "Plagegeist", "foundry": "opennlp", "rewrites": [{"@type": "koral:rewrite", "src": "Kustvakt", "operation": "operation:injection", "scope": "foundry"}]}}, "collection": {"operands": [{"@type": "koral:doc", "match": "match:eq", "type": "type:regex", "value": "CC-BY.*", "key": "availability"}, {"@type": "koral:doc", "match": "match:eq", "value": "WUD17", "key": "corpusSigle"}], "@type": "koral:docGroup", "operation": "operation:and"}, "matches": [{"snippet": "<span class=\"context-left\"><span class=\"more\"></span>deshalb so glatt durch, weil ich einige Tage verreist war. Ich habe Wiederherstellung beantragt. Vielen Dank f\u00fcr Deine geduldigen Bem\u00fchungen und mfG 13:49, 28. Jun. 2008 (CEST) Zur Kenntnis: Analemma zieht "durch die Welt der WP-Benutzer" und meint, mich als </span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\"> oder anders bei anderen Nutzern titulieren zu m\u00fcssen... als weiteres Beispiel f\u00fcr andere "nette Aktionen" die letzte aktuelle Aktion: Nachtr\u00e4gliche Ver\u00e4nderung von meinem eigenen Diskussionsbeitrag...na danke auch ;-) Gru\u00df-- 18:21, 28. Jun. 2008 (CEST) @NebMaatRe: Ich hab das "Umherziehen" nat\u00fcrlich<span class=\"more\"></span></span>", "matchID": "match-WUD17/J34/49397-p19826-19827", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/J34/49397", "author": "BlueC\u00fcc\u00fc, u.a.", "docSigle": "WUD17/J34", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Jesi/Archiv 2008/II", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}, {"snippet": "<span class=\"context-left\"><span class=\"more\"></span>Vielen Dank f\u00fcr deine Aufmerksamkeit und M\u00fche, mit der du gestern den Holocaustleugner abgeschmettert hast. Ab sofort ist "Lump" in meinem Sprachgebrauch eine Ehrenbezeichnung geworden... Lieben Gru\u00df - 19:44, 20. Feb. 2008 (CET)timestamp Keine Ursache. Eine andere Ehrenbezeichnung war mal "</span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">". Vielleich ist das genau die richtige Art, mit so was umzugehen. Liebe Gr\u00fc\u00dfe 21:30, 20. Feb. 2008 (CET)timestamp Michel Lippert Hoi Hozro, habe heute anhand meiner wenigen Infos und des franz. Wikiartikels die Type bearbeitet. Erg\u00e4nzt habe<span class=\"more\"></span></span>", "matchID": "match-WUD17/H29/31485-p10141-10142", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/H29/31485", "author": "L\u00f3melinde, u.a.", "docSigle": "WUD17/H29", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Hozro/Archiv0", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}]}
\ No newline at end of file
diff --git a/src/test/resources/fixtures/response_plagegeist_p3.json b/src/test/resources/fixtures/response_plagegeist_p3.json
new file mode 100644
index 0000000..d3a748f
--- /dev/null
+++ b/src/test/resources/fixtures/response_plagegeist_p3.json
@@ -0,0 +1 @@
+{"@context": "http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld", "meta": {"cutOff": false, "count": 2, "startIndex": 4, "timeout": 10000, "context": {"left": ["token", 40], "right": ["token", 40]}, "fields": ["ID", "UID", "textSigle", "corpusID", "author", "title", "subTitle", "textClass", "pubPlace", "pubDate", "availability", "layerInfos", "docSigle", "corpusSigle"], "version": "0.59.2", "benchmark": "0.330334022 s", "totalResults": 9, "serialQuery": "tokens:s:Plagegeist", "itemsPerPage": 5}, "query": {"@type": "koral:token", "wrap": {"@type": "koral:term", "match": "match:eq", "layer": "orth", "key": "Plagegeist", "foundry": "opennlp", "rewrites": [{"@type": "koral:rewrite", "src": "Kustvakt", "operation": "operation:injection", "scope": "foundry"}]}}, "collection": {"operands": [{"@type": "koral:doc", "match": "match:eq", "type": "type:regex", "value": "CC-BY.*", "key": "availability"}, {"@type": "koral:doc", "match": "match:eq", "value": "WUD17", "key": "corpusSigle"}], "@type": "koral:docGroup", "operation": "operation:and"}, "matches": [{"snippet": "<span class=\"context-left\"><span class=\"more\"></span>Unterst\u00fctzung, habe gesehen, da\u00df LaPalma nun gesperrt ist. Inzwischen habe ich auch den haupts\u00e4chlich von Dir bearbeiteten Artikel \u00fcber Velikovsky gelesen. Sehr interessant. Den kannte ich noch gar nicht! Herzlicher Gru\u00df, -- 15:20, 28. Nov. 2007 (CET) Naja, mir ging der "</span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">" auch geh\u00f6rig auf die Nerven. Der Artikel Velikovsky stammt nicht von mir, hatte ihn nur von \u00e4hnlichen Visionen eines anderen "Plagegeists" befreit. Ist zwar stilistisch immer noch \u00fcberarbeitungsbed\u00fcrftig, aber die "Science-Fiction" ist erstmal raus. Gru\u00df-- 19:33, 28. Nov. 2007 (CET<span class=\"more\"></span></span>", "matchID": "match-WUD17/I78/98882-p2700-2701", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/I78/98882", "author": "Ingochina, u.a.", "docSigle": "WUD17/I78", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels marmot/m=tokens marmot/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Ingochina/Archiv", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}, {"snippet": "<span class=\"context-left\"><span class=\"more\"></span>und wie ich in einem Buch von Bernhard Karlgren gelesen habe, wird da eine alte Bedeutung "Blutegel" f\u00fcr dieses Zeichen angenommen, bzw. auch andere Ungeziefer konnten wohl gemeint sein. Der ma-Teil des Worts wurde also urspr\u00fcnglich wahrscheinlich im Sinne von "</span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">" verwendet, folglich war \u8682\u8681 urspr\u00fcnglich frei \u00fcbersetzt eine "Sch...-Ameise" ;-) -- 18:21, 30. Apr. 2007 (CEST) Hallo Allgaeuer, mag sein, dass es f\u00fcr dich ein Hammer ist, aber es ist Stand der aktuellen Forschung in der Sinologie. Schriften von Karlgren u.<span class=\"more\"></span></span>", "matchID": "match-WUD17/N15/17364-p3803-3804", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/N15/17364", "author": "Ningling, u.a.", "docSigle": "WUD17/N15", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Ningling/Archiv2006-07", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}]}
\ No newline at end of file
diff --git a/src/test/resources/fixtures/response_plagegeist_p4.json b/src/test/resources/fixtures/response_plagegeist_p4.json
new file mode 100644
index 0000000..1955cf3
--- /dev/null
+++ b/src/test/resources/fixtures/response_plagegeist_p4.json
@@ -0,0 +1 @@
+{"@context": "http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld", "meta": {"cutOff": false, "count": 2, "startIndex": 6, "timeout": 10000, "context": {"left": ["token", 40], "right": ["token", 40]}, "fields": ["ID", "UID", "textSigle", "corpusID", "author", "title", "subTitle", "textClass", "pubPlace", "pubDate", "availability", "layerInfos", "docSigle", "corpusSigle"], "version": "0.59.2", "benchmark": "0.330334022 s", "totalResults": 9, "serialQuery": "tokens:s:Plagegeist", "itemsPerPage": 5}, "query": {"@type": "koral:token", "wrap": {"@type": "koral:term", "match": "match:eq", "layer": "orth", "key": "Plagegeist", "foundry": "opennlp", "rewrites": [{"@type": "koral:rewrite", "src": "Kustvakt", "operation": "operation:injection", "scope": "foundry"}]}}, "collection": {"operands": [{"@type": "koral:doc", "match": "match:eq", "type": "type:regex", "value": "CC-BY.*", "key": "availability"}, {"@type": "koral:doc", "match": "match:eq", "value": "WUD17", "key": "corpusSigle"}], "@type": "koral:docGroup", "operation": "operation:and"}, "matches": [{"snippet": "<span class=\"context-left\"><span class=\"more\"></span>vielleicht eine neue Schloss-Einstein-Antragswelle unterbinden.-- 07:36, 23. Jun. 2008 (CEST) Mentor Lieber Kriddl, als ich mir die Liste der Mentoren anschaute, fiel mein Augenmerk auf Dich als Jurist. K\u00f6nntest Du mir jungen Wikipedianer (aber nicht jung an Jahren) helfen, einen </span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">, der mich seit meiner ersten Teilnahme als IP mobbt, helfen? Wenn ja, so schau Dir doch als Einstieg bitte meinen Wiederherstellungs-Antrag zum Artikel Meton-Periode an: WP:LP, 26.Juni 08. Dort ist nicht nur der Sachverhalt, in den man sich nicht<span class=\"more\"></span></span>", "matchID": "match-WUD17/K35/39955-p16114-16115", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/K35/39955", "author": "TaxonBot, u.a.", "docSigle": "WUD17/K35", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Kriddl/Archiv", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}, {"snippet": "<span class=\"context-left\"><span class=\"more\"></span>schnell einarbeiten kann, sondern auch etwas \u00fcber die zu schlichtenden Hintergr\u00fcnde von mir zu lesen. Solltest Du Dich zus\u00e4tzlich als Bearbeiter von L\u00f6schfragen angesprochen f\u00fchlen, w\u00e4re ich Dir doppelt dankbar. mfG 15:52, 28. Jun. 2008 (CEST) Hallo, ich bin der "</span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">", der Analemma nicht "mobbt", sondern auf seine fachlichen und technischen Schwierigkeiten hingewiesen hatte. Mobbing ? Analemma bat mich sogar um einen Quellennachweis f\u00fcr die Lunationsdauer, die "er nirgendwo finden konnte"....hatte ich sodann erledigt. Ein Fachmann hat keine Schwierigkeiten, das zu<span class=\"more\"></span></span>", "matchID": "match-WUD17/K35/39955-p16197-16198", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/K35/39955", "author": "TaxonBot, u.a.", "docSigle": "WUD17/K35", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Kriddl/Archiv", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}]}
\ No newline at end of file
diff --git a/src/test/resources/fixtures/response_plagegeist_p5.json b/src/test/resources/fixtures/response_plagegeist_p5.json
new file mode 100644
index 0000000..628d35f
--- /dev/null
+++ b/src/test/resources/fixtures/response_plagegeist_p5.json
@@ -0,0 +1 @@
+{"@context": "http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld", "meta": {"cutOff": false, "count": 2, "startIndex": 8, "timeout": 10000, "context": {"left": ["token", 40], "right": ["token", 40]}, "fields": ["ID", "UID", "textSigle", "corpusID", "author", "title", "subTitle", "textClass", "pubPlace", "pubDate", "availability", "layerInfos", "docSigle", "corpusSigle"], "version": "0.59.2", "benchmark": "0.330334022 s", "totalResults": 9, "serialQuery": "tokens:s:Plagegeist", "itemsPerPage": 5}, "query": {"@type": "koral:token", "wrap": {"@type": "koral:term", "match": "match:eq", "layer": "orth", "key": "Plagegeist", "foundry": "opennlp", "rewrites": [{"@type": "koral:rewrite", "src": "Kustvakt", "operation": "operation:injection", "scope": "foundry"}]}}, "collection": {"operands": [{"@type": "koral:doc", "match": "match:eq", "type": "type:regex", "value": "CC-BY.*", "key": "availability"}, {"@type": "koral:doc", "match": "match:eq", "value": "WUD17", "key": "corpusSigle"}], "@type": "koral:docGroup", "operation": "operation:and"}, "matches": [{"snippet": "<span class=\"context-left\"><span class=\"more\"></span>Quellennachweis f\u00fcr die Lunationsdauer, die "er nirgendwo finden konnte"....hatte ich sodann erledigt. Ein Fachmann hat keine Schwierigkeiten, das zu finden. Nunja, und f\u00fcr die Nachfragen nach Quellen (die er bis heute nicht liefern konnte), bekomme ich dann den Stempel "</span><span class=\"match\"><mark>Plagegeist</mark></span><span class=\"context-right\">, Mobber" etc. aufgedr\u00fcckt...womit er auch bei vielen Benutzern "hausieren" geht. Ich frage mich, wer hier mobbt ? ... als Beispiel die letzte aktuelle Aktion: Nachtr\u00e4gliche Ver\u00e4nderung von meinem eigenen Diskussionsbeitrag...na danke auch ;-)... Es gr\u00fc\u00dft -- 18:10, 28. Jun. 2008 (CEST) Hm<span class=\"more\"></span></span>", "matchID": "match-WUD17/K35/39955-p16258-16259", "UID": 0, "textClass": "staat-gesellschaft biographien-interviews", "textSigle": "WUD17/K35/39955", "author": "TaxonBot, u.a.", "docSigle": "WUD17/K35", "layerInfos": "corenlp/c=spans corenlp/p=tokens corenlp/s=spans dereko/s=spans malt/d=rels opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens", "pubPlace": "URL:http://de.wikipedia.org", "availability": "CC-BY-SA", "title": "Benutzer Diskussion:Kriddl/Archiv", "pubDate": "2017-07-01", "corpusSigle": "WUD17"}]}
\ No newline at end of file