Improve paging mechanism to rely on offsets only
Also introduce hitc support to limit exported hits and
move max_page_limit to configuration file
Change-Id: Iccf9cc5f3497232dd76a917a059f67c9926b4942
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/ExWSConf.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/ExWSConf.java
index 11413ae..72e9468 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/ExWSConf.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/ExWSConf.java
@@ -13,14 +13,7 @@
import java.util.Properties;
public class ExWSConf {
- /*
- * maximum hits to be exported
- * TODO: Define this constants after discussing it.
- * Maybe we need a distinction between users at the IDS and external users
- * See also: https://www.ids-mannheim.de/cosmas2/script-app/hilfe/sitzung.html
- */
- public static final int MAX_EXP_LIMIT = 10000;
-
+
// Version of Export Plugin
public static final int VERSION_MAJOR = 0;
public static final int VERSION_MINOR = 1;
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java
index b00ea47..a54460f 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/Exporter.java
@@ -8,14 +8,14 @@
interface Exporter {
// Implemented by MatchAggregator
- public void init (String s) throws IOException;
+ public boolean init (String s) throws IOException;
public void setMeta(JsonNode n);
public void setQuery(JsonNode n);
public void setCollection(JsonNode n);
public JsonNode getMeta();
public JsonNode getQuery();
public JsonNode getCollection();
- public void appendMatches (String s) throws IOException;
+ public boolean appendMatches (String s) throws IOException;
public String getFileName ();
public void setFileName (String s);
public String getQueryString ();
@@ -24,6 +24,7 @@
public void setCorpusQueryString (String s);
public int getTotalResults ();
public boolean hasTimeExceeded ();
+ public void setMaxResults (int m);
// Implemented by Exporter
public ResponseBuilder serve();
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java
index e70790e..c36bb7c 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/IdsExportService.java
@@ -48,15 +48,12 @@
/**
* TODO:
* - Delete the temp file of the export at the end
- * - Right now, the web service returns one page (cutoff=1) or
- * all pages.
* - Do not expect all meta data per match.
- * - Handle timeout results (with minimum total results).
- * - Use offset instead of page parameter
* - Add progress mechanism.
* - Add CSV export format.
* - Add table layout to RTF information.
* - Add loading marker.
+ * - Add hitc to form.
*/
@Path("/")
@@ -116,9 +113,9 @@
@FormParam("q") String q,
@FormParam("cq") String cq,
@FormParam("ql") String ql,
- @FormParam("cutoff") String cutoffStr
+ @FormParam("cutoff") String cutoffStr,
// @FormParam("islimit") String il,
- // @FormParam("hitc") int hitc
+ @FormParam("hitc") int hitc
) throws IOException {
// These parameters are required
@@ -138,24 +135,32 @@
+ params[i][0] + "\"" + " is missing or empty")
.build());
};
-
- int totalResults = -1;
// Retrieve cutoff value
boolean cutoff = false;
- if (cutoffStr != null && (cutoffStr.equals("true") || cutoffStr.equals("1"))) {
+ if (cutoffStr != null && (
+ cutoffStr.equals("true") ||
+ cutoffStr.equals("1"))
+ ) {
cutoff = true;
};
ResponseBuilder builder = null;
Client client = ClientBuilder.newClient();
- String scheme = properties.getProperty("api.scheme", "https");
- String port = properties.getProperty("api.port", "8089");
- String host = properties.getProperty("api.host", "localhost");
- String path = properties.getProperty("api.path", "");
- int pageSize = Integer.parseInt(properties.getProperty("conf.page_size", "5"));
+ // Load configuration values
+ String scheme = properties.getProperty("api.scheme", "https");
+ String port = properties.getProperty("api.port", "8089");
+ String host = properties.getProperty("api.host", "localhost");
+ String path = properties.getProperty("api.path", "");
+ int pageSize = Integer.parseInt(properties.getProperty("conf.page_size", "5"));
+ int maxResults = Integer.parseInt(properties.getProperty("conf.max_exp_limit", "10000"));
+ // Adjust the number of requested hits
+ if (hitc > 0 && hitc < maxResults) {
+ maxResults = hitc;
+ };
+
// Create initial search uri
UriBuilder uri = UriBuilder.fromPath("/api/v1.0/search")
.host(host)
@@ -169,12 +174,11 @@
if (cq != null)
uri = uri.queryParam("cq", cq);
-
if (path != "") {
uri = uri.path(path);
};
-
+
uri = uri.queryParam("count", pageSize);
// Get client IP, in case service is behind a proxy
@@ -183,9 +187,8 @@
String auth = "";
if (req != null) {
xff = getClientIP(req.getHeader("X-Forwarded-For"));
- if (xff == "") {
+ if (xff == "")
xff = req.getRemoteAddr();
- };
auth = authFromCookie(req);
};
@@ -208,16 +211,15 @@
Exporter exp;
// Choose the correct exporter
- if (format.equals("json")) {
+ if (format.equals("json"))
exp = new JsonExporter();
- }
- else {
+ else
exp = new RtfExporter();
- };
+ exp.setMaxResults(maxResults);
exp.setQueryString(q);
exp.setCorpusQueryString(cq);
-
+
// set filename based on query (if not already set)
if (fname != null) {
exp.setFileName(fname);
@@ -226,6 +228,7 @@
// Initialize exporter (with meta data and first matches)
try {
exp.init(resp);
+
} catch (Exception e) {
throw new WebApplicationException(
@@ -234,47 +237,48 @@
e.getMessage()
)
);
- }
+ };
+
+ /*
+ * Calculate how many results to fetch
+ */
+ int fetchCount = exp.getTotalResults();
+ if (exp.hasTimeExceeded() || fetchCount > maxResults) {
+ fetchCount = maxResults;
+ };
+
+ // The first page was already enough
+ if (fetchCount <= pageSize) {
+ cutoff = true;
+ };
+
// If only one page should be exported there is no need
// for a temporary export file
if (cutoff) {
builder = exp.serve();
}
- // Page through results
+ // Page through all results
else {
- /*
- * Get total results
- */
- totalResults = exp.getTotalResults();
+ // It's not important anymore to get totalResults
+ uri.queryParam("cutoff", "true");
- /*
- * Get number of pages and the number of hits
- * which should be exported at the last page
- */
- int pg = 1;
- if (totalResults % pageSize > 0) {
- pg = totalResults / pageSize + 1;
- }
- else {
- pg = totalResults / pageSize;
- }
-
+ // Set offset for paging as a template
uri.queryParam("offset", "{offset}");
try {
// Iterate over all results
- for (int i = 2; i <= pg; i++) {
- resource = client.target(
- uri.build((i * pageSize) - pageSize)
- );
-
+ for (int i = pageSize; i <= fetchCount; i+=pageSize) {
+ resource = client.target(uri.build(i));
reqBuilder = resource.request(MediaType.APPLICATION_JSON);
resp = authBuilder(reqBuilder, xff, auth).get(String.class);
- exp.appendMatches(resp);
+
+ // Stop when no more matches are allowed
+ if (!exp.appendMatches(resp))
+ break;
}
} catch (Exception e) {
throw new WebApplicationException(
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java
index c482526..26a96c0 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/MatchAggregator.java
@@ -44,6 +44,8 @@
private String fname, queryString, corpusQueryString;
private boolean timeExceeded = false;
private int totalResults = -1;
+ private int maxResults = -1;
+ private int fetchedResults = 0;
public String getMimeType() {
return "text/plain";
@@ -56,7 +58,7 @@
public int getTotalResults() {
return this.totalResults;
};
-
+
public boolean hasTimeExceeded() {
return this.timeExceeded;
};
@@ -102,6 +104,15 @@
this.query = query;
};
+ // Needs to be set before first addMatch
+ public void setMaxResults (int maxResults) {
+ this.maxResults = maxResults;
+ };
+
+ public int getMaxResults () {
+ return this.maxResults;
+ };
+
public JsonNode getQuery () {
return this.query;
};
@@ -123,20 +134,20 @@
* Create new match aggregator and parse initial Json
* file to get header information and initial matches.
*/
- public void init (String resp) throws IOException, JsonParseException {
+ public boolean init (String resp) throws IOException, JsonParseException {
this.file = null;
matches = new LinkedList();
if (resp == null)
- return;
-
+ return false;
+
JsonParser parser = mapper.getFactory().createParser(resp);
JsonNode actualObj = mapper.readTree(parser);
-
+
if (actualObj == null)
- return;
+ return false;
JsonNode meta = actualObj.get("meta");
this.setMeta(meta);
@@ -155,24 +166,17 @@
writer = new StringWriter();
this.writeHeader(writer);
-
- JsonNode mNodes = actualObj.get("matches");
- if (mNodes == null)
- return;
-
- // Iterate over the results of the current file
- Iterator<JsonNode> mNode = mNodes.elements();
- while (mNode.hasNext()) {
- this.addMatch(mNode.next(), writer);
- };
+ return this.iterateThroughMatches(
+ actualObj.get("matches")
+ );
};
/**
* Append more matches to the result set.
*/
- public void appendMatches (String resp) throws IOException {
+ public boolean appendMatches (String resp) throws IOException {
// Open a temp file if not already opened
if (this.file == null) {
@@ -196,19 +200,11 @@
JsonNode actualObj = mapper.readTree(parser);
if (actualObj == null)
- return;
+ return false;
- JsonNode mNodes = actualObj.get("matches");
-
- if (mNodes == null)
- return;
-
- Iterator<JsonNode> mNode = mNodes.elements();
-
- MatchExport match;
- while (mNode.hasNext()) {
- this.addMatch(mNode.next(), writer);
- };
+ return this.iterateThroughMatches(
+ actualObj.get("matches")
+ );
};
@@ -249,4 +245,23 @@
// Return exporter error
return Response.status(500).entity("error");
};
+
+
+ // Iterate through all matches
+ private boolean iterateThroughMatches (JsonNode mNodes) throws IOException {
+ if (mNodes == null)
+ return false;
+
+ // Iterate over the results of the current file
+ Iterator<JsonNode> mNode = mNodes.elements();
+ while (mNode.hasNext()) {
+ this.addMatch(mNode.next(), writer);
+ this.fetchedResults++;
+ if (this.maxResults > 0 &&
+ this.fetchedResults > this.maxResults) {
+ return false;
+ };
+ };
+ return true;
+ };
};
diff --git a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
index fc74d2e..b1f085f 100644
--- a/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
+++ b/plugin/src/main/java/de/ids_mannheim/korap/plkexport/RtfExporter.java
@@ -103,6 +103,13 @@
w.append("\\par}\n");
};
+ if (this.getTotalResults() == -1 ||
+ this.getTotalResults() > this.getMaxResults()) {
+ w.append("{\\pard Fetched: \\f1 ");
+ w.append(Integer.toString(this.getMaxResults()));
+ w.append("\\par}\n");
+ };
+
// Add line
w.append("{\\pard\\brdrb\\brdrs\\brdrw2\\brsp20\\par}\n");
};
diff --git a/plugin/src/main/resources/exportPlugin.conf b/plugin/src/main/resources/exportPlugin.conf
index dfd029d..0d3374a 100644
--- a/plugin/src/main/resources/exportPlugin.conf
+++ b/plugin/src/main/resources/exportPlugin.conf
@@ -13,4 +13,10 @@
asset.scheme=https
# Default configuration
-conf.page_size=5
\ No newline at end of file
+conf.page_size=5
+
+# Maximum hits to be exported
+# TODO: Define this constants after discussing it.
+# Maybe we need a distinction between users at the IDS and external users
+# See also: https://www.ids-mannheim.de/cosmas2/script-app/hilfe/sitzung.html
+conf.max_exp_limit=10000
\ No newline at end of file
diff --git a/plugin/src/test/java/de/ids_mannheim/korap/plkexport/IdsExportServiceTest.java b/plugin/src/test/java/de/ids_mannheim/korap/plkexport/IdsExportServiceTest.java
index d356786..d0fe1dd 100644
--- a/plugin/src/test/java/de/ids_mannheim/korap/plkexport/IdsExportServiceTest.java
+++ b/plugin/src/test/java/de/ids_mannheim/korap/plkexport/IdsExportServiceTest.java
@@ -110,7 +110,7 @@
frmap.add("fname", filenamej);
frmap.add("format", "json");
frmap.add("q", "Wasser");
- frmap.add("cutoff", "true");
+ frmap.add("cutoff", "1");
frmap.add("ql", "poliqarp");
String message;
@@ -483,6 +483,7 @@
frmap.add("format", "rtf");
frmap.add("q", "Plagegeist");
frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "30");
String filenamer = "dateiPagingRtf";
frmap.putSingle("fname", filenamer);
@@ -491,11 +492,27 @@
assertEquals("Request RTF: Http Response should be 200: ",
Status.OK.getStatusCode(), responsertf.getStatus());
+ // With maxResults
String str = responsertf.readEntity(String.class);
assertTrue("Page 1 content", str.contains("Ironhoof"));
assertTrue("Page 2 content", str.contains("Sinologie"));
assertTrue("Unicode handling", str.contains("Hintergr\\u252\\'fcnde"));
assertTrue("TotalResults", str.contains("Count: \\f1 9\\"));
+ assertFalse("Fetched", str.contains("Fetched:"));
+
+ frmap.putSingle("hitc", "7");
+
+ responsertf = target("/export").request()
+ .post(Entity.form(frmap));
+ assertEquals("Request RTF: Http Response should be 200: ",
+ Status.OK.getStatusCode(), responsertf.getStatus());
+
+ str = responsertf.readEntity(String.class);
+ assertTrue("Page 1 content", str.contains("Ironhoof"));
+ assertTrue("Page 2 content", str.contains("Sinologie"));
+ assertTrue("Unicode handling", str.contains("Hintergr\\u252\\'fcnde"));
+ assertTrue("TotalResults", str.contains("Count: \\f1 9\\"));
+ assertTrue("Fetched", str.contains("Fetched: \\f1 7\\"));
}
@@ -534,6 +551,7 @@
frmap.add("format", "rtf");
frmap.add("q", "Plagegeist");
frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "30");
String filenamer = "dateiPagingRtf";
frmap.putSingle("fname", filenamer);
@@ -587,6 +605,7 @@
frmap.add("format", "json");
frmap.add("q", "Plagegeist");
frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "30");
String filenamer = "dateiPagingJson";
frmap.putSingle("fname", filenamer);
@@ -607,6 +626,65 @@
assertTrue(obj.at("/matches/0/snippet").asText().contains("<span class=\"context-right\">""));
assertTrue(obj.at("/matches/0/snippet").asText().contains("wie wär's"));
}
+
+ @Test
+ public void testExportWsJsonWithMaxHitcFirstPage () throws IOException {
+
+ // This should ensure here to check that page 2 is not loaded
+ mockClient.reset().when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ .withQueryStringParameter("count", "5")
+ .withQueryStringParameter("offset", "5")
+ )
+ .respond(
+ response()
+ .withHeader("Content-Type: application/json; charset=utf-8")
+ .withBody(getFixture("response_broken.json"))
+ .withStatusCode(200)
+ );
+
+ mockClient.when(
+ request()
+ .withMethod("GET")
+ .withPath("/api/v1.0/search")
+ .withQueryStringParameter("q", "Plagegeist")
+ )
+ .respond(
+ response()
+ .withHeader("Content-Type: application/json; charset=utf-8")
+ .withBody(getFixture("response_plagegeist_1.json"))
+ .withStatusCode(200)
+ );
+
+ MultivaluedHashMap<String, String> frmap = new MultivaluedHashMap<String, String>();
+ frmap.add("format", "json");
+ frmap.add("q", "Plagegeist");
+ frmap.add("ql", "poliqarp");
+ frmap.add("hitc", "3");
+ String filenamer = "dateiPagingJson";
+ frmap.putSingle("fname", filenamer);
+
+ Response responsejson = target("/export").request()
+ .post(Entity.form(frmap));
+ assertEquals("Request RTF: Http Response should be 200: ",
+ Status.OK.getStatusCode(), responsejson.getStatus());
+
+ String str = responsejson.readEntity(String.class);
+ JsonParser parser = mapper.getFactory().createParser(str);
+ JsonNode obj = mapper.readTree(parser);
+
+ assertEquals(obj.at("/query/@type").asText(),"koral:token");
+ assertEquals(obj.at("/meta/totalResults").asInt(),9);
+ assertEquals(obj.at("/matches/0/matchID").asText(),"match-WUD17/G59/34284-p4238-4239");
+ assertEquals(obj.at("/matches/1/matchID").asText(),"match-WUD17/C53/60524-p736-737");
+ assertEquals(obj.at("/matches/2/matchID").asText(),"match-WUD17/J34/49397-p19826-19827");
+ assertFalse(obj.has("/matches/3"));
+ assertTrue(obj.at("/matches/0/snippet").asText().contains("<span class=\"context-right\">""));
+ assertTrue(obj.at("/matches/0/snippet").asText().contains("wie wär's"));
+ }
@Test