blob: c36bb7cc404b5db202621fd60e9a374807dd6fcf [file] [log] [blame]
hebastae2bd6e42020-03-03 19:07:20 +01001package de.ids_mannheim.korap.plkexport;
2
hebastafed41942020-11-02 10:00:36 +01003import java.io.BufferedWriter;
4import java.io.File;
5import java.io.FileWriter;
hebastae0496762020-03-30 00:11:01 +02006import java.io.IOException;
Akronbb076c72020-09-28 14:39:15 +02007import java.io.StringWriter;
Akronb87af2f2020-09-24 14:38:02 +02008import java.lang.Thread;
9import java.io.InputStream;
hebastae0496762020-03-30 00:11:01 +020010import java.net.URLEncoder;
Akronbb076c72020-09-28 14:39:15 +020011import java.util.HashMap;
hebastae0496762020-03-30 00:11:01 +020012import java.util.ArrayList;
13import java.util.Collection;
14import java.util.Iterator;
15import java.util.LinkedList;
Akronc931cd02020-09-15 10:54:17 +020016import java.util.Properties;
Akron669d0bb2020-10-02 13:12:01 +020017import java.util.regex.Matcher;
18import java.util.regex.Pattern;
19import java.util.Base64;
hebastaff955d62020-05-27 15:59:14 +020020
21import javax.ws.rs.BadRequestException;
Akron328f9ea2020-09-22 17:11:22 +020022import javax.ws.rs.WebApplicationException;
hebastae0496762020-03-30 00:11:01 +020023import javax.ws.rs.FormParam;
24import javax.ws.rs.POST;
Akronb87af2f2020-09-24 14:38:02 +020025import javax.ws.rs.GET;
hebastae2bd6e42020-03-03 19:07:20 +010026import javax.ws.rs.Path;
27import javax.ws.rs.Produces;
hebastae0496762020-03-30 00:11:01 +020028import javax.ws.rs.client.Client;
29import javax.ws.rs.client.ClientBuilder;
30import javax.ws.rs.client.WebTarget;
Akronbe6777b2020-10-02 11:35:03 +020031import javax.ws.rs.client.Invocation;
32import javax.ws.rs.core.Context;
Akronbfd5fb82020-09-15 14:28:25 +020033import javax.ws.rs.core.UriBuilder;
hebastae2bd6e42020-03-03 19:07:20 +010034import javax.ws.rs.core.MediaType;
hebastae0496762020-03-30 00:11:01 +020035import javax.ws.rs.core.Response;
36import javax.ws.rs.core.Response.ResponseBuilder;
hebastaff955d62020-05-27 15:59:14 +020037import javax.ws.rs.core.Response.Status;
Akron669d0bb2020-10-02 13:12:01 +020038import javax.servlet.http.Cookie;
Akron328f9ea2020-09-22 17:11:22 +020039import java.net.ConnectException;
Akron669d0bb2020-10-02 13:12:01 +020040import javax.servlet.http.HttpServletRequest;
hebastaff955d62020-05-27 15:59:14 +020041
Akrone34bd4f2020-09-18 10:46:01 +020042import static de.ids_mannheim.korap.plkexport.Util.*;
43
Akronbb076c72020-09-28 14:39:15 +020044// Template engine
45import freemarker.template.Configuration;
46import freemarker.template.Template;
47
hebastafed41942020-11-02 10:00:36 +010048/**
Akron232248d2020-11-11 18:55:10 +010049 * TODO:
Akron232248d2020-11-11 18:55:10 +010050 * - Delete the temp file of the export at the end
Akronc1c18242020-11-18 18:24:12 +010051 * - Do not expect all meta data per match.
Akronb95c5a42020-11-17 13:44:17 +010052 * - Add progress mechanism.
53 * - Add CSV export format.
Akrond2072ee2020-11-17 16:12:41 +010054 * - Add table layout to RTF information.
Akronea77cb42020-11-18 14:04:21 +010055 * - Add loading marker.
Akron62d90a32020-11-18 20:45:38 +010056 * - Add hitc to form.
hebastafed41942020-11-02 10:00:36 +010057 */
hebastaa66693a2020-07-19 16:51:28 +020058
hebastae0496762020-03-30 00:11:01 +020059@Path("/")
hebastae2bd6e42020-03-03 19:07:20 +010060public class IdsExportService {
61
Akronc931cd02020-09-15 10:54:17 +020062 Properties properties = ExWSConf.properties(null);
Akronb87af2f2020-09-24 14:38:02 +020063
Akronbb076c72020-09-28 14:39:15 +020064 private final ClassLoader cl = Thread.currentThread().getContextClassLoader();
65
66 InputStream is = cl.getResourceAsStream("assets/export.js");
67 private final String exportJsStr = streamToString(is);
Akron7f242bb2020-09-28 20:19:11 +020068
69 Configuration cfg = new Configuration();
70 {
71 cfg.setClassForTemplateLoading(IdsExportService.class, "/assets/templates");
72 cfg.setDefaultEncoding("UTF-8");
73 }
Akronbe6777b2020-10-02 11:35:03 +020074
75 private final static String octets =
76 "(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})";
77
78 private final static String ipre =
79 octets + "\\." + octets + "\\." + octets + "\\." + octets;
Akron669d0bb2020-10-02 13:12:01 +020080
81 private static Pattern authrep = Pattern.compile("\"auth\":\"([^\"]+?)\"");
82
83 private final static Base64.Decoder b64Dec = Base64.getDecoder();
84
85 @Context
86 private HttpServletRequest req;
Akronc931cd02020-09-15 10:54:17 +020087
Akrone57937b2020-11-17 08:49:31 +010088
hebastae0496762020-03-30 00:11:01 +020089 /**
90 * WebService calls Kustvakt Search Webservices and returns
Akroneedac912020-11-16 12:54:42 +010091 * response as json (all of the response) and
92 * as rtf (matches)
hebastae0496762020-03-30 00:11:01 +020093 *
hebastaff955d62020-05-27 15:59:14 +020094 * @param fname
95 * file name
96 * @param format
97 * the file format value rtf or json.
98 * @param q
99 * the query
100 * @param ql
101 * the query language
Akron96907962020-11-12 18:47:07 +0100102 * @param cutoff
103 * Export more than the first page
hebastaff955d62020-05-27 15:59:14 +0200104 *
105 *
hebastae0496762020-03-30 00:11:01 +0200106 */
107 @POST
108 @Path("export")
109 @Produces(MediaType.APPLICATION_OCTET_STREAM)
Akron96907962020-11-12 18:47:07 +0100110 public Response export (
Akronbe6777b2020-10-02 11:35:03 +0200111 @FormParam("fname") String fname,
112 @FormParam("format") String format,
113 @FormParam("q") String q,
Akrond2072ee2020-11-17 16:12:41 +0100114 @FormParam("cq") String cq,
Akronbe6777b2020-10-02 11:35:03 +0200115 @FormParam("ql") String ql,
Akron62d90a32020-11-18 20:45:38 +0100116 @FormParam("cutoff") String cutoffStr,
Akronb95c5a42020-11-17 13:44:17 +0100117 // @FormParam("islimit") String il,
Akron62d90a32020-11-18 20:45:38 +0100118 @FormParam("hitc") int hitc
Akronbe6777b2020-10-02 11:35:03 +0200119 ) throws IOException {
Akron232248d2020-11-11 18:55:10 +0100120
121 // These parameters are required
Akron90799f12020-09-17 17:10:34 +0200122 String[][] params = {
Akron90799f12020-09-17 17:10:34 +0200123 { "format", format },
124 { "q", q },
125 { "ql", ql }
126 };
hebastae0496762020-03-30 00:11:01 +0200127
Akron232248d2020-11-11 18:55:10 +0100128 // Check that all parameters are available
hebastaff955d62020-05-27 15:59:14 +0200129 for (int i = 0; i < params.length; i++) {
130 if (params[i][1] == null || params[i][1].trim().isEmpty())
Akron90799f12020-09-17 17:10:34 +0200131 throw new BadRequestException(
132 Response
133 .status(Status.BAD_REQUEST)
134 .entity("Parameter " + "\""
135 + params[i][0] + "\"" + " is missing or empty")
136 .build());
Akron232248d2020-11-11 18:55:10 +0100137 };
Akron232248d2020-11-11 18:55:10 +0100138
139 // Retrieve cutoff value
hebastafed41942020-11-02 10:00:36 +0100140 boolean cutoff = false;
Akron62d90a32020-11-18 20:45:38 +0100141 if (cutoffStr != null && (
142 cutoffStr.equals("true") ||
143 cutoffStr.equals("1"))
144 ) {
Akron232248d2020-11-11 18:55:10 +0100145 cutoff = true;
146 };
hebastafed41942020-11-02 10:00:36 +0100147
148 ResponseBuilder builder = null;
hebastae0496762020-03-30 00:11:01 +0200149 Client client = ClientBuilder.newClient();
150
Akron62d90a32020-11-18 20:45:38 +0100151 // Load configuration values
152 String scheme = properties.getProperty("api.scheme", "https");
153 String port = properties.getProperty("api.port", "8089");
154 String host = properties.getProperty("api.host", "localhost");
155 String path = properties.getProperty("api.path", "");
156 int pageSize = Integer.parseInt(properties.getProperty("conf.page_size", "5"));
157 int maxResults = Integer.parseInt(properties.getProperty("conf.max_exp_limit", "10000"));
hebastaa66693a2020-07-19 16:51:28 +0200158
Akron62d90a32020-11-18 20:45:38 +0100159 // Adjust the number of requested hits
160 if (hitc > 0 && hitc < maxResults) {
161 maxResults = hitc;
162 };
163
Akrone57937b2020-11-17 08:49:31 +0100164 // Create initial search uri
Akronbfd5fb82020-09-15 14:28:25 +0200165 UriBuilder uri = UriBuilder.fromPath("/api/v1.0/search")
166 .host(host)
167 .port(Integer.parseInt(port))
Akron480a9082020-09-16 13:14:09 +0200168 .scheme(scheme)
Akronbfd5fb82020-09-15 14:28:25 +0200169 .queryParam("q", q)
Akron1b201612020-10-02 17:40:32 +0200170 // .queryParam("context", "sentence")
Akron96907962020-11-12 18:47:07 +0100171 .queryParam("context", "40-t,40-t") // Not yet supported
Akronbfd5fb82020-09-15 14:28:25 +0200172 .queryParam("ql", ql)
Akronbfd5fb82020-09-15 14:28:25 +0200173 ;
Akron68f221b2020-09-29 23:15:43 +0200174
Akrond2072ee2020-11-17 16:12:41 +0100175 if (cq != null)
176 uri = uri.queryParam("cq", cq);
Akrond2072ee2020-11-17 16:12:41 +0100177
Akron68f221b2020-09-29 23:15:43 +0200178 if (path != "") {
179 uri = uri.path(path);
180 };
Akron62d90a32020-11-18 20:45:38 +0100181
Akron5abeab52020-11-12 15:08:48 +0100182 uri = uri.queryParam("count", pageSize);
Akronc931cd02020-09-15 10:54:17 +0200183
Akronbe6777b2020-10-02 11:35:03 +0200184 // Get client IP, in case service is behind a proxy
185 String xff = "";
Akron669d0bb2020-10-02 13:12:01 +0200186 // Get auth (temporarily) via Session riding
187 String auth = "";
Akronbe6777b2020-10-02 11:35:03 +0200188 if (req != null) {
189 xff = getClientIP(req.getHeader("X-Forwarded-For"));
Akron62d90a32020-11-18 20:45:38 +0100190 if (xff == "")
Akronbe6777b2020-10-02 11:35:03 +0200191 xff = req.getRemoteAddr();
Akron669d0bb2020-10-02 13:12:01 +0200192
Akron54b30ed2020-11-13 10:35:35 +0100193 auth = authFromCookie(req);
Akronbe6777b2020-10-02 11:35:03 +0200194 };
Akron669d0bb2020-10-02 13:12:01 +0200195
Akron328f9ea2020-09-22 17:11:22 +0200196 String resp;
hebastafed41942020-11-02 10:00:36 +0100197 WebTarget resource;
Akron696c0632020-11-12 15:30:51 +0100198 Invocation.Builder reqBuilder;
Akrond2072ee2020-11-17 16:12:41 +0100199
Akron328f9ea2020-09-22 17:11:22 +0200200 try {
hebastafed41942020-11-02 10:00:36 +0100201 resource = client.target(uri.build());
Akron696c0632020-11-12 15:30:51 +0100202 reqBuilder = resource.request(MediaType.APPLICATION_JSON);
203 resp = authBuilder(reqBuilder, xff, auth).get(String.class);
Akroneedac912020-11-16 12:54:42 +0100204
Akron328f9ea2020-09-22 17:11:22 +0200205 } catch (Exception e) {
206 throw new WebApplicationException(
Akron7f242bb2020-09-28 20:19:11 +0200207 responseForm(Status.BAD_GATEWAY, "Unable to reach Backend")
Akron328f9ea2020-09-22 17:11:22 +0200208 );
209 }
hebastae0496762020-03-30 00:11:01 +0200210
Akroneedac912020-11-16 12:54:42 +0100211 Exporter exp;
212
Akrone57937b2020-11-17 08:49:31 +0100213 // Choose the correct exporter
Akron62d90a32020-11-18 20:45:38 +0100214 if (format.equals("json"))
Akroneedac912020-11-16 12:54:42 +0100215 exp = new JsonExporter();
Akron62d90a32020-11-18 20:45:38 +0100216 else
Akroneedac912020-11-16 12:54:42 +0100217 exp = new RtfExporter();
Akrone57937b2020-11-17 08:49:31 +0100218
Akron62d90a32020-11-18 20:45:38 +0100219 exp.setMaxResults(maxResults);
Akron74122712020-11-17 09:41:21 +0100220 exp.setQueryString(q);
Akrond2072ee2020-11-17 16:12:41 +0100221 exp.setCorpusQueryString(cq);
Akron62d90a32020-11-18 20:45:38 +0100222
Akron876017d2020-11-17 09:19:24 +0100223 // set filename based on query (if not already set)
224 if (fname != null) {
Akron74122712020-11-17 09:41:21 +0100225 exp.setFileName(fname);
Akron876017d2020-11-17 09:19:24 +0100226 };
227
Akrone57937b2020-11-17 08:49:31 +0100228 // Initialize exporter (with meta data and first matches)
Akronacc9f7a2020-11-17 17:21:40 +0100229 try {
230 exp.init(resp);
Akron62d90a32020-11-18 20:45:38 +0100231
Akronacc9f7a2020-11-17 17:21:40 +0100232 } catch (Exception e) {
233
234 throw new WebApplicationException(
235 responseForm(
236 Status.INTERNAL_SERVER_ERROR,
237 e.getMessage()
238 )
239 );
Akron62d90a32020-11-18 20:45:38 +0100240 };
Akrond2072ee2020-11-17 16:12:41 +0100241
Akron62d90a32020-11-18 20:45:38 +0100242
243 /*
244 * Calculate how many results to fetch
245 */
246 int fetchCount = exp.getTotalResults();
247 if (exp.hasTimeExceeded() || fetchCount > maxResults) {
248 fetchCount = maxResults;
249 };
250
251 // The first page was already enough
252 if (fetchCount <= pageSize) {
253 cutoff = true;
254 };
255
Akrond3df5b02020-11-12 18:23:17 +0100256 // If only one page should be exported there is no need
257 // for a temporary export file
258 if (cutoff) {
Akroneedac912020-11-16 12:54:42 +0100259 builder = exp.serve();
hebastae0496762020-03-30 00:11:01 +0200260 }
261
Akron62d90a32020-11-18 20:45:38 +0100262 // Page through all results
hebastae0496762020-03-30 00:11:01 +0200263 else {
hebastaa66693a2020-07-19 16:51:28 +0200264
Akron62d90a32020-11-18 20:45:38 +0100265 // It's not important anymore to get totalResults
266 uri.queryParam("cutoff", "true");
Akrond3df5b02020-11-12 18:23:17 +0100267
Akron62d90a32020-11-18 20:45:38 +0100268 // Set offset for paging as a template
Akrond3df5b02020-11-12 18:23:17 +0100269 uri.queryParam("offset", "{offset}");
Akron96907962020-11-12 18:47:07 +0100270
Akronacc9f7a2020-11-17 17:21:40 +0100271 try {
272
273 // Iterate over all results
Akron62d90a32020-11-18 20:45:38 +0100274 for (int i = pageSize; i <= fetchCount; i+=pageSize) {
275 resource = client.target(uri.build(i));
Akronacc9f7a2020-11-17 17:21:40 +0100276 reqBuilder = resource.request(MediaType.APPLICATION_JSON);
277 resp = authBuilder(reqBuilder, xff, auth).get(String.class);
Akron62d90a32020-11-18 20:45:38 +0100278
279 // Stop when no more matches are allowed
280 if (!exp.appendMatches(resp))
281 break;
Akronacc9f7a2020-11-17 17:21:40 +0100282 }
283 } catch (Exception e) {
284 throw new WebApplicationException(
285 responseForm(
286 Status.INTERNAL_SERVER_ERROR,
287 e.getMessage()
288 )
Akrond3df5b02020-11-12 18:23:17 +0100289 );
Akronacc9f7a2020-11-17 17:21:40 +0100290 };
hebastafed41942020-11-02 10:00:36 +0100291
Akrone57937b2020-11-17 08:49:31 +0100292 builder = exp.serve();
hebastafed41942020-11-02 10:00:36 +0100293 };
Akron2c0184d2020-09-17 18:22:03 +0200294
Akrond3df5b02020-11-12 18:23:17 +0100295 return builder.build();
Akron96907962020-11-12 18:47:07 +0100296 };
297
hebastae2bd6e42020-03-03 19:07:20 +0100298
Akronb87af2f2020-09-24 14:38:02 +0200299 @GET
300 @Path("export")
301 @Produces(MediaType.TEXT_HTML)
302 public Response exportHTML () {
Akron7f242bb2020-09-28 20:19:11 +0200303 return responseForm();
304 };
Akronbb076c72020-09-28 14:39:15 +0200305
Akron7f242bb2020-09-28 20:19:11 +0200306
Akron96907962020-11-12 18:47:07 +0100307 @GET
308 @Path("export.js")
309 @Produces("application/javascript")
310 public Response exportJavascript () {
311 return Response
312 .ok(exportJsStr, "application/javascript")
313 .build();
314 };
315
Akron7f242bb2020-09-28 20:19:11 +0200316
Akron696c0632020-11-12 15:30:51 +0100317 // Decorate request with auth headers
Akron54b30ed2020-11-13 10:35:35 +0100318 private Invocation.Builder authBuilder (Invocation.Builder reqBuilder,
319 String xff,
320 String auth) {
Akron696c0632020-11-12 15:30:51 +0100321 if (xff != "") {
322 reqBuilder = reqBuilder.header("X-Forwarded-For", xff);
323 };
324 if (auth != "") {
325 reqBuilder = reqBuilder.header("Authorization", auth);
326 };
327
328 return reqBuilder;
329 };
330
Akron96907962020-11-12 18:47:07 +0100331
Akron54b30ed2020-11-13 10:35:35 +0100332 // Get authorization token from cookie
333 private String authFromCookie (HttpServletRequest r) {
334
335 // This is a temporary solution using session riding - only
336 // valid for the time being
337 Cookie[] cookies = r.getCookies();
338 String cookiePath = properties.getProperty("cookie.path", "");
339
340 // Iterate through all cookies for a Kalamar session
341 for (int i = 0; i < cookies.length; i++) {
342
343 // Check the valid path
344 if (cookiePath != "" && cookies[i].getPath() != cookiePath) {
345 continue;
346 };
347
348 // Ignore irrelevant cookies
349 if (!cookies[i].getName().matches("^kalamar(-.+?)?$")) {
350 continue;
351 };
352
353 // Get the value
354 String b64 = cookies[i].getValue();
355 String[] b64Parts = b64.split("--", 2);
356 if (b64Parts.length == 2) {
357 // Read the payload
358 String payload = new String(b64Dec.decode(b64Parts[0]));
359 if (payload != "") {
360 Matcher m = authrep.matcher(payload);
361 if (m.find()) {
362 return m.group(1);
363 };
364 };
365 };
366 };
367
368 return "";
369 };
370
371
Akron96907962020-11-12 18:47:07 +0100372 /*
373 * Response with form template.
374 */
375 private Response responseForm () {
376 return responseForm(null, null);
Akron54b30ed2020-11-13 10:35:35 +0100377 };
Akron96907962020-11-12 18:47:07 +0100378
379
380 /*
381 * Response with form template.
382 *
383 * Accepts an error code and message.
384 */
Akron7f242bb2020-09-28 20:19:11 +0200385 private Response responseForm (Status code, String msg) {
Akronbb076c72020-09-28 14:39:15 +0200386 StringWriter out = new StringWriter();
387 HashMap<String, Object> templateData = new HashMap<String, Object>();
388
389 String scheme = properties.getProperty("asset.scheme", "https");
390 String port = properties.getProperty("asset.port", "");
391 String host = properties.getProperty("asset.host", "korap.ids-mannheim.de");
Akron68f221b2020-09-29 23:15:43 +0200392 String path = properties.getProperty("asset.path", "");
Akronbb076c72020-09-28 14:39:15 +0200393
394 UriBuilder uri = UriBuilder.fromPath("")
395 .host(host)
Akron68f221b2020-09-29 23:15:43 +0200396 .scheme(scheme);
397
398 if (path != "") {
399 uri = uri.path(path);
400 };
Akronbb076c72020-09-28 14:39:15 +0200401
402 if (port != "") {
403 uri = uri.port(Integer.parseInt(port));
Akron68f221b2020-09-29 23:15:43 +0200404 };
Akronbb076c72020-09-28 14:39:15 +0200405
406 templateData.put("assetPath", uri.build());
407
Akron7f242bb2020-09-28 20:19:11 +0200408 if (code != null) {
409 templateData.put("code", code.getStatusCode());
410 templateData.put("msg", msg);
411 };
412
413 // Generate template
Akronbb076c72020-09-28 14:39:15 +0200414 try {
415 Template template = cfg.getTemplate("export.ftl");
416 template.process(templateData, out);
417 }
418 catch (Exception e) {
419 return Response
420 .ok(new String("Template not found"))
421 .status(Status.INTERNAL_SERVER_ERROR)
422 .build();
Akron54b30ed2020-11-13 10:35:35 +0100423 };
hebastae2bd6e42020-03-03 19:07:20 +0100424
Akron7f242bb2020-09-28 20:19:11 +0200425 ResponseBuilder resp = Response.ok(out.toString(), "text/html");
426
427 if (code != null) {
428 resp = resp.status(code);
429 };
430
431 return resp.build();
Akroneedac912020-11-16 12:54:42 +0100432 };
Akronbe6777b2020-10-02 11:35:03 +0200433
434
435 /*
436 * This function is a simplification of
437 * Mojolicious::Plugin::ClientIP
438 */
439 protected static String getClientIP (String xff) {
440 if (xff == null) {
441 return "";
442 };
443
444 String[] ips = xff.split("\\s*,\\s*");
445
446 for (int i = ips.length - 1; i >= 0; i--){
447 if (ips[i].matches(ipre)) {
448 return ips[i];
449 };
450 };
451
452 return "";
Akron232248d2020-11-11 18:55:10 +0100453 };
Akron232248d2020-11-11 18:55:10 +0100454};