blob: 9e85cae05b95f1812cd97da40236e3f6b8753655 [file] [log] [blame]
hebastae2bd6e42020-03-03 19:07:20 +01001package de.ids_mannheim.korap.plkexport;
2
hebastafed41942020-11-02 10:00:36 +01003import java.io.BufferedWriter;
4import java.io.File;
5import java.io.FileWriter;
hebastae0496762020-03-30 00:11:01 +02006import java.io.IOException;
Akronbb076c72020-09-28 14:39:15 +02007import java.io.StringWriter;
Akronb87af2f2020-09-24 14:38:02 +02008import java.lang.Thread;
9import java.io.InputStream;
hebastae0496762020-03-30 00:11:01 +020010import java.net.URLEncoder;
Akronbb076c72020-09-28 14:39:15 +020011import java.util.HashMap;
hebastae0496762020-03-30 00:11:01 +020012import java.util.ArrayList;
13import java.util.Collection;
14import java.util.Iterator;
15import java.util.LinkedList;
Akronc931cd02020-09-15 10:54:17 +020016import java.util.Properties;
Akron669d0bb2020-10-02 13:12:01 +020017import java.util.regex.Matcher;
18import java.util.regex.Pattern;
19import java.util.Base64;
hebastaff955d62020-05-27 15:59:14 +020020
21import javax.ws.rs.BadRequestException;
Akron328f9ea2020-09-22 17:11:22 +020022import javax.ws.rs.WebApplicationException;
hebastae0496762020-03-30 00:11:01 +020023import javax.ws.rs.FormParam;
24import javax.ws.rs.POST;
Akronb87af2f2020-09-24 14:38:02 +020025import javax.ws.rs.GET;
hebastae2bd6e42020-03-03 19:07:20 +010026import javax.ws.rs.Path;
27import javax.ws.rs.Produces;
hebastae0496762020-03-30 00:11:01 +020028import javax.ws.rs.client.Client;
29import javax.ws.rs.client.ClientBuilder;
30import javax.ws.rs.client.WebTarget;
Akronbe6777b2020-10-02 11:35:03 +020031import javax.ws.rs.client.Invocation;
32import javax.ws.rs.core.Context;
Akronbfd5fb82020-09-15 14:28:25 +020033import javax.ws.rs.core.UriBuilder;
hebastae2bd6e42020-03-03 19:07:20 +010034import javax.ws.rs.core.MediaType;
hebastae0496762020-03-30 00:11:01 +020035import javax.ws.rs.core.Response;
36import javax.ws.rs.core.Response.ResponseBuilder;
hebastaff955d62020-05-27 15:59:14 +020037import javax.ws.rs.core.Response.Status;
Akron669d0bb2020-10-02 13:12:01 +020038import javax.servlet.http.Cookie;
Akron328f9ea2020-09-22 17:11:22 +020039import java.net.ConnectException;
Akron669d0bb2020-10-02 13:12:01 +020040import javax.servlet.http.HttpServletRequest;
hebastaff955d62020-05-27 15:59:14 +020041
Akrone34bd4f2020-09-18 10:46:01 +020042import static de.ids_mannheim.korap.plkexport.Util.*;
43
Akronbb076c72020-09-28 14:39:15 +020044// Template engine
45import freemarker.template.Configuration;
46import freemarker.template.Template;
47
hebastafed41942020-11-02 10:00:36 +010048/**
Akron232248d2020-11-11 18:55:10 +010049 * TODO:
50 * - Paging export works only for rtf, JSON has to be integrated
51 * - Delete the temp file of the export at the end
52 * - Get variable cutoff from URL
53 * - Right now, the web service returns one page (cutoff=1) or
54 * all pages.
Akron54b30ed2020-11-13 10:35:35 +010055 * - Handle timeout results (with minimum total results).
Akroneedac912020-11-16 12:54:42 +010056 * - Use offset instead of page parameter
57 * - Add mime type to exporters
58 * - Add format to exporters
59 * - Add file suffix to exporters
60 * - Add "..." to snippets in RTF exporter
Akroneedac912020-11-16 12:54:42 +010061 * - Test Snippet-Export with multiple classes.
Akronc408ccb2020-11-16 18:22:12 +010062 * - Test Snippet-Export with cutted matches.
hebastafed41942020-11-02 10:00:36 +010063 */
hebastaa66693a2020-07-19 16:51:28 +020064
hebastae0496762020-03-30 00:11:01 +020065@Path("/")
hebastae2bd6e42020-03-03 19:07:20 +010066public class IdsExportService {
67
Akronc931cd02020-09-15 10:54:17 +020068 Properties properties = ExWSConf.properties(null);
Akronb87af2f2020-09-24 14:38:02 +020069
Akronbb076c72020-09-28 14:39:15 +020070 private final ClassLoader cl = Thread.currentThread().getContextClassLoader();
71
72 InputStream is = cl.getResourceAsStream("assets/export.js");
73 private final String exportJsStr = streamToString(is);
Akron7f242bb2020-09-28 20:19:11 +020074
75 Configuration cfg = new Configuration();
76 {
77 cfg.setClassForTemplateLoading(IdsExportService.class, "/assets/templates");
78 cfg.setDefaultEncoding("UTF-8");
79 }
Akronbe6777b2020-10-02 11:35:03 +020080
81 private final static String octets =
82 "(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})";
83
84 private final static String ipre =
85 octets + "\\." + octets + "\\." + octets + "\\." + octets;
Akron669d0bb2020-10-02 13:12:01 +020086
87 private static Pattern authrep = Pattern.compile("\"auth\":\"([^\"]+?)\"");
88
89 private final static Base64.Decoder b64Dec = Base64.getDecoder();
90
91 @Context
92 private HttpServletRequest req;
Akronc931cd02020-09-15 10:54:17 +020093
hebastae0496762020-03-30 00:11:01 +020094 /**
95 * WebService calls Kustvakt Search Webservices and returns
Akroneedac912020-11-16 12:54:42 +010096 * response as json (all of the response) and
97 * as rtf (matches)
hebastae0496762020-03-30 00:11:01 +020098 *
hebastaff955d62020-05-27 15:59:14 +020099 * @param fname
100 * file name
101 * @param format
102 * the file format value rtf or json.
103 * @param q
104 * the query
105 * @param ql
106 * the query language
Akron96907962020-11-12 18:47:07 +0100107 * @param cutoff
108 * Export more than the first page
hebastaff955d62020-05-27 15:59:14 +0200109 *
110 *
hebastae0496762020-03-30 00:11:01 +0200111 */
112 @POST
113 @Path("export")
114 @Produces(MediaType.APPLICATION_OCTET_STREAM)
Akron96907962020-11-12 18:47:07 +0100115 public Response export (
Akronbe6777b2020-10-02 11:35:03 +0200116 @FormParam("fname") String fname,
117 @FormParam("format") String format,
118 @FormParam("q") String q,
119 @FormParam("ql") String ql,
120 @FormParam("islimit") String il,
Akron232248d2020-11-11 18:55:10 +0100121 @FormParam("cutoff") String cutoffStr,
Akron669d0bb2020-10-02 13:12:01 +0200122 @FormParam("hitc") int hitc
Akronbe6777b2020-10-02 11:35:03 +0200123 ) throws IOException {
Akron232248d2020-11-11 18:55:10 +0100124
125 // These parameters are required
Akron90799f12020-09-17 17:10:34 +0200126 String[][] params = {
Akron90799f12020-09-17 17:10:34 +0200127 { "format", format },
128 { "q", q },
129 { "ql", ql }
130 };
hebastae0496762020-03-30 00:11:01 +0200131
Akron232248d2020-11-11 18:55:10 +0100132 // Check that all parameters are available
hebastaff955d62020-05-27 15:59:14 +0200133 for (int i = 0; i < params.length; i++) {
134 if (params[i][1] == null || params[i][1].trim().isEmpty())
Akron90799f12020-09-17 17:10:34 +0200135 throw new BadRequestException(
136 Response
137 .status(Status.BAD_REQUEST)
138 .entity("Parameter " + "\""
139 + params[i][0] + "\"" + " is missing or empty")
140 .build());
Akron232248d2020-11-11 18:55:10 +0100141 };
hebastaff955d62020-05-27 15:59:14 +0200142
hebastafed41942020-11-02 10:00:36 +0100143
Akroneedac912020-11-16 12:54:42 +0100144 int totalhits = -1;
Akron232248d2020-11-11 18:55:10 +0100145
146 // Retrieve cutoff value
hebastafed41942020-11-02 10:00:36 +0100147 boolean cutoff = false;
Akron232248d2020-11-11 18:55:10 +0100148 if (cutoffStr != null && (cutoffStr.equals("true") || cutoffStr.equals("1"))) {
149 cutoff = true;
150 };
hebastafed41942020-11-02 10:00:36 +0100151
152 ResponseBuilder builder = null;
hebastae0496762020-03-30 00:11:01 +0200153 Client client = ClientBuilder.newClient();
154
Akron480a9082020-09-16 13:14:09 +0200155 String scheme = properties.getProperty("api.scheme", "https");
Akron5abeab52020-11-12 15:08:48 +0100156 String port = properties.getProperty("api.port", "8089");
157 String host = properties.getProperty("api.host", "localhost");
158 String path = properties.getProperty("api.path", "");
159 int pageSize = Integer.parseInt(properties.getProperty("conf.page_size", "5"));
hebastaa66693a2020-07-19 16:51:28 +0200160
Akronbfd5fb82020-09-15 14:28:25 +0200161 UriBuilder uri = UriBuilder.fromPath("/api/v1.0/search")
162 .host(host)
163 .port(Integer.parseInt(port))
Akron480a9082020-09-16 13:14:09 +0200164 .scheme(scheme)
Akronbfd5fb82020-09-15 14:28:25 +0200165 .queryParam("q", q)
Akron1b201612020-10-02 17:40:32 +0200166 // .queryParam("context", "sentence")
Akron96907962020-11-12 18:47:07 +0100167 .queryParam("context", "40-t,40-t") // Not yet supported
Akronbfd5fb82020-09-15 14:28:25 +0200168 .queryParam("ql", ql)
Akronbfd5fb82020-09-15 14:28:25 +0200169 ;
Akron68f221b2020-09-29 23:15:43 +0200170
171 if (path != "") {
172 uri = uri.path(path);
173 };
Akron669d0bb2020-10-02 13:12:01 +0200174
Akron5abeab52020-11-12 15:08:48 +0100175 uri = uri.queryParam("count", pageSize);
Akronc931cd02020-09-15 10:54:17 +0200176
Akronbe6777b2020-10-02 11:35:03 +0200177 // Get client IP, in case service is behind a proxy
178 String xff = "";
Akron669d0bb2020-10-02 13:12:01 +0200179 // Get auth (temporarily) via Session riding
180 String auth = "";
Akronbe6777b2020-10-02 11:35:03 +0200181 if (req != null) {
182 xff = getClientIP(req.getHeader("X-Forwarded-For"));
183 if (xff == "") {
184 xff = req.getRemoteAddr();
185 };
Akron669d0bb2020-10-02 13:12:01 +0200186
Akron54b30ed2020-11-13 10:35:35 +0100187 auth = authFromCookie(req);
Akronbe6777b2020-10-02 11:35:03 +0200188 };
Akron669d0bb2020-10-02 13:12:01 +0200189
Akron328f9ea2020-09-22 17:11:22 +0200190 String resp;
hebastafed41942020-11-02 10:00:36 +0100191 WebTarget resource;
Akron696c0632020-11-12 15:30:51 +0100192 Invocation.Builder reqBuilder;
193
Akron328f9ea2020-09-22 17:11:22 +0200194 try {
hebastafed41942020-11-02 10:00:36 +0100195 resource = client.target(uri.build());
Akron696c0632020-11-12 15:30:51 +0100196 reqBuilder = resource.request(MediaType.APPLICATION_JSON);
197 resp = authBuilder(reqBuilder, xff, auth).get(String.class);
Akroneedac912020-11-16 12:54:42 +0100198
Akron328f9ea2020-09-22 17:11:22 +0200199 } catch (Exception e) {
200 throw new WebApplicationException(
Akron7f242bb2020-09-28 20:19:11 +0200201 responseForm(Status.BAD_GATEWAY, "Unable to reach Backend")
Akron328f9ea2020-09-22 17:11:22 +0200202 );
203 }
hebastae0496762020-03-30 00:11:01 +0200204
Akron54b30ed2020-11-13 10:35:35 +0100205 // set filename based on query (if not already set)
Akron2c0184d2020-09-17 18:22:03 +0200206 if (fname == null) {
207 fname = q;
208 }
209
Akroneedac912020-11-16 12:54:42 +0100210 Exporter exp;
211
212 if (format.equals("json")) {
213 exp = new JsonExporter();
214 }
215 else {
216 exp = new RtfExporter();
217 };
218
219 exp.init(resp);
220
Akrond3df5b02020-11-12 18:23:17 +0100221 // If only one page should be exported there is no need
222 // for a temporary export file
223 if (cutoff) {
224
Akroneedac912020-11-16 12:54:42 +0100225 builder = exp.serve();
226
Akrond3df5b02020-11-12 18:23:17 +0100227 if (format.equals("json")) {
Akron232248d2020-11-11 18:55:10 +0100228 builder.type(MediaType.APPLICATION_JSON);
Akrond3df5b02020-11-12 18:23:17 +0100229 }
230
231 else {
Akrond3df5b02020-11-12 18:23:17 +0100232 builder.type("application/rtf");
233 format = "rtf";
Akron232248d2020-11-11 18:55:10 +0100234 };
hebastae0496762020-03-30 00:11:01 +0200235 }
236
Akrond3df5b02020-11-12 18:23:17 +0100237 // Page through results
hebastae0496762020-03-30 00:11:01 +0200238 else {
hebastaa66693a2020-07-19 16:51:28 +0200239
hebastafed41942020-11-02 10:00:36 +0100240 /*
241 * Get total results
242 */
Akroneedac912020-11-16 12:54:42 +0100243 if (exp.getMeta() != null) {
244 totalhits = exp.getMeta().get("totalResults").asInt();
245 };
Akrond3df5b02020-11-12 18:23:17 +0100246
Akrond3df5b02020-11-12 18:23:17 +0100247 /*
248 * Get number of pages and the number of hits
249 * which should be exported at the last page
250 */
251 int pg = 1;
Akroneedac912020-11-16 12:54:42 +0100252 if (totalhits % pageSize > 0) {
Akrond3df5b02020-11-12 18:23:17 +0100253 pg = totalhits / pageSize + 1;
254 }
255 else {
256 pg = totalhits / pageSize;
hebastae0496762020-03-30 00:11:01 +0200257 }
258
Akrond3df5b02020-11-12 18:23:17 +0100259 uri.queryParam("offset", "{offset}");
Akron96907962020-11-12 18:47:07 +0100260
Akroneedac912020-11-16 12:54:42 +0100261 // Iterate over all results
262 for (int i = 2; i <= pg; i++) {
Akrond3df5b02020-11-12 18:23:17 +0100263 resource = client.target(
264 uri.build((i * pageSize) - pageSize)
265 );
Akron696c0632020-11-12 15:30:51 +0100266
Akrond3df5b02020-11-12 18:23:17 +0100267 reqBuilder = resource.request(MediaType.APPLICATION_JSON);
268 resp = authBuilder(reqBuilder, xff, auth).get(String.class);
Akroneedac912020-11-16 12:54:42 +0100269 exp.appendMatches(resp);
hebastafed41942020-11-02 10:00:36 +0100270 }
Akroneedac912020-11-16 12:54:42 +0100271 // builder = Response.ok(expTmp);
272 builder = exp.serve();
hebastafed41942020-11-02 10:00:36 +0100273
Akroneedac912020-11-16 12:54:42 +0100274 if (format.equals("json")) {
275 builder.type(MediaType.APPLICATION_JSON);
276 }
277 else {
278 builder.type("application/rtf");
279 format = "rtf";
280 };
hebastafed41942020-11-02 10:00:36 +0100281 };
Akron2c0184d2020-09-17 18:22:03 +0200282
Akrona00142a2020-10-02 17:51:10 +0200283 builder.header(
284 "Content-Disposition",
285 "attachment; filename=" +
286 sanitizeFileName(fname) +
287 '.' +
288 format
289 );
Akrond3df5b02020-11-12 18:23:17 +0100290 return builder.build();
Akron96907962020-11-12 18:47:07 +0100291 };
292
hebastae2bd6e42020-03-03 19:07:20 +0100293
Akronb87af2f2020-09-24 14:38:02 +0200294 @GET
295 @Path("export")
296 @Produces(MediaType.TEXT_HTML)
297 public Response exportHTML () {
Akron7f242bb2020-09-28 20:19:11 +0200298 return responseForm();
299 };
Akronbb076c72020-09-28 14:39:15 +0200300
Akron7f242bb2020-09-28 20:19:11 +0200301
Akron96907962020-11-12 18:47:07 +0100302 @GET
303 @Path("export.js")
304 @Produces("application/javascript")
305 public Response exportJavascript () {
306 return Response
307 .ok(exportJsStr, "application/javascript")
308 .build();
309 };
310
Akron7f242bb2020-09-28 20:19:11 +0200311
Akron696c0632020-11-12 15:30:51 +0100312 // Decorate request with auth headers
Akron54b30ed2020-11-13 10:35:35 +0100313 private Invocation.Builder authBuilder (Invocation.Builder reqBuilder,
314 String xff,
315 String auth) {
Akron696c0632020-11-12 15:30:51 +0100316 if (xff != "") {
317 reqBuilder = reqBuilder.header("X-Forwarded-For", xff);
318 };
319 if (auth != "") {
320 reqBuilder = reqBuilder.header("Authorization", auth);
321 };
322
323 return reqBuilder;
324 };
325
Akron96907962020-11-12 18:47:07 +0100326
Akron54b30ed2020-11-13 10:35:35 +0100327 // Get authorization token from cookie
328 private String authFromCookie (HttpServletRequest r) {
329
330 // This is a temporary solution using session riding - only
331 // valid for the time being
332 Cookie[] cookies = r.getCookies();
333 String cookiePath = properties.getProperty("cookie.path", "");
334
335 // Iterate through all cookies for a Kalamar session
336 for (int i = 0; i < cookies.length; i++) {
337
338 // Check the valid path
339 if (cookiePath != "" && cookies[i].getPath() != cookiePath) {
340 continue;
341 };
342
343 // Ignore irrelevant cookies
344 if (!cookies[i].getName().matches("^kalamar(-.+?)?$")) {
345 continue;
346 };
347
348 // Get the value
349 String b64 = cookies[i].getValue();
350 String[] b64Parts = b64.split("--", 2);
351 if (b64Parts.length == 2) {
352 // Read the payload
353 String payload = new String(b64Dec.decode(b64Parts[0]));
354 if (payload != "") {
355 Matcher m = authrep.matcher(payload);
356 if (m.find()) {
357 return m.group(1);
358 };
359 };
360 };
361 };
362
363 return "";
364 };
365
366
Akron96907962020-11-12 18:47:07 +0100367 /*
368 * Response with form template.
369 */
370 private Response responseForm () {
371 return responseForm(null, null);
Akron54b30ed2020-11-13 10:35:35 +0100372 };
Akron96907962020-11-12 18:47:07 +0100373
374
375 /*
376 * Response with form template.
377 *
378 * Accepts an error code and message.
379 */
Akron7f242bb2020-09-28 20:19:11 +0200380 private Response responseForm (Status code, String msg) {
Akronbb076c72020-09-28 14:39:15 +0200381 StringWriter out = new StringWriter();
382 HashMap<String, Object> templateData = new HashMap<String, Object>();
383
384 String scheme = properties.getProperty("asset.scheme", "https");
385 String port = properties.getProperty("asset.port", "");
386 String host = properties.getProperty("asset.host", "korap.ids-mannheim.de");
Akron68f221b2020-09-29 23:15:43 +0200387 String path = properties.getProperty("asset.path", "");
Akronbb076c72020-09-28 14:39:15 +0200388
389 UriBuilder uri = UriBuilder.fromPath("")
390 .host(host)
Akron68f221b2020-09-29 23:15:43 +0200391 .scheme(scheme);
392
393 if (path != "") {
394 uri = uri.path(path);
395 };
Akronbb076c72020-09-28 14:39:15 +0200396
397 if (port != "") {
398 uri = uri.port(Integer.parseInt(port));
Akron68f221b2020-09-29 23:15:43 +0200399 };
Akronbb076c72020-09-28 14:39:15 +0200400
401 templateData.put("assetPath", uri.build());
402
Akron7f242bb2020-09-28 20:19:11 +0200403 if (code != null) {
404 templateData.put("code", code.getStatusCode());
405 templateData.put("msg", msg);
406 };
407
408 // Generate template
Akronbb076c72020-09-28 14:39:15 +0200409 try {
410 Template template = cfg.getTemplate("export.ftl");
411 template.process(templateData, out);
412 }
413 catch (Exception e) {
414 return Response
415 .ok(new String("Template not found"))
416 .status(Status.INTERNAL_SERVER_ERROR)
417 .build();
Akron54b30ed2020-11-13 10:35:35 +0100418 };
hebastae2bd6e42020-03-03 19:07:20 +0100419
Akron7f242bb2020-09-28 20:19:11 +0200420 ResponseBuilder resp = Response.ok(out.toString(), "text/html");
421
422 if (code != null) {
423 resp = resp.status(code);
424 };
425
426 return resp.build();
Akroneedac912020-11-16 12:54:42 +0100427 };
Akronbe6777b2020-10-02 11:35:03 +0200428
429
430 /*
431 * This function is a simplification of
432 * Mojolicious::Plugin::ClientIP
433 */
434 protected static String getClientIP (String xff) {
435 if (xff == null) {
436 return "";
437 };
438
439 String[] ips = xff.split("\\s*,\\s*");
440
441 for (int i = ips.length - 1; i >= 0; i--){
442 if (ips[i].matches(ipre)) {
443 return ips[i];
444 };
445 };
446
447 return "";
Akron232248d2020-11-11 18:55:10 +0100448 };
Akron232248d2020-11-11 18:55:10 +0100449};