blob: 8aa283d6ba6c3f7562dd80a9208cfddc8c0ae88b [file] [log] [blame]
Akronb329d272020-11-13 12:45:26 +01001package de.ids_mannheim.korap.plkexport;
2
3import java.io.BufferedWriter;
4import java.io.File;
Akron1d36eb52020-11-13 17:52:26 +01005import java.io.Writer;
Akronb329d272020-11-13 12:45:26 +01006import java.io.FileWriter;
7import java.io.IOException;
8import java.io.StringWriter;
Akron55def632020-11-26 16:00:02 +01009import java.io.InputStream;
10import java.io.OutputStream;
11import java.io.FileInputStream;
Akron1d36eb52020-11-13 17:52:26 +010012
Akronb329d272020-11-13 12:45:26 +010013import java.util.Collection;
14import java.util.ArrayList;
Akronb329d272020-11-13 12:45:26 +010015import java.util.Iterator;
16import java.util.LinkedList;
Akron35881012020-11-24 20:05:06 +010017import java.util.Properties;
Akronb329d272020-11-13 12:45:26 +010018
19import com.fasterxml.jackson.core.JsonFactory;
20import com.fasterxml.jackson.core.JsonParser;
21import com.fasterxml.jackson.core.Version;
Akronacc9f7a2020-11-17 17:21:40 +010022import com.fasterxml.jackson.core.JsonParseException;
Akronb329d272020-11-13 12:45:26 +010023import com.fasterxml.jackson.databind.JsonNode;
24import com.fasterxml.jackson.databind.ObjectMapper;
25
Akron1d36eb52020-11-13 17:52:26 +010026import javax.ws.rs.core.Response;
27import javax.ws.rs.core.Response.ResponseBuilder;
Akron35881012020-11-24 20:05:06 +010028import javax.ws.rs.core.Response.Status;
Akron55def632020-11-26 16:00:02 +010029import javax.ws.rs.core.StreamingOutput;
Akronb329d272020-11-13 12:45:26 +010030
Akronaf145eb2020-11-24 16:55:47 +010031import org.glassfish.jersey.media.sse.EventOutput;
32import org.glassfish.jersey.media.sse.OutboundEvent;
Akrond0b1cfe2020-11-20 19:26:52 +010033
Akron876017d2020-11-17 09:19:24 +010034import static de.ids_mannheim.korap.plkexport.Util.*;
35
Akron1d36eb52020-11-13 17:52:26 +010036/**
37 * Base class for collecting matches and header information
38 * for exporters implementing the Exporter interface.
39 */
Akron1d36eb52020-11-13 17:52:26 +010040public class MatchAggregator {
Akronb329d272020-11-13 12:45:26 +010041
Akron35881012020-11-24 20:05:06 +010042 private final Properties prop = ExWSConf.properties(null);
43
Akron984fe8f2020-11-25 15:21:37 +010044 private static final ObjectMapper mapper = new ObjectMapper();
Akronb329d272020-11-13 12:45:26 +010045
Akron984fe8f2020-11-25 15:21:37 +010046 // In-memory and persistant writer for data
Akron1d36eb52020-11-13 17:52:26 +010047 private Writer writer;
Akron1d36eb52020-11-13 17:52:26 +010048 private File file;
Akron984fe8f2020-11-25 15:21:37 +010049
50 // Meta information for result exports
Akronfddd0582020-11-17 09:49:54 +010051 private JsonNode meta, query, collection;
Akron820dc642020-11-19 13:11:50 +010052 private String fname, queryString, corpusQueryString, src;
Akronc1c18242020-11-18 18:24:12 +010053 private boolean timeExceeded = false;
Akron984fe8f2020-11-25 15:21:37 +010054
55 // Result calculations (partially for progress)
56 private int totalResults = -1,
57 maxResults = -1,
58 fetchedResults = 0;
59
60 // Event writer for progress
Akronaf145eb2020-11-24 16:55:47 +010061 private EventOutput evOut;
Akron984fe8f2020-11-25 15:21:37 +010062
63
64 /**
65 * MimeType of the exporter -
66 * defaults to "text/plain" but
67 * should be overwritten.
68 */
Akrone57937b2020-11-17 08:49:31 +010069 public String getMimeType() {
70 return "text/plain";
71 };
72
Akron984fe8f2020-11-25 15:21:37 +010073
74 /**
75 * Suffix of the exported file -
76 * defaults to "txt" but
77 * should be overwritten.
78 */
Akrone57937b2020-11-17 08:49:31 +010079 public String getSuffix() {
80 return "txt";
81 };
82
Akron984fe8f2020-11-25 15:21:37 +010083
84 /**
85 * Total results of exportable matches.
86 */
Akronc1c18242020-11-18 18:24:12 +010087 public int getTotalResults() {
88 return this.totalResults;
89 };
Akron984fe8f2020-11-25 15:21:37 +010090
Akron62d90a32020-11-18 20:45:38 +010091
Akron984fe8f2020-11-25 15:21:37 +010092 /**
93 * Indicator if time was exceeded when
94 * fetching all matches. This means
95 * that "totalResults" needs
96 * to be treated as a minimum value.
97 */
Akronc1c18242020-11-18 18:24:12 +010098 public boolean hasTimeExceeded() {
99 return this.timeExceeded;
100 };
Akron984fe8f2020-11-25 15:21:37 +0100101
Akronc1c18242020-11-18 18:24:12 +0100102
Akron984fe8f2020-11-25 15:21:37 +0100103 /**
104 * Set the file name of the file to
105 * be exported.
106 */
Akron74122712020-11-17 09:41:21 +0100107 public void setFileName (String fname) {
Akron876017d2020-11-17 09:19:24 +0100108 this.fname = fname;
109 };
110
Akron984fe8f2020-11-25 15:21:37 +0100111
112 /**
113 * Get the file name of the file to
114 * be exported.
115 */
Akron74122712020-11-17 09:41:21 +0100116 public String getFileName () {
117 String s = this.fname;
118 if (s == null)
119 s = this.queryString;
120 if (s == null)
121 return "export";
122 return sanitizeFileName(s);
Akron876017d2020-11-17 09:19:24 +0100123 };
124
Akron984fe8f2020-11-25 15:21:37 +0100125
126 /**
127 * Set the query string.
128 */
Akron74122712020-11-17 09:41:21 +0100129 public void setQueryString (String query) {
130 this.queryString = query;
131 };
132
Akron984fe8f2020-11-25 15:21:37 +0100133
134 /**
135 * Get the query string.
136 */
Akron74122712020-11-17 09:41:21 +0100137 public String getQueryString () {
138 return this.queryString;
139 };
Akrond2072ee2020-11-17 16:12:41 +0100140
Akron984fe8f2020-11-25 15:21:37 +0100141
142 /**
143 * Set the corpus query string.
144 */
Akrond2072ee2020-11-17 16:12:41 +0100145 public void setCorpusQueryString (String query) {
146 this.corpusQueryString = query;
147 };
148
Akron984fe8f2020-11-25 15:21:37 +0100149
150 /**
151 * Get the corpus query string.
152 */
Akrond2072ee2020-11-17 16:12:41 +0100153 public String getCorpusQueryString () {
154 return this.corpusQueryString;
155 };
Akron820dc642020-11-19 13:11:50 +0100156
Akron984fe8f2020-11-25 15:21:37 +0100157
158 /**
159 * Set the source information.
160 */
Akron820dc642020-11-19 13:11:50 +0100161 public void setSource (String host, String path) {
162 StringBuilder s = new StringBuilder(32);
163 if (host != null)
164 s.append(host);
165
166 if (path != null && path.length() > 0)
167 s.append('/').append(path);
168
169 this.src = s.toString();
170 };
171
Akron5405dec2020-11-26 20:39:24 +0100172
173 /**
Akron83f39942021-03-22 16:37:25 +0100174 * Set the source information verbatim.
Akron5405dec2020-11-26 20:39:24 +0100175 */
176 public void setSource (String src) {
177 this.src = src;
178 };
179
Akron984fe8f2020-11-25 15:21:37 +0100180
181 /**
182 * Get the source information.
183 */
Akron820dc642020-11-19 13:11:50 +0100184 public String getSource () {
185 return this.src;
186 };
Akron984fe8f2020-11-25 15:21:37 +0100187
Akron74122712020-11-17 09:41:21 +0100188
Akron984fe8f2020-11-25 15:21:37 +0100189 /**
190 * Set the meta JSON blob.
191 */
Akron1d36eb52020-11-13 17:52:26 +0100192 public void setMeta (JsonNode meta) {
193 this.meta = meta;
194 };
Akroneedac912020-11-16 12:54:42 +0100195
Akron984fe8f2020-11-25 15:21:37 +0100196
197 /**
198 * Get the meta JSON blob.
199 */
Akroneedac912020-11-16 12:54:42 +0100200 public JsonNode getMeta () {
201 return this.meta;
202 };
Akron984fe8f2020-11-25 15:21:37 +0100203
Akroneedac912020-11-16 12:54:42 +0100204
Akron984fe8f2020-11-25 15:21:37 +0100205 /**
206 * Set the query JSON blob.
207 */
Akron1d36eb52020-11-13 17:52:26 +0100208 public void setQuery (JsonNode query) {
209 this.query = query;
210 };
211
Akron62d90a32020-11-18 20:45:38 +0100212
Akron984fe8f2020-11-25 15:21:37 +0100213 /**
214 * Get the query JSON blob.
215 */
Akroneedac912020-11-16 12:54:42 +0100216 public JsonNode getQuery () {
217 return this.query;
218 };
219
Akron984fe8f2020-11-25 15:21:37 +0100220
221 /**
222 * Set the collection JSON blob.
223 */
Akron1d36eb52020-11-13 17:52:26 +0100224 public void setCollection (JsonNode collection) {
225 this.collection = collection;
226 };
227
Akron984fe8f2020-11-25 15:21:37 +0100228
229 /**
230 * Get the collection JSON blob.
231 */
Akroneedac912020-11-16 12:54:42 +0100232 public JsonNode getCollection () {
233 return this.collection;
234 };
235
Akron984fe8f2020-11-25 15:21:37 +0100236
237 /**
238 * Set the maximum results to be fetched.
239 *
240 * This needs to be set prior to the first
241 * "addMatch" so it can be taken into account.
242 */
243 public void setMaxResults (int maxResults) {
244 this.maxResults = maxResults;
245 };
246
247
248 /**
249 * Get the maximum results to be fetched.
250 */
251 public int getMaxResults () {
252 return this.maxResults;
253 };
254
255
256 /**
257 * Get the export ID which is the pointer
258 * to where the system can find the temporary
259 * generated file.
260 */
Akron35881012020-11-24 20:05:06 +0100261 public String getExportID () {
262 if (this.file == null)
263 return "";
264 return this.file.getName();
265 };
266
Akron984fe8f2020-11-25 15:21:37 +0100267
Akron35881012020-11-24 20:05:06 +0100268 /**
269 * Set the file based on the export ID
270 */
271 public void setFile (String exportID) {
272 this.file = new File(
273 this.getFileDirectory(),
274 exportID
275 );
276 }
Akron984fe8f2020-11-25 15:21:37 +0100277
Akron35881012020-11-24 20:05:06 +0100278
Akron984fe8f2020-11-25 15:21:37 +0100279 /**
280 * Write header for exportation.
281 *
282 * Should be overwritten.
283 */
Akron1d36eb52020-11-13 17:52:26 +0100284 public void writeHeader (Writer w) throws IOException { };
Akron984fe8f2020-11-25 15:21:37 +0100285
286
287 /**
288 * Write footer for exportation.
289 *
290 * Should be overwritten.
291 */
Akron1d36eb52020-11-13 17:52:26 +0100292 public void writeFooter (Writer w) throws IOException { };
Akron984fe8f2020-11-25 15:21:37 +0100293
294
295 /**
296 * Write a single match.
297 *
298 * Should be overwritten.
299 */
Akron1d36eb52020-11-13 17:52:26 +0100300 public void addMatch (JsonNode n, Writer w) throws IOException { };
Akrond0b1cfe2020-11-20 19:26:52 +0100301
Akron984fe8f2020-11-25 15:21:37 +0100302
303 /**
304 * Set the event stream for progress feedback.
305 */
Akronaf145eb2020-11-24 16:55:47 +0100306 public void setSse (EventOutput eventOutput) {
307 this.evOut = eventOutput;
Akrond0b1cfe2020-11-20 19:26:52 +0100308 };
Akron35881012020-11-24 20:05:06 +0100309
Akronaf145eb2020-11-24 16:55:47 +0100310
Akronb329d272020-11-13 12:45:26 +0100311 /**
Akron984fe8f2020-11-25 15:21:37 +0100312 * Force the creation of a file, even when only
313 * a few matches are requested.
Akron35881012020-11-24 20:05:06 +0100314 */
315 public void forceFile () {
316
317 // Open file if not already opened
318 if (this.file == null) {
319
320 try {
321
322 File dir = getFileDirectory();
323
324 // Create temporary file
325 this.file = File.createTempFile(
326 "idsexp-", "." + this.getSuffix(),
327 dir
328 );
329
Akron35881012020-11-24 20:05:06 +0100330 String s = null;
331
Akron984fe8f2020-11-25 15:21:37 +0100332 // Take temporary data from the in-memory writer
Akron35881012020-11-24 20:05:06 +0100333 if (writer != null)
334 s = writer.toString();
335
Akron984fe8f2020-11-25 15:21:37 +0100336 // Establish persistant writer
Akron35881012020-11-24 20:05:06 +0100337 writer = new BufferedWriter(new FileWriter(this.file, true));
338
Akron984fe8f2020-11-25 15:21:37 +0100339 // Add in-memory string
Akron35881012020-11-24 20:05:06 +0100340 if (s != null)
341 writer.write(s);
342
343 }
Akron35881012020-11-24 20:05:06 +0100344
Akron984fe8f2020-11-25 15:21:37 +0100345 // If data can't be stored on disk, the writer will
346 // rely on in-memory data, which may or may not work in
347 // different contexts.
348 catch (IOException e) {
Akron35881012020-11-24 20:05:06 +0100349 return;
350 };
351 };
Akron984fe8f2020-11-25 15:21:37 +0100352 };
Akron35881012020-11-24 20:05:06 +0100353
Akron984fe8f2020-11-25 15:21:37 +0100354
Akron35881012020-11-24 20:05:06 +0100355 /**
Akron984fe8f2020-11-25 15:21:37 +0100356 * Parse initial JSON file to get header information
357 * and initial matches.
Akronb329d272020-11-13 12:45:26 +0100358 */
Akron62d90a32020-11-18 20:45:38 +0100359 public boolean init (String resp) throws IOException, JsonParseException {
Akron1d36eb52020-11-13 17:52:26 +0100360
Akronc51327b2020-11-13 15:21:26 +0100361 if (resp == null)
Akron62d90a32020-11-18 20:45:38 +0100362 return false;
363
Akronb329d272020-11-13 12:45:26 +0100364 JsonParser parser = mapper.getFactory().createParser(resp);
Akron984fe8f2020-11-25 15:21:37 +0100365 JsonNode root = mapper.readTree(parser);
Akron62d90a32020-11-18 20:45:38 +0100366
Akron984fe8f2020-11-25 15:21:37 +0100367 if (root == null)
Akron62d90a32020-11-18 20:45:38 +0100368 return false;
Akronb329d272020-11-13 12:45:26 +0100369
Akron984fe8f2020-11-25 15:21:37 +0100370 JsonNode meta = root.get("meta");
Akronc1c18242020-11-18 18:24:12 +0100371 this.setMeta(meta);
Akron984fe8f2020-11-25 15:21:37 +0100372 this.setQuery(root.get("query"));
373 this.setCollection(root.get("collection"));
Akronb329d272020-11-13 12:45:26 +0100374
Akronc1c18242020-11-18 18:24:12 +0100375 if (meta != null) {
376 if (meta.has("totalResults")) {
377 this.totalResults = meta.get("totalResults").asInt();
378 if (meta.has("timeExceeded")) {
379 this.timeExceeded = meta.get("timeExceeded").asBoolean();
380 };
381 };
382 };
383
Akronba3ea112020-11-24 22:40:18 +0100384 // In case the writer is already set (e.g. forceFile() was issued),
385 // write in the header
386 if (writer == null) {
387 this.file = null;
388 writer = new StringWriter();
389 };
Akronb329d272020-11-13 12:45:26 +0100390
Akron984fe8f2020-11-25 15:21:37 +0100391 // Write header to exporter
Akron1d36eb52020-11-13 17:52:26 +0100392 this.writeHeader(writer);
Akronb329d272020-11-13 12:45:26 +0100393
Akron984fe8f2020-11-25 15:21:37 +0100394 // Go on by iterating through matches
395 return this.iterateThroughMatches(root.get("matches"));
Akronb329d272020-11-13 12:45:26 +0100396 };
397
Akron984fe8f2020-11-25 15:21:37 +0100398
399 /**
400 * Finalize the export stream.
401 */
402 public Exporter finish() throws IOException {
403 this.writeFooter(this.writer);
404 this.writer.close();
405 return (Exporter) this;
406 };
Akronc51327b2020-11-13 15:21:26 +0100407
Akron984fe8f2020-11-25 15:21:37 +0100408
Akronb329d272020-11-13 12:45:26 +0100409 /**
410 * Append more matches to the result set.
411 */
Akron62d90a32020-11-18 20:45:38 +0100412 public boolean appendMatches (String resp) throws IOException {
Akronb329d272020-11-13 12:45:26 +0100413
Akron35881012020-11-24 20:05:06 +0100414 // Demand creation of a file
415 this.forceFile();
Akronb329d272020-11-13 12:45:26 +0100416
417 JsonParser parser = mapper.getFactory().createParser(resp);
Akron984fe8f2020-11-25 15:21:37 +0100418 JsonNode root = mapper.readTree(parser);
Akronb329d272020-11-13 12:45:26 +0100419
Akron984fe8f2020-11-25 15:21:37 +0100420 if (root == null)
Akron62d90a32020-11-18 20:45:38 +0100421 return false;
Akron1d36eb52020-11-13 17:52:26 +0100422
Akron984fe8f2020-11-25 15:21:37 +0100423 return this.iterateThroughMatches(root.get("matches"));
Akronb329d272020-11-13 12:45:26 +0100424 };
425
Akron35881012020-11-24 20:05:06 +0100426
Akronb329d272020-11-13 12:45:26 +0100427 /**
Akron1d36eb52020-11-13 17:52:26 +0100428 * Serve response entity, either as a string or as a file.
Akronb329d272020-11-13 12:45:26 +0100429 */
Akron1d36eb52020-11-13 17:52:26 +0100430 public ResponseBuilder serve () {
Akronb329d272020-11-13 12:45:26 +0100431
Akron35881012020-11-24 20:05:06 +0100432 ResponseBuilder rb;
Akrone57937b2020-11-17 08:49:31 +0100433
Akron35881012020-11-24 20:05:06 +0100434 if (this.file == null) {
Akrone57937b2020-11-17 08:49:31 +0100435
Akron35881012020-11-24 20:05:06 +0100436 // Serve stream
437 rb = Response.ok(writer.toString());
Akron1d36eb52020-11-13 17:52:26 +0100438 }
Akron35881012020-11-24 20:05:06 +0100439 else if (this.file.exists()) {
Akron1d36eb52020-11-13 17:52:26 +0100440
Akron55def632020-11-26 16:00:02 +0100441 // Serve the file and delete after serving
442 final File expFile = this.file;
443 try {
444 final InputStream in = new FileInputStream(this.file);
445
446 // Remove file after output is streamed
447 StreamingOutput output = new StreamingOutput() {
448 @Override
449 public void write(OutputStream out)
450 throws IOException {
451
452 // Write file data in output stream
453 int length;
454 byte[] buffer = new byte[1024];
455 while ((length = in.read(buffer)) != -1) {
456 out.write(buffer, 0, length);
457 }
458 out.flush(); // Important!
459 in.close();
460
461 // When done, delete the file
462 expFile.delete();
463 }
464 };
465
466 // Serve file
467 rb = Response.ok(output);
468 }
469
470 catch (Exception e) {
471 // File problematic
Akrona3839bb2020-11-26 16:07:31 +0100472 return Response.status(Status.NOT_FOUND);
Akron55def632020-11-26 16:00:02 +0100473 };
Akron35881012020-11-24 20:05:06 +0100474 }
475 else {
476 // File doesn't exist
Akrona3839bb2020-11-26 16:07:31 +0100477 return Response.status(Status.NOT_FOUND);
Akronb329d272020-11-13 12:45:26 +0100478 };
Akrond0b1cfe2020-11-20 19:26:52 +0100479
Akron35881012020-11-24 20:05:06 +0100480 return rb
481 .type(this.getMimeType())
482 .header(
483 "Content-Disposition",
484 "attachment; filename=" +
485 this.getFileName() +
486 '.' +
487 this.getSuffix()
488 );
Akronb329d272020-11-13 12:45:26 +0100489 };
Akron62d90a32020-11-18 20:45:38 +0100490
Akron984fe8f2020-11-25 15:21:37 +0100491
492 /*
493 * Iterate through all matches
494 */
495 private boolean iterateThroughMatches (JsonNode mNodes)
496 throws IOException {
Akron62d90a32020-11-18 20:45:38 +0100497
Akron984fe8f2020-11-25 15:21:37 +0100498 // Send progress information
Akrond0b1cfe2020-11-20 19:26:52 +0100499 this.sendProgress();
500
Akron62d90a32020-11-18 20:45:38 +0100501 if (mNodes == null)
502 return false;
503
504 // Iterate over the results of the current file
505 Iterator<JsonNode> mNode = mNodes.elements();
506 while (mNode.hasNext()) {
Akron984fe8f2020-11-25 15:21:37 +0100507
508 // Stop if all relevant matches are fetched
Akron62d90a32020-11-18 20:45:38 +0100509 if (this.maxResults > 0 &&
Akron3524b8c2020-11-19 01:19:02 +0100510 this.fetchedResults >= this.maxResults) {
Akron62d90a32020-11-18 20:45:38 +0100511 return false;
512 };
Akron3524b8c2020-11-19 01:19:02 +0100513 this.addMatch(mNode.next(), writer);
514 this.fetchedResults++;
Akron62d90a32020-11-18 20:45:38 +0100515 };
516 return true;
517 };
Akron984fe8f2020-11-25 15:21:37 +0100518
519
520 /*
521 * Get the directory where all temporary files are stored.
522 */
523 private File getFileDirectory () {
524
525 String fileDir = prop.getProperty(
526 "conf.file_dir",
527 System.getProperty("java.io.tmpdir")
528 );
529
530 File dir = new File(fileDir);
531
532 // Create directory if not yet existing
533 if (!dir.exists()) {
534 dir.mkdir();
535 }
536
537 // Directory is unwritable - fallback
538 else if (!dir.canWrite()) {
539 fileDir = System.getProperty("java.io.tmpdir");
540 System.err.println("Unable to write to directory");
541 System.err.println("Fallback to " + fileDir);
542 dir = new File(fileDir);
543 };
544 return dir;
545 };
546
547
548 /*
549 * Send a single progress event to the event stream.
550 */
551 private void sendProgress () {
552
553 if (this.evOut == null || this.maxResults == 0)
554 return;
555
556 if (this.evOut.isClosed())
557 return;
558
559 int calc = (int) Math.ceil(((double) this.fetchedResults / this.maxResults) * 100);
560
561 final OutboundEvent.Builder eventBuilder = new OutboundEvent.Builder();
562 eventBuilder.name("Progress");
563 eventBuilder.data(String.valueOf(calc));
564
565 try {
566 this.evOut.write(eventBuilder.build());
567 }
568 catch (IOException e) {
569 return;
570 };
571 };
Akronb329d272020-11-13 12:45:26 +0100572};