blob: 7bf7ccb7a279d65bfa3af94496874eb99f61ac52 [file] [log] [blame]
margarethac8588902024-06-13 09:36:53 +02001package de.ids_mannheim.korap.init;
margaretha1b320452018-08-02 16:56:25 +02002
3import java.io.File;
margaretha6ad08b42018-08-22 18:33:54 +02004import java.io.FileInputStream;
margaretha1b320452018-08-02 16:56:25 +02005import java.io.IOException;
margaretha9e53bb22018-09-14 19:39:15 +02006import java.io.InputStream;
margarethae1228ab2021-02-22 11:51:38 +01007import java.util.Arrays;
margaretha6ad08b42018-08-22 18:33:54 +02008import java.util.zip.GZIPInputStream;
margaretha1b320452018-08-02 16:56:25 +02009
10import org.apache.commons.io.FileUtils;
margaretha9e53bb22018-09-14 19:39:15 +020011import org.apache.commons.io.IOUtils;
margaretha6ad08b42018-08-22 18:33:54 +020012import org.apache.commons.io.output.ByteArrayOutputStream;
margaretha1b320452018-08-02 16:56:25 +020013import org.apache.logging.log4j.LogManager;
14import org.apache.logging.log4j.Logger;
15import org.springframework.beans.factory.annotation.Autowired;
16import org.springframework.stereotype.Component;
17
margaretha7c1f4282021-11-29 17:27:53 +010018import de.ids_mannheim.korap.cache.VirtualCorpusCache;
margarethac8588902024-06-13 09:36:53 +020019import de.ids_mannheim.korap.config.FullConfiguration;
Akronda080152020-12-03 13:53:29 +010020import de.ids_mannheim.korap.constant.QueryType;
21import de.ids_mannheim.korap.constant.ResourceType;
margaretha05818b42024-04-10 13:10:02 +020022import de.ids_mannheim.korap.entity.QueryDO;
margaretha563aabe2018-09-13 20:39:45 +020023import de.ids_mannheim.korap.exceptions.KustvaktException;
margaretha7c1f4282021-11-29 17:27:53 +010024import de.ids_mannheim.korap.exceptions.StatusCodes;
margarethab097fb02021-02-22 19:28:33 +010025import de.ids_mannheim.korap.service.QueryService;
margarethac7196d22018-08-27 14:20:03 +020026import de.ids_mannheim.korap.util.QueryException;
margaretha1b320452018-08-02 16:56:25 +020027import de.ids_mannheim.korap.web.SearchKrill;
28
margaretha7c1f4282021-11-29 17:27:53 +010029/**
30 * <p>Loads predefined virtual corpora at server start up and caches
31 * them, if the VC have not been cached before. If there are changes
32 * in the index, the cache will be updated.
33 * </p>
34 *
35 * <p>
36 * All predefined VC are set as SYSTEM VC. The filenames are used as
37 * VC names. Acceptable file extensions are .jsonld.gz or .jsonld. The
margaretha35e1ca22023-11-16 22:00:01 +010038 * VC should be located at the folder indicated by
39 * <em>krill.namedVC</em>
margaretha7c1f4282021-11-29 17:27:53 +010040 * specified in kustvakt.conf.
41 * </p>
margaretha398f4722019-01-09 19:07:20 +010042 *
43 * @author margaretha
44 *
45 */
margaretha1b320452018-08-02 16:56:25 +020046@Component
margaretha35e1ca22023-11-16 22:00:01 +010047public class NamedVCLoader implements Runnable {
margaretha1b320452018-08-02 16:56:25 +020048 @Autowired
49 private FullConfiguration config;
50 @Autowired
51 private SearchKrill searchKrill;
margaretha563aabe2018-09-13 20:39:45 +020052 @Autowired
margarethab097fb02021-02-22 19:28:33 +010053 private QueryService vcService;
margaretha9e53bb22018-09-14 19:39:15 +020054
margarethadda4ef72018-12-06 14:20:51 +010055 public static Logger jlog = LogManager.getLogger(NamedVCLoader.class);
56 public static boolean DEBUG = false;
margaretha1b320452018-08-02 16:56:25 +020057
margaretha0b903912019-01-08 17:41:39 +010058 @Override
59 public void run () {
60 try {
61 loadVCToCache();
62 }
margaretha7c1f4282021-11-29 17:27:53 +010063 catch (IOException | QueryException e) {
margaretha35e1ca22023-11-16 22:00:01 +010064 // e.printStackTrace();
margaretha4b883ec2023-09-25 12:21:24 +020065 throw new RuntimeException(e.getMessage(), e.getCause());
margaretha0b903912019-01-08 17:41:39 +010066 }
67 }
margaretha35e1ca22023-11-16 22:00:01 +010068
margaretha05818b42024-04-10 13:10:02 +020069 public void loadVCToCache (String filename, String filePath)
70 throws IOException, QueryException, KustvaktException {
71 loadVCToCache(filename,filePath,null);
72 }
73
margaretha35e1ca22023-11-16 22:00:01 +010074 /**
75 * Used for testing
margaretha7c1f4282021-11-29 17:27:53 +010076 *
77 * @param filename
78 * @param filePath
79 * @throws IOException
80 * @throws QueryException
81 * @throws KustvaktException
82 */
margaretha05818b42024-04-10 13:10:02 +020083 public void loadVCToCache (String filename, String filePath, String json)
84 throws IOException, QueryException {
margaretha9e53bb22018-09-14 19:39:15 +020085
margaretha05818b42024-04-10 13:10:02 +020086 if (json==null || json.isEmpty()) {
87 InputStream is = NamedVCLoader.class.getResourceAsStream(filePath);
88 json = IOUtils.toString(is, "utf-8");
margaretha9e53bb22018-09-14 19:39:15 +020089 }
margaretha05818b42024-04-10 13:10:02 +020090 processVC(filename, json);
margaretha9e53bb22018-09-14 19:39:15 +020091 }
92
margaretha35e1ca22023-11-16 22:00:01 +010093 public void loadVCToCache () throws IOException, QueryException {
margaretha1b320452018-08-02 16:56:25 +020094
95 String dir = config.getNamedVCPath();
margaretha35e1ca22023-11-16 22:00:01 +010096 if (dir.isEmpty())
97 return;
margarethae72355a2018-11-28 16:53:09 +010098
margaretha1b320452018-08-02 16:56:25 +020099 File d = new File(dir);
100 if (!d.isDirectory()) {
101 throw new IOException("Directory " + dir + " is not valid");
102 }
103
margarethae1228ab2021-02-22 11:51:38 +0100104 jlog.info(Arrays.toString(d.list()));
margaretha35e1ca22023-11-16 22:00:01 +0100105
margaretha1b320452018-08-02 16:56:25 +0200106 for (File file : d.listFiles()) {
107 if (!file.exists()) {
108 throw new IOException("File " + file + " is not found.");
109 }
margaretha1b320452018-08-02 16:56:25 +0200110
margaretha6ad08b42018-08-22 18:33:54 +0200111 String filename = file.getName();
margaretha339fd2e2018-11-13 12:14:53 +0100112 String[] strArr = readFile(file, filename);
113 filename = strArr[0];
114 String json = strArr[1];
margaretha9e53bb22018-09-14 19:39:15 +0200115 if (json != null) {
margaretha05818b42024-04-10 13:10:02 +0200116 processVC(filename, json);
margaretha6ad08b42018-08-22 18:33:54 +0200117 }
margaretha1b320452018-08-02 16:56:25 +0200118 }
119 }
margaretha05818b42024-04-10 13:10:02 +0200120
121 /**
122 * Stores and caches VC if the given VC does not exist.
123 * Updates VC in the database and re-caches it, if the given VC exists.
124 * Updates VC if there is any change in the index.
125 *
126 * In this method, it will be checked if
127 * <ol>
128 * <li> VC exists in the database</li>
129 * <li> VC exists in the cache </li>
130 * <li> KoralQuery of the given VC differs from an existing VC with
131 * the same id. </li>
132 * <li> Index has been changed</li>
133 * </ol>
134 *
135 * Koral Query
136 *
137 * @param vcId
138 * @param json
139 * @throws IOException
140 * @throws QueryException
141 */
142 private void processVC (String vcId, String json)
143 throws IOException, QueryException {
144 boolean updateCache = false;
145 try {
146 // if VC exists in the DB
147 QueryDO existingVC = vcService.searchQueryByName("system", vcId, "system",
148 QueryType.VIRTUAL_CORPUS);
149
150 String koralQuery = existingVC.getKoralQuery();
151 // if existing VC is different from input
152 if (json.hashCode() != koralQuery.hashCode()) {
153 updateCache = true;
154 // updateVCinDB
155 storeVCinDB(vcId, json, existingVC);
156 }
157 }
158 catch (KustvaktException e) {
159 // VC doesn't exist in the DB
160 if (e.getStatusCode() == StatusCodes.NO_RESOURCE_FOUND) {
161 storeVCinDB(vcId, json, null);
162 }
163 else {
164 throw new RuntimeException(e);
165 }
166 }
167
168 cacheVC(vcId, json, updateCache);
169 }
margaretha9e53bb22018-09-14 19:39:15 +0200170
margaretha35e1ca22023-11-16 22:00:01 +0100171 private String[] readFile (File file, String filename) throws IOException {
margaretha9e53bb22018-09-14 19:39:15 +0200172 String json = null;
173 long start = System.currentTimeMillis();
174 if (filename.endsWith(".jsonld")) {
175 filename = filename.substring(0, filename.length() - 7);
176 json = FileUtils.readFileToString(file, "utf-8");
177 }
178 else if (filename.endsWith(".jsonld.gz")) {
179 filename = filename.substring(0, filename.length() - 10);
margaretha35e1ca22023-11-16 22:00:01 +0100180 GZIPInputStream gzipInputStream = new GZIPInputStream(
181 new FileInputStream(file));
margaretha9e53bb22018-09-14 19:39:15 +0200182 ByteArrayOutputStream bos = new ByteArrayOutputStream(512);
183 bos.write(gzipInputStream);
184 json = bos.toString("utf-8");
185 bos.close();
186 }
187 else {
188 System.err.println("File " + filename
189 + " is not allowed. Filename must ends with .jsonld or .jsonld.gz");
190 }
191 long end = System.currentTimeMillis();
margarethadda4ef72018-12-06 14:20:51 +0100192 if (DEBUG) {
193 jlog.debug("READ " + filename + " duration: " + (end - start));
194 }
margarethae72355a2018-11-28 16:53:09 +0100195
196 return new String[] { filename, json };
margaretha9e53bb22018-09-14 19:39:15 +0200197 }
198
margaretha7c1f4282021-11-29 17:27:53 +0100199 /**
200 * Caches the given VC if the VC is not found in cache and updates
201 * the VC if it exists and there are changes in the index.
202 *
203 * @param vcId
204 * vc-name
205 * @param koralQuery
206 * @throws IOException
207 * @throws QueryException
margaretha05818b42024-04-10 13:10:02 +0200208 * @throws KustvaktException
margaretha7c1f4282021-11-29 17:27:53 +0100209 */
margaretha05818b42024-04-10 13:10:02 +0200210 private void cacheVC (String vcId, String koralQuery, boolean updateVC)
margaretha9e53bb22018-09-14 19:39:15 +0200211 throws IOException, QueryException {
margaretha7c1f4282021-11-29 17:27:53 +0100212 config.setVcInCaching(vcId);
margaretha05818b42024-04-10 13:10:02 +0200213 if (updateVC) {
214 jlog.info("Updating {} in cache ", vcId);
215 VirtualCorpusCache.delete(vcId);
216 }
217 else if (VirtualCorpusCache.contains(vcId)) {
margaretha326520b2021-12-08 17:58:09 +0100218 jlog.info("Checking {} in cache ", vcId);
margaretha05818b42024-04-10 13:10:02 +0200219
margaretha326520b2021-12-08 17:58:09 +0100220 }
221 else {
222 jlog.info("Storing {} in cache ", vcId);
223 }
margaretha35e1ca22023-11-16 22:00:01 +0100224
margaretha563aabe2018-09-13 20:39:45 +0200225 long start, end;
226 start = System.currentTimeMillis();
margarethaae1ca622024-08-02 10:43:08 +0200227 try {
228 VirtualCorpusCache.store(vcId, searchKrill.getIndex());
229 }
230 catch (Exception e) {
231 jlog.error("Failed caching vc "+vcId, e);
232 }
margaretha563aabe2018-09-13 20:39:45 +0200233 end = System.currentTimeMillis();
margaretha326520b2021-12-08 17:58:09 +0100234 jlog.info("Duration : {}", (end - start));
margaretha52ee9e32019-12-11 16:36:14 +0100235 config.setVcInCaching("");
margaretha563aabe2018-09-13 20:39:45 +0200236 }
margaretha35e1ca22023-11-16 22:00:01 +0100237
238 /**
margaretha05818b42024-04-10 13:10:02 +0200239 * Stores new VC or updates existing VC
margaretha7c1f4282021-11-29 17:27:53 +0100240 *
241 * @param vcId
242 * @param koralQuery
243 */
margaretha05818b42024-04-10 13:10:02 +0200244 private void storeVCinDB (String vcId, String koralQuery, QueryDO existingVC) {
margaretha7c1f4282021-11-29 17:27:53 +0100245 try {
margaretha05818b42024-04-10 13:10:02 +0200246 String info = (existingVC == null) ? "Storing" : "Updating";
margarethaae1ca622024-08-02 10:43:08 +0200247 jlog.info("{} {} in the database ", info, vcId);
margaretha05818b42024-04-10 13:10:02 +0200248
249 vcService.storeQuery(existingVC, "system", vcId, ResourceType.SYSTEM,
250 QueryType.VIRTUAL_CORPUS, koralQuery, null, null, null,
251 true, "system", null, null);
margaretha7c1f4282021-11-29 17:27:53 +0100252 }
margarethaae1ca622024-08-02 10:43:08 +0200253 catch (Exception e) {
254 jlog.error("Failed storing VC: "+vcId, e);
margaretha05818b42024-04-10 13:10:02 +0200255 throw new RuntimeException(e);
margaretha7c1f4282021-11-29 17:27:53 +0100256 }
margaretha7c1f4282021-11-29 17:27:53 +0100257 }
margaretha1b320452018-08-02 16:56:25 +0200258}