blob: 7c7acfddedd423cff1a0b6371926de44eec5c135 [file] [log] [blame]
Nils Diewald6535c522015-02-26 17:45:24 +00001package de.ids_mannheim.korap.index;
2import java.util.*;
3import java.io.*;
4import org.apache.lucene.store.MMapDirectory;
Nils Diewalda14ecd62015-02-26 21:00:20 +00005import de.ids_mannheim.korap.KrillIndex;
Nils Diewald6535c522015-02-26 17:45:24 +00006import org.slf4j.Logger;
7import org.slf4j.LoggerFactory;
8
9/**
10 * This is a runnable indexer tool for
11 * Krill. Although the preferred index method
12 * is using the standalone server system,
13 * this tool may be more suitable for your needs
14 * (especially as it is way faster).
15 *
16 * Usage: java -jar Krill-X.XX.jar [propfile] [directories]*
17 */
18public class Indexer {
Nils Diewalda14ecd62015-02-26 21:00:20 +000019 KrillIndex index;
Nils Diewald6535c522015-02-26 17:45:24 +000020 String indexDir;
21 int count;
22 int commitCount;
23
24 // Init logger
25 private final static Logger log =
Nils Diewalda14ecd62015-02-26 21:00:20 +000026 LoggerFactory.getLogger(KrillIndex.class);
Nils Diewald6535c522015-02-26 17:45:24 +000027
28
29 /**
30 * Construct a new indexer object.
31 *
32 * @param prop A {@link Properties} object with
33 * at least the following information:
Nils Diewaldd37f7e42015-02-27 21:08:22 +000034 * <tt>krill.indexDir</tt>.
Nils Diewald6535c522015-02-26 17:45:24 +000035 * @throws IOException
36 */
37 public Indexer (Properties prop) throws IOException {
Nils Diewaldd37f7e42015-02-27 21:08:22 +000038 this.indexDir = prop.getProperty("krill.indexDir");
Nils Diewald6535c522015-02-26 17:45:24 +000039
40 System.out.println("Index to " + this.indexDir);
41
42 // Default to 1000 documents till the next commit
Nils Diewaldd37f7e42015-02-27 21:08:22 +000043 String commitCount = prop.getProperty("krill.index.commit.count", "1000");
Nils Diewald6535c522015-02-26 17:45:24 +000044
45 // Create a new index object based on the directory
Nils Diewalda14ecd62015-02-26 21:00:20 +000046 this.index = new KrillIndex(new MMapDirectory(new File(indexDir)));
Nils Diewald6535c522015-02-26 17:45:24 +000047 this.count = 0;
48 this.commitCount = Integer.parseInt(commitCount);
49 };
50
51 /**
52 * Parse a directory for document files.
53 *
54 * @param dir The {@link File} directory containing
55 * documents to index.
56 */
57 public void parse (File dir) {
58 for (String file : dir.list()) {
59 if (file.matches("^[^\\.].+?\\.json\\.gz$")) {
60 String found = dir.getPath() + '/' + file;
61 System.out.print(" Index " + found + " ... ");
62
63 // Add file to the index
64 if (this.index.addDocFile(found, true) == null) {
65 System.out.println("fail.");
66 continue;
67 };
68 System.out.println("done (" + count + ").");
69 this.count++;
70
71 // Commit in case the commit count is reached
72 if ((this.count % this.commitCount) == 0)
73 this.commit();
74 };
75 };
76 };
77
78 /**
79 * Commit changes to the index.
80 */
81 public void commit () {
82 System.out.println("-----");
83 System.out.print(" Commit ... ");
84 try {
85 this.index.commit();
86 }
87 catch (IOException e) {
88 System.err.println("Unable to commit to index " + this.indexDir);
89 };
90 System.out.println("done.");
91 };
92
93 /**
94 * Main method.
95 *
96 * @param argv Argument list,
97 * expecting the properties file
98 * and a list of directories
99 * @throws IOException
100 */
101 public static void main (String[] argv) throws IOException {
102 Properties prop = new Properties();
103
104 // Needed at least 2 parameters
105 if (argv.length < 2) {
106
107 String jar = new File(Indexer.class.getProtectionDomain()
108 .getCodeSource()
109 .getLocation()
110 .getPath()).getName();
111 System.out.println("Usage: java -jar " + jar +
112 " [propfile] [directories]*");
113 return;
114 };
115
116 // Load properties
117 InputStream fr = new FileInputStream(argv[0]);
118 prop.load(fr);
119
120 // Get indexer object
121 Indexer ki = new Indexer(prop);
122
123 // Empty line
124 System.out.println();
125
126 // Iterate over list of directories
127 for (String arg : Arrays.copyOfRange(argv, 1, argv.length)) {
128 File f = new File(arg);
129 if (f.isDirectory())
130 ki.parse(f);
131 };
132
133 // Final commit
134 ki.commit();
135
136 // Finish indexing
137 System.out.println("-----");
138 System.out.println(" Indexed " + ki.count + " files.");
139 System.out.println();
140 };
141};