Allow also json/json.gz zips as Krill-Indexer input files
Unpacking archives should be avoided where ever possible.
Change-Id: I139e9ebed3ecab858df5cdd9b84e70ef947111f6
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 6db23cb..05b0261 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -4,10 +4,14 @@
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.file.Paths;
+import java.util.Enumeration;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -34,7 +38,9 @@
* this tool may be more suitable for your needs
* (especially as it is way faster).
* <br><br>
- * Input directories should contain files in the json.gz format. Files
+ * Input can be directories containing files in the json.gz format, or
+ * zip files containing .json or .json.gz files. The indexer automatically
+ * detects whether each input path is a directory or zip file. Files
* of other formats will be skipped or not indexed. The output
* directory can be specified in the config file. See
* src/main/resources/krill.properties.info to create a config file.
@@ -42,11 +48,16 @@
* <pre>
* Usage:
*
- * java -jar Krill-Indexer.jar -c [propfile] -i [input directories] -o
+ * java -jar Krill-Indexer.jar -c [propfile] -i [input paths] -o
* [output directory]
*
- * java -jar Krill-Indexer.jar --config [propfile] --input [input
- * directories] --output [output directory]
+ * java -jar Krill-Indexer.jar --config [propfile] --input [input paths]
+ * --output [output directory]
+ *
+ * Input paths can be:
+ * - Directories containing .json.gz files
+ * - Zip files containing .json or .json.gz files
+ * - Mix of both, separated by semicolons
* </pre>
*
*
@@ -61,6 +72,7 @@
private static String path = null;
private static boolean addInsteadOfUpsert = false;
private Pattern jsonFilePattern;
+ private Pattern plainJsonFilePattern;
// Init logger
private final static Logger log = LoggerFactory.getLogger(Indexer.class);
@@ -90,6 +102,7 @@
this.commitCount = Integer.parseInt(commitCount);
jsonFilePattern = Pattern.compile(".*\\.json\\.gz$");
+ plainJsonFilePattern = Pattern.compile(".*\\.json$");
}
@@ -150,6 +163,77 @@
/**
+ * Parse a zip file for document files.
+ *
+ * @param zipFile
+ * The {@link File} zip file containing
+ * JSON documents (plain .json or gzipped .json.gz) to index.
+ */
+ private void parseZip (File zipFile) {
+ try (ZipFile zip = new ZipFile(zipFile)) {
+ Enumeration<? extends ZipEntry> entries = zip.entries();
+
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+
+ // Skip directories
+ if (entry.isDirectory()) {
+ continue;
+ }
+
+ String entryName = entry.getName();
+ Matcher gzipMatcher = jsonFilePattern.matcher(entryName);
+ Matcher plainMatcher = plainJsonFilePattern.matcher(entryName);
+
+ boolean isGzipped = gzipMatcher.find();
+ boolean isPlainJson = plainMatcher.find();
+
+ if (isGzipped || isPlainJson) {
+ try (InputStream entryStream = zip.getInputStream(entry)) {
+ if (addInsteadOfUpsert) {
+ log.info("{} Add {} from zip {} to the index. ",
+ this.count, entryName, zipFile.getName());
+ if (this.index.addDoc(entryStream, isGzipped) == null) {
+ log.warn("fail.");
+ continue;
+ }
+ }
+ else {
+ log.info("{} Add or update {} from zip {} to the index. ",
+ this.count, entryName, zipFile.getName());
+ if (this.index.upsertDoc(entryStream, isGzipped) == null) {
+ log.warn("fail.");
+ continue;
+ }
+ }
+
+ this.count++;
+ if (DEBUG) {
+ log.debug("Finished adding files. (" + count + ").");
+ }
+
+ // Commit in case the commit count is reached
+ if ((this.count % this.commitCount) == 0) {
+ this.commit();
+ }
+ }
+ catch (IOException e) {
+ log.error("Error reading entry " + entryName + " from zip file " + zipFile.getName(), e);
+ }
+ }
+ else {
+ log.warn("Skip " + entryName + " from zip " + zipFile.getName()
+ + " since it does not have .json or .json.gz format.");
+ }
+ }
+ }
+ catch (IOException e) {
+ log.error("Error reading zip file " + zipFile.getName(), e);
+ }
+ }
+
+
+ /**
* Commit changes to the index.
*/
private void commit () {
@@ -185,10 +269,11 @@
+ KrillProperties.DEFAULT_PROPERTIES_LOCATION
+ ").")
.hasArg().argName("properties file").required().build());
- options.addOption(Option.builder("i").longOpt("inputDir")
- .desc("input directories separated by semicolons. The input files "
- + "have to be in <filename>.json.gz format. ")
- .hasArgs().argName("input directories").required()
+ options.addOption(Option.builder("i").longOpt("input")
+ .desc("input paths separated by semicolons. Can be directories containing "
+ + "<filename>.json.gz files, or zip files containing .json or .json.gz files. "
+ + "The indexer will automatically detect the type.")
+ .hasArgs().argName("input paths").required()
.valueSeparator(Character.valueOf(';')).build());
options.addOption(Option.builder("o").longOpt("outputDir")
.desc("index output directory (defaults to "
@@ -201,14 +286,15 @@
CommandLineParser parser = new DefaultParser();
String propFile = null;
- String[] inputDirectories = null;
+ String[] inputPaths = null;
try {
CommandLine cmd = parser.parse(options, argv);
log.info("Configuration file: " + cmd.getOptionValue("c"));
propFile = cmd.getOptionValue("c");
- log.info("Input directories: "
+
+ log.info("Input paths: "
+ StringUtils.join(cmd.getOptionValues("i"), ";"));
- inputDirectories = cmd.getOptionValues("i");
+ inputPaths = cmd.getOptionValues("i");
if (cmd.hasOption("o")) {
log.info("Output directory: " + cmd.getOptionValue("o"));
@@ -222,7 +308,7 @@
catch (MissingOptionException e) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(
- "Krill indexer\n java -jar -c <properties file> -i <input directories> "
+ "Krill indexer\n java -jar -c <properties file> -i <input paths> "
+ "[-o <output directory> -a]",
options);
return;
@@ -245,12 +331,21 @@
indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
}
- // Iterate over list of directories
- for (String arg : inputDirectories) {
- log.info("Indexing files in " + arg);
+ // Iterate over list of input paths (auto-detect directories vs zip files)
+ for (String arg : inputPaths) {
File f = new File(arg);
- if (f.isDirectory())
+
+ if (f.isDirectory()) {
+ log.info("Indexing files in directory " + arg);
indexer.parse(f);
+ }
+ else if (f.isFile() && f.getName().toLowerCase().endsWith(".zip")) {
+ log.info("Indexing files in zip " + arg);
+ indexer.parseZip(f);
+ }
+ else {
+ log.warn("Skipping " + arg + " - not a valid directory or zip file");
+ }
}
indexer.closeIndex();
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index b2a2792..920a060 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -1,187 +1,252 @@
-package de.ids_mannheim.korap;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.PrintStream;
-
-import org.junit.After;
-import org.junit.AfterClass;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import de.ids_mannheim.korap.index.Indexer;
-
-/**
- * @author margaretha
- *
- */
-public class TestIndexer {
- private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
- private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
- private String info = "usage: Krill indexer";
- private static File outputDirectory = new File("test-index");
- private static File outputDirectory2 = new File("test-index2");
- private static File outputDirectory3 = new File("test-output");
- private static File outputDirectory4 = new File("test-output-1");
-
- @Test
- public void testArguments () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
- }
-
- @Test
- public void testOutputArgument () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk", "-o", "test-output"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
- }
-
- @Test
- public void testMultipleInputFiles () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/wiki"});
- assertTrue(outputStream.toString().startsWith("Added or updated 19 files."));
- }
-
-
- @Test
- public void testAdding () throws IOException {
- Indexer.main(new String[] {
- "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk",
- "-a"});
- logger.info(outputStream.toString());
- assertTrue(outputStream.toString().startsWith("Added 1 file."));
- }
-
-
- @Test
- public void testMultipleInputDirectories () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i",
- "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
- "-o", "test-index"});
- assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
- }
-
- @Test
- public void testEmptyArgument () throws IOException {
- Indexer.main(new String[] {});
- logger.info(outputStream.toString());
- assertEquals(true, outputStream.toString().startsWith(info));
- }
-
-
- @Test
- public void testMissingConfig () throws IOException {
- Indexer.main(new String[] { "-i", "src/test/resources/bzk",
- "-o test-index"});
- logger.info(outputStream.toString());
- assertEquals(true, outputStream.toString().startsWith(info));
- }
-
- @Test
- public void testMissingInput () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-o", "test-index"});
- logger.info(outputStream.toString());
- assertEquals(true, outputStream.toString().startsWith(info));
- }
-
- @Test
- public void testUnicodeProblem () throws IOException {
- Indexer.main(new String[] {
- "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bug",
- "-o", "test-index2"
- });
- logger.info(outputStream.toString());
- assertTrue(outputStream.toString().startsWith("Added 1 file."));
- }
-
- @Test
- public void testMaxTextSize () throws IOException {
- // Create a temporary properties file with the max text size setting
- File tempPropertiesFile = File.createTempFile("krill", ".properties");
- FileWriter writer = new FileWriter(tempPropertiesFile);
- writer.write("krill.version = ${project.version}\n");
- writer.write("krill.name = ${project.name}\n");
- writer.write("krill.indexDir = test-output\n");
- writer.write("krill.index.textSize.max = 25000000\n");
- writer.close();
-
- Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
- "-i", "src/test/resources/bzk", "-o", "test-output-1"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
-
- tempPropertiesFile.delete();
- }
-
- @Before
- public void setOutputStream () {
- System.setOut(new PrintStream(outputStream));
- }
-
- @After
- public void cleanOutputStream () {
- System.setOut(null);
- }
-
- @AfterClass
- public static void cleanup() {
- if (outputDirectory.exists()) {
- deleteFile(outputDirectory);
- }
- if (outputDirectory2.exists()) {
- deleteFile(outputDirectory2);
- }
- if (outputDirectory3.exists()) {
- deleteFile(outputDirectory3);
- }
- if (outputDirectory4.exists()) {
- deleteFile(outputDirectory4);
- }
- }
-
-
- @Before
- public void cleanOutputDirectory () {
-
- if (outputDirectory.exists()) {
- logger.debug("Output directory exists");
- deleteFile(outputDirectory);
- }
- if (outputDirectory2.exists()) {
- logger.debug("Output directory 2 exists");
- deleteFile(outputDirectory2);
- }
- if (outputDirectory3.exists()) {
- logger.debug("Output directory 3 exists");
- deleteFile(outputDirectory3);
- }
- if (outputDirectory4.exists()) {
- logger.debug("Output directory 4 exists");
- deleteFile(outputDirectory4);
- }
- }
-
- private static void deleteFile (File path) {
- if (path.isDirectory()) {
- File file;
- for (String filename : path.list()) {
- file = new File(path + "/" + filename);
- deleteFile(file);
- }
- }
- path.delete();
- }
-}
+package de.ids_mannheim.korap;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.index.Indexer;
+
+/**
+ * @author margaretha
+ *
+ */
+public class TestIndexer {
+ private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
+ private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ private String info = "usage: Krill indexer";
+ private static File outputDirectory = new File("test-index");
+ private static File outputDirectory2 = new File("test-index2");
+ private static File outputDirectory3 = new File("test-output");
+ private static File outputDirectory4 = new File("test-output-1");
+ private static File zipIndexDirectory = new File("test-zip-index");
+ private static File zipIndexAddDirectory = new File("test-zip-index-add");
+ private static File mixedIndexDirectory = new File("test-mixed-index");
+ private static File multipleZipIndexDirectory = new File("test-multiple-zip-index");
+ private static File invalidZipIndexDirectory = new File("test-invalid-zip-index");
+ private static File mixedValidInvalidIndexDirectory = new File("test-mixed-valid-invalid-index");
+ private static File mixedContentZipIndexDirectory = new File("test-mixed-content-zip-index");
+
+ @Test
+ public void testArguments () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+ }
+
+ @Test
+ public void testOutputArgument () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk", "-o", "test-output"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+ }
+
+ @Test
+ public void testMultipleInputFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/wiki"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 19 files."));
+ }
+
+
+ @Test
+ public void testAdding () throws IOException {
+ Indexer.main(new String[] {
+ "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk",
+ "-a"});
+ logger.info(outputStream.toString());
+ assertTrue(outputStream.toString().startsWith("Added 1 file."));
+ }
+
+
+ @Test
+ public void testMultipleInputDirectories () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i",
+ "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
+ "-o", "test-index"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
+ }
+
+ @Test
+ public void testEmptyArgument () throws IOException {
+ Indexer.main(new String[] {});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+
+ @Test
+ public void testMissingConfig () throws IOException {
+ Indexer.main(new String[] { "-i", "src/test/resources/bzk",
+ "-o test-index"});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+ @Test
+ public void testMissingInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-o", "test-index"});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+ @Test
+ public void testUnicodeProblem () throws IOException {
+ Indexer.main(new String[] {
+ "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bug",
+ "-o", "test-index2"
+ });
+ logger.info(outputStream.toString());
+ assertTrue(outputStream.toString().startsWith("Added 1 file."));
+ }
+
+ @Test
+ public void testMaxTextSize () throws IOException {
+ // Create a temporary properties file with the max text size setting
+ File tempPropertiesFile = File.createTempFile("krill", ".properties");
+ FileWriter writer = new FileWriter(tempPropertiesFile);
+ writer.write("krill.version = ${project.version}\n");
+ writer.write("krill.name = ${project.name}\n");
+ writer.write("krill.indexDir = test-output\n");
+ writer.write("krill.index.textSize.max = 25000000\n");
+ writer.close();
+
+ Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
+ "-i", "src/test/resources/bzk", "-o", "test-output-1"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+
+ tempPropertiesFile.delete();
+ }
+
+ @Test
+ public void testZipFileInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-zip-index"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 3 files."));
+ }
+
+ @Test
+ public void testZipFileWithAdding () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-zip-index-add",
+ "-a"});
+ assertTrue(outputStream.toString().startsWith("Added 3 files."));
+ }
+
+ @Test
+ public void testMixedDirectoryAndZipInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk;src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-mixed-index"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 4 files."));
+ }
+
+ @Test
+ public void testMultipleZipFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-multiple-zip-index"});
+ // Should process 6 files total (3 from each zip)
+ assertTrue(outputStream.toString().startsWith("Added or updated 6 files."));
+ }
+
+ @Test
+ public void testInvalidZipFile () throws IOException {
+ // Test with a non-existent zip file
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/nonexistent.zip",
+ "-o", "test-invalid-zip-index"});
+ // Should handle gracefully and process 0 files
+ assertTrue(outputStream.toString().startsWith("Added or updated 0 file"));
+ }
+
+ @Test
+ public void testMixedValidAndInvalidInputs () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk;src/test/resources/nonexistent.zip;src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-mixed-valid-invalid-index"});
+ // Should process files from valid inputs only (1 from bzk + 3 from zip = 4 files)
+ assertTrue(outputStream.toString().startsWith("Added"));
+ }
+
+ @Test
+ public void testMixedContentZipFile () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/mixed_test.zip",
+ "-o", "test-mixed-content-zip-index"});
+ // Should process 2 JSON files (1 plain + 1 gzipped) and skip the .txt file
+ assertTrue(outputStream.toString().startsWith("Added"));
+ }
+
+ @Before
+ public void setOutputStream () {
+ System.setOut(new PrintStream(outputStream));
+ }
+
+ @After
+ public void cleanOutputStream () {
+ System.setOut(null);
+ }
+
+ @AfterClass
+ public static void cleanup() {
+ File[] directories = {
+ outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
+ zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
+ multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
+ mixedContentZipIndexDirectory
+ };
+
+ for (File dir : directories) {
+ if (dir.exists()) {
+ deleteFile(dir);
+ }
+ }
+ }
+
+
+ @Before
+ public void cleanOutputDirectory () {
+ File[] directories = {
+ outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
+ zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
+ multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
+ mixedContentZipIndexDirectory
+ };
+
+ for (File dir : directories) {
+ if (dir.exists()) {
+ logger.debug("Output directory " + dir.getName() + " exists");
+ deleteFile(dir);
+ }
+ }
+ }
+
+ private static void deleteFile (File path) {
+ if (path.isDirectory()) {
+ File file;
+ for (String filename : path.list()) {
+ file = new File(path + "/" + filename);
+ deleteFile(file);
+ }
+ }
+ path.delete();
+ }
+}
diff --git a/src/test/resources/rei/mixed_test.zip b/src/test/resources/rei/mixed_test.zip
new file mode 100644
index 0000000..c27dd5e
--- /dev/null
+++ b/src/test/resources/rei/mixed_test.zip
Binary files differ
diff --git a/src/test/resources/rei/rei_sample_krill.zip b/src/test/resources/rei/rei_sample_krill.zip
new file mode 100644
index 0000000..01f26fa
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.zip
Binary files differ