Allow also json/json.gz zips as Krill-Indexer input files
Unpacking archives should be avoided where ever possible.
Change-Id: I139e9ebed3ecab858df5cdd9b84e70ef947111f6
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index b2a2792..920a060 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -1,187 +1,252 @@
-package de.ids_mannheim.korap;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.PrintStream;
-
-import org.junit.After;
-import org.junit.AfterClass;
-import org.junit.Before;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import de.ids_mannheim.korap.index.Indexer;
-
-/**
- * @author margaretha
- *
- */
-public class TestIndexer {
- private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
- private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
- private String info = "usage: Krill indexer";
- private static File outputDirectory = new File("test-index");
- private static File outputDirectory2 = new File("test-index2");
- private static File outputDirectory3 = new File("test-output");
- private static File outputDirectory4 = new File("test-output-1");
-
- @Test
- public void testArguments () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
- }
-
- @Test
- public void testOutputArgument () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk", "-o", "test-output"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
- }
-
- @Test
- public void testMultipleInputFiles () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/wiki"});
- assertTrue(outputStream.toString().startsWith("Added or updated 19 files."));
- }
-
-
- @Test
- public void testAdding () throws IOException {
- Indexer.main(new String[] {
- "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk",
- "-a"});
- logger.info(outputStream.toString());
- assertTrue(outputStream.toString().startsWith("Added 1 file."));
- }
-
-
- @Test
- public void testMultipleInputDirectories () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i",
- "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
- "-o", "test-index"});
- assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
- }
-
- @Test
- public void testEmptyArgument () throws IOException {
- Indexer.main(new String[] {});
- logger.info(outputStream.toString());
- assertEquals(true, outputStream.toString().startsWith(info));
- }
-
-
- @Test
- public void testMissingConfig () throws IOException {
- Indexer.main(new String[] { "-i", "src/test/resources/bzk",
- "-o test-index"});
- logger.info(outputStream.toString());
- assertEquals(true, outputStream.toString().startsWith(info));
- }
-
- @Test
- public void testMissingInput () throws IOException {
- Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-o", "test-index"});
- logger.info(outputStream.toString());
- assertEquals(true, outputStream.toString().startsWith(info));
- }
-
- @Test
- public void testUnicodeProblem () throws IOException {
- Indexer.main(new String[] {
- "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bug",
- "-o", "test-index2"
- });
- logger.info(outputStream.toString());
- assertTrue(outputStream.toString().startsWith("Added 1 file."));
- }
-
- @Test
- public void testMaxTextSize () throws IOException {
- // Create a temporary properties file with the max text size setting
- File tempPropertiesFile = File.createTempFile("krill", ".properties");
- FileWriter writer = new FileWriter(tempPropertiesFile);
- writer.write("krill.version = ${project.version}\n");
- writer.write("krill.name = ${project.name}\n");
- writer.write("krill.indexDir = test-output\n");
- writer.write("krill.index.textSize.max = 25000000\n");
- writer.close();
-
- Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
- "-i", "src/test/resources/bzk", "-o", "test-output-1"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
-
- tempPropertiesFile.delete();
- }
-
- @Before
- public void setOutputStream () {
- System.setOut(new PrintStream(outputStream));
- }
-
- @After
- public void cleanOutputStream () {
- System.setOut(null);
- }
-
- @AfterClass
- public static void cleanup() {
- if (outputDirectory.exists()) {
- deleteFile(outputDirectory);
- }
- if (outputDirectory2.exists()) {
- deleteFile(outputDirectory2);
- }
- if (outputDirectory3.exists()) {
- deleteFile(outputDirectory3);
- }
- if (outputDirectory4.exists()) {
- deleteFile(outputDirectory4);
- }
- }
-
-
- @Before
- public void cleanOutputDirectory () {
-
- if (outputDirectory.exists()) {
- logger.debug("Output directory exists");
- deleteFile(outputDirectory);
- }
- if (outputDirectory2.exists()) {
- logger.debug("Output directory 2 exists");
- deleteFile(outputDirectory2);
- }
- if (outputDirectory3.exists()) {
- logger.debug("Output directory 3 exists");
- deleteFile(outputDirectory3);
- }
- if (outputDirectory4.exists()) {
- logger.debug("Output directory 4 exists");
- deleteFile(outputDirectory4);
- }
- }
-
- private static void deleteFile (File path) {
- if (path.isDirectory()) {
- File file;
- for (String filename : path.list()) {
- file = new File(path + "/" + filename);
- deleteFile(file);
- }
- }
- path.delete();
- }
-}
+package de.ids_mannheim.korap;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.index.Indexer;
+
+/**
+ * @author margaretha
+ *
+ */
+public class TestIndexer {
+ private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
+ private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ private String info = "usage: Krill indexer";
+ private static File outputDirectory = new File("test-index");
+ private static File outputDirectory2 = new File("test-index2");
+ private static File outputDirectory3 = new File("test-output");
+ private static File outputDirectory4 = new File("test-output-1");
+ private static File zipIndexDirectory = new File("test-zip-index");
+ private static File zipIndexAddDirectory = new File("test-zip-index-add");
+ private static File mixedIndexDirectory = new File("test-mixed-index");
+ private static File multipleZipIndexDirectory = new File("test-multiple-zip-index");
+ private static File invalidZipIndexDirectory = new File("test-invalid-zip-index");
+ private static File mixedValidInvalidIndexDirectory = new File("test-mixed-valid-invalid-index");
+ private static File mixedContentZipIndexDirectory = new File("test-mixed-content-zip-index");
+
+ @Test
+ public void testArguments () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+ }
+
+ @Test
+ public void testOutputArgument () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk", "-o", "test-output"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+ }
+
+ @Test
+ public void testMultipleInputFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/wiki"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 19 files."));
+ }
+
+
+ @Test
+ public void testAdding () throws IOException {
+ Indexer.main(new String[] {
+ "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk",
+ "-a"});
+ logger.info(outputStream.toString());
+ assertTrue(outputStream.toString().startsWith("Added 1 file."));
+ }
+
+
+ @Test
+ public void testMultipleInputDirectories () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i",
+ "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
+ "-o", "test-index"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
+ }
+
+ @Test
+ public void testEmptyArgument () throws IOException {
+ Indexer.main(new String[] {});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+
+ @Test
+ public void testMissingConfig () throws IOException {
+ Indexer.main(new String[] { "-i", "src/test/resources/bzk",
+ "-o test-index"});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+ @Test
+ public void testMissingInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-o", "test-index"});
+ logger.info(outputStream.toString());
+ assertEquals(true, outputStream.toString().startsWith(info));
+ }
+
+ @Test
+ public void testUnicodeProblem () throws IOException {
+ Indexer.main(new String[] {
+ "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bug",
+ "-o", "test-index2"
+ });
+ logger.info(outputStream.toString());
+ assertTrue(outputStream.toString().startsWith("Added 1 file."));
+ }
+
+ @Test
+ public void testMaxTextSize () throws IOException {
+ // Create a temporary properties file with the max text size setting
+ File tempPropertiesFile = File.createTempFile("krill", ".properties");
+ FileWriter writer = new FileWriter(tempPropertiesFile);
+ writer.write("krill.version = ${project.version}\n");
+ writer.write("krill.name = ${project.name}\n");
+ writer.write("krill.indexDir = test-output\n");
+ writer.write("krill.index.textSize.max = 25000000\n");
+ writer.close();
+
+ Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
+ "-i", "src/test/resources/bzk", "-o", "test-output-1"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+
+ tempPropertiesFile.delete();
+ }
+
+ @Test
+ public void testZipFileInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-zip-index"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 3 files."));
+ }
+
+ @Test
+ public void testZipFileWithAdding () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-zip-index-add",
+ "-a"});
+ assertTrue(outputStream.toString().startsWith("Added 3 files."));
+ }
+
+ @Test
+ public void testMixedDirectoryAndZipInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk;src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-mixed-index"});
+ assertTrue(outputStream.toString().startsWith("Added or updated 4 files."));
+ }
+
+ @Test
+ public void testMultipleZipFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-multiple-zip-index"});
+ // Should process 6 files total (3 from each zip)
+ assertTrue(outputStream.toString().startsWith("Added or updated 6 files."));
+ }
+
+ @Test
+ public void testInvalidZipFile () throws IOException {
+ // Test with a non-existent zip file
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/nonexistent.zip",
+ "-o", "test-invalid-zip-index"});
+ // Should handle gracefully and process 0 files
+ assertTrue(outputStream.toString().startsWith("Added or updated 0 file"));
+ }
+
+ @Test
+ public void testMixedValidAndInvalidInputs () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/bzk;src/test/resources/nonexistent.zip;src/test/resources/rei/rei_sample_krill.zip",
+ "-o", "test-mixed-valid-invalid-index"});
+ // Should process files from valid inputs only (1 from bzk + 3 from zip = 4 files)
+ assertTrue(outputStream.toString().startsWith("Added"));
+ }
+
+ @Test
+ public void testMixedContentZipFile () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/mixed_test.zip",
+ "-o", "test-mixed-content-zip-index"});
+ // Should process 2 JSON files (1 plain + 1 gzipped) and skip the .txt file
+ assertTrue(outputStream.toString().startsWith("Added"));
+ }
+
+ @Before
+ public void setOutputStream () {
+ System.setOut(new PrintStream(outputStream));
+ }
+
+ @After
+ public void cleanOutputStream () {
+ System.setOut(null);
+ }
+
+ @AfterClass
+ public static void cleanup() {
+ File[] directories = {
+ outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
+ zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
+ multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
+ mixedContentZipIndexDirectory
+ };
+
+ for (File dir : directories) {
+ if (dir.exists()) {
+ deleteFile(dir);
+ }
+ }
+ }
+
+
+ @Before
+ public void cleanOutputDirectory () {
+ File[] directories = {
+ outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
+ zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
+ multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
+ mixedContentZipIndexDirectory
+ };
+
+ for (File dir : directories) {
+ if (dir.exists()) {
+ logger.debug("Output directory " + dir.getName() + " exists");
+ deleteFile(dir);
+ }
+ }
+ }
+
+ private static void deleteFile (File path) {
+ if (path.isDirectory()) {
+ File file;
+ for (String filename : path.list()) {
+ file = new File(path + "/" + filename);
+ deleteFile(file);
+ }
+ }
+ path.delete();
+ }
+}
diff --git a/src/test/resources/rei/mixed_test.zip b/src/test/resources/rei/mixed_test.zip
new file mode 100644
index 0000000..c27dd5e
--- /dev/null
+++ b/src/test/resources/rei/mixed_test.zip
Binary files differ
diff --git a/src/test/resources/rei/rei_sample_krill.zip b/src/test/resources/rei/rei_sample_krill.zip
new file mode 100644
index 0000000..01f26fa
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.zip
Binary files differ