Allow also json/json.gz zips as Krill-Indexer input files

Unpacking archives should be avoided where ever possible.

Change-Id: I139e9ebed3ecab858df5cdd9b84e70ef947111f6
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index b2a2792..920a060 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -1,187 +1,252 @@
-package de.ids_mannheim.korap;

-

-import static org.junit.Assert.assertEquals;

-import static org.junit.Assert.assertTrue;

-

-import java.io.ByteArrayOutputStream;

-import java.io.File;

-import java.io.FileWriter;

-import java.io.IOException;

-import java.io.PrintStream;

-

-import org.junit.After;

-import org.junit.AfterClass;

-import org.junit.Before;

-import org.junit.Test;

-import org.slf4j.Logger;

-import org.slf4j.LoggerFactory;

-

-import de.ids_mannheim.korap.index.Indexer;

-

-/**

- * @author margaretha

- *

- */

-public class TestIndexer {

-    private Logger logger = LoggerFactory.getLogger(TestIndexer.class);

-    private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

-    private String info = "usage: Krill indexer";

-    private static File outputDirectory = new File("test-index");

-    private static File outputDirectory2 = new File("test-index2");

-    private static File outputDirectory3 = new File("test-output");

-    private static File outputDirectory4 = new File("test-output-1");

-

-    @Test

-    public void testArguments () throws IOException {

-        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

-                                    "-i", "src/test/resources/bzk"});

-        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));

-    }

-

-    @Test

-    public void testOutputArgument () throws IOException {

-        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

-                                    "-i", "src/test/resources/bzk", "-o", "test-output"});

-        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));

-    }

-

-    @Test

-    public void testMultipleInputFiles () throws IOException {

-        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

-                                    "-i", "src/test/resources/wiki"});

-        assertTrue(outputStream.toString().startsWith("Added or updated 19 files."));

-    }

-

-

-    @Test

-    public void testAdding () throws IOException {

-        Indexer.main(new String[] {

-                "-c", "src/test/resources/krill.properties",

-                "-i", "src/test/resources/bzk",

-                "-a"});

-        logger.info(outputStream.toString());

-        assertTrue(outputStream.toString().startsWith("Added 1 file."));

-    }

-

-    

-    @Test

-    public void testMultipleInputDirectories () throws IOException {

-        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

-                                    "-i",

-                                    "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",

-                                    "-o", "test-index"});

-        assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));

-    }

-

-    @Test

-    public void testEmptyArgument () throws IOException {

-        Indexer.main(new String[] {});

-        logger.info(outputStream.toString());

-        assertEquals(true, outputStream.toString().startsWith(info));

-    }

-

-

-    @Test

-    public void testMissingConfig () throws IOException {

-        Indexer.main(new String[] { "-i", "src/test/resources/bzk",

-                                    "-o test-index"});

-        logger.info(outputStream.toString());

-        assertEquals(true, outputStream.toString().startsWith(info));

-    }

-    

-    @Test

-    public void testMissingInput () throws IOException {

-        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

-                                    "-o", "test-index"});

-        logger.info(outputStream.toString());

-        assertEquals(true, outputStream.toString().startsWith(info));

-    }

-

-    @Test

-    public void testUnicodeProblem () throws IOException {

-        Indexer.main(new String[] {

-                "-c", "src/test/resources/krill.properties",

-                "-i", "src/test/resources/bug",

-                "-o", "test-index2"

-            });

-        logger.info(outputStream.toString());

-        assertTrue(outputStream.toString().startsWith("Added 1 file."));

-    }

-

-    @Test

-    public void testMaxTextSize () throws IOException {

-        // Create a temporary properties file with the max text size setting

-        File tempPropertiesFile = File.createTempFile("krill", ".properties");

-        FileWriter writer = new FileWriter(tempPropertiesFile);

-        writer.write("krill.version = ${project.version}\n");

-        writer.write("krill.name = ${project.name}\n");

-        writer.write("krill.indexDir = test-output\n");

-        writer.write("krill.index.textSize.max = 25000000\n");

-        writer.close();

-        

-        Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),

-                "-i", "src/test/resources/bzk", "-o", "test-output-1"});

-        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));

-        

-        tempPropertiesFile.delete();

-    }

-

-    @Before

-    public void setOutputStream () {

-        System.setOut(new PrintStream(outputStream));

-    }

-

-    @After

-    public void cleanOutputStream () {

-        System.setOut(null);

-    }

-

-    @AfterClass

-    public static void cleanup() {

-        if (outputDirectory.exists()) {

-            deleteFile(outputDirectory);

-        }

-        if (outputDirectory2.exists()) {

-            deleteFile(outputDirectory2);

-        }

-        if (outputDirectory3.exists()) {

-            deleteFile(outputDirectory3);

-        }

-        if (outputDirectory4.exists()) {

-            deleteFile(outputDirectory4);

-        }

-    }

-

-    

-    @Before

-    public void cleanOutputDirectory () {

-

-        if (outputDirectory.exists()) {

-            logger.debug("Output directory exists");

-            deleteFile(outputDirectory);

-        }

-        if (outputDirectory2.exists()) {

-            logger.debug("Output directory 2 exists");

-            deleteFile(outputDirectory2);

-        }

-        if (outputDirectory3.exists()) {

-            logger.debug("Output directory 3 exists");

-            deleteFile(outputDirectory3);

-        }

-        if (outputDirectory4.exists()) {

-            logger.debug("Output directory 4 exists");

-            deleteFile(outputDirectory4);

-        }

-    }

-

-    private static void deleteFile (File path) {

-        if (path.isDirectory()) {

-            File file;

-            for (String filename : path.list()) {

-                file = new File(path + "/" + filename);

-                deleteFile(file);

-            }

-        }

-        path.delete();

-    }

-}

+package de.ids_mannheim.korap;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.ids_mannheim.korap.index.Indexer;
+
+/**
+ * @author margaretha
+ *
+ */
+public class TestIndexer {
+    private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
+    private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+    private String info = "usage: Krill indexer";
+    private static File outputDirectory = new File("test-index");
+    private static File outputDirectory2 = new File("test-index2");
+    private static File outputDirectory3 = new File("test-output");
+    private static File outputDirectory4 = new File("test-output-1");
+    private static File zipIndexDirectory = new File("test-zip-index");
+    private static File zipIndexAddDirectory = new File("test-zip-index-add");
+    private static File mixedIndexDirectory = new File("test-mixed-index");
+    private static File multipleZipIndexDirectory = new File("test-multiple-zip-index");
+    private static File invalidZipIndexDirectory = new File("test-invalid-zip-index");
+    private static File mixedValidInvalidIndexDirectory = new File("test-mixed-valid-invalid-index");
+    private static File mixedContentZipIndexDirectory = new File("test-mixed-content-zip-index");
+
+    @Test
+    public void testArguments () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/bzk"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+    }
+
+    @Test
+    public void testOutputArgument () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/bzk", "-o", "test-output"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+    }
+
+    @Test
+    public void testMultipleInputFiles () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/wiki"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 19 files."));
+    }
+
+
+    @Test
+    public void testAdding () throws IOException {
+        Indexer.main(new String[] {
+                "-c", "src/test/resources/krill.properties",
+                "-i", "src/test/resources/bzk",
+                "-a"});
+        logger.info(outputStream.toString());
+        assertTrue(outputStream.toString().startsWith("Added 1 file."));
+    }
+
+    
+    @Test
+    public void testMultipleInputDirectories () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i",
+                                    "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
+                                    "-o", "test-index"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
+    }
+
+    @Test
+    public void testEmptyArgument () throws IOException {
+        Indexer.main(new String[] {});
+        logger.info(outputStream.toString());
+        assertEquals(true, outputStream.toString().startsWith(info));
+    }
+
+
+    @Test
+    public void testMissingConfig () throws IOException {
+        Indexer.main(new String[] { "-i", "src/test/resources/bzk",
+                                    "-o test-index"});
+        logger.info(outputStream.toString());
+        assertEquals(true, outputStream.toString().startsWith(info));
+    }
+    
+    @Test
+    public void testMissingInput () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-o", "test-index"});
+        logger.info(outputStream.toString());
+        assertEquals(true, outputStream.toString().startsWith(info));
+    }
+
+    @Test
+    public void testUnicodeProblem () throws IOException {
+        Indexer.main(new String[] {
+                "-c", "src/test/resources/krill.properties",
+                "-i", "src/test/resources/bug",
+                "-o", "test-index2"
+            });
+        logger.info(outputStream.toString());
+        assertTrue(outputStream.toString().startsWith("Added 1 file."));
+    }
+
+    @Test
+    public void testMaxTextSize () throws IOException {
+        // Create a temporary properties file with the max text size setting
+        File tempPropertiesFile = File.createTempFile("krill", ".properties");
+        FileWriter writer = new FileWriter(tempPropertiesFile);
+        writer.write("krill.version = ${project.version}\n");
+        writer.write("krill.name = ${project.name}\n");
+        writer.write("krill.indexDir = test-output\n");
+        writer.write("krill.index.textSize.max = 25000000\n");
+        writer.close();
+        
+        Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
+                "-i", "src/test/resources/bzk", "-o", "test-output-1"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+        
+        tempPropertiesFile.delete();
+    }
+
+    @Test
+    public void testZipFileInput () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.zip",
+                                    "-o", "test-zip-index"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 3 files."));
+    }
+
+    @Test
+    public void testZipFileWithAdding () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.zip",
+                                    "-o", "test-zip-index-add",
+                                    "-a"});
+        assertTrue(outputStream.toString().startsWith("Added 3 files."));
+    }
+
+    @Test
+    public void testMixedDirectoryAndZipInput () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/bzk;src/test/resources/rei/rei_sample_krill.zip",
+                                    "-o", "test-mixed-index"});
+        assertTrue(outputStream.toString().startsWith("Added or updated 4 files."));
+    }
+
+    @Test
+    public void testMultipleZipFiles () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.zip",
+                                    "-o", "test-multiple-zip-index"});
+        // Should process 6 files total (3 from each zip)
+        assertTrue(outputStream.toString().startsWith("Added or updated 6 files."));
+    }
+
+    @Test
+    public void testInvalidZipFile () throws IOException {
+        // Test with a non-existent zip file
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/nonexistent.zip",
+                                    "-o", "test-invalid-zip-index"});
+        // Should handle gracefully and process 0 files
+        assertTrue(outputStream.toString().startsWith("Added or updated 0 file"));
+    }
+
+    @Test
+    public void testMixedValidAndInvalidInputs () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/bzk;src/test/resources/nonexistent.zip;src/test/resources/rei/rei_sample_krill.zip",
+                                    "-o", "test-mixed-valid-invalid-index"});
+        // Should process files from valid inputs only (1 from bzk + 3 from zip = 4 files)
+        assertTrue(outputStream.toString().startsWith("Added"));
+    }
+
+    @Test
+    public void testMixedContentZipFile () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/mixed_test.zip",
+                                    "-o", "test-mixed-content-zip-index"});
+        // Should process 2 JSON files (1 plain + 1 gzipped) and skip the .txt file
+        assertTrue(outputStream.toString().startsWith("Added"));
+    }
+
+    @Before
+    public void setOutputStream () {
+        System.setOut(new PrintStream(outputStream));
+    }
+
+    @After
+    public void cleanOutputStream () {
+        System.setOut(null);
+    }
+
+    @AfterClass
+    public static void cleanup() {
+        File[] directories = {
+            outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
+            zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
+            multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
+            mixedContentZipIndexDirectory
+        };
+        
+        for (File dir : directories) {
+            if (dir.exists()) {
+                deleteFile(dir);
+            }
+        }
+    }
+
+    
+    @Before
+    public void cleanOutputDirectory () {
+        File[] directories = {
+            outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
+            zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
+            multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
+            mixedContentZipIndexDirectory
+        };
+        
+        for (File dir : directories) {
+            if (dir.exists()) {
+                logger.debug("Output directory " + dir.getName() + " exists");
+                deleteFile(dir);
+            }
+        }
+    }
+
+    private static void deleteFile (File path) {
+        if (path.isDirectory()) {
+            File file;
+            for (String filename : path.list()) {
+                file = new File(path + "/" + filename);
+                deleteFile(file);
+            }
+        }
+        path.delete();
+    }
+}
diff --git a/src/test/resources/rei/mixed_test.zip b/src/test/resources/rei/mixed_test.zip
new file mode 100644
index 0000000..c27dd5e
--- /dev/null
+++ b/src/test/resources/rei/mixed_test.zip
Binary files differ
diff --git a/src/test/resources/rei/rei_sample_krill.zip b/src/test/resources/rei/rei_sample_krill.zip
new file mode 100644
index 0000000..01f26fa
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.zip
Binary files differ