Allow also json/json.gz tar(.gz)s as Krill-Indexer input files

Change-Id: I80c839deb50be33e70903a5b406bac138f60056f
diff --git a/pom.xml b/pom.xml
index 1bcebe5..df64dc8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -224,6 +224,13 @@
       <artifactId>commons-io</artifactId>
       <version>2.19.0</version>
     </dependency>
+    
+    <!-- Apache Commons Compress for tar archive support -->
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <version>1.27.1</version>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 05b0261..d795e3a 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -13,6 +13,10 @@
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
 
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.DefaultParser;
@@ -39,8 +43,8 @@
  * (especially as it is way faster).
  * <br><br>
  * Input can be directories containing files in the json.gz format, or
- * zip files containing .json or .json.gz files. The indexer automatically
- * detects whether each input path is a directory or zip file. Files
+ * zip/tar files containing .json or .json.gz files. The indexer automatically
+ * detects whether each input path is a directory, zip file, or tar file. Files
  * of other formats will be skipped or not indexed. The output
  * directory can be specified in the config file. See
  * src/main/resources/krill.properties.info to create a config file.
@@ -57,7 +61,8 @@
  * Input paths can be:
  * - Directories containing .json.gz files
  * - Zip files containing .json or .json.gz files
- * - Mix of both, separated by semicolons
+ * - Tar files (including .tar.gz) containing .json or .json.gz files
+ * - Mix of any of the above, separated by semicolons
  * </pre>
  * 
  * 
@@ -163,6 +168,90 @@
 
 
     /**
+     * Parse a tar file for document files.
+     * 
+     * @param tarFile
+     *            The {@link File} tar file containing
+     *            JSON documents (plain .json or gzipped .json.gz) to index.
+     */
+    private void parseTar (File tarFile) {
+        try {
+            InputStream fileInputStream = new FileInputStream(tarFile);
+            
+            // Check if it's a gzipped tar file
+            if (tarFile.getName().toLowerCase().endsWith(".tar.gz") || 
+                tarFile.getName().toLowerCase().endsWith(".tgz")) {
+                fileInputStream = new GzipCompressorInputStream(fileInputStream);
+            }
+            
+            try (TarArchiveInputStream tarInputStream = new TarArchiveInputStream(fileInputStream)) {
+                TarArchiveEntry entry;
+                
+                while ((entry = tarInputStream.getNextTarEntry()) != null) {
+                    // Skip directories
+                    if (entry.isDirectory()) {
+                        continue;
+                    }
+                    
+                    String entryName = entry.getName();
+                    Matcher gzipMatcher = jsonFilePattern.matcher(entryName);
+                    Matcher plainMatcher = plainJsonFilePattern.matcher(entryName);
+                    
+                    boolean isGzipped = gzipMatcher.find();
+                    boolean isPlainJson = plainMatcher.find();
+                    
+                    if (isGzipped || isPlainJson) {
+                        // Read the entry content into a byte array to avoid stream closure issues
+                        byte[] entryData = new byte[(int) entry.getSize()];
+                        int totalRead = 0;
+                        while (totalRead < entryData.length) {
+                            int bytesRead = tarInputStream.read(entryData, totalRead, entryData.length - totalRead);
+                            if (bytesRead == -1) break;
+                            totalRead += bytesRead;
+                        }
+                        
+                        try (InputStream entryStream = new java.io.ByteArrayInputStream(entryData)) {
+                            if (addInsteadOfUpsert) {
+                                log.info("{} Add {} from tar {} to the index. ", 
+                                        this.count, entryName, tarFile.getName());
+                                if (this.index.addDoc(entryStream, isGzipped) == null) {
+                                    log.warn("fail.");
+                                    continue;
+                                }
+                            }
+                            else {
+                                log.info("{} Add or update {} from tar {} to the index. ", 
+                                        this.count, entryName, tarFile.getName());
+                                if (this.index.upsertDoc(entryStream, isGzipped) == null) {
+                                    log.warn("fail.");
+                                    continue;
+                                }
+                            }
+                            
+                            this.count++;
+                            if (DEBUG) {
+                                log.debug("Finished adding files. (" + count + ").");
+                            }
+
+                            // Commit in case the commit count is reached
+                            if ((this.count % this.commitCount) == 0) {
+                                this.commit();
+                            }
+                        }
+                    }
+                    else {
+                        log.warn("Skip " + entryName + " from tar " + tarFile.getName()
+                                + " since it does not have .json or .json.gz format.");
+                    }
+                }
+            }
+        }
+        catch (IOException e) {
+            log.error("Error reading tar file " + tarFile.getName(), e);
+        }
+    }
+
+    /**
      * Parse a zip file for document files.
      * 
      * @param zipFile
@@ -271,7 +360,8 @@
                 .hasArg().argName("properties file").required().build());
         options.addOption(Option.builder("i").longOpt("input")
                 .desc("input paths separated by semicolons. Can be directories containing "
-                        + "<filename>.json.gz files, or zip files containing .json or .json.gz files. "
+                        + "<filename>.json.gz files, zip files containing .json or .json.gz files, "
+                        + "or tar files (including .tar.gz) containing .json or .json.gz files. "
                         + "The indexer will automatically detect the type.")
                 .hasArgs().argName("input paths").required()
                 .valueSeparator(Character.valueOf(';')).build());
@@ -331,7 +421,7 @@
                 indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
             }
 
-            // Iterate over list of input paths (auto-detect directories vs zip files)
+            // Iterate over list of input paths (auto-detect directories vs zip/tar files)
             for (String arg : inputPaths) {
                 File f = new File(arg);
                 
@@ -343,8 +433,14 @@
                     log.info("Indexing files in zip " + arg);
                     indexer.parseZip(f);
                 }
+                else if (f.isFile() && (f.getName().toLowerCase().endsWith(".tar") || 
+                                       f.getName().toLowerCase().endsWith(".tar.gz") ||
+                                       f.getName().toLowerCase().endsWith(".tgz"))) {
+                    log.info("Indexing files in tar " + arg);
+                    indexer.parseTar(f);
+                }
                 else {
-                    log.warn("Skipping " + arg + " - not a valid directory or zip file");
+                    log.warn("Skipping " + arg + " - not a valid directory, zip file, or tar file");
                 }
             }
             indexer.closeIndex();
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 920a060..ec26670 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -8,6 +8,8 @@
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 
 import org.junit.After;
 import org.junit.AfterClass;
@@ -26,17 +28,19 @@
     private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
     private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
     private String info = "usage: Krill indexer";
-    private static File outputDirectory = new File("test-index");
-    private static File outputDirectory2 = new File("test-index2");
-    private static File outputDirectory3 = new File("test-output");
-    private static File outputDirectory4 = new File("test-output-1");
-    private static File zipIndexDirectory = new File("test-zip-index");
-    private static File zipIndexAddDirectory = new File("test-zip-index-add");
-    private static File mixedIndexDirectory = new File("test-mixed-index");
-    private static File multipleZipIndexDirectory = new File("test-multiple-zip-index");
-    private static File invalidZipIndexDirectory = new File("test-invalid-zip-index");
-    private static File mixedValidInvalidIndexDirectory = new File("test-mixed-valid-invalid-index");
-    private static File mixedContentZipIndexDirectory = new File("test-mixed-content-zip-index");
+    private static File tempBaseDirectory;
+    
+    static {
+        try {
+            tempBaseDirectory = Files.createTempDirectory("krill-test").toFile();
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to create temporary directory for tests", e);
+        }
+    }
+    
+    private static String getTestOutputPath(String subdir) {
+        return new File(tempBaseDirectory, subdir).getAbsolutePath();
+    }
 
     @Test
     public void testArguments () throws IOException {
@@ -48,7 +52,7 @@
     @Test
     public void testOutputArgument () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
-                                    "-i", "src/test/resources/bzk", "-o", "test-output"});
+                                    "-i", "src/test/resources/bzk", "-o", getTestOutputPath("test-output")});
         assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
     }
 
@@ -76,7 +80,7 @@
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i",
                                     "src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
-                                    "-o", "test-index"});
+                                    "-o", getTestOutputPath("test-index")});
         assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
     }
 
@@ -91,7 +95,7 @@
     @Test
     public void testMissingConfig () throws IOException {
         Indexer.main(new String[] { "-i", "src/test/resources/bzk",
-                                    "-o test-index"});
+                                    "-o " + getTestOutputPath("test-index")});
         logger.info(outputStream.toString());
         assertEquals(true, outputStream.toString().startsWith(info));
     }
@@ -99,7 +103,7 @@
     @Test
     public void testMissingInput () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
-                                    "-o", "test-index"});
+                                    "-o", getTestOutputPath("test-index")});
         logger.info(outputStream.toString());
         assertEquals(true, outputStream.toString().startsWith(info));
     }
@@ -109,7 +113,7 @@
         Indexer.main(new String[] {
                 "-c", "src/test/resources/krill.properties",
                 "-i", "src/test/resources/bug",
-                "-o", "test-index2"
+                "-o", getTestOutputPath("test-index2")
             });
         logger.info(outputStream.toString());
         assertTrue(outputStream.toString().startsWith("Added 1 file."));
@@ -119,25 +123,27 @@
     public void testMaxTextSize () throws IOException {
         // Create a temporary properties file with the max text size setting
         File tempPropertiesFile = File.createTempFile("krill", ".properties");
-        FileWriter writer = new FileWriter(tempPropertiesFile);
-        writer.write("krill.version = ${project.version}\n");
-        writer.write("krill.name = ${project.name}\n");
-        writer.write("krill.indexDir = test-output\n");
-        writer.write("krill.index.textSize.max = 25000000\n");
-        writer.close();
+        try (FileWriter writer = new FileWriter(tempPropertiesFile)) {
+            writer.write("krill.version = ${project.version}\n");
+            writer.write("krill.name = ${project.name}\n");
+            writer.write("krill.indexDir = " + getTestOutputPath("test-output") + "\n");
+            writer.write("krill.index.textSize.max = 25000000\n");
+        }
         
-        Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
-                "-i", "src/test/resources/bzk", "-o", "test-output-1"});
-        assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
-        
-        tempPropertiesFile.delete();
+        try {
+            Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
+                    "-i", "src/test/resources/bzk", "-o", getTestOutputPath("test-output-1")});
+            assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+        } finally {
+            tempPropertiesFile.delete();
+        }
     }
 
     @Test
     public void testZipFileInput () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/rei/rei_sample_krill.zip",
-                                    "-o", "test-zip-index"});
+                                    "-o", getTestOutputPath("test-zip-index")});
         assertTrue(outputStream.toString().startsWith("Added or updated 3 files."));
     }
 
@@ -145,7 +151,7 @@
     public void testZipFileWithAdding () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/rei/rei_sample_krill.zip",
-                                    "-o", "test-zip-index-add",
+                                    "-o", getTestOutputPath("test-zip-index-add"),
                                     "-a"});
         assertTrue(outputStream.toString().startsWith("Added 3 files."));
     }
@@ -154,7 +160,7 @@
     public void testMixedDirectoryAndZipInput () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/bzk;src/test/resources/rei/rei_sample_krill.zip",
-                                    "-o", "test-mixed-index"});
+                                    "-o", getTestOutputPath("test-mixed-index")});
         assertTrue(outputStream.toString().startsWith("Added or updated 4 files."));
     }
 
@@ -162,7 +168,7 @@
     public void testMultipleZipFiles () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.zip",
-                                    "-o", "test-multiple-zip-index"});
+                                    "-o", getTestOutputPath("test-multiple-zip-index")});
         // Should process 6 files total (3 from each zip)
         assertTrue(outputStream.toString().startsWith("Added or updated 6 files."));
     }
@@ -172,7 +178,7 @@
         // Test with a non-existent zip file
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/nonexistent.zip",
-                                    "-o", "test-invalid-zip-index"});
+                                    "-o", getTestOutputPath("test-invalid-zip-index")});
         // Should handle gracefully and process 0 files
         assertTrue(outputStream.toString().startsWith("Added or updated 0 file"));
     }
@@ -181,7 +187,7 @@
     public void testMixedValidAndInvalidInputs () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/bzk;src/test/resources/nonexistent.zip;src/test/resources/rei/rei_sample_krill.zip",
-                                    "-o", "test-mixed-valid-invalid-index"});
+                                    "-o", getTestOutputPath("test-mixed-valid-invalid-index")});
         // Should process files from valid inputs only (1 from bzk + 3 from zip = 4 files)
         assertTrue(outputStream.toString().startsWith("Added"));
     }
@@ -190,11 +196,45 @@
     public void testMixedContentZipFile () throws IOException {
         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
                                     "-i", "src/test/resources/rei/mixed_test.zip",
-                                    "-o", "test-mixed-content-zip-index"});
+                                    "-o", getTestOutputPath("test-mixed-content-zip-index")});
         // Should process 2 JSON files (1 plain + 1 gzipped) and skip the .txt file
         assertTrue(outputStream.toString().startsWith("Added"));
     }
 
+    @Test
+    public void testTarFileInput () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.tar",
+                                    "-o", getTestOutputPath("test-tar-index")});
+        assertTrue(outputStream.toString().contains("Added or updated 3 files"));
+    }
+
+    @Test
+    public void testTarGzFileInput () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.tar.gz",
+                                    "-o", getTestOutputPath("test-tar-gz-index")});
+        assertTrue(outputStream.toString().contains("Added or updated 3 files"));
+    }
+
+    @Test
+    public void testMultipleTarFiles () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.tar;src/test/resources/rei/rei_sample_krill.tar.gz",
+                                    "-o", getTestOutputPath("test-multiple-tar-index")});
+        // Should process 6 files total (3 from each tar)
+        assertTrue(outputStream.toString().contains("Added or updated 6 files"));
+    }
+
+    @Test
+    public void testMixedZipAndTarFiles () throws IOException {
+        Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+                                    "-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.tar",
+                                    "-o", getTestOutputPath("test-mixed-zip-tar-index")});
+        // Should process 6 files total (3 from zip + 3 from tar)
+        assertTrue(outputStream.toString().contains("Added or updated 6 files"));
+    }
+
     @Before
     public void setOutputStream () {
         System.setOut(new PrintStream(outputStream));
@@ -207,37 +247,12 @@
 
     @AfterClass
     public static void cleanup() {
-        File[] directories = {
-            outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
-            zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
-            multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
-            mixedContentZipIndexDirectory
-        };
-        
-        for (File dir : directories) {
-            if (dir.exists()) {
-                deleteFile(dir);
-            }
+        if (tempBaseDirectory != null && tempBaseDirectory.exists()) {
+            deleteFile(tempBaseDirectory);
         }
     }
 
     
-    @Before
-    public void cleanOutputDirectory () {
-        File[] directories = {
-            outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
-            zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
-            multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
-            mixedContentZipIndexDirectory
-        };
-        
-        for (File dir : directories) {
-            if (dir.exists()) {
-                logger.debug("Output directory " + dir.getName() + " exists");
-                deleteFile(dir);
-            }
-        }
-    }
 
     private static void deleteFile (File path) {
         if (path.isDirectory()) {
diff --git a/src/test/resources/rei/rei_sample_krill.tar b/src/test/resources/rei/rei_sample_krill.tar
new file mode 100644
index 0000000..f269455
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.tar
Binary files differ
diff --git a/src/test/resources/rei/rei_sample_krill.tar.gz b/src/test/resources/rei/rei_sample_krill.tar.gz
new file mode 100644
index 0000000..a369e48
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.tar.gz
Binary files differ