Allow also json/json.gz tar(.gz)s as Krill-Indexer input files
Change-Id: I80c839deb50be33e70903a5b406bac138f60056f
diff --git a/pom.xml b/pom.xml
index 1bcebe5..df64dc8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -224,6 +224,13 @@
<artifactId>commons-io</artifactId>
<version>2.19.0</version>
</dependency>
+
+ <!-- Apache Commons Compress for tar archive support -->
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>1.27.1</version>
+ </dependency>
</dependencies>
<build>
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 05b0261..d795e3a 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -13,6 +13,10 @@
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
@@ -39,8 +43,8 @@
* (especially as it is way faster).
* <br><br>
* Input can be directories containing files in the json.gz format, or
- * zip files containing .json or .json.gz files. The indexer automatically
- * detects whether each input path is a directory or zip file. Files
+ * zip/tar files containing .json or .json.gz files. The indexer automatically
+ * detects whether each input path is a directory, zip file, or tar file. Files
* of other formats will be skipped or not indexed. The output
* directory can be specified in the config file. See
* src/main/resources/krill.properties.info to create a config file.
@@ -57,7 +61,8 @@
* Input paths can be:
* - Directories containing .json.gz files
* - Zip files containing .json or .json.gz files
- * - Mix of both, separated by semicolons
+ * - Tar files (including .tar.gz) containing .json or .json.gz files
+ * - Mix of any of the above, separated by semicolons
* </pre>
*
*
@@ -163,6 +168,90 @@
/**
+ * Parse a tar file for document files.
+ *
+ * @param tarFile
+ * The {@link File} tar file containing
+ * JSON documents (plain .json or gzipped .json.gz) to index.
+ */
+ private void parseTar (File tarFile) {
+ try {
+ InputStream fileInputStream = new FileInputStream(tarFile);
+
+ // Check if it's a gzipped tar file
+ if (tarFile.getName().toLowerCase().endsWith(".tar.gz") ||
+ tarFile.getName().toLowerCase().endsWith(".tgz")) {
+ fileInputStream = new GzipCompressorInputStream(fileInputStream);
+ }
+
+ try (TarArchiveInputStream tarInputStream = new TarArchiveInputStream(fileInputStream)) {
+ TarArchiveEntry entry;
+
+ while ((entry = tarInputStream.getNextTarEntry()) != null) {
+ // Skip directories
+ if (entry.isDirectory()) {
+ continue;
+ }
+
+ String entryName = entry.getName();
+ Matcher gzipMatcher = jsonFilePattern.matcher(entryName);
+ Matcher plainMatcher = plainJsonFilePattern.matcher(entryName);
+
+ boolean isGzipped = gzipMatcher.find();
+ boolean isPlainJson = plainMatcher.find();
+
+ if (isGzipped || isPlainJson) {
+ // Read the entry content into a byte array to avoid stream closure issues
+ byte[] entryData = new byte[(int) entry.getSize()];
+ int totalRead = 0;
+ while (totalRead < entryData.length) {
+ int bytesRead = tarInputStream.read(entryData, totalRead, entryData.length - totalRead);
+ if (bytesRead == -1) break;
+ totalRead += bytesRead;
+ }
+
+ try (InputStream entryStream = new java.io.ByteArrayInputStream(entryData)) {
+ if (addInsteadOfUpsert) {
+ log.info("{} Add {} from tar {} to the index. ",
+ this.count, entryName, tarFile.getName());
+ if (this.index.addDoc(entryStream, isGzipped) == null) {
+ log.warn("fail.");
+ continue;
+ }
+ }
+ else {
+ log.info("{} Add or update {} from tar {} to the index. ",
+ this.count, entryName, tarFile.getName());
+ if (this.index.upsertDoc(entryStream, isGzipped) == null) {
+ log.warn("fail.");
+ continue;
+ }
+ }
+
+ this.count++;
+ if (DEBUG) {
+ log.debug("Finished adding files. (" + count + ").");
+ }
+
+ // Commit in case the commit count is reached
+ if ((this.count % this.commitCount) == 0) {
+ this.commit();
+ }
+ }
+ }
+ else {
+ log.warn("Skip " + entryName + " from tar " + tarFile.getName()
+ + " since it does not have .json or .json.gz format.");
+ }
+ }
+ }
+ }
+ catch (IOException e) {
+ log.error("Error reading tar file " + tarFile.getName(), e);
+ }
+ }
+
+ /**
* Parse a zip file for document files.
*
* @param zipFile
@@ -271,7 +360,8 @@
.hasArg().argName("properties file").required().build());
options.addOption(Option.builder("i").longOpt("input")
.desc("input paths separated by semicolons. Can be directories containing "
- + "<filename>.json.gz files, or zip files containing .json or .json.gz files. "
+ + "<filename>.json.gz files, zip files containing .json or .json.gz files, "
+ + "or tar files (including .tar.gz) containing .json or .json.gz files. "
+ "The indexer will automatically detect the type.")
.hasArgs().argName("input paths").required()
.valueSeparator(Character.valueOf(';')).build());
@@ -331,7 +421,7 @@
indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
}
- // Iterate over list of input paths (auto-detect directories vs zip files)
+ // Iterate over list of input paths (auto-detect directories vs zip/tar files)
for (String arg : inputPaths) {
File f = new File(arg);
@@ -343,8 +433,14 @@
log.info("Indexing files in zip " + arg);
indexer.parseZip(f);
}
+ else if (f.isFile() && (f.getName().toLowerCase().endsWith(".tar") ||
+ f.getName().toLowerCase().endsWith(".tar.gz") ||
+ f.getName().toLowerCase().endsWith(".tgz"))) {
+ log.info("Indexing files in tar " + arg);
+ indexer.parseTar(f);
+ }
else {
- log.warn("Skipping " + arg + " - not a valid directory or zip file");
+ log.warn("Skipping " + arg + " - not a valid directory, zip file, or tar file");
}
}
indexer.closeIndex();
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 920a060..ec26670 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -8,6 +8,8 @@
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.junit.After;
import org.junit.AfterClass;
@@ -26,17 +28,19 @@
private Logger logger = LoggerFactory.getLogger(TestIndexer.class);
private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
private String info = "usage: Krill indexer";
- private static File outputDirectory = new File("test-index");
- private static File outputDirectory2 = new File("test-index2");
- private static File outputDirectory3 = new File("test-output");
- private static File outputDirectory4 = new File("test-output-1");
- private static File zipIndexDirectory = new File("test-zip-index");
- private static File zipIndexAddDirectory = new File("test-zip-index-add");
- private static File mixedIndexDirectory = new File("test-mixed-index");
- private static File multipleZipIndexDirectory = new File("test-multiple-zip-index");
- private static File invalidZipIndexDirectory = new File("test-invalid-zip-index");
- private static File mixedValidInvalidIndexDirectory = new File("test-mixed-valid-invalid-index");
- private static File mixedContentZipIndexDirectory = new File("test-mixed-content-zip-index");
+ private static File tempBaseDirectory;
+
+ static {
+ try {
+ tempBaseDirectory = Files.createTempDirectory("krill-test").toFile();
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to create temporary directory for tests", e);
+ }
+ }
+
+ private static String getTestOutputPath(String subdir) {
+ return new File(tempBaseDirectory, subdir).getAbsolutePath();
+ }
@Test
public void testArguments () throws IOException {
@@ -48,7 +52,7 @@
@Test
public void testOutputArgument () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-i", "src/test/resources/bzk", "-o", "test-output"});
+ "-i", "src/test/resources/bzk", "-o", getTestOutputPath("test-output")});
assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
}
@@ -76,7 +80,7 @@
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i",
"src/test/resources/bzk;src/test/resources/goe;src/test/resources/sgbr",
- "-o", "test-index"});
+ "-o", getTestOutputPath("test-index")});
assertTrue(outputStream.toString().startsWith("Added or updated 5 files."));
}
@@ -91,7 +95,7 @@
@Test
public void testMissingConfig () throws IOException {
Indexer.main(new String[] { "-i", "src/test/resources/bzk",
- "-o test-index"});
+ "-o " + getTestOutputPath("test-index")});
logger.info(outputStream.toString());
assertEquals(true, outputStream.toString().startsWith(info));
}
@@ -99,7 +103,7 @@
@Test
public void testMissingInput () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
- "-o", "test-index"});
+ "-o", getTestOutputPath("test-index")});
logger.info(outputStream.toString());
assertEquals(true, outputStream.toString().startsWith(info));
}
@@ -109,7 +113,7 @@
Indexer.main(new String[] {
"-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/bug",
- "-o", "test-index2"
+ "-o", getTestOutputPath("test-index2")
});
logger.info(outputStream.toString());
assertTrue(outputStream.toString().startsWith("Added 1 file."));
@@ -119,25 +123,27 @@
public void testMaxTextSize () throws IOException {
// Create a temporary properties file with the max text size setting
File tempPropertiesFile = File.createTempFile("krill", ".properties");
- FileWriter writer = new FileWriter(tempPropertiesFile);
- writer.write("krill.version = ${project.version}\n");
- writer.write("krill.name = ${project.name}\n");
- writer.write("krill.indexDir = test-output\n");
- writer.write("krill.index.textSize.max = 25000000\n");
- writer.close();
+ try (FileWriter writer = new FileWriter(tempPropertiesFile)) {
+ writer.write("krill.version = ${project.version}\n");
+ writer.write("krill.name = ${project.name}\n");
+ writer.write("krill.indexDir = " + getTestOutputPath("test-output") + "\n");
+ writer.write("krill.index.textSize.max = 25000000\n");
+ }
- Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
- "-i", "src/test/resources/bzk", "-o", "test-output-1"});
- assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
-
- tempPropertiesFile.delete();
+ try {
+ Indexer.main(new String[] { "-c", tempPropertiesFile.getAbsolutePath(),
+ "-i", "src/test/resources/bzk", "-o", getTestOutputPath("test-output-1")});
+ assertTrue(outputStream.toString().startsWith("Added or updated 1 file."));
+ } finally {
+ tempPropertiesFile.delete();
+ }
}
@Test
public void testZipFileInput () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/rei/rei_sample_krill.zip",
- "-o", "test-zip-index"});
+ "-o", getTestOutputPath("test-zip-index")});
assertTrue(outputStream.toString().startsWith("Added or updated 3 files."));
}
@@ -145,7 +151,7 @@
public void testZipFileWithAdding () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/rei/rei_sample_krill.zip",
- "-o", "test-zip-index-add",
+ "-o", getTestOutputPath("test-zip-index-add"),
"-a"});
assertTrue(outputStream.toString().startsWith("Added 3 files."));
}
@@ -154,7 +160,7 @@
public void testMixedDirectoryAndZipInput () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/bzk;src/test/resources/rei/rei_sample_krill.zip",
- "-o", "test-mixed-index"});
+ "-o", getTestOutputPath("test-mixed-index")});
assertTrue(outputStream.toString().startsWith("Added or updated 4 files."));
}
@@ -162,7 +168,7 @@
public void testMultipleZipFiles () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.zip",
- "-o", "test-multiple-zip-index"});
+ "-o", getTestOutputPath("test-multiple-zip-index")});
// Should process 6 files total (3 from each zip)
assertTrue(outputStream.toString().startsWith("Added or updated 6 files."));
}
@@ -172,7 +178,7 @@
// Test with a non-existent zip file
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/nonexistent.zip",
- "-o", "test-invalid-zip-index"});
+ "-o", getTestOutputPath("test-invalid-zip-index")});
// Should handle gracefully and process 0 files
assertTrue(outputStream.toString().startsWith("Added or updated 0 file"));
}
@@ -181,7 +187,7 @@
public void testMixedValidAndInvalidInputs () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/bzk;src/test/resources/nonexistent.zip;src/test/resources/rei/rei_sample_krill.zip",
- "-o", "test-mixed-valid-invalid-index"});
+ "-o", getTestOutputPath("test-mixed-valid-invalid-index")});
// Should process files from valid inputs only (1 from bzk + 3 from zip = 4 files)
assertTrue(outputStream.toString().startsWith("Added"));
}
@@ -190,11 +196,45 @@
public void testMixedContentZipFile () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/rei/mixed_test.zip",
- "-o", "test-mixed-content-zip-index"});
+ "-o", getTestOutputPath("test-mixed-content-zip-index")});
// Should process 2 JSON files (1 plain + 1 gzipped) and skip the .txt file
assertTrue(outputStream.toString().startsWith("Added"));
}
+ @Test
+ public void testTarFileInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.tar",
+ "-o", getTestOutputPath("test-tar-index")});
+ assertTrue(outputStream.toString().contains("Added or updated 3 files"));
+ }
+
+ @Test
+ public void testTarGzFileInput () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.tar.gz",
+ "-o", getTestOutputPath("test-tar-gz-index")});
+ assertTrue(outputStream.toString().contains("Added or updated 3 files"));
+ }
+
+ @Test
+ public void testMultipleTarFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.tar;src/test/resources/rei/rei_sample_krill.tar.gz",
+ "-o", getTestOutputPath("test-multiple-tar-index")});
+ // Should process 6 files total (3 from each tar)
+ assertTrue(outputStream.toString().contains("Added or updated 6 files"));
+ }
+
+ @Test
+ public void testMixedZipAndTarFiles () throws IOException {
+ Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
+ "-i", "src/test/resources/rei/rei_sample_krill.zip;src/test/resources/rei/rei_sample_krill.tar",
+ "-o", getTestOutputPath("test-mixed-zip-tar-index")});
+ // Should process 6 files total (3 from zip + 3 from tar)
+ assertTrue(outputStream.toString().contains("Added or updated 6 files"));
+ }
+
@Before
public void setOutputStream () {
System.setOut(new PrintStream(outputStream));
@@ -207,37 +247,12 @@
@AfterClass
public static void cleanup() {
- File[] directories = {
- outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
- zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
- multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
- mixedContentZipIndexDirectory
- };
-
- for (File dir : directories) {
- if (dir.exists()) {
- deleteFile(dir);
- }
+ if (tempBaseDirectory != null && tempBaseDirectory.exists()) {
+ deleteFile(tempBaseDirectory);
}
}
- @Before
- public void cleanOutputDirectory () {
- File[] directories = {
- outputDirectory, outputDirectory2, outputDirectory3, outputDirectory4,
- zipIndexDirectory, zipIndexAddDirectory, mixedIndexDirectory,
- multipleZipIndexDirectory, invalidZipIndexDirectory, mixedValidInvalidIndexDirectory,
- mixedContentZipIndexDirectory
- };
-
- for (File dir : directories) {
- if (dir.exists()) {
- logger.debug("Output directory " + dir.getName() + " exists");
- deleteFile(dir);
- }
- }
- }
private static void deleteFile (File path) {
if (path.isDirectory()) {
diff --git a/src/test/resources/rei/rei_sample_krill.tar b/src/test/resources/rei/rei_sample_krill.tar
new file mode 100644
index 0000000..f269455
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.tar
Binary files differ
diff --git a/src/test/resources/rei/rei_sample_krill.tar.gz b/src/test/resources/rei/rei_sample_krill.tar.gz
new file mode 100644
index 0000000..a369e48
--- /dev/null
+++ b/src/test/resources/rei/rei_sample_krill.tar.gz
Binary files differ