totalngrams: add reading .conllu and conllu.gz files directly
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index 6383591..723e570 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -54,18 +54,29 @@
String fname = fnames.get(index);
File current_file = new File(fname);
long file_size = current_file.length();
- int poolIndex = pool.getNextFree();
- logger.info(String.format("Started %d/%d %s %s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
- String[] cmd = {
- "/bin/sh",
- "-c",
- pool.getExec(poolIndex) + "/usr/local/kl/bin/korapxml2conllu " + fname
- };
- Process p = Runtime.getRuntime().exec(cmd);
- BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ int poolIndex = 0;
+ BufferedReader in = null;
+ if (fname.matches(".*\\.conllu\\.gz$")) {
+ in = new BufferedReader(new InputStreamReader(new ParallelGZIPInputStream(new FileInputStream(fname))));
+ } else if (fname.matches(".*\\.conllu?$")) {
+ in = new BufferedReader(new InputStreamReader(new FileInputStream(fname)));
+ } else if (fname.matches(".*\\.zip$")) {
+ poolIndex = pool.getNextFree();
+ String[] cmd = {
+ "/bin/sh",
+ "-c",
+ pool.getExec(poolIndex) + "/usr/local/kl/bin/korapxml2conllu " + fname
+ };
+ Process p = Runtime.getRuntime().exec(cmd);
+ in = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ } else {
+ logger.severe(new IllegalArgumentException("Cannot guess file type for "+fname).toString());
+ System.exit(-1);
+ }
+ logger.info(String.format("Processing %d/%d %s %s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
String line;
int fold = -1;
- int texts=0;
+ int texts = 0;
while ((line = in.readLine()) != null) {
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);