Add blingfire

Change-Id: I26814a0d5d9ab6a0f453e507cfc89fae399d4ebd
diff --git a/Dockerfile b/Dockerfile
index 2d92c07..4a1ea8e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -223,13 +223,24 @@
 # Install Cutter #
 ##################
 
-RUN pip3 install cutter-ng
+RUN pip3 install cutter-ng==2.5
 
 COPY cutter /euralex/cutter/
 
 RUN echo "Cutter\n" && python3 ./cutter/cutter.py nosent example.txt
 
 
+#####################
+# Install BlingFire #
+#####################
+
+RUN pip3 install -U blingfire==0.1.8
+
+COPY blingfire /euralex/blingfire/
+
+RUN echo "BlingFire\n" && python3 ./blingfire/blingfire_tok.py example.txt
+
+
 #################
 # Install Datok #
 #################
diff --git a/benchmarks/benchmark.pl b/benchmarks/benchmark.pl
index 6b74456..a273c9c 100644
--- a/benchmarks/benchmark.pl
+++ b/benchmarks/benchmark.pl
@@ -101,6 +101,12 @@
   cutter => sub {
     system 'python3 ./cutter/cutter.py nosent ./corpus/'.$FILE.' > /dev/null'
   },
+  blingfire_tok => sub {
+    system 'python3 ./blingfire/blingfire_tok.py ./corpus/'.$FILE.' > /dev/null'
+  },
+  blingfire_sent => sub {
+    system 'python3 ./blingfire/blingfire_sent.py ./corpus/'.$FILE.' > /dev/null'
+  },
   spacy_tok => sub {
     system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null'
   },
@@ -135,36 +141,38 @@
   },
 };
 
-delete $models->{'SoMaJo'};
-delete $models->{'SoMaJo_p2'};
-delete $models->{'SoMaJo_p4'};
-delete $models->{'SoMaJo_p8'};
-delete $models->{'Datok_matok'};
-delete $models->{'Datok_datok'};
-delete $models->{'OpenNLP_Simple'};
-delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
-delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
-delete $models->{'TreeTagger'};
-delete $models->{'deep-eos_bi-lstm-de'};
-delete $models->{'deep-eos_cnn-de'};
-delete $models->{'deep-eos_lstm-de'};
-delete $models->{'JTok'};
-delete $models->{'KorAP-Tokenizer'};
-delete $models->{'Syntok_tokenizer'};
-delete $models->{'Syntok_segmenter'};
-delete $models->{'Waste'};
-delete $models->{'nnsplit'};
-delete $models->{'elephant'};
-delete $models->{'Stanford'};
-delete $models->{'Stanford_t2'};
-delete $models->{'Stanford_t4'};
-delete $models->{'Stanford_t8'};
+#delete $models->{'SoMaJo'};
+#delete $models->{'SoMaJo_p2'};
+#delete $models->{'SoMaJo_p4'};
+#delete $models->{'SoMaJo_p8'};
+#delete $models->{'Datok_matok'};
+#delete $models->{'Datok_datok'};
+#delete $models->{'OpenNLP_Simple'};
+#delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
+#delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
+#delete $models->{'TreeTagger'};
+#delete $models->{'deep-eos_bi-lstm-de'};
+#delete $models->{'deep-eos_cnn-de'};
+#delete $models->{'deep-eos_lstm-de'};
+#delete $models->{'JTok'};
+#delete $models->{'KorAP-Tokenizer'};
+#delete $models->{'Syntok_tokenizer'};
+#delete $models->{'Syntok_segmenter'};
+#delete $models->{'Waste'};
+#delete $models->{'nnsplit'};
+#delete $models->{'elephant'};
+#delete $models->{'Stanford'};
+#delete $models->{'Stanford_t2'};
+#delete $models->{'Stanford_t4'};
+#delete $models->{'Stanford_t8'};
 #delete $models->{'Stanford_tokonly'};
 #delete $models->{'cutter'};
 #delete $models->{'spacy_tok'};
 #delete $models->{'spacy_sentencizer'};
 #delete $models->{'spacy_dep'};
 #delete $models->{'spacy_stat'};
+#delete $models->{'blingfire_tok'};
+#delete $models->{'blingfire_sent'};
 
 
 
diff --git a/benchmarks/empirist.pl b/benchmarks/empirist.pl
index 86e21dd..b20af0d 100644
--- a/benchmarks/empirist.pl
+++ b/benchmarks/empirist.pl
@@ -59,6 +59,10 @@
     my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
     system 'python3 ./spacy/spacy_tok.py ' . $raw . ' > ' . $empirist_path . $_[1] . '/spacy/' . $_[0];
   },
+  blingfire => sub {
+    my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
+    system 'python3 ./blingfire/blingfire_tok.py ' . $raw . ' | sed "s/\s/\n/g" > ' . $empirist_path . $_[1] . '/blingfire/' . $_[0];
+  },
   cutter => sub {
     my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
     system 'python3 ./cutter/cutter.py nosent ' . $raw . ' > ' . $empirist_path . $_[1] . '/cutter/' . $_[0];
@@ -72,19 +76,20 @@
   }
 );
 
-# delete $tools{waste};
-# delete $tools{datok};
-# delete $tools{korap_tokenizer};
-# delete $tools{opennlp_simple};
-# delete $tools{opennlp_tokenizer};
-# delete $tools{tree_tagger};
-# delete $tools{jtok};
-# delete $tools{syntok};
-# delete $tools{somajo};
-# delete $tools{stanford};
-# delete $tools{spacy};
-# delete $tools{elephant};
-# delete $tools{cutter};
+delete $tools{waste};
+delete $tools{datok};
+delete $tools{korap_tokenizer};
+delete $tools{opennlp_simple};
+delete $tools{opennlp_tokenizer};
+delete $tools{tree_tagger};
+delete $tools{jtok};
+delete $tools{syntok};
+delete $tools{somajo};
+delete $tools{stanford};
+delete $tools{spacy};
+delete $tools{elephant};
+delete $tools{cutter};
+delete $tools{blingfire};
 
 # Create project folders
 foreach (keys %tools) {
diff --git a/benchmarks/ud_sentences.pl b/benchmarks/ud_sentences.pl
index f7973a2..ea2774b 100644
--- a/benchmarks/ud_sentences.pl
+++ b/benchmarks/ud_sentences.pl
@@ -8,6 +8,7 @@
 my $cleanup = 'perl /euralex/benchmarks/cleanup/';
 my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
 my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
+my $tokenize_simple = $cleanup . 'tokenize_simple.pl';
 
 # Output path
 my $ud_path = '/euralex/ud_eos';
@@ -45,7 +46,7 @@
     chdir '/euralex';
   },
   syntok => sub {
-    system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $cleanup . '/tokenize_simple.pl > ' . $ud_path . '/syntok/' . $base;
+    system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $tokenize_simple . ' > ' . $ud_path . '/syntok/' . $base;
   },
   somajo => sub {
     system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
@@ -68,6 +69,9 @@
   spacy_sentencizer => sub {
     system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
   },
+  blingfire => sub {
+    system 'python3 ./blingfire/blingfire_sent.py ' . $raw . ' | ' . $tokenize_simple . ' > ' . $ud_path . '/blingfire/' . $base;
+  },
   'deep-eos_bi-lstm-de' => sub {
     system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
   },
@@ -80,22 +84,23 @@
 );
 
 
-#delete $tools{waste};
-#delete $tools{datok};
-#delete $tools{korap_tokenizer};
-#delete $tools{'opennlp_sentence'};
-#delete $tools{jtok};
-#delete $tools{syntok};
-#delete $tools{somajo};
-#delete $tools{stanford};
-#delete $tools{nnsplit};
-#delete $tools{'deep-eos_bi-lstm-de'};
-#delete $tools{'deep-eos_cnn-de'};
-#delete $tools{'deep-eos_lstm-de'};
-#delete $tools{'spacy_dep'};
-#delete $tools{'spacy_stat'};
-#delete $tools{'spacy_sentencizer'};
-#delete $tools{'cutter'};
+# delete $tools{waste};
+# delete $tools{datok};
+# delete $tools{korap_tokenizer};
+# delete $tools{'opennlp_sentence'};
+# delete $tools{jtok};
+# delete $tools{syntok};
+# delete $tools{somajo};
+# delete $tools{stanford};
+# delete $tools{nnsplit};
+# delete $tools{'deep-eos_bi-lstm-de'};
+# delete $tools{'deep-eos_cnn-de'};
+# delete $tools{'deep-eos_lstm-de'};
+# delete $tools{'spacy_dep'};
+# delete $tools{'spacy_stat'};
+# delete $tools{'spacy_sentencizer'};
+# delete $tools{'blingfire'};
+# delete $tools{'cutter'};
 
 
 # Create project folders
diff --git a/benchmarks/ud_tokens.pl b/benchmarks/ud_tokens.pl
index 6e30ef1..685e6da 100644
--- a/benchmarks/ud_tokens.pl
+++ b/benchmarks/ud_tokens.pl
@@ -13,11 +13,11 @@
 
 # Split files
 chdir '/euralex/corpus/';
-system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base;
+system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base . ' ' . $ud_path;
 chdir '/euralex';
 
-my $gold = '/euralex/corpus/' . $base . '.split';
-my $raw = '/euralex/corpus/' . $base . '.raw';
+my $gold = $ud_path . '/' . $base . '.split';
+my $raw = $ud_path . '/' . $base . '.raw';
 
 my %tools = (
   waste => sub {
@@ -52,6 +52,9 @@
   spacy => sub {
     system 'python3 ./spacy/spacy_tok.py ' . $raw . ' > ' . $ud_path . '/spacy/' . $base;
   },
+  blingfire => sub {
+    system 'python3 ./blingfire/blingfire_tok.py ' . $raw . ' | sed "s/\s/\n/g" > ' . $ud_path . '/blingfire/' . $base;
+  },
   cutter => sub {
     system 'python3 ./cutter/cutter.py nosent ' . $raw . ' > ' . $ud_path . '/cutter/' . $base;
   },
@@ -79,6 +82,7 @@
 # delete $tools{elephant};
 # delete $tools{spacy};
 # delete $tools{cutter};
+# delete $tools{blingfire};
 
 # Create project folders
 foreach (keys %tools) {
diff --git a/blingfire/blingfire_sent.py b/blingfire/blingfire_sent.py
new file mode 100644
index 0000000..1f1bfed
--- /dev/null
+++ b/blingfire/blingfire_sent.py
@@ -0,0 +1,8 @@
+import sys
+from blingfire import *
+
+with open(sys.argv[1], 'r') as f:
+    contents = f.read()
+
+    print(text_to_sentences(contents))
+
diff --git a/blingfire/blingfire_tok.py b/blingfire/blingfire_tok.py
new file mode 100644
index 0000000..4929cdb
--- /dev/null
+++ b/blingfire/blingfire_tok.py
@@ -0,0 +1,8 @@
+import sys
+from blingfire import *
+
+with open(sys.argv[1], 'r') as f:
+    contents = f.read()
+
+    print(text_to_words(contents))
+