Add blingfire
Change-Id: I26814a0d5d9ab6a0f453e507cfc89fae399d4ebd
diff --git a/Dockerfile b/Dockerfile
index 2d92c07..4a1ea8e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -223,13 +223,24 @@
# Install Cutter #
##################
-RUN pip3 install cutter-ng
+RUN pip3 install cutter-ng==2.5
COPY cutter /euralex/cutter/
RUN echo "Cutter\n" && python3 ./cutter/cutter.py nosent example.txt
+#####################
+# Install BlingFire #
+#####################
+
+RUN pip3 install -U blingfire==0.1.8
+
+COPY blingfire /euralex/blingfire/
+
+RUN echo "BlingFire\n" && python3 ./blingfire/blingfire_tok.py example.txt
+
+
#################
# Install Datok #
#################
diff --git a/benchmarks/benchmark.pl b/benchmarks/benchmark.pl
index 6b74456..a273c9c 100644
--- a/benchmarks/benchmark.pl
+++ b/benchmarks/benchmark.pl
@@ -101,6 +101,12 @@
cutter => sub {
system 'python3 ./cutter/cutter.py nosent ./corpus/'.$FILE.' > /dev/null'
},
+ blingfire_tok => sub {
+ system 'python3 ./blingfire/blingfire_tok.py ./corpus/'.$FILE.' > /dev/null'
+ },
+ blingfire_sent => sub {
+ system 'python3 ./blingfire/blingfire_sent.py ./corpus/'.$FILE.' > /dev/null'
+ },
spacy_tok => sub {
system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null'
},
@@ -135,36 +141,38 @@
},
};
-delete $models->{'SoMaJo'};
-delete $models->{'SoMaJo_p2'};
-delete $models->{'SoMaJo_p4'};
-delete $models->{'SoMaJo_p8'};
-delete $models->{'Datok_matok'};
-delete $models->{'Datok_datok'};
-delete $models->{'OpenNLP_Simple'};
-delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
-delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
-delete $models->{'TreeTagger'};
-delete $models->{'deep-eos_bi-lstm-de'};
-delete $models->{'deep-eos_cnn-de'};
-delete $models->{'deep-eos_lstm-de'};
-delete $models->{'JTok'};
-delete $models->{'KorAP-Tokenizer'};
-delete $models->{'Syntok_tokenizer'};
-delete $models->{'Syntok_segmenter'};
-delete $models->{'Waste'};
-delete $models->{'nnsplit'};
-delete $models->{'elephant'};
-delete $models->{'Stanford'};
-delete $models->{'Stanford_t2'};
-delete $models->{'Stanford_t4'};
-delete $models->{'Stanford_t8'};
+#delete $models->{'SoMaJo'};
+#delete $models->{'SoMaJo_p2'};
+#delete $models->{'SoMaJo_p4'};
+#delete $models->{'SoMaJo_p8'};
+#delete $models->{'Datok_matok'};
+#delete $models->{'Datok_datok'};
+#delete $models->{'OpenNLP_Simple'};
+#delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
+#delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
+#delete $models->{'TreeTagger'};
+#delete $models->{'deep-eos_bi-lstm-de'};
+#delete $models->{'deep-eos_cnn-de'};
+#delete $models->{'deep-eos_lstm-de'};
+#delete $models->{'JTok'};
+#delete $models->{'KorAP-Tokenizer'};
+#delete $models->{'Syntok_tokenizer'};
+#delete $models->{'Syntok_segmenter'};
+#delete $models->{'Waste'};
+#delete $models->{'nnsplit'};
+#delete $models->{'elephant'};
+#delete $models->{'Stanford'};
+#delete $models->{'Stanford_t2'};
+#delete $models->{'Stanford_t4'};
+#delete $models->{'Stanford_t8'};
#delete $models->{'Stanford_tokonly'};
#delete $models->{'cutter'};
#delete $models->{'spacy_tok'};
#delete $models->{'spacy_sentencizer'};
#delete $models->{'spacy_dep'};
#delete $models->{'spacy_stat'};
+#delete $models->{'blingfire_tok'};
+#delete $models->{'blingfire_sent'};
diff --git a/benchmarks/empirist.pl b/benchmarks/empirist.pl
index 86e21dd..b20af0d 100644
--- a/benchmarks/empirist.pl
+++ b/benchmarks/empirist.pl
@@ -59,6 +59,10 @@
my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
system 'python3 ./spacy/spacy_tok.py ' . $raw . ' > ' . $empirist_path . $_[1] . '/spacy/' . $_[0];
},
+ blingfire => sub {
+ my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
+ system 'python3 ./blingfire/blingfire_tok.py ' . $raw . ' | sed "s/\s/\n/g" > ' . $empirist_path . $_[1] . '/blingfire/' . $_[0];
+ },
cutter => sub {
my $raw = $gold_path . $_[1] . '/raw/' . $_[0];
system 'python3 ./cutter/cutter.py nosent ' . $raw . ' > ' . $empirist_path . $_[1] . '/cutter/' . $_[0];
@@ -72,19 +76,20 @@
}
);
-# delete $tools{waste};
-# delete $tools{datok};
-# delete $tools{korap_tokenizer};
-# delete $tools{opennlp_simple};
-# delete $tools{opennlp_tokenizer};
-# delete $tools{tree_tagger};
-# delete $tools{jtok};
-# delete $tools{syntok};
-# delete $tools{somajo};
-# delete $tools{stanford};
-# delete $tools{spacy};
-# delete $tools{elephant};
-# delete $tools{cutter};
+delete $tools{waste};
+delete $tools{datok};
+delete $tools{korap_tokenizer};
+delete $tools{opennlp_simple};
+delete $tools{opennlp_tokenizer};
+delete $tools{tree_tagger};
+delete $tools{jtok};
+delete $tools{syntok};
+delete $tools{somajo};
+delete $tools{stanford};
+delete $tools{spacy};
+delete $tools{elephant};
+delete $tools{cutter};
+delete $tools{blingfire};
# Create project folders
foreach (keys %tools) {
diff --git a/benchmarks/ud_sentences.pl b/benchmarks/ud_sentences.pl
index f7973a2..ea2774b 100644
--- a/benchmarks/ud_sentences.pl
+++ b/benchmarks/ud_sentences.pl
@@ -8,6 +8,7 @@
my $cleanup = 'perl /euralex/benchmarks/cleanup/';
my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
+my $tokenize_simple = $cleanup . 'tokenize_simple.pl';
# Output path
my $ud_path = '/euralex/ud_eos';
@@ -45,7 +46,7 @@
chdir '/euralex';
},
syntok => sub {
- system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $cleanup . '/tokenize_simple.pl > ' . $ud_path . '/syntok/' . $base;
+ system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $tokenize_simple . ' > ' . $ud_path . '/syntok/' . $base;
},
somajo => sub {
system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
@@ -68,6 +69,9 @@
spacy_sentencizer => sub {
system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
},
+ blingfire => sub {
+ system 'python3 ./blingfire/blingfire_sent.py ' . $raw . ' | ' . $tokenize_simple . ' > ' . $ud_path . '/blingfire/' . $base;
+ },
'deep-eos_bi-lstm-de' => sub {
system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
},
@@ -80,22 +84,23 @@
);
-#delete $tools{waste};
-#delete $tools{datok};
-#delete $tools{korap_tokenizer};
-#delete $tools{'opennlp_sentence'};
-#delete $tools{jtok};
-#delete $tools{syntok};
-#delete $tools{somajo};
-#delete $tools{stanford};
-#delete $tools{nnsplit};
-#delete $tools{'deep-eos_bi-lstm-de'};
-#delete $tools{'deep-eos_cnn-de'};
-#delete $tools{'deep-eos_lstm-de'};
-#delete $tools{'spacy_dep'};
-#delete $tools{'spacy_stat'};
-#delete $tools{'spacy_sentencizer'};
-#delete $tools{'cutter'};
+# delete $tools{waste};
+# delete $tools{datok};
+# delete $tools{korap_tokenizer};
+# delete $tools{'opennlp_sentence'};
+# delete $tools{jtok};
+# delete $tools{syntok};
+# delete $tools{somajo};
+# delete $tools{stanford};
+# delete $tools{nnsplit};
+# delete $tools{'deep-eos_bi-lstm-de'};
+# delete $tools{'deep-eos_cnn-de'};
+# delete $tools{'deep-eos_lstm-de'};
+# delete $tools{'spacy_dep'};
+# delete $tools{'spacy_stat'};
+# delete $tools{'spacy_sentencizer'};
+# delete $tools{'blingfire'};
+# delete $tools{'cutter'};
# Create project folders
diff --git a/benchmarks/ud_tokens.pl b/benchmarks/ud_tokens.pl
index 6e30ef1..685e6da 100644
--- a/benchmarks/ud_tokens.pl
+++ b/benchmarks/ud_tokens.pl
@@ -13,11 +13,11 @@
# Split files
chdir '/euralex/corpus/';
-system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base;
+system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base . ' ' . $ud_path;
chdir '/euralex';
-my $gold = '/euralex/corpus/' . $base . '.split';
-my $raw = '/euralex/corpus/' . $base . '.raw';
+my $gold = $ud_path . '/' . $base . '.split';
+my $raw = $ud_path . '/' . $base . '.raw';
my %tools = (
waste => sub {
@@ -52,6 +52,9 @@
spacy => sub {
system 'python3 ./spacy/spacy_tok.py ' . $raw . ' > ' . $ud_path . '/spacy/' . $base;
},
+ blingfire => sub {
+ system 'python3 ./blingfire/blingfire_tok.py ' . $raw . ' | sed "s/\s/\n/g" > ' . $ud_path . '/blingfire/' . $base;
+ },
cutter => sub {
system 'python3 ./cutter/cutter.py nosent ' . $raw . ' > ' . $ud_path . '/cutter/' . $base;
},
@@ -79,6 +82,7 @@
# delete $tools{elephant};
# delete $tools{spacy};
# delete $tools{cutter};
+# delete $tools{blingfire};
# Create project folders
foreach (keys %tools) {
diff --git a/blingfire/blingfire_sent.py b/blingfire/blingfire_sent.py
new file mode 100644
index 0000000..1f1bfed
--- /dev/null
+++ b/blingfire/blingfire_sent.py
@@ -0,0 +1,8 @@
+import sys
+from blingfire import *
+
+with open(sys.argv[1], 'r') as f:
+ contents = f.read()
+
+ print(text_to_sentences(contents))
+
diff --git a/blingfire/blingfire_tok.py b/blingfire/blingfire_tok.py
new file mode 100644
index 0000000..4929cdb
--- /dev/null
+++ b/blingfire/blingfire_tok.py
@@ -0,0 +1,8 @@
+import sys
+from blingfire import *
+
+with open(sys.argv[1], 'r') as f:
+ contents = f.read()
+
+ print(text_to_words(contents))
+