blob: ea2774bf287f04522f064c9a479ae0be8342ef24 [file] [log] [blame]
Akron049e5262022-03-18 09:59:34 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4
5# Comparison path
6my $cmd = '/euralex/corpus/empirist_gold_cmc/tools/compare_tokenization.perl';
Akron049e5262022-03-18 09:59:34 +01007
8my $cleanup = 'perl /euralex/benchmarks/cleanup/';
9my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
10my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
Akron325193e2022-03-20 11:38:04 +010011my $tokenize_simple = $cleanup . 'tokenize_simple.pl';
Akron049e5262022-03-18 09:59:34 +010012
13# Output path
14my $ud_path = '/euralex/ud_eos';
15mkdir $ud_path;
16
17my $base = 'de_gsd-ud-train.conllu';
18
19# Split files
20chdir '/euralex/corpus/';
Akron93ff8692022-03-18 13:14:05 +010021system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base . ' ' . $ud_path;
Akron049e5262022-03-18 09:59:34 +010022chdir '/euralex';
23
Akron93ff8692022-03-18 13:14:05 +010024my $gold = $ud_path . '/' . $base . '.eos';
25my $raw = $ud_path . '/' . $base . '.raw';
Akron049e5262022-03-18 09:59:34 +010026
27my %tools = (
28 waste => sub {
29 system 'cat ' . $raw . ' | waste -N -v0 --rcfile=./Waste/waste.rc | ' . $tokenize_nn . ' > ' . $ud_path . '/waste/' . $base;
30 },
31 datok => sub {
32 system 'cat ' . $raw . ' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | ' . $tokenize_nn . ' > ' . $ud_path . '/datok/' . $base;
33 },
34 cutter => sub {
35 system 'python3 ./cutter/cutter.py sent ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/cutter/' . $base;
36 },
37 korap_tokenizer => sub {
38 system 'cat ' . $raw . ' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -s -l de | ' . $tokenize_nn . ' > ' . $ud_path . '/korap_tokenizer/' . $base;
39 },
40 'opennlp_sentence' => sub {
41 system 'cat ' . $raw . ' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin 2> /dev/null > ' . $ud_path . '/opennlp_sentence/' . $base;
42 },
43 jtok => sub {
44 chdir '/euralex/JTok/bin';
45 system 'sh tokenize ' . $raw . ' de utf8 | ' . $cleanup . '/jtok.pl > ' . $ud_path . '/jtok/' . $base;
46 chdir '/euralex';
47 },
48 syntok => sub {
Akron325193e2022-03-20 11:38:04 +010049 system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $tokenize_simple . ' > ' . $ud_path . '/syntok/' . $base;
Akron049e5262022-03-18 09:59:34 +010050 },
51 somajo => sub {
52 system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
53 },
54 stanford => sub {
55 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
56 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
57 system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $base . '.raw.out | ' . $tokenize_nn . ' > ' . $ud_path . '/stanford/' . $base;
58 system 'rm ' . $base . '.raw.out';
59 },
60 nnsplit => sub {
61 system './nnsplit/nnsplit_bench ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/nnsplit/' . $base
62 },
63 spacy_dep => sub {
64 system 'python3 ./spacy/spacy_sent.py dep ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_dep/' . $base
65 },
66 spacy_stat => sub {
67 system 'python3 ./spacy/spacy_sent.py stat ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_stat/' . $base
68 },
69 spacy_sentencizer => sub {
70 system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
71 },
Akron325193e2022-03-20 11:38:04 +010072 blingfire => sub {
73 system 'python3 ./blingfire/blingfire_sent.py ' . $raw . ' | ' . $tokenize_simple . ' > ' . $ud_path . '/blingfire/' . $base;
74 },
Akron049e5262022-03-18 09:59:34 +010075 'deep-eos_bi-lstm-de' => sub {
76 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
77 },
78 'deep-eos_cnn-de' => sub {
79 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_cnn-de/' . $base;
80 },
81 'deep-eos_lstm-de' => sub {
82 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_lstm-de/' . $base;;
83 },
84);
85
86
Akron325193e2022-03-20 11:38:04 +010087# delete $tools{waste};
88# delete $tools{datok};
89# delete $tools{korap_tokenizer};
90# delete $tools{'opennlp_sentence'};
91# delete $tools{jtok};
92# delete $tools{syntok};
93# delete $tools{somajo};
94# delete $tools{stanford};
95# delete $tools{nnsplit};
96# delete $tools{'deep-eos_bi-lstm-de'};
97# delete $tools{'deep-eos_cnn-de'};
98# delete $tools{'deep-eos_lstm-de'};
99# delete $tools{'spacy_dep'};
100# delete $tools{'spacy_stat'};
101# delete $tools{'spacy_sentencizer'};
102# delete $tools{'blingfire'};
103# delete $tools{'cutter'};
Akron049e5262022-03-18 09:59:34 +0100104
105
106# Create project folders
107foreach (keys %tools) {
108 mkdir $ud_path . '/' . $_;
109};
110
111# Run tokenization
112foreach (keys %tools) {
113 $tools{$_}->();
114};
115
116foreach my $tool (keys %tools) {
117 print "\n##########\n";
118 print "##### $tool - UD\n";
119 print "##\n";
120 system $cmd . ' -x ' . $gold . ' ' . $ud_path . '/' . $tool . '/' . $base . ' 2> /dev/null';
121};