blob: c5f80c6d4250ad60b3af706fca0ce29288b0e3b3 [file] [log] [blame]
Akron049e5262022-03-18 09:59:34 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4
5# Comparison path
6my $cmd = '/euralex/corpus/empirist_gold_cmc/tools/compare_tokenization.perl';
7# my $cmd = '/euralex/corpus/deep-eos/eval.py';
8
9my $cleanup = 'perl /euralex/benchmarks/cleanup/';
10my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
11my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
12
13# Output path
14my $ud_path = '/euralex/ud_eos';
15mkdir $ud_path;
16
17my $base = 'de_gsd-ud-train.conllu';
18
19# Split files
20chdir '/euralex/corpus/';
21system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base;
22chdir '/euralex';
23
24my $gold = '/euralex/corpus/' . $base . '.eos';
25my $raw = '/euralex/corpus/' . $base . '.raw';
26
27my %tools = (
28 waste => sub {
29 system 'cat ' . $raw . ' | waste -N -v0 --rcfile=./Waste/waste.rc | ' . $tokenize_nn . ' > ' . $ud_path . '/waste/' . $base;
30 },
31 datok => sub {
32 system 'cat ' . $raw . ' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | ' . $tokenize_nn . ' > ' . $ud_path . '/datok/' . $base;
33 },
34 cutter => sub {
35 system 'python3 ./cutter/cutter.py sent ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/cutter/' . $base;
36 },
37 korap_tokenizer => sub {
38 system 'cat ' . $raw . ' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -s -l de | ' . $tokenize_nn . ' > ' . $ud_path . '/korap_tokenizer/' . $base;
39 },
40 'opennlp_sentence' => sub {
41 system 'cat ' . $raw . ' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin 2> /dev/null > ' . $ud_path . '/opennlp_sentence/' . $base;
42 },
43 jtok => sub {
44 chdir '/euralex/JTok/bin';
45 system 'sh tokenize ' . $raw . ' de utf8 | ' . $cleanup . '/jtok.pl > ' . $ud_path . '/jtok/' . $base;
46 chdir '/euralex';
47 },
48 syntok => sub {
49 system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $cleanup . '/tokenize_simple.pl > ' . $ud_path . '/syntok/' . $base;
50 },
51 somajo => sub {
52 system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
53 },
54 stanford => sub {
55 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
56 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
57 system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $base . '.raw.out | ' . $tokenize_nn . ' > ' . $ud_path . '/stanford/' . $base;
58 system 'rm ' . $base . '.raw.out';
59 },
60 nnsplit => sub {
61 system './nnsplit/nnsplit_bench ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/nnsplit/' . $base
62 },
63 spacy_dep => sub {
64 system 'python3 ./spacy/spacy_sent.py dep ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_dep/' . $base
65 },
66 spacy_stat => sub {
67 system 'python3 ./spacy/spacy_sent.py stat ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_stat/' . $base
68 },
69 spacy_sentencizer => sub {
70 system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
71 },
72 'deep-eos_bi-lstm-de' => sub {
73 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
74 },
75 'deep-eos_cnn-de' => sub {
76 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_cnn-de/' . $base;
77 },
78 'deep-eos_lstm-de' => sub {
79 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_lstm-de/' . $base;;
80 },
81);
82
83
84#delete $tools{waste};
85#delete $tools{datok};
86#delete $tools{korap_tokenizer};
87#delete $tools{'opennlp_sentence'};
88#delete $tools{jtok};
89#delete $tools{syntok};
90#delete $tools{somajo};
91#delete $tools{stanford};
92#delete $tools{nnsplit};
93#delete $tools{'deep-eos_bi-lstm-de'};
94#delete $tools{'deep-eos_cnn-de'};
95#delete $tools{'deep-eos_lstm-de'};
96#delete $tools{'spacy_dep'};
97#delete $tools{'spacy_stat'};
98#delete $tools{'spacy_sentencizer'};
99#delete $tools{'cutter'};
100
101
102# Create project folders
103foreach (keys %tools) {
104 mkdir $ud_path . '/' . $_;
105};
106
107# Run tokenization
108foreach (keys %tools) {
109 $tools{$_}->();
110};
111
112foreach my $tool (keys %tools) {
113 print "\n##########\n";
114 print "##### $tool - UD\n";
115 print "##\n";
116 system $cmd . ' -x ' . $gold . ' ' . $ud_path . '/' . $tool . '/' . $base . ' 2> /dev/null';
117};