blob: f7973a2097b9061f21e38504f07d1fd778b859c0 [file] [log] [blame]
Akron049e5262022-03-18 09:59:34 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4
5# Comparison path
6my $cmd = '/euralex/corpus/empirist_gold_cmc/tools/compare_tokenization.perl';
Akron049e5262022-03-18 09:59:34 +01007
8my $cleanup = 'perl /euralex/benchmarks/cleanup/';
9my $tokenize_eos = $cleanup . 'tokenize_eos.pl';
10my $tokenize_nn = $cleanup . 'tokenize_nn.pl';
11
12# Output path
13my $ud_path = '/euralex/ud_eos';
14mkdir $ud_path;
15
16my $base = 'de_gsd-ud-train.conllu';
17
18# Split files
19chdir '/euralex/corpus/';
Akron93ff8692022-03-18 13:14:05 +010020system 'perl /euralex/benchmarks/cleanup/split_conllu.pl /euralex/corpus/' . $base . ' ' . $ud_path;
Akron049e5262022-03-18 09:59:34 +010021chdir '/euralex';
22
Akron93ff8692022-03-18 13:14:05 +010023my $gold = $ud_path . '/' . $base . '.eos';
24my $raw = $ud_path . '/' . $base . '.raw';
Akron049e5262022-03-18 09:59:34 +010025
26my %tools = (
27 waste => sub {
28 system 'cat ' . $raw . ' | waste -N -v0 --rcfile=./Waste/waste.rc | ' . $tokenize_nn . ' > ' . $ud_path . '/waste/' . $base;
29 },
30 datok => sub {
31 system 'cat ' . $raw . ' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | ' . $tokenize_nn . ' > ' . $ud_path . '/datok/' . $base;
32 },
33 cutter => sub {
34 system 'python3 ./cutter/cutter.py sent ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/cutter/' . $base;
35 },
36 korap_tokenizer => sub {
37 system 'cat ' . $raw . ' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -s -l de | ' . $tokenize_nn . ' > ' . $ud_path . '/korap_tokenizer/' . $base;
38 },
39 'opennlp_sentence' => sub {
40 system 'cat ' . $raw . ' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin 2> /dev/null > ' . $ud_path . '/opennlp_sentence/' . $base;
41 },
42 jtok => sub {
43 chdir '/euralex/JTok/bin';
44 system 'sh tokenize ' . $raw . ' de utf8 | ' . $cleanup . '/jtok.pl > ' . $ud_path . '/jtok/' . $base;
45 chdir '/euralex';
46 },
47 syntok => sub {
48 system 'python3 -m syntok.segmenter ' . $raw . ' | ' . $cleanup . '/tokenize_simple.pl > ' . $ud_path . '/syntok/' . $base;
49 },
50 somajo => sub {
51 system 'somajo-tokenizer --split_sentences ' . $raw . ' 2> /dev/null | ' . $tokenize_nn . ' > ' . $ud_path . '/somajo/' . $base;
52 },
53 stanford => sub {
54 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
55 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ' . $raw . ' 2> /dev/null';
56 system 'perl /euralex/benchmarks/cleanup/stanford.pl ' . $base . '.raw.out | ' . $tokenize_nn . ' > ' . $ud_path . '/stanford/' . $base;
57 system 'rm ' . $base . '.raw.out';
58 },
59 nnsplit => sub {
60 system './nnsplit/nnsplit_bench ' . $raw . ' | ' . $tokenize_eos. ' > ' . $ud_path . '/nnsplit/' . $base
61 },
62 spacy_dep => sub {
63 system 'python3 ./spacy/spacy_sent.py dep ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_dep/' . $base
64 },
65 spacy_stat => sub {
66 system 'python3 ./spacy/spacy_sent.py stat ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_stat/' . $base
67 },
68 spacy_sentencizer => sub {
69 system 'python3 ./spacy/spacy_sent.py sentencizer ' . $raw . ' | ' . $tokenize_eos . ' > ' . $ud_path . '/spacy_sentencizer/' . $base
70 },
71 'deep-eos_bi-lstm-de' => sub {
72 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_bi-lstm-de/' . $base;
73 },
74 'deep-eos_cnn-de' => sub {
75 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_cnn-de/' . $base;
76 },
77 'deep-eos_lstm-de' => sub {
78 system 'python3 ./deep-eos/main.py --input-file '.$raw.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "</eos>" tag | ' . $tokenize_eos . ' > ' . $ud_path . '/deep-eos_lstm-de/' . $base;;
79 },
80);
81
82
83#delete $tools{waste};
84#delete $tools{datok};
85#delete $tools{korap_tokenizer};
86#delete $tools{'opennlp_sentence'};
87#delete $tools{jtok};
88#delete $tools{syntok};
89#delete $tools{somajo};
90#delete $tools{stanford};
91#delete $tools{nnsplit};
92#delete $tools{'deep-eos_bi-lstm-de'};
93#delete $tools{'deep-eos_cnn-de'};
94#delete $tools{'deep-eos_lstm-de'};
95#delete $tools{'spacy_dep'};
96#delete $tools{'spacy_stat'};
97#delete $tools{'spacy_sentencizer'};
98#delete $tools{'cutter'};
99
100
101# Create project folders
102foreach (keys %tools) {
103 mkdir $ud_path . '/' . $_;
104};
105
106# Run tokenization
107foreach (keys %tools) {
108 $tools{$_}->();
109};
110
111foreach my $tool (keys %tools) {
112 print "\n##########\n";
113 print "##### $tool - UD\n";
114 print "##\n";
115 system $cmd . ' -x ' . $gold . ' ' . $ud_path . '/' . $tool . '/' . $base . ' 2> /dev/null';
116};