blob: 2b97959cec198b7d6abb480f1e65c69a706e9ae0 [file] [log] [blame]
Marc Kupietzbb739b02020-09-22 16:49:34 +02001use strict;
2use warnings;
Marc Kupietz80c8f2a2024-04-05 17:37:26 +02003use Test::More;
Marc Kupietzbb739b02020-09-22 16:49:34 +02004use Test::Script;
Marc Kupietz79ba1e52021-02-12 17:26:54 +01005use Test::TempDir::Tiny;
6use File::Copy;
Marc Kupietzbb739b02020-09-22 16:49:34 +02007
Marc Kupietz6a79cad2021-03-19 16:26:58 +01008script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 1 });
9script_stdout_like "Description", "Can print help message";
Marc Kupietzbb739b02020-09-22 16:49:34 +020010
11for my $morpho_fname (glob("t/data/*\.*\.zip")) {
12 my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r;
Akron46f2c232021-02-12 17:21:39 +010013 if (!-e $base_fname) {
14 fail("cannot find $base_fname");
15 next;
16 };
Marc Kupietzbb739b02020-09-22 16:49:34 +020017
Marc Kupietzd8455832021-02-11 17:30:29 +010018 my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.conllu/r;
Akron46f2c232021-02-12 17:21:39 +010019 if (!-e $conllu_fname) {
20 fail("cannot find $conllu_fname");
21 next;
22 };
Marc Kupietzbb739b02020-09-22 16:49:34 +020023
24 my $expected;
Akron46f2c232021-02-12 17:21:39 +010025 if (open(my $fh, '<', $conllu_fname)) {
Marc Kupietzbb739b02020-09-22 16:49:34 +020026 local $/;
27 $expected = <$fh>;
Marc Kupietzcc391472024-06-24 10:48:34 +020028 $expected =~ s/^(# text_id\s*=\s*\S+)\s*$/$1/mg;
Akron46f2c232021-02-12 17:21:39 +010029 close($fh);
30 } else {
31 fail("cannot open file $conllu_fname");
32 next;
Marc Kupietzbb739b02020-09-22 16:49:34 +020033 }
Marc Kupietz13994d12021-02-12 17:25:36 +010034 script_runs([ 'script/korapxml2conllu', $morpho_fname ], "Runs korapxml2conllu with pos and lemma annotated input");
Marc Kupietzbb739b02020-09-22 16:49:34 +020035 script_stdout_is $expected, "Converts $morpho_fname correctly";
36}
Marc Kupietzd8455832021-02-11 17:30:29 +010037
Marc Kupietz15c84fd2021-10-12 12:20:27 +020038for my $morpho_fname (glob("t/data/*\.*\.zip")) {
39 my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r;
40 if (!-e $base_fname) {
41 fail("cannot find $base_fname");
42 next;
43 };
44
45 my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.sbfm.conllu/r;
46 if (!-e $conllu_fname) {
47 fail("cannot find $conllu_fname");
48 next;
49 };
50
51 my $expected;
52 if (open(my $fh, '<', $conllu_fname)) {
53 local $/;
54 $expected = <$fh>;
55 close($fh);
56 } else {
57 fail("cannot open file $conllu_fname");
58 next;
59 }
60 script_runs([ 'script/korapxml2conllu', '--s-bounds-from-morpho', $morpho_fname ], "Runs korapxml2conllu with --s-bounds-from-morpho correctly");
61 script_stdout_is $expected, "Converts $morpho_fname correctly";
62}
63
Marc Kupietzd8455832021-02-11 17:30:29 +010064for my $base_fname (glob("t/data/*\.zip")) {
Marc Kupietzd8455832021-02-11 17:30:29 +010065 my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.conllu/r;
Marc Kupietz628893e2021-02-12 15:50:29 +010066 next if (!-e $conllu_fname);
Marc Kupietzd8455832021-02-11 17:30:29 +010067
68 my $expected;
Akron46f2c232021-02-12 17:21:39 +010069 if (open(my $fh, '<', $conllu_fname)) {
Marc Kupietzd8455832021-02-11 17:30:29 +010070 local $/;
71 $expected = <$fh>;
Akron46f2c232021-02-12 17:21:39 +010072 close($fh);
73 } else {
74 fail("cannot open file $conllu_fname");
75 next;
Marc Kupietzd8455832021-02-11 17:30:29 +010076 }
Marc Kupietz13994d12021-02-12 17:25:36 +010077 script_runs([ 'script/korapxml2conllu', $base_fname ], "Runs korapxml2conllu with base input");
78 script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U";
Marc Kupietzd8455832021-02-11 17:30:29 +010079}
80
Marc Kupietzd0bf2772022-06-26 19:27:58 +020081for my $w2v_fname (glob("t/data/*\.w2v_simple")) {
82 my $base_fname = $w2v_fname =~ s/(.*)\.w2v_simple/$1.zip/r;
83 next if (!-e $base_fname);
84
85 my $expected;
86 if (open(my $fh, '<', $w2v_fname)) {
87 local $/;
88 $expected = <$fh>;
89 close($fh);
90 } else {
91 fail("cannot open file $w2v_fname");
92 next;
93 }
94 script_runs([ 'script/korapxml2conllu', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v output");
95 script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format";
96}
97
98for my $w2v_fname (glob("t/data/*\.w2v")) {
99 my $base_fname = $w2v_fname =~ s/(.*)\.w2v/$1.zip/r;
100 next if (!-e $base_fname);
101
102 my $expected;
103 if (open(my $fh, '<', $w2v_fname)) {
104 local $/;
105 $expected = <$fh>;
106 close($fh);
107 } else {
108 fail("cannot open file $w2v_fname");
109 next;
110 }
Marc Kupietz07a88952022-07-01 08:55:53 +0200111 script_runs([ 'script/korapxml2conllu', '-m', '<textSigle>([^<.]+)', '-m', '<creatDate>([^<]{4,7})', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v and metadata output");
Marc Kupietzd0bf2772022-06-26 19:27:58 +0200112 script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format together with some metadata";
113}
114
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100115my $expected;
Marc Kupietzd7d5d6a2021-10-11 17:52:58 +0200116if (open(my $fh, '<', 't/data/goe.1c.txt')) {
117 local $/;
118 $expected = <$fh>;
119 close($fh);
120} else {
121 fail("cannot open file.");
122}
123script_runs([ 'script/korapxml2conllu', '-c', '1', 't/data/goe.zip' ], "Runs korapxml2conllu in one column mode");
124script_stdout_is $expected, "Converts correctly in one column mode.";
125
126my $test_tempdir = tempdir();
Akron46f2c232021-02-12 17:21:39 +0100127my $conllu_fname = "t/data/goe.morpho.conllu";
128if(open(my $fh, '<', $conllu_fname )) {
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100129 local $/;
130 $expected = <$fh>;
Akron46f2c232021-02-12 17:21:39 +0100131 close($fh);
132} else {
133 fail("cannot open file $conllu_fname");
134 }
135
136ok(length($expected) > 100, 'Output is not empty');
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100137
138my $zipfile = "$test_tempdir/goe.tree_tagger.zip";
139my $zipcontent;
140script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent},
141 "Converts t/data/goe.morpho.conllu to KorAP-XML zip");
142open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
143print $fh $zipcontent;
144close($fh);
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100145copy("t/data/goe.zip", $test_tempdir);
146script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
147 "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
148script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100149
150script_runs([ 'script/korapxml2conllu', '-e', 'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction");
151script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips";
152script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips";
153
154script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction");
155script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)";
156script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)";
157script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place";
158script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)";
159script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)";
Marc Kupietzab150232021-07-31 23:41:47 +0200160script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in base zip";
161
162script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and regex attribute extraction");
163script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from morpho zips (1)";
164script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from morpho zips (2)";
165script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from morpho zips (1)";
166script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from morpho zips (2)";
167script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in morpho zip";
168
Marc Kupietz97ba2ba2021-10-11 17:55:47 +0200169$zipfile = "$test_tempdir/without_lemma.zip";
170script_runs([ 'script/conllu2korapxml', "t/data/without_lemma.tsv" ], {stdout => \$zipcontent},
171 "Converts t/data/without_lemma.tsv to KorAP-XML zip");
172open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
173print $fh $zipcontent;
174close($fh);
175my $UNZIP = `sh -c 'command -v unzip'`;
176chomp $UNZIP;
177
178if ($UNZIP eq '') {
179 warn('No unzip executable found in PATH.');
180 return 0;
181};
182$zipcontent = `$UNZIP -c $zipfile`;
183unlike($zipcontent, qr/.*name ="lemma".*/, "conllu2korapxml igores _ lemmas.");
184like($zipcontent, qr/.*<f name="pos">NN|NN<\/f>.*/, "conllu2korapxml does not ignore pos for _ lemmas.");
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200185
186script_runs([ 'script/conllu2korapxml', '-l', 'debug', 't/data/goe.ud.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with UDPipe and unparsable comments");
187script_stderr_like "Foundry:\\s+ud", "Found generator based foundry";
188script_stderr_like "Ignored\\s+foundry\\s+name:\\s+base", "Ignore defined foundry";
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200189$zipfile = "$test_tempdir/goe.ud.zip";
190open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
191print $fh $zipcontent;
192close($fh);
193
Marc Kupietz447f4752024-03-22 17:35:57 +0100194$zipcontent = `$UNZIP -Z $zipfile`;
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200195like($zipcontent, qr@GOE/AGA/00000/ud/morpho\.xml@, "conllu2korapxml UDPipe input conversion contains morpho layer with foundry name 'ud'");
196like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion contains dependency layer with foundry name 'ud'");
Marc Kupietz447f4752024-03-22 17:35:57 +0100197like($zipcontent, qr@rw-rw-rw-.*GOE/AGA/00000/ud/morpho\.xml@, "KorAP-XML zip contents have read and write permissions");
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200198
Marc Kupietz5cc4df22024-03-24 13:46:42 +0100199$zipcontent = `$UNZIP -c $zipfile`;
200like($zipcontent, qr/.*<f name="upos">VERB<\/f>.*/, "conllu2korapxml extracts upos tags.");
201like($zipcontent, qr/.*<f name="pos">VVFIN<\/f>.*/, "conllu2korapxml extracts (x)pos tags.");
202unlike($zipcontent, qr/.*<f name="pos">_<\/f>.*/, "conllu2korapxml ignores _ pos tags.");
203unlike($zipcontent, qr/.*<f name="upos">_<\/f>.*/, "conllu2korapxml ignores _ upos tags.");
204
Marc Kupietz187abd72024-06-25 14:30:01 +0200205$zipfile = "$test_tempdir/goe.ud2.zip";
Marc Kupietz3559fa42024-06-25 17:06:11 +0200206script_runs([ 'script/conllu2korapxml', '-l', 'warn', '-o', $zipfile, 't/data/goe.ud.conllu' ], {}, "Runs conllu2korap with -o option");
Marc Kupietz187abd72024-06-25 14:30:01 +0200207like($zipcontent, qr@GOE/AGA/00000/ud/morpho\.xml@, "conllu2korapxml UDPipe input conversion -o option contains morpho layer with foundry name 'ud'");
208like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion with -o option contains dependency layer with foundry name 'ud'");
209
Akron49f333b2022-09-27 17:03:49 +0200210script_runs([ 'script/conllu2korapxml', 't/data/deu-deps.conllu' ], "Runs conllu2korap with UDPipe input");
211script_stderr_unlike "fileparse(): need a valid pathname", "Ignore sent_id and newdoc id";
Marc Kupietz67d8c432024-06-25 14:32:16 +0200212script_stderr_like qr@WARNING: Invalid input in.*deu-deps.conllu.*token offsets missing.*in line \d+@, "Warn on missing token offsets";
213script_stderr_like qr@WARNING: Invalid input in.*deu-deps.conllu.*text.id .*missing.*in line \d+@, "Warn on missing text ids";
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200214
Marc Kupietz7e4cd6c2022-12-15 18:34:37 +0100215script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data");
216script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)");
217script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations");
Marc Kupietz534df182022-12-16 15:00:30 +0100218
219script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "KOT", "t/data/nkjp.zip" ], "Runs korapxml2conllu with --sigle-pattern option on combined base/morpho files");
220script_stdout_like("NKJP/NKJP/KOT/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern extracts the right texts");
221script_stdout_unlike("NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern does not extract the wrong texts");
222
223script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "13072", "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with --sigle-pattern option on seprate base/morpho files");
224script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts");
225script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts");
226
Akronf2b0bba2022-12-16 18:00:08 +0100227script_runs([ 'script/korapxml2conllu', "t/data/nkjp-fail.zip" ], "Runs korapxml2conllu on nkjp-fail test data");
228script_stderr_like("could not retrieve token at 1297-1298/ 1297 - ending with: e! upadku.", "Offset error");
229
Marc Kupietzdd546a82024-03-22 16:30:09 +0100230script_runs([ 'script/conllu2korapxml', 't/data/goe.marmot-malt.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with marmot and malt annotations");
231$zipfile = "$test_tempdir/goe.marmalt.zip";
232open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
233print $fh $zipcontent;
234close($fh);
235$zipcontent = `$UNZIP -l $zipfile`;
236like($zipcontent, qr@GOE/AGA/00000/marmot/morpho\.xml@, "conllu2korapxml can handle different foundries for motpho and dependency layers");
237like($zipcontent, qr@GOE/AGA/00000/malt/dependency\.xml@, "conllu2korapxml sets the secondary dependency foundry correctly");
238
239script_runs([ 'script/conllu2korapxml', '-f', 'upos dependency:gsd', 't/data/goe.ud.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with marmot and malt annotations");
240$zipfile = "$test_tempdir/goe.marmalt.zip";
241open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
242print $fh $zipcontent;
243close($fh);
244$zipcontent = `$UNZIP -l $zipfile`;
245like($zipcontent, qr@GOE/AGA/00000/upos/morpho\.xml@, "conllu2korapxml can handle different foundries for motpho and dependency layers");
246like($zipcontent, qr@GOE/AGA/00000/gsd/dependency\.xml@, "conllu2korapxml sets the secondary dependency foundry correctly");
247
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100248done_testing;