blob: 4fd8c6f996e76e789f857b2e1881f199855ea428 [file] [log] [blame]
Marc Kupietzbb739b02020-09-22 16:49:34 +02001use strict;
2use warnings;
Marc Kupietz15c84fd2021-10-12 12:20:27 +02003use Test::More tests => 37;
Marc Kupietzbb739b02020-09-22 16:49:34 +02004use Test::Script;
Marc Kupietz79ba1e52021-02-12 17:26:54 +01005use Test::TempDir::Tiny;
6use File::Copy;
Marc Kupietzbb739b02020-09-22 16:49:34 +02007
Marc Kupietz6a79cad2021-03-19 16:26:58 +01008script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 1 });
9script_stdout_like "Description", "Can print help message";
Marc Kupietzbb739b02020-09-22 16:49:34 +020010
11for my $morpho_fname (glob("t/data/*\.*\.zip")) {
12 my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r;
Akron46f2c232021-02-12 17:21:39 +010013 if (!-e $base_fname) {
14 fail("cannot find $base_fname");
15 next;
16 };
Marc Kupietzbb739b02020-09-22 16:49:34 +020017
Marc Kupietzd8455832021-02-11 17:30:29 +010018 my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.conllu/r;
Akron46f2c232021-02-12 17:21:39 +010019 if (!-e $conllu_fname) {
20 fail("cannot find $conllu_fname");
21 next;
22 };
Marc Kupietzbb739b02020-09-22 16:49:34 +020023
24 my $expected;
Akron46f2c232021-02-12 17:21:39 +010025 if (open(my $fh, '<', $conllu_fname)) {
Marc Kupietzbb739b02020-09-22 16:49:34 +020026 local $/;
27 $expected = <$fh>;
Akron46f2c232021-02-12 17:21:39 +010028 close($fh);
29 } else {
30 fail("cannot open file $conllu_fname");
31 next;
Marc Kupietzbb739b02020-09-22 16:49:34 +020032 }
Marc Kupietz13994d12021-02-12 17:25:36 +010033 script_runs([ 'script/korapxml2conllu', $morpho_fname ], "Runs korapxml2conllu with pos and lemma annotated input");
Marc Kupietzbb739b02020-09-22 16:49:34 +020034 script_stdout_is $expected, "Converts $morpho_fname correctly";
35}
Marc Kupietzd8455832021-02-11 17:30:29 +010036
Marc Kupietz15c84fd2021-10-12 12:20:27 +020037for my $morpho_fname (glob("t/data/*\.*\.zip")) {
38 my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r;
39 if (!-e $base_fname) {
40 fail("cannot find $base_fname");
41 next;
42 };
43
44 my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.sbfm.conllu/r;
45 if (!-e $conllu_fname) {
46 fail("cannot find $conllu_fname");
47 next;
48 };
49
50 my $expected;
51 if (open(my $fh, '<', $conllu_fname)) {
52 local $/;
53 $expected = <$fh>;
54 close($fh);
55 } else {
56 fail("cannot open file $conllu_fname");
57 next;
58 }
59 script_runs([ 'script/korapxml2conllu', '--s-bounds-from-morpho', $morpho_fname ], "Runs korapxml2conllu with --s-bounds-from-morpho correctly");
60 script_stdout_is $expected, "Converts $morpho_fname correctly";
61}
62
Marc Kupietzd8455832021-02-11 17:30:29 +010063for my $base_fname (glob("t/data/*\.zip")) {
Marc Kupietzd8455832021-02-11 17:30:29 +010064 my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.conllu/r;
Marc Kupietz628893e2021-02-12 15:50:29 +010065 next if (!-e $conllu_fname);
Marc Kupietzd8455832021-02-11 17:30:29 +010066
67 my $expected;
Akron46f2c232021-02-12 17:21:39 +010068 if (open(my $fh, '<', $conllu_fname)) {
Marc Kupietzd8455832021-02-11 17:30:29 +010069 local $/;
70 $expected = <$fh>;
Akron46f2c232021-02-12 17:21:39 +010071 close($fh);
72 } else {
73 fail("cannot open file $conllu_fname");
74 next;
Marc Kupietzd8455832021-02-11 17:30:29 +010075 }
Marc Kupietz13994d12021-02-12 17:25:36 +010076 script_runs([ 'script/korapxml2conllu', $base_fname ], "Runs korapxml2conllu with base input");
77 script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U";
Marc Kupietzd8455832021-02-11 17:30:29 +010078}
79
Marc Kupietz79ba1e52021-02-12 17:26:54 +010080my $expected;
Marc Kupietzd7d5d6a2021-10-11 17:52:58 +020081if (open(my $fh, '<', 't/data/goe.1c.txt')) {
82 local $/;
83 $expected = <$fh>;
84 close($fh);
85} else {
86 fail("cannot open file.");
87}
88script_runs([ 'script/korapxml2conllu', '-c', '1', 't/data/goe.zip' ], "Runs korapxml2conllu in one column mode");
89script_stdout_is $expected, "Converts correctly in one column mode.";
90
91my $test_tempdir = tempdir();
Akron46f2c232021-02-12 17:21:39 +010092my $conllu_fname = "t/data/goe.morpho.conllu";
93if(open(my $fh, '<', $conllu_fname )) {
Marc Kupietz79ba1e52021-02-12 17:26:54 +010094 local $/;
95 $expected = <$fh>;
Akron46f2c232021-02-12 17:21:39 +010096 close($fh);
97} else {
98 fail("cannot open file $conllu_fname");
99 }
100
101ok(length($expected) > 100, 'Output is not empty');
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100102
103my $zipfile = "$test_tempdir/goe.tree_tagger.zip";
104my $zipcontent;
105script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent},
106 "Converts t/data/goe.morpho.conllu to KorAP-XML zip");
107open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
108print $fh $zipcontent;
109close($fh);
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100110copy("t/data/goe.zip", $test_tempdir);
111script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
112 "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
113script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
Marc Kupietzeb7d06a2021-03-19 16:29:16 +0100114
115script_runs([ 'script/korapxml2conllu', '-e', 'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction");
116script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips";
117script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips";
118
119script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction");
120script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)";
121script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)";
122script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place";
123script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)";
124script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)";
Marc Kupietzab150232021-07-31 23:41:47 +0200125script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in base zip";
126
127script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and regex attribute extraction");
128script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from morpho zips (1)";
129script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from morpho zips (2)";
130script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from morpho zips (1)";
131script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from morpho zips (2)";
132script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in morpho zip";
133
Marc Kupietz97ba2ba2021-10-11 17:55:47 +0200134$zipfile = "$test_tempdir/without_lemma.zip";
135script_runs([ 'script/conllu2korapxml', "t/data/without_lemma.tsv" ], {stdout => \$zipcontent},
136 "Converts t/data/without_lemma.tsv to KorAP-XML zip");
137open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
138print $fh $zipcontent;
139close($fh);
140my $UNZIP = `sh -c 'command -v unzip'`;
141chomp $UNZIP;
142
143if ($UNZIP eq '') {
144 warn('No unzip executable found in PATH.');
145 return 0;
146};
147$zipcontent = `$UNZIP -c $zipfile`;
148unlike($zipcontent, qr/.*name ="lemma".*/, "conllu2korapxml igores _ lemmas.");
149like($zipcontent, qr/.*<f name="pos">NN|NN<\/f>.*/, "conllu2korapxml does not ignore pos for _ lemmas.");
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100150done_testing;