Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 1 | use strict; |
| 2 | use warnings; |
Marc Kupietz | 80c8f2a | 2024-04-05 17:37:26 +0200 | [diff] [blame] | 3 | use Test::More; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 4 | use Test::Script; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 5 | use Test::TempDir::Tiny; |
| 6 | use File::Copy; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 7 | |
Marc Kupietz | 6a79cad | 2021-03-19 16:26:58 +0100 | [diff] [blame] | 8 | script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 1 }); |
| 9 | script_stdout_like "Description", "Can print help message"; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 10 | |
| 11 | for my $morpho_fname (glob("t/data/*\.*\.zip")) { |
| 12 | my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 13 | if (!-e $base_fname) { |
| 14 | fail("cannot find $base_fname"); |
| 15 | next; |
| 16 | }; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 17 | |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 18 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.conllu/r; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 19 | if (!-e $conllu_fname) { |
| 20 | fail("cannot find $conllu_fname"); |
| 21 | next; |
| 22 | }; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 23 | |
| 24 | my $expected; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 25 | if (open(my $fh, '<', $conllu_fname)) { |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 26 | local $/; |
| 27 | $expected = <$fh>; |
Marc Kupietz | cc39147 | 2024-06-24 10:48:34 +0200 | [diff] [blame] | 28 | $expected =~ s/^(# text_id\s*=\s*\S+)\s*$/$1/mg; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 29 | close($fh); |
| 30 | } else { |
| 31 | fail("cannot open file $conllu_fname"); |
| 32 | next; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 33 | } |
Marc Kupietz | 13994d1 | 2021-02-12 17:25:36 +0100 | [diff] [blame] | 34 | script_runs([ 'script/korapxml2conllu', $morpho_fname ], "Runs korapxml2conllu with pos and lemma annotated input"); |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 35 | script_stdout_is $expected, "Converts $morpho_fname correctly"; |
| 36 | } |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 37 | |
Marc Kupietz | 15c84fd | 2021-10-12 12:20:27 +0200 | [diff] [blame] | 38 | for my $morpho_fname (glob("t/data/*\.*\.zip")) { |
| 39 | my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r; |
| 40 | if (!-e $base_fname) { |
| 41 | fail("cannot find $base_fname"); |
| 42 | next; |
| 43 | }; |
| 44 | |
| 45 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.sbfm.conllu/r; |
| 46 | if (!-e $conllu_fname) { |
| 47 | fail("cannot find $conllu_fname"); |
| 48 | next; |
| 49 | }; |
| 50 | |
| 51 | my $expected; |
| 52 | if (open(my $fh, '<', $conllu_fname)) { |
| 53 | local $/; |
| 54 | $expected = <$fh>; |
| 55 | close($fh); |
| 56 | } else { |
| 57 | fail("cannot open file $conllu_fname"); |
| 58 | next; |
| 59 | } |
| 60 | script_runs([ 'script/korapxml2conllu', '--s-bounds-from-morpho', $morpho_fname ], "Runs korapxml2conllu with --s-bounds-from-morpho correctly"); |
| 61 | script_stdout_is $expected, "Converts $morpho_fname correctly"; |
| 62 | } |
| 63 | |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 64 | for my $base_fname (glob("t/data/*\.zip")) { |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 65 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.conllu/r; |
Marc Kupietz | 628893e | 2021-02-12 15:50:29 +0100 | [diff] [blame] | 66 | next if (!-e $conllu_fname); |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 67 | |
| 68 | my $expected; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 69 | if (open(my $fh, '<', $conllu_fname)) { |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 70 | local $/; |
| 71 | $expected = <$fh>; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 72 | close($fh); |
| 73 | } else { |
| 74 | fail("cannot open file $conllu_fname"); |
| 75 | next; |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 76 | } |
Marc Kupietz | 13994d1 | 2021-02-12 17:25:36 +0100 | [diff] [blame] | 77 | script_runs([ 'script/korapxml2conllu', $base_fname ], "Runs korapxml2conllu with base input"); |
| 78 | script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U"; |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 79 | } |
| 80 | |
Marc Kupietz | d0bf277 | 2022-06-26 19:27:58 +0200 | [diff] [blame] | 81 | for my $w2v_fname (glob("t/data/*\.w2v_simple")) { |
| 82 | my $base_fname = $w2v_fname =~ s/(.*)\.w2v_simple/$1.zip/r; |
| 83 | next if (!-e $base_fname); |
| 84 | |
| 85 | my $expected; |
| 86 | if (open(my $fh, '<', $w2v_fname)) { |
| 87 | local $/; |
| 88 | $expected = <$fh>; |
| 89 | close($fh); |
| 90 | } else { |
| 91 | fail("cannot open file $w2v_fname"); |
| 92 | next; |
| 93 | } |
| 94 | script_runs([ 'script/korapxml2conllu', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v output"); |
| 95 | script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format"; |
| 96 | } |
| 97 | |
| 98 | for my $w2v_fname (glob("t/data/*\.w2v")) { |
| 99 | my $base_fname = $w2v_fname =~ s/(.*)\.w2v/$1.zip/r; |
| 100 | next if (!-e $base_fname); |
| 101 | |
| 102 | my $expected; |
| 103 | if (open(my $fh, '<', $w2v_fname)) { |
| 104 | local $/; |
| 105 | $expected = <$fh>; |
| 106 | close($fh); |
| 107 | } else { |
| 108 | fail("cannot open file $w2v_fname"); |
| 109 | next; |
| 110 | } |
Marc Kupietz | 07a8895 | 2022-07-01 08:55:53 +0200 | [diff] [blame] | 111 | script_runs([ 'script/korapxml2conllu', '-m', '<textSigle>([^<.]+)', '-m', '<creatDate>([^<]{4,7})', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v and metadata output"); |
Marc Kupietz | d0bf277 | 2022-06-26 19:27:58 +0200 | [diff] [blame] | 112 | script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format together with some metadata"; |
| 113 | } |
| 114 | |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 115 | my $expected; |
Marc Kupietz | d7d5d6a | 2021-10-11 17:52:58 +0200 | [diff] [blame] | 116 | if (open(my $fh, '<', 't/data/goe.1c.txt')) { |
| 117 | local $/; |
| 118 | $expected = <$fh>; |
| 119 | close($fh); |
| 120 | } else { |
| 121 | fail("cannot open file."); |
| 122 | } |
| 123 | script_runs([ 'script/korapxml2conllu', '-c', '1', 't/data/goe.zip' ], "Runs korapxml2conllu in one column mode"); |
| 124 | script_stdout_is $expected, "Converts correctly in one column mode."; |
| 125 | |
| 126 | my $test_tempdir = tempdir(); |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 127 | my $conllu_fname = "t/data/goe.morpho.conllu"; |
| 128 | if(open(my $fh, '<', $conllu_fname )) { |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 129 | local $/; |
| 130 | $expected = <$fh>; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 131 | close($fh); |
| 132 | } else { |
| 133 | fail("cannot open file $conllu_fname"); |
| 134 | } |
| 135 | |
| 136 | ok(length($expected) > 100, 'Output is not empty'); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 137 | |
| 138 | my $zipfile = "$test_tempdir/goe.tree_tagger.zip"; |
| 139 | my $zipcontent; |
| 140 | script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent}, |
| 141 | "Converts t/data/goe.morpho.conllu to KorAP-XML zip"); |
| 142 | open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); |
| 143 | print $fh $zipcontent; |
| 144 | close($fh); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 145 | copy("t/data/goe.zip", $test_tempdir); |
| 146 | script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ], |
| 147 | "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U"); |
| 148 | script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly"; |
Marc Kupietz | eb7d06a | 2021-03-19 16:29:16 +0100 | [diff] [blame] | 149 | |
| 150 | script_runs([ 'script/korapxml2conllu', '-e', 'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction"); |
| 151 | script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips"; |
| 152 | script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips"; |
| 153 | |
| 154 | script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction"); |
| 155 | script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)"; |
| 156 | script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)"; |
| 157 | script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place"; |
| 158 | script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)"; |
| 159 | script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)"; |
Marc Kupietz | ab15023 | 2021-07-31 23:41:47 +0200 | [diff] [blame] | 160 | script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in base zip"; |
| 161 | |
| 162 | script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and regex attribute extraction"); |
| 163 | script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from morpho zips (1)"; |
| 164 | script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from morpho zips (2)"; |
| 165 | script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from morpho zips (1)"; |
| 166 | script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from morpho zips (2)"; |
| 167 | script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in morpho zip"; |
| 168 | |
Marc Kupietz | 97ba2ba | 2021-10-11 17:55:47 +0200 | [diff] [blame] | 169 | $zipfile = "$test_tempdir/without_lemma.zip"; |
| 170 | script_runs([ 'script/conllu2korapxml', "t/data/without_lemma.tsv" ], {stdout => \$zipcontent}, |
| 171 | "Converts t/data/without_lemma.tsv to KorAP-XML zip"); |
| 172 | open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); |
| 173 | print $fh $zipcontent; |
| 174 | close($fh); |
| 175 | my $UNZIP = `sh -c 'command -v unzip'`; |
| 176 | chomp $UNZIP; |
| 177 | |
| 178 | if ($UNZIP eq '') { |
| 179 | warn('No unzip executable found in PATH.'); |
| 180 | return 0; |
| 181 | }; |
| 182 | $zipcontent = `$UNZIP -c $zipfile`; |
| 183 | unlike($zipcontent, qr/.*name ="lemma".*/, "conllu2korapxml igores _ lemmas."); |
| 184 | like($zipcontent, qr/.*<f name="pos">NN|NN<\/f>.*/, "conllu2korapxml does not ignore pos for _ lemmas."); |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 185 | |
| 186 | script_runs([ 'script/conllu2korapxml', '-l', 'debug', 't/data/goe.ud.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with UDPipe and unparsable comments"); |
| 187 | script_stderr_like "Foundry:\\s+ud", "Found generator based foundry"; |
| 188 | script_stderr_like "Ignored\\s+foundry\\s+name:\\s+base", "Ignore defined foundry"; |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 189 | $zipfile = "$test_tempdir/goe.ud.zip"; |
| 190 | open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); |
| 191 | print $fh $zipcontent; |
| 192 | close($fh); |
| 193 | |
Marc Kupietz | 447f475 | 2024-03-22 17:35:57 +0100 | [diff] [blame] | 194 | $zipcontent = `$UNZIP -Z $zipfile`; |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 195 | like($zipcontent, qr@GOE/AGA/00000/ud/morpho\.xml@, "conllu2korapxml UDPipe input conversion contains morpho layer with foundry name 'ud'"); |
| 196 | like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion contains dependency layer with foundry name 'ud'"); |
Marc Kupietz | 447f475 | 2024-03-22 17:35:57 +0100 | [diff] [blame] | 197 | like($zipcontent, qr@rw-rw-rw-.*GOE/AGA/00000/ud/morpho\.xml@, "KorAP-XML zip contents have read and write permissions"); |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 198 | |
Marc Kupietz | 5cc4df2 | 2024-03-24 13:46:42 +0100 | [diff] [blame] | 199 | $zipcontent = `$UNZIP -c $zipfile`; |
| 200 | like($zipcontent, qr/.*<f name="upos">VERB<\/f>.*/, "conllu2korapxml extracts upos tags."); |
| 201 | like($zipcontent, qr/.*<f name="pos">VVFIN<\/f>.*/, "conllu2korapxml extracts (x)pos tags."); |
| 202 | unlike($zipcontent, qr/.*<f name="pos">_<\/f>.*/, "conllu2korapxml ignores _ pos tags."); |
| 203 | unlike($zipcontent, qr/.*<f name="upos">_<\/f>.*/, "conllu2korapxml ignores _ upos tags."); |
| 204 | |
Marc Kupietz | 187abd7 | 2024-06-25 14:30:01 +0200 | [diff] [blame] | 205 | $zipfile = "$test_tempdir/goe.ud2.zip"; |
Marc Kupietz | 3559fa4 | 2024-06-25 17:06:11 +0200 | [diff] [blame] | 206 | script_runs([ 'script/conllu2korapxml', '-l', 'warn', '-o', $zipfile, 't/data/goe.ud.conllu' ], {}, "Runs conllu2korap with -o option"); |
Marc Kupietz | 187abd7 | 2024-06-25 14:30:01 +0200 | [diff] [blame] | 207 | like($zipcontent, qr@GOE/AGA/00000/ud/morpho\.xml@, "conllu2korapxml UDPipe input conversion -o option contains morpho layer with foundry name 'ud'"); |
| 208 | like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion with -o option contains dependency layer with foundry name 'ud'"); |
| 209 | |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 210 | script_runs([ 'script/conllu2korapxml', 't/data/deu-deps.conllu' ], "Runs conllu2korap with UDPipe input"); |
| 211 | script_stderr_unlike "fileparse(): need a valid pathname", "Ignore sent_id and newdoc id"; |
Marc Kupietz | 67d8c43 | 2024-06-25 14:32:16 +0200 | [diff] [blame] | 212 | script_stderr_like qr@WARNING: Invalid input in.*deu-deps.conllu.*token offsets missing.*in line \d+@, "Warn on missing token offsets"; |
| 213 | script_stderr_like qr@WARNING: Invalid input in.*deu-deps.conllu.*text.id .*missing.*in line \d+@, "Warn on missing text ids"; |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 214 | |
Marc Kupietz | 7e4cd6c | 2022-12-15 18:34:37 +0100 | [diff] [blame] | 215 | script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data"); |
| 216 | script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)"); |
| 217 | script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations"); |
Marc Kupietz | 534df18 | 2022-12-16 15:00:30 +0100 | [diff] [blame] | 218 | |
| 219 | script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "KOT", "t/data/nkjp.zip" ], "Runs korapxml2conllu with --sigle-pattern option on combined base/morpho files"); |
| 220 | script_stdout_like("NKJP/NKJP/KOT/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern extracts the right texts"); |
| 221 | script_stdout_unlike("NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern does not extract the wrong texts"); |
| 222 | |
| 223 | script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "13072", "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with --sigle-pattern option on seprate base/morpho files"); |
| 224 | script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts"); |
| 225 | script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts"); |
| 226 | |
Akron | f2b0bba | 2022-12-16 18:00:08 +0100 | [diff] [blame] | 227 | script_runs([ 'script/korapxml2conllu', "t/data/nkjp-fail.zip" ], "Runs korapxml2conllu on nkjp-fail test data"); |
| 228 | script_stderr_like("could not retrieve token at 1297-1298/ 1297 - ending with: e! upadku.", "Offset error"); |
| 229 | |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 230 | script_runs([ 'script/conllu2korapxml', 't/data/goe.marmot-malt.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with marmot and malt annotations"); |
| 231 | $zipfile = "$test_tempdir/goe.marmalt.zip"; |
| 232 | open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); |
| 233 | print $fh $zipcontent; |
| 234 | close($fh); |
| 235 | $zipcontent = `$UNZIP -l $zipfile`; |
| 236 | like($zipcontent, qr@GOE/AGA/00000/marmot/morpho\.xml@, "conllu2korapxml can handle different foundries for motpho and dependency layers"); |
| 237 | like($zipcontent, qr@GOE/AGA/00000/malt/dependency\.xml@, "conllu2korapxml sets the secondary dependency foundry correctly"); |
| 238 | |
| 239 | script_runs([ 'script/conllu2korapxml', '-f', 'upos dependency:gsd', 't/data/goe.ud.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with marmot and malt annotations"); |
| 240 | $zipfile = "$test_tempdir/goe.marmalt.zip"; |
| 241 | open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); |
| 242 | print $fh $zipcontent; |
| 243 | close($fh); |
| 244 | $zipcontent = `$UNZIP -l $zipfile`; |
| 245 | like($zipcontent, qr@GOE/AGA/00000/upos/morpho\.xml@, "conllu2korapxml can handle different foundries for motpho and dependency layers"); |
| 246 | like($zipcontent, qr@GOE/AGA/00000/gsd/dependency\.xml@, "conllu2korapxml sets the secondary dependency foundry correctly"); |
| 247 | |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 248 | done_testing; |