| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 1 | use strict; | 
|  | 2 | use warnings; | 
| Akron | f2b0bba | 2022-12-16 18:00:08 +0100 | [diff] [blame^] | 3 | use Test::More tests => 61; | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 4 | use Test::Script; | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 5 | use Test::TempDir::Tiny; | 
|  | 6 | use File::Copy; | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 7 |  | 
| Marc Kupietz | 6a79cad | 2021-03-19 16:26:58 +0100 | [diff] [blame] | 8 | script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 1 }); | 
|  | 9 | script_stdout_like "Description", "Can print help message"; | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 10 |  | 
|  | 11 | for my $morpho_fname (glob("t/data/*\.*\.zip")) { | 
|  | 12 | my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 13 | if (!-e $base_fname) { | 
|  | 14 | fail("cannot find $base_fname"); | 
|  | 15 | next; | 
|  | 16 | }; | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 17 |  | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 18 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.conllu/r; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 19 | if (!-e $conllu_fname) { | 
|  | 20 | fail("cannot find $conllu_fname"); | 
|  | 21 | next; | 
|  | 22 | }; | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 23 |  | 
|  | 24 | my $expected; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 25 | if (open(my $fh, '<', $conllu_fname)) { | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 26 | local $/; | 
|  | 27 | $expected = <$fh>; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 28 | close($fh); | 
|  | 29 | } else { | 
|  | 30 | fail("cannot open file $conllu_fname"); | 
|  | 31 | next; | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 32 | } | 
| Marc Kupietz | 13994d1 | 2021-02-12 17:25:36 +0100 | [diff] [blame] | 33 | script_runs([ 'script/korapxml2conllu', $morpho_fname ], "Runs korapxml2conllu with pos and lemma annotated input"); | 
| Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 34 | script_stdout_is $expected, "Converts $morpho_fname correctly"; | 
|  | 35 | } | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 36 |  | 
| Marc Kupietz | 15c84fd | 2021-10-12 12:20:27 +0200 | [diff] [blame] | 37 | for my $morpho_fname (glob("t/data/*\.*\.zip")) { | 
|  | 38 | my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r; | 
|  | 39 | if (!-e $base_fname) { | 
|  | 40 | fail("cannot find $base_fname"); | 
|  | 41 | next; | 
|  | 42 | }; | 
|  | 43 |  | 
|  | 44 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.sbfm.conllu/r; | 
|  | 45 | if (!-e $conllu_fname) { | 
|  | 46 | fail("cannot find $conllu_fname"); | 
|  | 47 | next; | 
|  | 48 | }; | 
|  | 49 |  | 
|  | 50 | my $expected; | 
|  | 51 | if (open(my $fh, '<', $conllu_fname)) { | 
|  | 52 | local $/; | 
|  | 53 | $expected = <$fh>; | 
|  | 54 | close($fh); | 
|  | 55 | } else { | 
|  | 56 | fail("cannot open file $conllu_fname"); | 
|  | 57 | next; | 
|  | 58 | } | 
|  | 59 | script_runs([ 'script/korapxml2conllu', '--s-bounds-from-morpho', $morpho_fname ], "Runs korapxml2conllu with --s-bounds-from-morpho correctly"); | 
|  | 60 | script_stdout_is $expected, "Converts $morpho_fname correctly"; | 
|  | 61 | } | 
|  | 62 |  | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 63 | for my $base_fname (glob("t/data/*\.zip")) { | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 64 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.conllu/r; | 
| Marc Kupietz | 628893e | 2021-02-12 15:50:29 +0100 | [diff] [blame] | 65 | next if (!-e $conllu_fname); | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 66 |  | 
|  | 67 | my $expected; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 68 | if (open(my $fh, '<', $conllu_fname)) { | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 69 | local $/; | 
|  | 70 | $expected = <$fh>; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 71 | close($fh); | 
|  | 72 | } else { | 
|  | 73 | fail("cannot open file $conllu_fname"); | 
|  | 74 | next; | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 75 | } | 
| Marc Kupietz | 13994d1 | 2021-02-12 17:25:36 +0100 | [diff] [blame] | 76 | script_runs([ 'script/korapxml2conllu', $base_fname ], "Runs korapxml2conllu with base input"); | 
|  | 77 | script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U"; | 
| Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 78 | } | 
|  | 79 |  | 
| Marc Kupietz | d0bf277 | 2022-06-26 19:27:58 +0200 | [diff] [blame] | 80 | for my $w2v_fname (glob("t/data/*\.w2v_simple")) { | 
|  | 81 | my $base_fname = $w2v_fname =~ s/(.*)\.w2v_simple/$1.zip/r; | 
|  | 82 | next if (!-e $base_fname); | 
|  | 83 |  | 
|  | 84 | my $expected; | 
|  | 85 | if (open(my $fh, '<', $w2v_fname)) { | 
|  | 86 | local $/; | 
|  | 87 | $expected = <$fh>; | 
|  | 88 | close($fh); | 
|  | 89 | } else { | 
|  | 90 | fail("cannot open file $w2v_fname"); | 
|  | 91 | next; | 
|  | 92 | } | 
|  | 93 | script_runs([ 'script/korapxml2conllu', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v output"); | 
|  | 94 | script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format"; | 
|  | 95 | } | 
|  | 96 |  | 
|  | 97 | for my $w2v_fname (glob("t/data/*\.w2v")) { | 
|  | 98 | my $base_fname = $w2v_fname =~ s/(.*)\.w2v/$1.zip/r; | 
|  | 99 | next if (!-e $base_fname); | 
|  | 100 |  | 
|  | 101 | my $expected; | 
|  | 102 | if (open(my $fh, '<', $w2v_fname)) { | 
|  | 103 | local $/; | 
|  | 104 | $expected = <$fh>; | 
|  | 105 | close($fh); | 
|  | 106 | } else { | 
|  | 107 | fail("cannot open file $w2v_fname"); | 
|  | 108 | next; | 
|  | 109 | } | 
| Marc Kupietz | 07a8895 | 2022-07-01 08:55:53 +0200 | [diff] [blame] | 110 | script_runs([ 'script/korapxml2conllu', '-m', '<textSigle>([^<.]+)', '-m', '<creatDate>([^<]{4,7})', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v and metadata output"); | 
| Marc Kupietz | d0bf277 | 2022-06-26 19:27:58 +0200 | [diff] [blame] | 111 | script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format together with some metadata"; | 
|  | 112 | } | 
|  | 113 |  | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 114 | my $expected; | 
| Marc Kupietz | d7d5d6a | 2021-10-11 17:52:58 +0200 | [diff] [blame] | 115 | if (open(my $fh, '<', 't/data/goe.1c.txt')) { | 
|  | 116 | local $/; | 
|  | 117 | $expected = <$fh>; | 
|  | 118 | close($fh); | 
|  | 119 | } else { | 
|  | 120 | fail("cannot open file."); | 
|  | 121 | } | 
|  | 122 | script_runs([ 'script/korapxml2conllu', '-c',  '1', 't/data/goe.zip' ], "Runs korapxml2conllu in one column mode"); | 
|  | 123 | script_stdout_is $expected, "Converts correctly in one column mode."; | 
|  | 124 |  | 
|  | 125 | my $test_tempdir = tempdir(); | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 126 | my $conllu_fname = "t/data/goe.morpho.conllu"; | 
|  | 127 | if(open(my $fh, '<', $conllu_fname )) { | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 128 | local $/; | 
|  | 129 | $expected = <$fh>; | 
| Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 130 | close($fh); | 
|  | 131 | } else { | 
|  | 132 | fail("cannot open file $conllu_fname"); | 
|  | 133 | } | 
|  | 134 |  | 
|  | 135 | ok(length($expected) > 100, 'Output is not empty'); | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 136 |  | 
|  | 137 | my $zipfile = "$test_tempdir/goe.tree_tagger.zip"; | 
|  | 138 | my $zipcontent; | 
|  | 139 | script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent}, | 
|  | 140 | "Converts t/data/goe.morpho.conllu to KorAP-XML zip"); | 
|  | 141 | open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); | 
|  | 142 | print $fh $zipcontent; | 
|  | 143 | close($fh); | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 144 | copy("t/data/goe.zip", $test_tempdir); | 
|  | 145 | script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ], | 
|  | 146 | "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U"); | 
|  | 147 | script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly"; | 
| Marc Kupietz | eb7d06a | 2021-03-19 16:29:16 +0100 | [diff] [blame] | 148 |  | 
|  | 149 | script_runs([ 'script/korapxml2conllu', '-e',  'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction"); | 
|  | 150 | script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips"; | 
|  | 151 | script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips"; | 
|  | 152 |  | 
|  | 153 | script_runs([ 'script/korapxml2conllu', '-e',  '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction"); | 
|  | 154 | script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)"; | 
|  | 155 | script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)"; | 
|  | 156 | script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place"; | 
|  | 157 | script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)"; | 
|  | 158 | script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)"; | 
| Marc Kupietz | ab15023 | 2021-07-31 23:41:47 +0200 | [diff] [blame] | 159 | script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in base zip"; | 
|  | 160 |  | 
|  | 161 | script_runs([ 'script/korapxml2conllu', '-e',  '(posting/id|div/id)', "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and regex attribute extraction"); | 
|  | 162 | script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from morpho zips (1)"; | 
|  | 163 | script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from morpho zips (2)"; | 
|  | 164 | script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from morpho zips (1)"; | 
|  | 165 | script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from morpho zips (2)"; | 
|  | 166 | script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in morpho zip"; | 
|  | 167 |  | 
| Marc Kupietz | 97ba2ba | 2021-10-11 17:55:47 +0200 | [diff] [blame] | 168 | $zipfile = "$test_tempdir/without_lemma.zip"; | 
|  | 169 | script_runs([ 'script/conllu2korapxml', "t/data/without_lemma.tsv" ], {stdout => \$zipcontent}, | 
|  | 170 | "Converts t/data/without_lemma.tsv to KorAP-XML zip"); | 
|  | 171 | open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); | 
|  | 172 | print $fh $zipcontent; | 
|  | 173 | close($fh); | 
|  | 174 | my $UNZIP = `sh -c 'command -v unzip'`; | 
|  | 175 | chomp $UNZIP; | 
|  | 176 |  | 
|  | 177 | if ($UNZIP eq '') { | 
|  | 178 | warn('No unzip executable found in PATH.'); | 
|  | 179 | return 0; | 
|  | 180 | }; | 
|  | 181 | $zipcontent = `$UNZIP -c $zipfile`; | 
|  | 182 | unlike($zipcontent, qr/.*name ="lemma".*/, "conllu2korapxml igores _ lemmas."); | 
|  | 183 | like($zipcontent, qr/.*<f name="pos">NN|NN<\/f>.*/, "conllu2korapxml does not ignore pos for _ lemmas."); | 
| Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 184 |  | 
|  | 185 | script_runs([ 'script/conllu2korapxml', '-l', 'debug', 't/data/goe.ud.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with UDPipe and unparsable comments"); | 
|  | 186 | script_stderr_like "Foundry:\\s+ud", "Found generator based foundry"; | 
|  | 187 | script_stderr_like "Ignored\\s+foundry\\s+name:\\s+base", "Ignore defined foundry"; | 
|  | 188 |  | 
|  | 189 | $zipfile = "$test_tempdir/goe.ud.zip"; | 
|  | 190 | open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); | 
|  | 191 | print $fh $zipcontent; | 
|  | 192 | close($fh); | 
|  | 193 |  | 
|  | 194 | $zipcontent = `$UNZIP -l $zipfile`; | 
|  | 195 | like($zipcontent, qr@GOE/AGA/00000/ud/morpho\.xml@, "conllu2korapxml UDPipe input conversion contains morpho layer with foundry name 'ud'"); | 
|  | 196 | like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion contains dependency layer with foundry name 'ud'"); | 
|  | 197 |  | 
| Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 198 | script_runs([ 'script/conllu2korapxml', 't/data/deu-deps.conllu' ], "Runs conllu2korap with UDPipe input"); | 
|  | 199 | script_stderr_unlike "fileparse(): need a valid pathname", "Ignore sent_id and newdoc id"; | 
|  | 200 | script_stderr_like "WARNING: No valid input document.*token offsets missing", "Warn on missing token offsets"; | 
|  | 201 | script_stderr_like qr@WARNING: No valid input document.*text.id .*missing@,   "Warn on missing text ids"; | 
| Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 202 |  | 
| Marc Kupietz | 7e4cd6c | 2022-12-15 18:34:37 +0100 | [diff] [blame] | 203 | script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data"); | 
|  | 204 | script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)"); | 
|  | 205 | script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations"); | 
| Marc Kupietz | 534df18 | 2022-12-16 15:00:30 +0100 | [diff] [blame] | 206 |  | 
|  | 207 | script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "KOT", "t/data/nkjp.zip" ], "Runs korapxml2conllu with --sigle-pattern option on combined base/morpho files"); | 
|  | 208 | script_stdout_like("NKJP/NKJP/KOT/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern extracts the right texts"); | 
|  | 209 | script_stdout_unlike("NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern does not extract the wrong texts"); | 
|  | 210 |  | 
|  | 211 | script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "13072", "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with --sigle-pattern option on seprate base/morpho files"); | 
|  | 212 | script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts"); | 
|  | 213 | script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts"); | 
|  | 214 |  | 
| Akron | f2b0bba | 2022-12-16 18:00:08 +0100 | [diff] [blame^] | 215 | script_runs([ 'script/korapxml2conllu', "t/data/nkjp-fail.zip" ], "Runs korapxml2conllu on nkjp-fail test data"); | 
|  | 216 | script_stderr_like("could not retrieve token at 1297-1298/ 1297  - ending with:  e! upadku.", "Offset error"); | 
|  | 217 |  | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 218 | done_testing; |