Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 1 | use strict; |
| 2 | use warnings; |
Marc Kupietz | ab15023 | 2021-07-31 23:41:47 +0200 | [diff] [blame^] | 3 | use Test::More tests => 28; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 4 | use Test::Script; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 5 | use Test::TempDir::Tiny; |
| 6 | use File::Copy; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 7 | |
Marc Kupietz | 6a79cad | 2021-03-19 16:26:58 +0100 | [diff] [blame] | 8 | script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 1 }); |
| 9 | script_stdout_like "Description", "Can print help message"; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 10 | |
| 11 | for my $morpho_fname (glob("t/data/*\.*\.zip")) { |
| 12 | my $base_fname = $morpho_fname =~ s/(.*)\..*\.zip/$1.zip/r; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 13 | if (!-e $base_fname) { |
| 14 | fail("cannot find $base_fname"); |
| 15 | next; |
| 16 | }; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 17 | |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 18 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.morpho.conllu/r; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 19 | if (!-e $conllu_fname) { |
| 20 | fail("cannot find $conllu_fname"); |
| 21 | next; |
| 22 | }; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 23 | |
| 24 | my $expected; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 25 | if (open(my $fh, '<', $conllu_fname)) { |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 26 | local $/; |
| 27 | $expected = <$fh>; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 28 | close($fh); |
| 29 | } else { |
| 30 | fail("cannot open file $conllu_fname"); |
| 31 | next; |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 32 | } |
Marc Kupietz | 13994d1 | 2021-02-12 17:25:36 +0100 | [diff] [blame] | 33 | script_runs([ 'script/korapxml2conllu', $morpho_fname ], "Runs korapxml2conllu with pos and lemma annotated input"); |
Marc Kupietz | bb739b0 | 2020-09-22 16:49:34 +0200 | [diff] [blame] | 34 | script_stdout_is $expected, "Converts $morpho_fname correctly"; |
| 35 | } |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 36 | |
| 37 | for my $base_fname (glob("t/data/*\.zip")) { |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 38 | my $conllu_fname = $base_fname =~ s/(.*)\.zip/$1.conllu/r; |
Marc Kupietz | 628893e | 2021-02-12 15:50:29 +0100 | [diff] [blame] | 39 | next if (!-e $conllu_fname); |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 40 | |
| 41 | my $expected; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 42 | if (open(my $fh, '<', $conllu_fname)) { |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 43 | local $/; |
| 44 | $expected = <$fh>; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 45 | close($fh); |
| 46 | } else { |
| 47 | fail("cannot open file $conllu_fname"); |
| 48 | next; |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 49 | } |
Marc Kupietz | 13994d1 | 2021-02-12 17:25:36 +0100 | [diff] [blame] | 50 | script_runs([ 'script/korapxml2conllu', $base_fname ], "Runs korapxml2conllu with base input"); |
| 51 | script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U"; |
Marc Kupietz | d845583 | 2021-02-11 17:30:29 +0100 | [diff] [blame] | 52 | } |
| 53 | |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 54 | my $test_tempdir = tempdir(); |
| 55 | my $expected; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 56 | my $conllu_fname = "t/data/goe.morpho.conllu"; |
| 57 | if(open(my $fh, '<', $conllu_fname )) { |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 58 | local $/; |
| 59 | $expected = <$fh>; |
Akron | 46f2c23 | 2021-02-12 17:21:39 +0100 | [diff] [blame] | 60 | close($fh); |
| 61 | } else { |
| 62 | fail("cannot open file $conllu_fname"); |
| 63 | } |
| 64 | |
| 65 | ok(length($expected) > 100, 'Output is not empty'); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 66 | |
| 67 | my $zipfile = "$test_tempdir/goe.tree_tagger.zip"; |
| 68 | my $zipcontent; |
| 69 | script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent}, |
| 70 | "Converts t/data/goe.morpho.conllu to KorAP-XML zip"); |
| 71 | open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing"); |
| 72 | print $fh $zipcontent; |
| 73 | close($fh); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 74 | copy("t/data/goe.zip", $test_tempdir); |
| 75 | script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ], |
| 76 | "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U"); |
| 77 | script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly"; |
Marc Kupietz | eb7d06a | 2021-03-19 16:29:16 +0100 | [diff] [blame] | 78 | |
| 79 | script_runs([ 'script/korapxml2conllu', '-e', 'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction"); |
| 80 | script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips"; |
| 81 | script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips"; |
| 82 | |
| 83 | script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction"); |
| 84 | script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)"; |
| 85 | script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)"; |
| 86 | script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place"; |
| 87 | script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)"; |
| 88 | script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)"; |
Marc Kupietz | ab15023 | 2021-07-31 23:41:47 +0200 | [diff] [blame^] | 89 | script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in base zip"; |
| 90 | |
| 91 | script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and regex attribute extraction"); |
| 92 | script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from morpho zips (1)"; |
| 93 | script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from morpho zips (2)"; |
| 94 | script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from morpho zips (1)"; |
| 95 | script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from morpho zips (2)"; |
| 96 | script_stdout_like "\n# posting/id = i.14548_9_1", "Extracts last postings in morpho zip"; |
| 97 | |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 98 | done_testing; |