Add --word2vec option to produce one sentence per line output
otionally with leading metadata columns (via --extract-metadata-regex
<regex>)
Change-Id: Ic564c14bc08a0041798048a90e466b09ea24666e
diff --git a/t/test.t b/t/test.t
index 4fd8c6f..99f6503 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 37;
+use Test::More tests => 41;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -77,6 +77,40 @@
script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U";
}
+for my $w2v_fname (glob("t/data/*\.w2v_simple")) {
+ my $base_fname = $w2v_fname =~ s/(.*)\.w2v_simple/$1.zip/r;
+ next if (!-e $base_fname);
+
+ my $expected;
+ if (open(my $fh, '<', $w2v_fname)) {
+ local $/;
+ $expected = <$fh>;
+ close($fh);
+ } else {
+ fail("cannot open file $w2v_fname");
+ next;
+ }
+ script_runs([ 'script/korapxml2conllu', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v output");
+ script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format";
+}
+
+for my $w2v_fname (glob("t/data/*\.w2v")) {
+ my $base_fname = $w2v_fname =~ s/(.*)\.w2v/$1.zip/r;
+ next if (!-e $base_fname);
+
+ my $expected;
+ if (open(my $fh, '<', $w2v_fname)) {
+ local $/;
+ $expected = <$fh>;
+ close($fh);
+ } else {
+ fail("cannot open file $w2v_fname");
+ next;
+ }
+ script_runs([ 'script/korapxml2conllu', '-m', '<textSigle>([^<.]+)', '-m', '<creatDate>([^<]{7})', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v and metadata output");
+ script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format together with some metadata";
+}
+
my $expected;
if (open(my $fh, '<', 't/data/goe.1c.txt')) {
local $/;