Add --word2vec option to produce one sentence per line output otionally with leading metadata columns (via --extract-metadata-regex <regex>) Change-Id: Ic564c14bc08a0041798048a90e466b09ea24666e

commit: d0bf2773e39efa4afb80dd1682bcb94d4dd63d82 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Jun 26 19:27:58 2022 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Jun 29 11:41:42 2022 +0200
tree: f1d1150d782db69a78f043f9260fec0073261dd5
parent: 8eb468edb934f1c47a3dcbe269f8b1514aee5a49 [diff] [blame]
diff --git a/t/test.t b/t/test.t
index 4fd8c6f..99f6503 100644
--- a/t/test.t
+++ b/t/test.t

@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 37;
+use Test::More tests => 41;
 use Test::Script;
 use Test::TempDir::Tiny;
 use File::Copy;
@@ -77,6 +77,40 @@
     script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U";
 }
 
+for my $w2v_fname (glob("t/data/*\.w2v_simple")) {
+    my $base_fname = $w2v_fname =~ s/(.*)\.w2v_simple/$1.zip/r;
+    next if (!-e $base_fname);
+
+    my $expected;
+    if (open(my $fh, '<', $w2v_fname)) {
+        local $/;
+        $expected = <$fh>;
+        close($fh);
+    } else {
+        fail("cannot open file $w2v_fname");
+        next;
+    }
+    script_runs([ 'script/korapxml2conllu', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v output");
+    script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format";
+}
+
+for my $w2v_fname (glob("t/data/*\.w2v")) {
+    my $base_fname = $w2v_fname =~ s/(.*)\.w2v/$1.zip/r;
+    next if (!-e $base_fname);
+
+    my $expected;
+    if (open(my $fh, '<', $w2v_fname)) {
+        local $/;
+        $expected = <$fh>;
+        close($fh);
+    } else {
+        fail("cannot open file $w2v_fname");
+        next;
+    }
+    script_runs([ 'script/korapxml2conllu', '-m', '<textSigle>([^<.]+)', '-m', '<creatDate>([^<]{7})', '--word2vec', $base_fname ], "Runs korapxml2conllu with base input and w2v and metadata output");
+    script_stdout_is $expected, "Converts $base_fname correctly to word2vec input format together with some metadata";
+}
+
 my $expected;
 if (open(my $fh, '<', 't/data/goe.1c.txt')) {
     local $/;
commit	d0bf2773e39efa4afb80dd1682bcb94d4dd63d82	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Jun 26 19:27:58 2022 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Jun 29 11:41:42 2022 +0200
tree	f1d1150d782db69a78f043f9260fec0073261dd5
parent	8eb468edb934f1c47a3dcbe269f8b1514aee5a49 [diff] [blame]