Add korapxml2conllu option -e <regex> to extract element/attributes
./script/korapxml2conllu -e '(posting/id|div/id)' -p "A0000" t/data/wdf19.zip | head -12
# foundry = base
# filename = WDF19/A0000/10894/base/tokens.xml
# text_id = WDF19_A0000.10894
# start_offsets = 0 0 5 14 23 32 40 48 51 54 60
# end_offsets = 61 4 12 22 31 39 47 50 53 59 61
1 Arts _ _ _ _ _ _ _ _
2 visuels _ _ _ _ _ _ _ _
# div/id = i.10894_1
# posting/id = i.10894_1_1
3 Pourquoi _ _ _ _ _ _ _ _
4 toujours _ _ _ _ _ _ _ _
5 vouloir _ _ _ _ _ _ _ _
Change-Id: I2cedc6580699fab0db6794d0f3225ea4da72b30f
diff --git a/t/data/wdf19.zip b/t/data/wdf19.zip
new file mode 100644
index 0000000..61a8bdf
--- /dev/null
+++ b/t/data/wdf19.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 22d261a..48bd2b1 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 10;
+use Test::More tests => 19;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -75,4 +75,15 @@
script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
"Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
+
+script_runs([ 'script/korapxml2conllu', '-e', 'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction");
+script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips";
+script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips";
+
+script_runs([ 'script/korapxml2conllu', '-e', '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction");
+script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)";
+script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)";
+script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place";
+script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)";
+script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)";
done_testing;