Added batch processing class for documents
Change-Id: I91ab289ffff525978fce3079a4dad71caddfe98a
diff --git a/Changes b/Changes
index cd3dbfe..ca1cec6 100644
--- a/Changes
+++ b/Changes
@@ -1,10 +1,11 @@
-0.18 2016-07-06
+0.18 2016-07-08
- Added REI test.
- Added multiple archive support to korapxml2krill.
- Added support for prefix negation in korapxml2krill.
- Added support for Malt#Dependency.
- Improved test suite for caching and REI.
- Added support for MDParser annotation.
+ - Added batch processing class for documents.
0.17 2016-03-22
- Rewrite siglen to use slashes as separators.
diff --git a/MANIFEST b/MANIFEST
index c0ca61e..9911feb 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,6 +1,7 @@
lib/KorAP/XML/Krill.pm
lib/KorAP/XML/Log.pm
lib/KorAP/XML/Archive.pm
+lib/KorAP/XML/Batch/File.pm
lib/KorAP/XML/Tokenizer.pm
lib/KorAP/XML/Tokenizer/Match.pm
lib/KorAP/XML/Tokenizer/Range.pm
@@ -46,6 +47,7 @@
lib/KorAP/XML/Annotation/XIP/Morpho.pm
lib/KorAP/XML/Annotation/XIP/Sentences.pm
t/archive.t
+t/batch_file.t
t/meta.t
t/meta_caching.t
t/multiple_archives.t
diff --git a/Makefile.PL b/Makefile.PL
index b4bdacb..1ab8172 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -22,6 +22,7 @@
'List::MoreUtils' => 0.33,
'Parallel::ForkManager' => 1.17,
'IO::Compress::Gzip' => 2.069,
+ 'IO::Uncompress::Gunzip' => 2.069,
'IO::Dir::Recursive' => 0.03,
'File::Temp' => 0,
'Directory::Iterator' => 0,
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index de34e4f..d91074f 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -17,19 +17,24 @@
foundry => $param{foundry} // 'Base',
layer => $param{layer} // 'Tokens',
anno => $param{anno} // [[]],
- log => $param{log} // Mojo::Log->new,
+ log => $param{log} // Mojo::Log->new(level => 'fatal'),
primary => $param{primary},
pretty => $param{pretty},
gzip => $param{gzip} // 0
}, $class;
};
-
+# Process a file
sub process {
my $self = shift;
my $input = shift;
my $output = shift;
+ if (!$self->{overwrite} && $output && -e $output) {
+ $self->{log}->debug($output . ' already exists');
+ return;
+ };
+
# Create and parse new document
$input =~ s{([^/])$}{$1/};
my $doc = KorAP::XML::Krill->new(
@@ -85,3 +90,144 @@
};
1;
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 NAME
+
+KorAP::XML::Batch::File - Process multiple files with identical setup
+
+
+=head1 SYNOPSIS
+
+
+ # Create Converter Object
+ my $converter = KorAP::XML::Batch::File->new(
+ overwrite => 1,
+ gzip => 1
+ );
+
+ $converter->process('/my/data' => 'my-output.gz');
+
+=head1 DESCRIPTION
+
+Set up the configuration for a corpus and process
+multiple texts with the same configuration.
+
+=head1 METHODS
+
+Construct a new converter object.
+
+ my $converter = KorAP::XML::Batch::File->new(
+ overwrite => 1,
+ gzip => 1
+ );
+
+
+=head2 new
+
+=over 2
+
+=item cache
+
+A L<Cache::FastMmap> compatible cache object.
+
+=item meta_type
+
+Meta data type to be parsed. Defaults to C<I5>,
+also supports all classes in the C<KorAP::XML::Meta> namespace.
+
+=item overwrite
+
+Overwrite existing files!
+Defaults to C<false>.
+
+=item foundry
+
+The foundry to use for tokenization,
+defaults to C<Base>.
+
+=item layer
+
+The layer to use for tokenization,
+defaults to C<Tokens>.
+
+=item anno
+
+ my $converter = KorAP::XML::Batch::File->new(
+ anno => [
+ ['CoreNLP', 'Morpho'],
+ ['OpenNLP', 'Morpho']
+ ]
+ );
+
+An array reference of array references,
+containing annotation layers as foundry-layer
+pairs to parse.
+The list is empty by default.
+
+=item log
+
+A L<Mojo::Log> compatible log object.
+
+=item primary
+
+Export primary text associated with the document.
+Defaults to C<true>.
+
+=item pretty
+
+Pretty print the output JSON.
+Defaults to C<false>.
+
+=item gzip
+
+Compress the output using Gzip.
+This will be ignored, if the output is undefined
+(i.e. C<STDOUT>).
+Defaults to C<false>.
+
+=back
+
+=head2 process
+
+ $converter->process('/mydoc/');
+ $converter->process('/mydoc/', '/myoutput.gzip');
+
+Process a file and pass to a chosen output.
+The first argument is mandatory and
+represents the path to the KorapXML text files.
+The second argument is optional and
+represents a file path to write.
+If the second argument is not given,
+the process will write to C<STDOUT>
+(in that case, the C<gzip> parameter is ignored).
+
+=head1 AVAILABILITY
+
+ https://github.com/KorAP/KorAP-XML-Krill
+
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Author: L<Nils Diewald|http://nils-diewald.de/>
+
+KorAP::XML::Krill is developed as part of the
+L<KorAP|http://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
+and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
+funded by the
+L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
+
+KorAP::XML::Krill is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
+
+=cut
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index affb633..5740492 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -517,7 +517,7 @@
sub to_data {
my $self = shift;
my $primary = defined $_[0] ? $_[0] : 1;
- my $legacy = defined $_[1] ? $_[1] : 0;
+ my $legacy = defined $_[1] ? $_[1] : 0;
my %data = %{$self->doc->to_hash};
my @fields;
@@ -538,6 +538,7 @@
else {
my $tokens = $self->to_hash;
+
$tokens->{text} = $self->doc->primary->data if $primary;
$data{data} = $tokens;
$data{version} = '0.03';
diff --git a/script/korapxml2krill b/script/korapxml2krill
index af75538..8a56858 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -14,7 +14,6 @@
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
-use KorAP::XML::ProcessFile;
use Parallel::ForkManager;
# TODO: use Parallel::Loops
# TODO: make output files
diff --git a/t/annotation/corpus/doc/0001/corenlp/tokens.xml b/t/annotation/corpus/doc/0001/corenlp/tokens.xml
index 78f562b..6ff5eae 100644
--- a/t/annotation/corpus/doc/0001/corenlp/tokens.xml
+++ b/t/annotation/corpus/doc/0001/corenlp/tokens.xml
@@ -23,204 +23,5 @@
<span id="s2_n26" from="117" to="120"/>
<span id="s2_n28" from="120" to="121"/>
<span id="s2_n31" from="122" to="127"/>
- <span id="s2_n34" from="128" to="131"/>
- <span id="s2_n36" from="132" to="139"/>
- <span id="s2_n40" from="140" to="144"/>
- <span id="s2_n42" from="145" to="154"/>
- <span id="s2_n44" from="155" to="166"/>
- <span id="s2_n46" from="167" to="171"/>
- <span id="s2_n48" from="171" to="172"/>
- <span id="s3_n4" from="173" to="175"/>
- <span id="s3_n6" from="176" to="185"/>
- <span id="s3_n9" from="186" to="193"/>
- <span id="s3_n11" from="193" to="194"/>
- <span id="s3_n13" from="195" to="196"/>
- <span id="s3_n15" from="196" to="197"/>
- <span id="s4_n3" from="198" to="203"/>
- <span id="s4_n5" from="203" to="204"/>
- <span id="s4_n7" from="205" to="211"/>
- <span id="s4_n13" from="212" to="214"/>
- <span id="s4_n15" from="215" to="220"/>
- <span id="s4_n17" from="221" to="224"/>
- <span id="s4_n20" from="225" to="228"/>
- <span id="s4_n22" from="229" to="239"/>
- <span id="s4_n25" from="240" to="243"/>
- <span id="s4_n27" from="244" to="255"/>
- <span id="s4_n31" from="256" to="259"/>
- <span id="s4_n34" from="260" to="266"/>
- <span id="s4_n36" from="267" to="274"/>
- <span id="s4_n41" from="275" to="276"/>
- <span id="s4_n43" from="276" to="282"/>
- <span id="s4_n47" from="282" to="283"/>
- <span id="s4_n49" from="284" to="287"/>
- <span id="s4_n52" from="288" to="293"/>
- <span id="s4_n54" from="294" to="298"/>
- <span id="s4_n57" from="299" to="300"/>
- <span id="s4_n59" from="300" to="313"/>
- <span id="s4_n61" from="314" to="317"/>
- <span id="s4_n63" from="318" to="331"/>
- <span id="s4_n65" from="331" to="332"/>
- <span id="s4_n68" from="333" to="335"/>
- <span id="s4_n70" from="336" to="345"/>
- <span id="s4_n72" from="346" to="357"/>
- <span id="s4_n74" from="358" to="363"/>
- <span id="s4_n76" from="363" to="364"/>
- <span id="s5_n5" from="365" to="368"/>
- <span id="s5_n7" from="369" to="372"/>
- <span id="s5_n9" from="373" to="385"/>
- <span id="s5_n11" from="386" to="395"/>
- <span id="s5_n13" from="396" to="402"/>
- <span id="s5_n17" from="403" to="409"/>
- <span id="s5_n19" from="410" to="416"/>
- <span id="s5_n21" from="417" to="420"/>
- <span id="s5_n24" from="421" to="425"/>
- <span id="s5_n26" from="426" to="432"/>
- <span id="s5_n28" from="432" to="433"/>
- <span id="s5_n32" from="434" to="445"/>
- <span id="s5_n34" from="446" to="451"/>
- <span id="s5_n36" from="452" to="459"/>
- <span id="s5_n39" from="460" to="465"/>
- <span id="s5_n41" from="466" to="476"/>
- <span id="s5_n43" from="476" to="477"/>
- <span id="s6_n4" from="478" to="481"/>
- <span id="s6_n6" from="482" to="493"/>
- <span id="s6_n8" from="494" to="500"/>
- <span id="s6_n12" from="501" to="504"/>
- <span id="s6_n14" from="505" to="507"/>
- <span id="s6_n16" from="508" to="510"/>
- <span id="s6_n18" from="510" to="511"/>
- <span id="s7_n5" from="512" to="517"/>
- <span id="s7_n7" from="518" to="521"/>
- <span id="s7_n10" from="522" to="525"/>
- <span id="s7_n14" from="526" to="528"/>
- <span id="s7_n16" from="529" to="536"/>
- <span id="s7_n19" from="537" to="540"/>
- <span id="s7_n22" from="541" to="548"/>
- <span id="s7_n25" from="549" to="552"/>
- <span id="s7_n28" from="553" to="558"/>
- <span id="s7_n30" from="559" to="562"/>
- <span id="s7_n32" from="563" to="568"/>
- <span id="s7_n34" from="569" to="572"/>
- <span id="s7_n36" from="573" to="578"/>
- <span id="s7_n39" from="579" to="585"/>
- <span id="s7_n42" from="586" to="589"/>
- <span id="s7_n44" from="590" to="597"/>
- <span id="s7_n47" from="598" to="601"/>
- <span id="s7_n50" from="602" to="607"/>
- <span id="s7_n52" from="608" to="611"/>
- <span id="s7_n54" from="612" to="617"/>
- <span id="s7_n56" from="617" to="620"/>
- <span id="s7_n58" from="621" to="632"/>
- <span id="s7_n60" from="632" to="633"/>
- <span id="s8_n3" from="634" to="642"/>
- <span id="s8_n6" from="643" to="658"/>
- <span id="s8_n8" from="659" to="663"/>
- <span id="s8_n10" from="663" to="664"/>
- <span id="s9_n4" from="665" to="668"/>
- <span id="s9_n6" from="669" to="689"/>
- <span id="s9_n8" from="690" to="695"/>
- <span id="s9_n10" from="696" to="699"/>
- <span id="s9_n12" from="699" to="700"/>
- <span id="s9_n15" from="701" to="705"/>
- <span id="s9_n18" from="706" to="713"/>
- <span id="s9_n20" from="714" to="724"/>
- <span id="s9_n24" from="725" to="729"/>
- <span id="s9_n26" from="730" to="747"/>
- <span id="s9_n29" from="748" to="751"/>
- <span id="s9_n31" from="752" to="755"/>
- <span id="s9_n33" from="756" to="774"/>
- <span id="s9_n35" from="775" to="784"/>
- <span id="s9_n37" from="785" to="788"/>
- <span id="s9_n39" from="788" to="789"/>
- <span id="s10_n4" from="790" to="793"/>
- <span id="s10_n6" from="794" to="797"/>
- <span id="s10_n9" from="798" to="802"/>
- <span id="s10_n11" from="803" to="806"/>
- <span id="s10_n13" from="807" to="816"/>
- <span id="s10_n15" from="817" to="826"/>
- <span id="s10_n17" from="827" to="831"/>
- <span id="s10_n19" from="832" to="835"/>
- <span id="s10_n22" from="836" to="839"/>
- <span id="s10_n24" from="840" to="845"/>
- <span id="s10_n26" from="846" to="850"/>
- <span id="s10_n29" from="851" to="864"/>
- <span id="s10_n31" from="864" to="865"/>
- <span id="s11_n4" from="866" to="869"/>
- <span id="s11_n6" from="870" to="875"/>
- <span id="s11_n8" from="876" to="892"/>
- <span id="s11_n10" from="893" to="898"/>
- <span id="s11_n13" from="899" to="902"/>
- <span id="s11_n15" from="903" to="909"/>
- <span id="s11_n18" from="910" to="913"/>
- <span id="s11_n20" from="914" to="927"/>
- <span id="s11_n24" from="928" to="935"/>
- <span id="s11_n26" from="936" to="947"/>
- <span id="s11_n28" from="947" to="948"/>
- <span id="s11_n32" from="949" to="957"/>
- <span id="s11_n34" from="958" to="962"/>
- <span id="s11_n36" from="962" to="963"/>
- <span id="s11_n38" from="964" to="986"/>
- <span id="s11_n40" from="986" to="987"/>
- <span id="s11_n42" from="988" to="995"/>
- <span id="s11_n44" from="995" to="996"/>
- <span id="s11_n47" from="997" to="1007"/>
- <span id="s11_n49" from="1007" to="1008"/>
- <span id="s12_n4" from="1009" to="1025"/>
- <span id="s12_n7" from="1026" to="1029"/>
- <span id="s12_n9" from="1030" to="1036"/>
- <span id="s12_n11" from="1037" to="1049"/>
- <span id="s12_n13" from="1049" to="1050"/>
- <span id="s13_n4" from="1051" to="1054"/>
- <span id="s13_n6" from="1055" to="1056"/>
- <span id="s13_n9" from="1056" to="1064"/>
- <span id="s13_n11" from="1065" to="1070"/>
- <span id="s13_n13" from="1070" to="1071"/>
- <span id="s13_n15" from="1072" to="1076"/>
- <span id="s13_n18" from="1077" to="1079"/>
- <span id="s13_n20" from="1080" to="1085"/>
- <span id="s13_n22" from="1086" to="1090"/>
- <span id="s13_n24" from="1091" to="1094"/>
- <span id="s13_n26" from="1094" to="1095"/>
- <span id="s13_n31" from="1096" to="1098"/>
- <span id="s13_n33" from="1099" to="1102"/>
- <span id="s13_n36" from="1103" to="1109"/>
- <span id="s13_n38" from="1110" to="1122"/>
- <span id="s13_n40" from="1123" to="1129"/>
- <span id="s13_n42" from="1130" to="1134"/>
- <span id="s13_n44" from="1134" to="1135"/>
- <span id="s13_n47" from="1136" to="1139"/>
- <span id="s13_n50" from="1140" to="1143"/>
- <span id="s13_n54" from="1144" to="1152"/>
- <span id="s13_n56" from="1153" to="1156"/>
- <span id="s13_n58" from="1157" to="1168"/>
- <span id="s13_n60" from="1169" to="1177"/>
- <span id="s13_n65" from="1177" to="1178"/>
- <span id="s13_n67" from="1179" to="1182"/>
- <span id="s13_n69" from="1183" to="1195"/>
- <span id="s13_n72" from="1196" to="1199"/>
- <span id="s13_n74" from="1200" to="1207"/>
- <span id="s13_n76" from="1208" to="1214"/>
- <span id="s13_n78" from="1215" to="1219"/>
- <span id="s13_n80" from="1219" to="1220"/>
- <span id="s14_n4" from="1221" to="1224"/>
- <span id="s14_n6" from="1225" to="1237"/>
- <span id="s14_n8" from="1238" to="1243"/>
- <span id="s14_n10" from="1244" to="1247"/>
- <span id="s14_n12" from="1248" to="1252"/>
- <span id="s14_n14" from="1253" to="1258"/>
- <span id="s14_n17" from="1259" to="1262"/>
- <span id="s14_n20" from="1263" to="1268"/>
- <span id="s14_n22" from="1269" to="1274"/>
- <span id="s14_n24" from="1274" to="1275"/>
- <span id="s14_n27" from="1276" to="1280"/>
- <span id="s14_n30" from="1281" to="1284"/>
- <span id="s14_n32" from="1285" to="1287"/>
- <span id="s14_n34" from="1288" to="1290"/>
- <span id="s14_n36" from="1290" to="1291"/>
- <span id="s15_n4" from="1292" to="1295"/>
- <span id="s15_n6" from="1296" to="1310"/>
- <span id="s15_n8" from="1311" to="1314"/>
- <span id="s15_n10" from="1315" to="1326"/>
- <span id="s15_n12" from="1326" to="1327"/>
</spanList>
</layer>
diff --git a/t/batch_file.t b/t/batch_file.t
index aa7993d..947e1ef 100644
--- a/t/batch_file.t
+++ b/t/batch_file.t
@@ -7,6 +7,8 @@
use File::Temp qw/ :POSIX /;
use Mojo::Util qw/slurp/;
use Mojo::JSON qw/decode_json/;
+use IO::Uncompress::Gunzip;
+use Data::Dumper;
use_ok('KorAP::XML::Batch::File');
@@ -16,8 +18,6 @@
layer => 'Tokens'
), 'Construct new batch file object');
-# gzip => 1,
-
my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
my $output = tmpnam();
@@ -37,5 +37,91 @@
is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+# Generate with Gzip
+$bf->{gzip} = 1;
+
+$path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+
+my $out;
+my $gz = IO::Uncompress::Gunzip->new($output);
+ok($gz->read($out), 'Uncompress');
+
+ok($json = decode_json $out, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, '', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+
+# Generate with annotations
+$bf->{gzip} = 0;
+$bf->{anno} = [
+ ['CoreNLP', 'Morpho'],
+ ['OpenNLP', 'Morpho']
+];
+$output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, 'corenlp corenlp/morpho opennlp opennlp/morpho', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+
+my $token = $json->{data}->{stream}->[0];
+
+like($json->{data}->{text}, qr/Ende Schuljahr eingestellt wird\.$/, 'Primary text');
+
+is($token->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'base/s');
+is($token->[2], '_0$<i>0<i>3', 'position');
+is($token->[3], 'corenlp/p:APPRART', 'corenlp');
+is($token->[5], 'opennlp/p:APPRART', 'opennlp');
+
+$token = $json->{data}->{stream}->[-1];
+
+is($token->[1], 'corenlp/p:VAFIN', 'corenlp');
+is($token->[3], 'opennlp/p:VAFIN', 'opennlp');
+
+# Check layer and foundry for base tokenization
+# No primary data
+$bf->{anno} = [[]];
+$bf->{primary} = 0;
+$bf->{foundry} = 'CoreNLP';
+$bf->{layer} = 'Tokens';
+
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+ok($json = decode_json $file, 'decode json');
+
+ok(!$json->{data}->{text}, 'No Primary text');
+is($json->{data}->{tokenSource}, 'corenlp#tokens', 'Title');
+
+like($file, qr/^\{"/, 'No pretty printing');
+
+# Check pretty printing
+$bf->{pretty} = 1;
+ok($bf->process($path => $output), 'Process file');
+ok(-f $output, 'File exists');
+ok($file = slurp $output, 'Slurp data');
+like($file, qr/^\{\n\s+"/, 'No pretty printing');
+
+# Check overwriting
+$bf->{overwrite} = 0;
+
+ok(!$bf->process($path => $output), 'Process file');
+
done_testing;
__END__
+
+
+
+