blob: 76a4a46634363af905dddb27c65a8f4f135084f6 [file] [log] [blame]
Akron405f0c52016-07-07 17:56:16 +02001package KorAP::XML::Batch::File;
2use KorAP::XML::Krill;
3use Mojo::Log;
4use IO::Compress::Gzip;
5use IO::File;
6use strict;
7use warnings;
8
Akrone1dbc382016-07-08 22:24:52 +02009# Constructor
Akron405f0c52016-07-07 17:56:16 +020010sub new {
11 my $class = shift;
12 my %param = @_;
13
14 bless {
15 cache => $param{cache} // undef,
Akrone1dbc382016-07-08 22:24:52 +020016 meta_type => $param{meta_type} || 'I5',
Akron405f0c52016-07-07 17:56:16 +020017 overwrite => $param{overwrite},
Akrone1dbc382016-07-08 22:24:52 +020018 foundry => $param{foundry} || 'Base',
19 layer => $param{layer} || 'Tokens',
20 anno => $param{anno} || [[]],
21 log => $param{log} || Mojo::Log->new(level => 'fatal'),
Akron405f0c52016-07-07 17:56:16 +020022 primary => $param{primary},
23 pretty => $param{pretty},
Akrone1dbc382016-07-08 22:24:52 +020024 gzip => $param{gzip} // 0
Akron405f0c52016-07-07 17:56:16 +020025 }, $class;
26};
27
Akroncdf0e002016-07-08 16:42:04 +020028# Process a file
Akron405f0c52016-07-07 17:56:16 +020029sub process {
Akron5f51d422016-08-16 16:26:43 +020030 my ($self, $input, $output) = @_;
Akron405f0c52016-07-07 17:56:16 +020031
Akroncdf0e002016-07-08 16:42:04 +020032 if (!$self->{overwrite} && $output && -e $output) {
33 $self->{log}->debug($output . ' already exists');
34 return;
35 };
36
Akron405f0c52016-07-07 17:56:16 +020037 # Create and parse new document
38 $input =~ s{([^/])$}{$1/};
39 my $doc = KorAP::XML::Krill->new(
40 path => $input,
41 meta_type => $self->{meta_type},
42 cache => $self->{cache}
43 );
44
45 # Parse document
46 unless ($doc->parse) {
47 $self->{log}->warn(($output // $input) . " can't be processed - no document data");
48 return;
49 };
50
51 # Get tokenization
52 my $tokens = KorAP::XML::Tokenizer->new(
53 path => $doc->path,
54 doc => $doc,
55 foundry => $self->{foundry},
56 layer => $self->{layer},
57 name => 'tokens'
58 );
59
60 # Unable to process base tokenization
61 unless ($tokens->parse) {
62 $self->{log}->error(($output // $input) . " can't be processed - no base tokenization");
63 return;
64 };
65
66 foreach (@{$self->{anno}}) {
67 $tokens->add(@$_);
68 };
69
70 my $file;
71 my $print_text = ($self->{pretty} ? $tokens->to_pretty_json($self->{primary}) : $tokens->to_json($self->{primary}));
Akron5f51d422016-08-16 16:26:43 +020072
73 # There is an output file given
Akron405f0c52016-07-07 17:56:16 +020074 if ($output) {
Akron5f51d422016-08-16 16:26:43 +020075
Akron405f0c52016-07-07 17:56:16 +020076 if ($self->{gzip}) {
Akron5f51d422016-08-16 16:26:43 +020077 $file = IO::Compress::Gzip->new($output, TextFlag => 1, Minimal => 1);
Akron405f0c52016-07-07 17:56:16 +020078 }
79 else {
80 $file = IO::File->new($output, "w");
81 };
82
Akron5f51d422016-08-16 16:26:43 +020083 # Write to output
84 unless ($file->print($print_text)) {
85 $self->{log}->error('Unable to write to ' . $file);
86 };
87
88 # Flush pending data
89 # $file->flush if $self->{gzip};
90
Akron405f0c52016-07-07 17:56:16 +020091 $file->close;
92 }
93
Akron5f51d422016-08-16 16:26:43 +020094 # Direct output to STDOUT
Akron405f0c52016-07-07 17:56:16 +020095 else {
96 print $print_text . "\n";
97 };
98
99 return 1;
100};
101
1021;
Akroncdf0e002016-07-08 16:42:04 +0200103
104__END__
105
106=pod
107
108=encoding utf8
109
110=head1 NAME
111
112KorAP::XML::Batch::File - Process multiple files with identical setup
113
114
115=head1 SYNOPSIS
116
117
118 # Create Converter Object
119 my $converter = KorAP::XML::Batch::File->new(
120 overwrite => 1,
121 gzip => 1
122 );
123
124 $converter->process('/my/data' => 'my-output.gz');
125
126=head1 DESCRIPTION
127
128Set up the configuration for a corpus and process
129multiple texts with the same configuration.
130
131=head1 METHODS
132
133Construct a new converter object.
134
135 my $converter = KorAP::XML::Batch::File->new(
136 overwrite => 1,
137 gzip => 1
138 );
139
140
141=head2 new
142
143=over 2
144
145=item cache
146
147A L<Cache::FastMmap> compatible cache object.
148
149=item meta_type
150
151Meta data type to be parsed. Defaults to C<I5>,
152also supports all classes in the C<KorAP::XML::Meta> namespace.
153
154=item overwrite
155
156Overwrite existing files!
157Defaults to C<false>.
158
159=item foundry
160
161The foundry to use for tokenization,
162defaults to C<Base>.
163
164=item layer
165
166The layer to use for tokenization,
167defaults to C<Tokens>.
168
169=item anno
170
171 my $converter = KorAP::XML::Batch::File->new(
172 anno => [
173 ['CoreNLP', 'Morpho'],
174 ['OpenNLP', 'Morpho']
175 ]
176 );
177
178An array reference of array references,
179containing annotation layers as foundry-layer
180pairs to parse.
181The list is empty by default.
182
183=item log
184
185A L<Mojo::Log> compatible log object.
186
187=item primary
188
189Export primary text associated with the document.
190Defaults to C<true>.
191
192=item pretty
193
194Pretty print the output JSON.
195Defaults to C<false>.
196
197=item gzip
198
199Compress the output using Gzip.
200This will be ignored, if the output is undefined
201(i.e. C<STDOUT>).
202Defaults to C<false>.
203
204=back
205
206=head2 process
207
208 $converter->process('/mydoc/');
209 $converter->process('/mydoc/', '/myoutput.gzip');
210
211Process a file and pass to a chosen output.
212The first argument is mandatory and
213represents the path to the KorapXML text files.
214The second argument is optional and
215represents a file path to write.
216If the second argument is not given,
217the process will write to C<STDOUT>
218(in that case, the C<gzip> parameter is ignored).
219
220=head1 AVAILABILITY
221
222 https://github.com/KorAP/KorAP-XML-Krill
223
224
225=head1 COPYRIGHT AND LICENSE
226
227Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
228Author: L<Nils Diewald|http://nils-diewald.de/>
229
230KorAP::XML::Krill is developed as part of the
231L<KorAP|http://korap.ids-mannheim.de/>
232Corpus Analysis Platform at the
233L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
234member of the
235L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
236and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
237funded by the
238L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
239
240KorAP::XML::Krill is free software published under the
241L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
242
243=cut