blob: 297fc060d01cb96bcd96c460bcc68a603826af1b [file] [log] [blame]
Akron405f0c52016-07-07 17:56:16 +02001package KorAP::XML::Batch::File;
2use KorAP::XML::Krill;
Akronb9c33812020-10-21 16:19:35 +02003# use Mojo::Log;
4use Log::Any qw($log);
Akron405f0c52016-07-07 17:56:16 +02005use IO::Compress::Gzip;
6use IO::File;
7use strict;
8use warnings;
9
Akrone1dbc382016-07-08 22:24:52 +020010# Constructor
Akron405f0c52016-07-07 17:56:16 +020011sub new {
12 my $class = shift;
13 my %param = @_;
14
15 bless {
Akroned9baf02019-01-22 17:03:25 +010016 cache => $param{cache} // undef,
17 meta_type => $param{meta_type} || 'I5',
18 overwrite => $param{overwrite},
19 foundry => $param{foundry} || 'Base',
20 layer => $param{layer} || 'Tokens',
21 anno => $param{anno} || [[]],
Akronb9c33812020-10-21 16:19:35 +020022 log => $param{log} || $log,
Akron263274c2019-02-07 09:48:30 +010023 koral => $param{koral},
Akroned9baf02019-01-22 17:03:25 +010024 non_word_tokens => $param{non_word_tokens},
Akronf1849aa2019-12-16 23:35:33 +010025 non_verbal_tokens => $param{non_verbal_tokens},
Akroned9baf02019-01-22 17:03:25 +010026 gzip => $param{gzip} // 0
Akron405f0c52016-07-07 17:56:16 +020027 }, $class;
28};
29
Akroncdf0e002016-07-08 16:42:04 +020030# Process a file
Akron405f0c52016-07-07 17:56:16 +020031sub process {
Akron5f51d422016-08-16 16:26:43 +020032 my ($self, $input, $output) = @_;
Akron405f0c52016-07-07 17:56:16 +020033
Akroncdf0e002016-07-08 16:42:04 +020034 if (!$self->{overwrite} && $output && -e $output) {
35 $self->{log}->debug($output . ' already exists');
Akron13d56622016-10-31 14:54:49 +010036 return -1;
Akroncdf0e002016-07-08 16:42:04 +020037 };
38
Akron405f0c52016-07-07 17:56:16 +020039 # Create and parse new document
40 $input =~ s{([^/])$}{$1/};
41 my $doc = KorAP::XML::Krill->new(
42 path => $input,
43 meta_type => $self->{meta_type},
Akronb05b8422019-12-11 13:47:57 +010044 cache => $self->{cache},
45 log => $self->{log}
Akron405f0c52016-07-07 17:56:16 +020046 );
47
48 # Parse document
49 unless ($doc->parse) {
50 $self->{log}->warn(($output // $input) . " can't be processed - no document data");
51 return;
52 };
53
54 # Get tokenization
55 my $tokens = KorAP::XML::Tokenizer->new(
56 path => $doc->path,
57 doc => $doc,
58 foundry => $self->{foundry},
59 layer => $self->{layer},
Akroned9baf02019-01-22 17:03:25 +010060 name => 'tokens',
Akronf1849aa2019-12-16 23:35:33 +010061 non_word_tokens => $self->{non_word_tokens},
62 non_verbal_tokens => $self->{non_verbal_tokens}
Akron405f0c52016-07-07 17:56:16 +020063 );
64
65 # Unable to process base tokenization
66 unless ($tokens->parse) {
Akronf021ad62019-03-08 17:25:59 +010067 $self->{log}->error(($output // $input) . " can't be processed - " . $tokens->error);
Akron405f0c52016-07-07 17:56:16 +020068 return;
69 };
70
71 foreach (@{$self->{anno}}) {
72 $tokens->add(@$_);
73 };
74
75 my $file;
Akron6aed0562020-08-07 16:46:00 +020076 my $print_text = $tokens->to_json($self->{koral});
Akron5f51d422016-08-16 16:26:43 +020077
78 # There is an output file given
Akron405f0c52016-07-07 17:56:16 +020079 if ($output) {
Akron5f51d422016-08-16 16:26:43 +020080
Akron405f0c52016-07-07 17:56:16 +020081 if ($self->{gzip}) {
Akron5f51d422016-08-16 16:26:43 +020082 $file = IO::Compress::Gzip->new($output, TextFlag => 1, Minimal => 1);
Akron405f0c52016-07-07 17:56:16 +020083 }
84 else {
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020085 $file = IO::File->new($output, "w"); # '>:encoding(UTF-8)'); # "w");
Akron158bd502020-04-02 12:28:00 +020086 # Unable to open for writing
87 };
88
89 # Output not opened
90 unless (defined $file) {
91 $self->{log}->error('Unable to open ' . $output . ' for writing');
92 return;
Akron405f0c52016-07-07 17:56:16 +020093 };
94
Akron5f51d422016-08-16 16:26:43 +020095 # Write to output
96 unless ($file->print($print_text)) {
97 $self->{log}->error('Unable to write to ' . $file);
98 };
99
100 # Flush pending data
101 # $file->flush if $self->{gzip};
102
Akron405f0c52016-07-07 17:56:16 +0200103 $file->close;
104 }
105
Akron5f51d422016-08-16 16:26:43 +0200106 # Direct output to STDOUT
Akron405f0c52016-07-07 17:56:16 +0200107 else {
108 print $print_text . "\n";
109 };
110
111 return 1;
112};
113
1141;
Akroncdf0e002016-07-08 16:42:04 +0200115
116__END__
117
118=pod
119
120=encoding utf8
121
122=head1 NAME
123
124KorAP::XML::Batch::File - Process multiple files with identical setup
125
126
127=head1 SYNOPSIS
128
129
130 # Create Converter Object
131 my $converter = KorAP::XML::Batch::File->new(
132 overwrite => 1,
133 gzip => 1
134 );
135
136 $converter->process('/my/data' => 'my-output.gz');
137
138=head1 DESCRIPTION
139
140Set up the configuration for a corpus and process
141multiple texts with the same configuration.
142
143=head1 METHODS
144
145Construct a new converter object.
146
147 my $converter = KorAP::XML::Batch::File->new(
148 overwrite => 1,
149 gzip => 1
150 );
151
152
153=head2 new
154
155=over 2
156
157=item cache
158
159A L<Cache::FastMmap> compatible cache object.
160
161=item meta_type
162
163Meta data type to be parsed. Defaults to C<I5>,
164also supports all classes in the C<KorAP::XML::Meta> namespace.
165
166=item overwrite
167
168Overwrite existing files!
169Defaults to C<false>.
170
171=item foundry
172
173The foundry to use for tokenization,
174defaults to C<Base>.
175
176=item layer
177
178The layer to use for tokenization,
179defaults to C<Tokens>.
180
181=item anno
182
183 my $converter = KorAP::XML::Batch::File->new(
184 anno => [
185 ['CoreNLP', 'Morpho'],
186 ['OpenNLP', 'Morpho']
187 ]
188 );
189
190An array reference of array references,
191containing annotation layers as foundry-layer
192pairs to parse.
193The list is empty by default.
194
195=item log
196
Akronb9c33812020-10-21 16:19:35 +0200197A L<Log::Any> compatible log object.
Akroncdf0e002016-07-08 16:42:04 +0200198
Akroncdf0e002016-07-08 16:42:04 +0200199=item gzip
200
201Compress the output using Gzip.
202This will be ignored, if the output is undefined
203(i.e. C<STDOUT>).
204Defaults to C<false>.
205
206=back
207
208=head2 process
209
210 $converter->process('/mydoc/');
211 $converter->process('/mydoc/', '/myoutput.gzip');
212
213Process a file and pass to a chosen output.
214The first argument is mandatory and
215represents the path to the KorapXML text files.
216The second argument is optional and
217represents a file path to write.
218If the second argument is not given,
219the process will write to C<STDOUT>
220(in that case, the C<gzip> parameter is ignored).
221
222=head1 AVAILABILITY
223
224 https://github.com/KorAP/KorAP-XML-Krill
225
226
227=head1 COPYRIGHT AND LICENSE
228
229Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
230Author: L<Nils Diewald|http://nils-diewald.de/>
231
232KorAP::XML::Krill is developed as part of the
233L<KorAP|http://korap.ids-mannheim.de/>
234Corpus Analysis Platform at the
235L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
236member of the
237L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
238and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
239funded by the
240L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
241
242KorAP::XML::Krill is free software published under the
243L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
244
245=cut