blob: 66af16f5c02b9cb2230a0db1b73341d3b8a17a5f [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010019use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010020use v5.10;
21use Sys::Info;
22use Sys::Info::Constants qw( :device_cpu );
23
24# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010025# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010026# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010027
Akronc11f7982017-02-21 21:20:14 +010028# TODO: Use KorAP::XML::ForkPool!
29
Akron941c1a62016-02-23 17:41:41 +010030# CHANGES:
31# ----------------------------------------------------------
32# 2013/11/25
33# - Initial release
34#
35# 2014/10/29
36# - Merges foundry data to create indexer friendly documents
37#
Akron93d620e2016-02-05 19:40:05 +010038# 2016/02/04
39# - renamed to korapxml2krill
40# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010041#
42# 2016/02/12
43# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010044# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010045#
46# 2016/02/14
47# - Added version information
Akron941c1a62016-02-23 17:41:41 +010048# - Added support for archive files
49#
50# 2016/02/15
51# - Fixed temporary directory bug
52# - Improved skipping before unzipping
53# - Added EXPERIMENTAL concurrency support
54#
55# 2016/02/23
56# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010057#
58# 2016/02/27
59# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010060#
61# 2016/03/17
62# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010063#
64# 2016/03/18
65# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020066#
Akronf3f0c942016-06-27 13:27:14 +020067# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020068# - Added multi archive support
69# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020070# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020071#
72# 2016/07/06
73# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020074#
75# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020076# - Fixed temporary path issue in script
77#
78# 2016/10/24
79# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020080#
Akronb4bbec72016-10-26 20:21:02 +020081# 2016/10/24
82# - Added support for document extraction
83#
Akron3741f8b2016-12-21 19:55:21 +010084# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020085# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020086#
Akron3741f8b2016-12-21 19:55:21 +010087# 2016/12/21
88# - added support for base-sentences and base-tokenizations
89#
Akron4fa37c32017-01-20 14:43:10 +010090# 2017/01/20
91# - added support for DRuKoLa annotations
92#
Akron41ac10b2017-02-08 22:47:25 +010093# 2017/02/08
94# - added support for pagebreak annotations
95#
Akron941c1a62016-02-23 17:41:41 +010096# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010097
Akron41ac10b2017-02-08 22:47:25 +010098our $LAST_CHANGE = '2017/02/08';
Akron941c1a62016-02-23 17:41:41 +010099our $LOCAL = $FindBin::Bin;
100our $VERSION_MSG = <<"VERSION";
101Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
102VERSION
103
Akron941c1a62016-02-23 17:41:41 +0100104# Parse comand
105my $cmd;
106our @ARGV;
107if ($ARGV[0] && index($ARGV[0], '-') != 0) {
108 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100109};
Akron93d620e2016-02-05 19:40:05 +0100110
Akron5f51d422016-08-16 16:26:43 +0200111my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100112my $text;
Akrone10ad322016-02-27 10:54:26 +0100113
Akronc11f7982017-02-21 21:20:14 +0100114
Akron941c1a62016-02-23 17:41:41 +0100115# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000116GetOptions(
Akron08385f62016-03-22 20:37:04 +0100117 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100118 'output|o=s' => \(my $output),
119 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100120 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200121 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100122 'base-sentences|bs=s' => \(my $base_sentences = ''),
123 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron41ac10b2017-02-08 22:47:25 +0100124 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
Akron941c1a62016-02-23 17:41:41 +0100125 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100126 'skip|s=s' => \@skip,
127 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100128 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100129 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200130 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100131 'primary|p!' => \(my $primary),
132 'pretty|y' => \(my $pretty),
133 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200134 'cache-size|cs=s' => \(my $cache_size = '50m'),
135 'cache-delete|cd!' => \(my $cache_delete = 1),
136 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100137 'help|h' => sub {
138 pod2usage(
139 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200140 -verbose => 99,
141 -msg => $VERSION_MSG,
142 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100143 );
144 },
145 'version|v' => sub {
146 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200147 -verbose => 0,
148 -msg => $VERSION_MSG,
149 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100150 )
151 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000152);
153
Akron3741f8b2016-12-21 19:55:21 +0100154$base_sentences = lc $base_sentences;
155$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100156$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100157
Akron941c1a62016-02-23 17:41:41 +0100158my %ERROR_HASH = (
159 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200160 -verbose => 99,
161 -msg => $VERSION_MSG,
162 -output => '-',
163 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100164);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000165
Akron941c1a62016-02-23 17:41:41 +0100166# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100167pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000168
Akrone1dbc382016-07-08 22:24:52 +0200169# Gzip has no effect, if no output is given
170pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000171
Akron941c1a62016-02-23 17:41:41 +0100172# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000173Log::Log4perl->init({
174 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
175 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
176 'log4perl.appender.STDERR.layout' => 'PatternLayout',
177 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
178});
179
180my $log = Log::Log4perl->get_logger('main');
181
Akronc11f7982017-02-21 21:20:14 +0100182
183if ($jobs == -1) {
184 state $cores = Sys::Info->new->device('CPU')->count;
185 $jobs = ceil(5 * $cores);
186 $log->info("Run using $jobs jobs");
187};
188
Akrone1dbc382016-07-08 22:24:52 +0200189my %skip;
190$skip{lc($_)} = 1 foreach @skip;
191
192my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100193push(@layers, ['Base', 'Sentences']) unless $base_sentences;
194push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200195
196# Connexor
197push(@layers, ['Connexor', 'Morpho']);
198push(@layers, ['Connexor', 'Syntax']);
199push(@layers, ['Connexor', 'Phrase']);
200push(@layers, ['Connexor', 'Sentences']);
201
202# CoreNLP
203push(@layers, ['CoreNLP', 'NamedEntities']);
204push(@layers, ['CoreNLP', 'Sentences']);
205push(@layers, ['CoreNLP', 'Morpho']);
206push(@layers, ['CoreNLP', 'Constituency']);
207
Akron3741f8b2016-12-21 19:55:21 +0100208
Akrone1dbc382016-07-08 22:24:52 +0200209# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100210my @dereko_attr = ();
211if ($base_sentences eq 'dereko#structure') {
212 push @dereko_attr, 'sentences';
213};
214if ($base_paragraphs eq 'dereko#structure') {
215 push @dereko_attr, 'paragraphs';
216};
Akron636bd9c2017-02-09 17:13:00 +0100217
Akron41ac10b2017-02-08 22:47:25 +0100218if ($base_pagebreaks eq 'dereko#structure') {
219 push @dereko_attr, 'pagebreaks';
220};
221
222if ($dereko_attr[0]) {
223 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100224}
225else {
226 push(@layers, ['DeReKo', 'Structure']);
227};
Akrone1dbc382016-07-08 22:24:52 +0200228
229# Glemm
230push(@layers, ['Glemm', 'Morpho']);
231
232# Malt
233push(@layers, ['Malt', 'Dependency']);
234
235# MDParser
236push(@layers, ['MDParser', 'Dependency']);
237
238# Mate
239push(@layers, ['Mate', 'Morpho']);
240push(@layers, ['Mate', 'Dependency']);
241
242# OpenNLP
243push(@layers, ['OpenNLP', 'Morpho']);
244push(@layers, ['OpenNLP', 'Sentences']);
245
246# Schreibgebrauch
247push(@layers, ['Sgbr', 'Lemma']);
248push(@layers, ['Sgbr', 'Morpho']);
249
250# TreeTagger
251push(@layers, ['TreeTagger', 'Morpho']);
252push(@layers, ['TreeTagger', 'Sentences']);
253
254# XIP
255push(@layers, ['XIP', 'Morpho']);
256push(@layers, ['XIP', 'Constituency']);
257push(@layers, ['XIP', 'Sentences']);
258push(@layers, ['XIP', 'Dependency']);
259
Akron4fa37c32017-01-20 14:43:10 +0100260# DRuKoLa
261push(@layers, ['DRuKoLa', 'Morpho']);
262
Akron3bd942f2017-02-20 20:09:14 +0100263# Marmot
264push(@layers, ['MarMoT', 'Morpho']);
265
Akron4fa37c32017-01-20 14:43:10 +0100266
Akrone1dbc382016-07-08 22:24:52 +0200267# Check filters
268my @filtered_anno;
269if ($skip{'#all'}) {
270 foreach (@anno) {
271 push @filtered_anno, [ split('#', $_) ];
272 };
273}
274
275# Add all annotations that are not skipped
276else {
277 # Add to index file - respect skipping
278 foreach my $info (@layers) {
279 # Skip if Foundry or Foundry#Layer should be skipped
280 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
281 push @filtered_anno, $info;
282 };
283 };
284};
285
286# Get tokenization basis
287my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
288
289# TODO: This should not be initialized for batch
290my $cache = Cache::FastMmap->new(
291 share_file => $cache_file,
292 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200293 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200294);
295
Akron03b24db2016-08-16 20:54:32 +0200296# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200297my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200298 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200299 meta_type => $meta,
300 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200301 foundry => $token_base_foundry,
302 layer => $token_base_layer,
303 gzip => $gzip,
304 log => $log,
305 primary => $primary,
306 pretty => $pretty,
307 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200308);
309
Akron941c1a62016-02-23 17:41:41 +0100310
311# Get file name based on path information
312sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100313 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200314 if (-d $i) {
315 $i =~ s![^\/]+$!!;
316 };
Akron941c1a62016-02-23 17:41:41 +0100317 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200318
319 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200320 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100321 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100322 $file =~ tr/\//-/;
323 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200324 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100325 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000326};
327
Akrone10ad322016-02-27 10:54:26 +0100328# Convert sigle to path construct
329s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
330
Akron7d4cdd82016-08-17 21:39:45 +0200331if ($cmd) {
332 if ($output && (!-e $output || !-d $output)) {
333 print "Directory '$output' does not exist.\n\n";
334 exit(0);
335 };
336};
337
338
Akron941c1a62016-02-23 17:41:41 +0100339# Process a single file
340unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100341 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000342
Akron941c1a62016-02-23 17:41:41 +0100343 BEGIN {
344 $main::TIME = Benchmark->new;
345 $main::LAST_STOP = Benchmark->new;
346 };
347
348 sub stop_time {
349 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200350 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100351 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200352 timestr(timediff($new, $main::LAST_STOP)) .
353 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
354 );
Akron941c1a62016-02-23 17:41:41 +0100355 $main::LAST_STOP = $new;
356 };
357
358 # Create and parse new document
359 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100360
Akron7d4cdd82016-08-17 21:39:45 +0200361 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200362 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100363
Akron11c80302016-03-18 19:44:43 +0100364 # Delete cache file
365 unlink($cache_file) if $cache_delete;
366
Akron5f51d422016-08-16 16:26:43 +0200367 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000368}
Nils Diewald59094f22014-11-05 18:20:50 +0000369
Akrone10ad322016-02-27 10:54:26 +0100370# Extract XML files
371elsif ($cmd eq 'extract') {
372
Akron7d4cdd82016-08-17 21:39:45 +0200373 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200374 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100375
Akron7d4cdd82016-08-17 21:39:45 +0200376 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100377 unless ($archive->test_unzip) {
378 print "Unzip is not installed or incompatible.\n\n";
379 exit(1);
380 };
381
Akronb0c88db2016-06-29 16:33:18 +0200382 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200383 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200384
Akron651cb8d2016-08-16 21:44:49 +0200385 my $prefix = 1;
386
Akron03b24db2016-08-16 20:54:32 +0200387 # No sigles given
388 unless (@sigle) {
389
390 # Get files
391 foreach ($archive->list_texts) {
392
393 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200394 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200395
396 # TODO: Make this OS independent
397 push @sigle, join '/', $corpus, $doc, $text;
398 };
Akron20807582016-10-26 17:11:34 +0200399 }
400
401 # Check sigle for doc sigles
402 else {
403 my @new_sigle;
404
405 my $prefix_check = 0;
406
407 # Iterate over all sigle
408 foreach (@sigle) {
409
410 # Sigle is a doc sigle
411 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200412
Akron60a8caa2017-02-17 21:51:27 +0100413 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200414 # Check if a prefix is needed
415 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100416
417 if ($prefix = $archive->check_prefix) {
418 print " with prefix ...";
419 };
Akron20807582016-10-26 17:11:34 +0200420 $prefix_check = 1;
421 };
422
Akron60a8caa2017-02-17 21:51:27 +0100423 print "\n";
424
Akron20807582016-10-26 17:11:34 +0200425 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200426 my $path = ($prefix ? './' : '') . $_;
427
428 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200429 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200430 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200431 ) ? '' : 'not '
432 );
433 print "extracted.\n";
434 }
Akron60a8caa2017-02-17 21:51:27 +0100435
436 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200437 else {
438 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100439
440 unless ($prefix_check) {
441
442 if ($prefix = $archive->check_prefix) {
443 print " with prefix ...";
444 };
445 $prefix_check = 1;
446 };
Akron20807582016-10-26 17:11:34 +0200447 };
448 };
449 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200450 };
451
Akrone10ad322016-02-27 10:54:26 +0100452 # Iterate over all given sigles and extract
453 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100454
Akron2812ba22016-10-28 21:55:59 +0200455 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200456
Akron03b24db2016-08-16 20:54:32 +0200457 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200458 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100459
Akron20807582016-10-26 17:11:34 +0200460 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200461 ($prefix ? './' : '') . $_, $output
462 ) ? '' : 'not '
463 );
Akrone10ad322016-02-27 10:54:26 +0100464 print "extracted.\n";
465 };
466
467 print "\n";
468 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200469 }
Akron7d4cdd82016-08-17 21:39:45 +0200470
471 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200472 else {
473 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100474 };
475}
476
Akron941c1a62016-02-23 17:41:41 +0100477# Process an archive
478elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000479
Akrone1dbc382016-07-08 22:24:52 +0200480 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100481
Akron7d4cdd82016-08-17 21:39:45 +0200482 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100483 my $pool = Parallel::ForkManager->new($jobs);
484
Akron7d4cdd82016-08-17 21:39:45 +0200485 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100486 my $iter = 1; # Current text in process
487
488 # Report on fork message
489 $pool->run_on_finish (
490 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200491 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100492 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200493
Akron08385f62016-03-22 20:37:04 +0100494 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200495 ($iter++) . "/$count]" .
496 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200497 ' ' . $data->[0] . "\n";
498 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100499 }
500 );
501
502 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200503 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100504 print "Reading data ...\n";
505
Akron7d4cdd82016-08-17 21:39:45 +0200506 # unless (Cache::FastMmap->new(
507 # share_file => $cache_file,
508 # cache_size => $cache_size,
509 # init_file => $cache_init
510 # )) {
511 # print "Unable to intialize cache '$cache_file'\n\n";
512 # exit(1);
513 # };
Akron11c80302016-03-18 19:44:43 +0100514
Akron941c1a62016-02-23 17:41:41 +0100515 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100516 if (-d $input[0]) {
517 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100518 my @dirs;
519 my $dir;
520
Akron7d4cdd82016-08-17 21:39:45 +0200521 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100522 while (1) {
523 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200524 push @dirs, $dir;
525 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100526 };
527 last unless $it->next;
528 };
529
530 print "Start processing ...\n";
531 $t = Benchmark->new;
532 $count = scalar @dirs;
533
534 DIRECTORY_LOOP:
535 for (my $i = 0; $i < $count; $i++) {
536
Akrone1dbc382016-07-08 22:24:52 +0200537 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200538 $output,
539 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200540 );
Akron941c1a62016-02-23 17:41:41 +0100541
542 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200543 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200544
Akron13d56622016-10-31 14:54:49 +0100545 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
546 $pool->finish(
547 0,
548 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
549 );
Akron3ec48972016-08-17 23:24:52 +0200550 }
551 else {
Akron4c0cf312016-10-15 16:42:09 +0200552 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200553 };
Akron941c1a62016-02-23 17:41:41 +0100554 };
555 }
556
557 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200558 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200559
Akron941c1a62016-02-23 17:41:41 +0100560 unless ($archive->test_unzip) {
561 print "Unzip is not installed or incompatible.\n\n";
562 exit(1);
563 };
564
Akron08385f62016-03-22 20:37:04 +0100565 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200566 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100567
Akron941c1a62016-02-23 17:41:41 +0100568 print "Start processing ...\n";
569 $t = Benchmark->new;
570 my @dirs = $archive->list_texts;
571 $count = scalar @dirs;
572
573 ARCHIVE_LOOP:
574 for (my $i = 0; $i < $count; $i++) {
575
576 # Split path information
577 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
578
Akrone1dbc382016-07-08 22:24:52 +0200579 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200580 $output,
581 get_file_name(
582 catfile($corpus, $doc, $text)
583 . '.json' . ($gzip ? '.gz' : '')
584 )
Akrone1dbc382016-07-08 22:24:52 +0200585 );
Akron941c1a62016-02-23 17:41:41 +0100586
587 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200588 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100589
Akron4c0cf312016-10-15 16:42:09 +0200590 # Create temporary file
591 $temp = File::Temp->newdir;
592
Akronbdf434a2016-10-24 17:42:07 +0200593 # TODO: Check if $filename exist at the beginning,
594 # because extraction can be horrible slow!
595
Akron941c1a62016-02-23 17:41:41 +0100596 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200597 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100598
Akron7d4cdd82016-08-17 21:39:45 +0200599 # Create corpus directory
600 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100601
Akron7d4cdd82016-08-17 21:39:45 +0200602 # Temporary directory
603 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100604
Akron7d4cdd82016-08-17 21:39:45 +0200605 # Write file
Akron13d56622016-10-31 14:54:49 +0100606 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200607 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100608 $pool->finish(
609 0,
610 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
611 );
612 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200613 }
614 else {
Akron4c0cf312016-10-15 16:42:09 +0200615 # Delete temporary file
616 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200617 };
Akron941c1a62016-02-23 17:41:41 +0100618 }
Akron7d4cdd82016-08-17 21:39:45 +0200619
620 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100621 else {
Akron4c0cf312016-10-15 16:42:09 +0200622 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100623 };
624 };
625 }
626
627 else {
628 print "Input is neither a directory nor an archive.\n\n";
629 };
630
631 $pool->wait_all_children;
632
Akron11c80302016-03-18 19:44:43 +0100633 # Delete cache file
634 unlink($cache_file) if $cache_delete;
635
Akron941c1a62016-02-23 17:41:41 +0100636 print "Done.\n";
637 print timestr(timediff(Benchmark->new, $t))."\n\n";
638}
639
640# Unknown command
641else {
642 warn "Unknown command '$cmd'.\n\n";
643 pod2usage(%ERROR_HASH);
644}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000645
646__END__
Akron941c1a62016-02-23 17:41:41 +0100647
648=pod
649
650=encoding utf8
651
652=head1 NAME
653
Akronf7ad89e2016-03-16 18:22:47 +0100654korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100655
656
657=head1 SYNOPSIS
658
Akrona76d8352016-10-27 16:27:32 +0200659 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100660
Akron2fd402b2016-10-27 21:26:48 +0200661
Akron941c1a62016-02-23 17:41:41 +0100662=head1 DESCRIPTION
663
664L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
665compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100666The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100667
668
669=head1 INSTALLATION
670
671The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
672
Akronaf386982016-10-12 00:33:25 +0200673 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100674
Akronc13a1702016-03-15 19:33:14 +0100675In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100676be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200677Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200678In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100679
680=head1 ARGUMENTS
681
Akrona76d8352016-10-27 16:27:32 +0200682 $ korapxml2krill -z --input <directory> --output <filename>
683
684Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200685It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200686
Akron941c1a62016-02-23 17:41:41 +0100687=over 2
688
689=item B<archive>
690
Akrona76d8352016-10-27 16:27:32 +0200691 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
692
Akron2fd402b2016-10-27 21:26:48 +0200693Converts an archive of KorAP-XML documents. It expects a directory
694(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100695
696=item B<extract>
697
Akrona76d8352016-10-27 16:27:32 +0200698 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
699
700Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100701
702=back
703
704
705=head1 OPTIONS
706
707=over 2
708
Akrona76d8352016-10-27 16:27:32 +0200709=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100710
Akrona76d8352016-10-27 16:27:32 +0200711Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100712
Akron7606afa2016-10-25 16:23:49 +0200713Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100714document, while C<archive> expects a KorAP-XML corpus folder or a zip
715file to batch process multiple files.
716C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200717
Akrona76d8352016-10-27 16:27:32 +0200718C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200719that the first archive listed contains all primary data files
720and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200721
Akron7606afa2016-10-25 16:23:49 +0200722 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200723
Akron0c3e3752016-06-28 15:55:53 +0200724(The directory structure follows the base directory format,
725that may include a C<.> root folder.
726In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200727need to be passed with a hash sign in front of the archive's name.
728This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200729
Akron7606afa2016-10-25 16:23:49 +0200730To support zip files, a version of C<unzip> needs to be installed that is
731compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200732
Akron7606afa2016-10-25 16:23:49 +0200733B<The root folder switch using the hash sign is experimental and
734may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200735
Akron941c1a62016-02-23 17:41:41 +0100736=item B<--output|-o> <directory|file>
737
738Output folder for archive processing or
739document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100740writes to C<STDOUT> by default
741(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100742
743=item B<--overwrite|-w>
744
745Overwrite files that already exist.
746
Akron3741f8b2016-12-21 19:55:21 +0100747=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100748
749Define the default tokenization by specifying
750the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100751of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100752
Akron3741f8b2016-12-21 19:55:21 +0100753
754=item B<--base-sentences|-bs> <foundry>#<layer>
755
756Define the layer for base sentences.
757If given, this will be used instead of using C<Base#Sentences>.
758Currently C<DeReKo#Structure> is the only additional layer supported.
759
760 Defaults to unset.
761
762
763=item B<--base-paragraphs|-bp> <foundry>#<layer>
764
765Define the layer for base paragraphs.
766If given, this will be used instead of using C<Base#Paragraphs>.
767Currently C<DeReKo#Structure> is the only additional layer supported.
768
769 Defaults to unset.
770
771
Akron41ac10b2017-02-08 22:47:25 +0100772=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
773
774Define the layer for base pagebreaks.
775Currently C<DeReKo#Structure> is the only layer supported.
776
777 Defaults to unset.
778
779
Akron941c1a62016-02-23 17:41:41 +0100780=item B<--skip|-s> <foundry>[#<layer>]
781
Akronf7ad89e2016-03-16 18:22:47 +0100782Skip specific annotations by specifying the foundry
783(and optionally the layer with a C<#>-prefix),
784e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100785Can be set multiple times.
786
Akronc13a1702016-03-15 19:33:14 +0100787=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100788
Akronf7ad89e2016-03-16 18:22:47 +0100789Convert specific annotations by specifying the foundry
790(and optionally the layer with a C<#>-prefix),
791e.g. C<Mate> or C<Mate#Morpho>.
792Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100793
794=item B<--primary|-p>
795
Akronc13a1702016-03-15 19:33:14 +0100796Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100797Can be flagged using C<--no-primary> as well.
798This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100799
800=item B<--jobs|-j>
801
802Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100803for archive processing.
Akron11c80302016-03-18 19:44:43 +0100804Defaults to C<0> (everything runs in a single process).
Akronc11f7982017-02-21 21:20:14 +0100805Pass -1, and the value will be set automatically to 5
806times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +0100807This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100808
Akron35db6e32016-03-17 22:42:22 +0100809=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100810
Akron35db6e32016-03-17 22:42:22 +0100811Define the metadata parser to use. Defaults to C<I5>.
812Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
813This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100814
815=item B<--pretty|-y>
816
Akronc13a1702016-03-15 19:33:14 +0100817Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100818This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100819
820=item B<--gzip|-z>
821
Akronf7ad89e2016-03-16 18:22:47 +0100822Compress the output.
823Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100824
Akron11c80302016-03-18 19:44:43 +0100825=item B<--cache|-c>
826
827File to mmap a cache (using L<Cache::FastMmap>).
828Defaults to C<korapxml2krill.cache> in the calling directory.
829
830=item B<--cache-size|-cs>
831
832Size of the cache. Defaults to C<50m>.
833
834=item B<--cache-init|-ci>
835
836Initialize cache file.
837Can be flagged using C<--no-cache-init> as well.
838Defaults to C<true>.
839
840=item B<--cache-delete|-cd>
841
842Delete cache file after processing.
843Can be flagged using C<--no-cache-delete> as well.
844Defaults to C<true>.
845
Akrone10ad322016-02-27 10:54:26 +0100846=item B<--sigle|-sg>
847
Akron20807582016-10-26 17:11:34 +0200848Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100849Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100850I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200851Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200852In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200853On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100854
Akron941c1a62016-02-23 17:41:41 +0100855=item B<--log|-l>
856
857The L<Log4perl> log level, defaults to C<ERROR>.
858
859=item B<--help|-h>
860
861Print this document.
862
863=item B<--version|-v>
864
865Print version information.
866
867=back
868
Akronc13a1702016-03-15 19:33:14 +0100869=head1 ANNOTATION SUPPORT
870
871L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
872developed in the KorAP project that are part of the KorAP preprocessing pipeline.
873The base foundry with paragraphs, sentences, and the text element are mandatory for
874L<Krill|https://github.com/KorAP/Krill>.
875
Akronf7ad89e2016-03-16 18:22:47 +0100876=over 2
Akronc13a1702016-03-15 19:33:14 +0100877
878=item B<Base>
879
880=over 4
881
Akronf7ad89e2016-03-16 18:22:47 +0100882=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100883
Akronf7ad89e2016-03-16 18:22:47 +0100884=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100885
886=back
887
888=item B<Connexor>
889
890=over 4
891
Akronf7ad89e2016-03-16 18:22:47 +0100892=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100893
Akronf7ad89e2016-03-16 18:22:47 +0100894=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100895
Akronf7ad89e2016-03-16 18:22:47 +0100896=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100897
Akronf7ad89e2016-03-16 18:22:47 +0100898=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100899
900=back
901
902=item B<CoreNLP>
903
904=over 4
905
Akronf7ad89e2016-03-16 18:22:47 +0100906=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100907
Akronf7ad89e2016-03-16 18:22:47 +0100908=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100909
Akronf7ad89e2016-03-16 18:22:47 +0100910=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100911
Akronf7ad89e2016-03-16 18:22:47 +0100912=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100913
914=back
915
916=item B<DeReKo>
917
918=over 4
919
Akronf7ad89e2016-03-16 18:22:47 +0100920=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100921
922=back
923
924=item B<Glemm>
925
926=over 4
927
Akronf7ad89e2016-03-16 18:22:47 +0100928=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100929
930=back
931
932=item B<Mate>
933
934=over 4
935
Akronf7ad89e2016-03-16 18:22:47 +0100936=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100937
Akronf7ad89e2016-03-16 18:22:47 +0100938=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100939
940=back
941
942=item B<OpenNLP>
943
944=over 4
945
Akronf7ad89e2016-03-16 18:22:47 +0100946=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100947
Akronf7ad89e2016-03-16 18:22:47 +0100948=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100949
950=back
951
952=item B<Sgbr>
953
954=over 4
955
Akronf7ad89e2016-03-16 18:22:47 +0100956=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100957
Akronf7ad89e2016-03-16 18:22:47 +0100958=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100959
960=back
961
Akron4fa37c32017-01-20 14:43:10 +0100962=item B<DRuKoLa>
963
964=over 4
965
966=item #Morpho
967
968=back
969
Akronc13a1702016-03-15 19:33:14 +0100970=item B<TreeTagger>
971
972=over 4
973
Akronf7ad89e2016-03-16 18:22:47 +0100974=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100975
Akronf7ad89e2016-03-16 18:22:47 +0100976=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100977
978=back
979
980=item B<XIP>
981
982=over 4
983
Akronf7ad89e2016-03-16 18:22:47 +0100984=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100985
Akronf7ad89e2016-03-16 18:22:47 +0100986=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100987
Akronf7ad89e2016-03-16 18:22:47 +0100988=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100989
990=back
991
992=back
993
994More importers are in preparation.
995New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
996See the built-in annotation importers as examples.
997
Akron941c1a62016-02-23 17:41:41 +0100998=head1 AVAILABILITY
999
1000 https://github.com/KorAP/KorAP-XML-Krill
1001
1002
1003=head1 COPYRIGHT AND LICENSE
1004
Akron3ec0a1c2017-01-18 14:41:55 +01001005Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001006
Akron941c1a62016-02-23 17:41:41 +01001007Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +02001008Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001009
1010L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1011Corpus Analysis Platform at the
1012L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1013member of the
1014L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1015
1016This program is free software published under the
1017L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1018
1019=cut