Start splitting script file for better testing
Change-Id: I072f270db2328c01634f4626062635b60dd5844d
diff --git a/lib/KorAP/XML/ForkPool.pm b/lib/KorAP/XML/ForkPool.pm
new file mode 100644
index 0000000..e60135f
--- /dev/null
+++ b/lib/KorAP/XML/ForkPool.pm
@@ -0,0 +1,174 @@
+package KorAP::XML::ForkPool;
+use strict;
+use warnings;
+use Parallel::ForkManager;
+
+
+# Construct a new fork pool
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ bless {
+ jobs => $param{jobs} // 0,
+ iter => 1, # Current text in process
+ overwrite => $param{overwrite},
+ output => $param{output},
+ cache => $param{cache}
+ }, $class;
+};
+
+sub new_pool {
+ my $self = shift;
+
+ # Zero means: everything runs in the parent process
+ my $pool = Parallel::ForkManager->new($self->{jobs});
+
+ # Report per processed text
+ $pool->run_on_finish(
+ sub {
+ my ($pid, $code) = @_;
+ my $data = pop;
+
+ print 'Convert [' . ($self->{jobs} > 0 ? "\$$pid:" : '') .
+ ($self->{iter}++) . '/' . $self->{count} . ']';
+ print ($code ? " $code" : '') . " $$data\n";
+ }
+ );
+
+ return $pool;
+};
+
+sub process_directory {
+ my $self = shift;
+ my $input = shift;
+
+ my $pool = $self->new_pool;
+
+ print "Reading data ...\n";
+
+ my $it = Directory::Iterator->new($input);
+ my @dirs;
+ my $dir;
+
+ while (1) {
+ if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
+ push @dirs, $dir;
+ $it->prune;
+ };
+ last unless $it->next;
+ };
+
+ $self->{count} = scalar @dirs;
+
+ DIRECTORY_LOOP:
+ for (my $i = 0; $i < $count; $i++) {
+
+ unless ($self->{overwrite}) {
+ my $filename = catfile(
+ $output,
+ get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+ );
+
+ if (-e $filename) {
+ $iter++;
+ print "Skip $filename\n";
+ next;
+ };
+ };
+
+ # Get the next fork
+ my $pid = $pool->start and next DIRECTORY_LOOP;
+ my $msg;
+ $msg = write_file($dirs[$i]);
+ $pool->finish(0, \$msg);
+ };
+
+ $pool->wait_all_children;
+
+ # Delete cache file
+ unlink($cache_file) if $cache_delete;
+};
+
+
+sub process_archive {
+ my $self = shift;
+ my $archive = shift;
+ my @input = @_;
+
+ unless ($archive->test_unzip) {
+ print "Unzip is not installed or incompatible.\n\n";
+ exit(1);
+ };
+
+ # Add further annotation archived
+ $archive->attach($_) foreach @input;
+
+ print "Start processing ...\n";
+
+ my @dirs = $archive->list_texts;
+ $self->{count} = scalar @dirs;
+
+ # Creae new pool
+ my $pool = $self->new_pool;
+
+ ARCHIVE_LOOP:
+ for (my $i = 0; $i < $count; $i++) {
+
+ # Split path information
+ my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+
+ unless ($self->{overwrite}) {
+
+ my $filename = catfile(
+ $output,
+ get_file_name(
+ catfile($corpus, $doc, $text)
+ . '.json' . ($gzip ? '.gz' : '')
+ )
+ );
+
+ if (-e $filename) {
+ $iter++;
+ print "Skip $filename\n";
+ next;
+ };
+ };
+
+ # Get the next fork
+ my $pid = $pool->start and next ARCHIVE_LOOP;
+
+ # Create temporary file
+ my $temp = File::Temp->newdir;
+
+ my $msg;
+
+ # Extract from archive
+ if ($archive->extract($dirs[$i], $temp)) {
+
+ # Create corpus directory
+ my $input = catdir("$temp", $corpus);
+
+ # Temporary directory
+ my $dir = catdir($input, $doc, $text);
+
+ # Write file
+ $msg = write_file($dir);
+ $temp = undef;
+ $pool->finish(0, \$msg);
+ }
+ else {
+ $temp = undef;
+ $msg = "Unable to extract " . $dirs[$i] . "\n";
+ $pool->finish(1, \$msg);
+ };
+ };
+
+ $pool->wait_all_children;
+
+ # Delete cache file
+ unlink($cache_file) if $cache_delete;
+};
+
+
+1;
diff --git a/lib/KorAP/XML/ProcessFile.pm b/lib/KorAP/XML/ProcessFile.pm
new file mode 100644
index 0000000..b3adeab
--- /dev/null
+++ b/lib/KorAP/XML/ProcessFile.pm
@@ -0,0 +1,137 @@
+package KorAP::XML::ProcessFile;
+use KorAP::XML::Krill;
+use Log::Log4perl;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ my @layers;
+ push(@layers, ['Base', 'Sentences']);
+ push(@layers, ['Base', 'Paragraphs']);
+
+ # Connexor
+ push(@layers, ['Connexor', 'Morpho']);
+ push(@layers, ['Connexor', 'Syntax']);
+ push(@layers, ['Connexor', 'Phrase']);
+ push(@layers, ['Connexor', 'Sentences']);
+
+ # CoreNLP
+ push(@layers, ['CoreNLP', 'NamedEntities']);
+ push(@layers, ['CoreNLP', 'Sentences']);
+ push(@layers, ['CoreNLP', 'Morpho']);
+ push(@layers, ['CoreNLP', 'Constituency']);
+
+ # DeReKo
+ push(@layers, ['DeReKo', 'Structure']);
+
+ # Glemm
+ push(@layers, ['Glemm', 'Morpho']);
+
+ # Malt
+ push(@layers, ['Malt', 'Dependency']);
+
+ # MDParser
+ push(@layers, ['MDParser', 'Dependency']);
+
+ # Mate
+ push(@layers, ['Mate', 'Morpho']);
+ push(@layers, ['Mate', 'Dependency']);
+
+ # OpenNLP
+ push(@layers, ['OpenNLP', 'Morpho']);
+ push(@layers, ['OpenNLP', 'Sentences']);
+
+ # Schreibgebrauch
+ push(@layers, ['Sgbr', 'Lemma']);
+ push(@layers, ['Sgbr', 'Morpho']);
+
+ # TreeTagger
+ push(@layers, ['TreeTagger', 'Morpho']);
+ push(@layers, ['TreeTagger', 'Sentences']);
+
+ # XIP
+ push(@layers, ['XIP', 'Morpho']);
+ push(@layers, ['XIP', 'Constituency']);
+ push(@layers, ['XIP', 'Sentences']);
+ push(@layers, ['XIP', 'Dependency']);
+
+ my @anno;
+ my $skip = $param{skip};
+
+ # Check for complete skipping
+ if ($skip->{'#all'}) {
+ foreach (@$param{anno}) {
+ push @anno, [split('#', $_)];
+ }
+ }
+
+ # Iterate over all layers
+ else {
+ # Add to index file - respect skipping
+ foreach my $info (@layers) {
+
+ # Skip if Foundry or Foundry#Layer should be skipped
+ unless ($skip->{lc($info->[0])} || $skip->{lc($info->[0]) . '#' . lc($info->[1])}) {
+ push @anno, $info;
+ };
+ };
+ };
+
+ bless {
+ cache => $param{cache} // undef,
+ meta => $param{meta} // 'I5',
+ outpu => $param{output},
+ overwrite => $param{overwrite},
+ foundry => $param{foundry} // 'Base',
+ layer => $param{layer} // 'Tokens',
+ anno => \@anno,
+ log => $param{log} // Log::Log4perl->get_logger('main')
+ }, $class;
+};
+
+
+sub process {
+ my $self = shift;
+ my $input = shift;
+ my $output = shift;
+
+ # Create and parse new document
+ $input =~ s{([^/])$}{$1/};
+ my $doc = KorAP::XML::Krill->new(
+ path => $input,
+ meta_type => $self->{meta},
+ cache => $self->{cache}
+ );
+
+ # Parse document
+ unless ($doc->parse) {
+ $log->warn($output . " can't be processed - no document data");
+ return;
+ };
+
+ # Get tokenization
+ my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $self->{foundry},
+ layer => $self->{layer},
+ name => 'tokens'
+ );
+
+ # Unable to process base tokenization
+ unless ($tokens->parse) {
+ $log->error($output . " can't be processed - no base tokenization");
+ return;
+ };
+
+ foreach (@{$self->{anno}}) {
+ $tokens->add(@$_);
+ };
+
+# Go on here with my $file; my $print_text
+};
+
+1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5e9cc38..53f0765 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -59,9 +59,12 @@
# - Added multi archive support
# - Added prefix negation support
# - Added Malt#Dependency support
+#
+# 2016/07/06
+# - Added MDParser#Dependency
# ----------------------------------------------------------
-our $LAST_CHANGE = '2016/03/17';
+our $LAST_CHANGE = '2016/07/06';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -270,6 +273,9 @@
# Malt
push(@layers, ['Malt', 'Dependency']);
+ # MDParser
+ push(@layers, ['MDParser', 'Dependency']);
+
# Mate
push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);