Start splitting script file for better testing Change-Id: I072f270db2328c01634f4626062635b60dd5844d

commit: 8b9905233c2ad3044af7ee8c712c371faf2ebec3 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Jul 06 16:45:57 2016 +0200
committer: Akron <nils@diewald-online.de> Wed Jul 06 16:45:57 2016 +0200
tree: 6835ecdfa2ffa53512c35a4cf4d35196d3c22fb2
parent: a86d94a3b310cc83176e0073d10c6eafe3007be1 [diff]
diff --git a/lib/KorAP/XML/ForkPool.pm b/lib/KorAP/XML/ForkPool.pm
new file mode 100644
index 0000000..e60135f
--- /dev/null
+++ b/lib/KorAP/XML/ForkPool.pm

@@ -0,0 +1,174 @@
+package KorAP::XML::ForkPool;
+use strict;
+use warnings;
+use Parallel::ForkManager;
+
+
+# Construct a new fork pool
+sub new {
+  my $class = shift;
+  my %param = @_;
+
+  bless {
+    jobs      => $param{jobs} // 0,
+    iter      => 1,      # Current text in process
+    overwrite => $param{overwrite},
+    output    => $param{output},
+    cache     => $param{cache}
+  }, $class;
+};
+
+sub new_pool {
+  my $self = shift;
+
+  # Zero means: everything runs in the parent process
+  my $pool = Parallel::ForkManager->new($self->{jobs});
+
+  # Report per processed text
+  $pool->run_on_finish(
+    sub {
+      my ($pid, $code) = @_;
+      my $data = pop;
+
+      print 'Convert [' . ($self->{jobs} > 0 ? "\$$pid:" : '') .
+	($self->{iter}++) . '/' . $self->{count} . ']';
+      print ($code ? " $code" : '') . " $$data\n";
+    }
+  );
+
+  return $pool;
+};
+
+sub process_directory {
+  my $self = shift;
+  my $input = shift;
+
+  my $pool = $self->new_pool;
+
+  print "Reading data ...\n";
+
+  my $it = Directory::Iterator->new($input);
+  my @dirs;
+  my $dir;
+
+  while (1) {
+    if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
+      push @dirs, $dir;
+      $it->prune;
+    };
+    last unless $it->next;
+  };
+
+  $self->{count} = scalar @dirs;
+
+ DIRECTORY_LOOP:
+  for (my $i = 0; $i < $count; $i++) {
+
+    unless ($self->{overwrite}) {
+      my $filename = catfile(
+	$output,
+	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
+      );
+
+      if (-e $filename) {
+	$iter++;
+	print "Skip $filename\n";
+	next;
+      };
+    };
+
+    # Get the next fork
+    my $pid = $pool->start and next DIRECTORY_LOOP;
+    my $msg;
+    $msg = write_file($dirs[$i]);
+    $pool->finish(0, \$msg);
+  };
+
+  $pool->wait_all_children;
+
+  # Delete cache file
+  unlink($cache_file) if $cache_delete;
+};
+
+
+sub process_archive {
+  my $self = shift;
+  my $archive = shift;
+  my @input = @_;
+
+  unless ($archive->test_unzip) {
+    print "Unzip is not installed or incompatible.\n\n";
+    exit(1);
+  };
+
+  # Add further annotation archived
+  $archive->attach($_) foreach @input;
+
+  print "Start processing ...\n";
+
+  my @dirs = $archive->list_texts;
+  $self->{count} = scalar @dirs;
+
+  # Creae new pool
+  my $pool = $self->new_pool;
+
+ ARCHIVE_LOOP:
+  for (my $i = 0; $i < $count; $i++) {
+
+    # Split path information
+    my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+
+    unless ($self->{overwrite}) {
+
+      my $filename = catfile(
+	$output,
+	get_file_name(
+	  catfile($corpus, $doc, $text)
+	    . '.json' . ($gzip ? '.gz' : '')
+	  )
+      );
+
+      if (-e $filename) {
+	$iter++;
+	print "Skip $filename\n";
+	next;
+      };
+    };
+
+    # Get the next fork
+    my $pid = $pool->start and next ARCHIVE_LOOP;
+
+    # Create temporary file
+    my $temp = File::Temp->newdir;
+
+    my $msg;
+
+    # Extract from archive
+    if ($archive->extract($dirs[$i], $temp)) {
+
+      # Create corpus directory
+      my $input = catdir("$temp", $corpus);
+
+      # Temporary directory
+      my $dir = catdir($input, $doc, $text);
+
+      # Write file
+      $msg = write_file($dir);
+      $temp = undef;
+      $pool->finish(0, \$msg);
+    }
+    else {
+      $temp = undef;
+      $msg = "Unable to extract " . $dirs[$i] . "\n";
+      $pool->finish(1, \$msg);
+    };
+  };
+
+  $pool->wait_all_children;
+
+  # Delete cache file
+  unlink($cache_file) if $cache_delete;
+};
+
+
+1;

diff --git a/lib/KorAP/XML/ProcessFile.pm b/lib/KorAP/XML/ProcessFile.pm
new file mode 100644
index 0000000..b3adeab
--- /dev/null
+++ b/lib/KorAP/XML/ProcessFile.pm

@@ -0,0 +1,137 @@
+package KorAP::XML::ProcessFile;
+use KorAP::XML::Krill;
+use Log::Log4perl;
+use strict;
+use warnings;
+
+sub new {
+  my $class = shift;
+  my %param = @_;
+
+  my @layers;
+  push(@layers, ['Base', 'Sentences']);
+  push(@layers, ['Base', 'Paragraphs']);
+
+  # Connexor
+  push(@layers, ['Connexor', 'Morpho']);
+  push(@layers, ['Connexor', 'Syntax']);
+  push(@layers, ['Connexor', 'Phrase']);
+  push(@layers, ['Connexor', 'Sentences']);
+
+  # CoreNLP
+  push(@layers, ['CoreNLP', 'NamedEntities']);
+  push(@layers, ['CoreNLP', 'Sentences']);
+  push(@layers, ['CoreNLP', 'Morpho']);
+  push(@layers, ['CoreNLP', 'Constituency']);
+
+  # DeReKo
+  push(@layers, ['DeReKo', 'Structure']);
+
+  # Glemm
+  push(@layers, ['Glemm', 'Morpho']);
+
+  # Malt
+  push(@layers, ['Malt', 'Dependency']);
+
+  # MDParser
+  push(@layers, ['MDParser', 'Dependency']);
+
+  # Mate
+  push(@layers, ['Mate', 'Morpho']);
+  push(@layers, ['Mate', 'Dependency']);
+
+  # OpenNLP
+  push(@layers, ['OpenNLP', 'Morpho']);
+  push(@layers, ['OpenNLP', 'Sentences']);
+
+  # Schreibgebrauch
+  push(@layers, ['Sgbr', 'Lemma']);
+  push(@layers, ['Sgbr', 'Morpho']);
+
+  # TreeTagger
+  push(@layers, ['TreeTagger', 'Morpho']);
+  push(@layers, ['TreeTagger', 'Sentences']);
+
+  # XIP
+  push(@layers, ['XIP', 'Morpho']);
+  push(@layers, ['XIP', 'Constituency']);
+  push(@layers, ['XIP', 'Sentences']);
+  push(@layers, ['XIP', 'Dependency']);
+
+  my @anno;
+  my $skip = $param{skip};
+
+  # Check for complete skipping
+  if ($skip->{'#all'}) {
+    foreach (@$param{anno}) {
+      push @anno, [split('#', $_)];
+    }
+  }
+
+  # Iterate over all layers
+  else {
+    # Add to index file - respect skipping
+    foreach my $info (@layers) {
+
+      # Skip if Foundry or Foundry#Layer should be skipped
+      unless ($skip->{lc($info->[0])} || $skip->{lc($info->[0]) . '#' . lc($info->[1])}) {
+	push @anno, $info;
+      };
+    };
+  };
+
+  bless {
+    cache     => $param{cache} // undef,
+    meta      => $param{meta}  // 'I5',
+    outpu     => $param{output},
+    overwrite => $param{overwrite},
+    foundry   => $param{foundry} // 'Base',
+    layer     => $param{layer}   // 'Tokens',
+    anno      => \@anno,
+    log       => $param{log} // Log::Log4perl->get_logger('main')
+  }, $class;
+};
+
+
+sub process {
+  my $self = shift;
+  my $input = shift;
+  my $output = shift;
+
+  # Create and parse new document
+  $input =~ s{([^/])$}{$1/};
+  my $doc = KorAP::XML::Krill->new(
+    path => $input,
+    meta_type => $self->{meta},
+    cache => $self->{cache}
+  );
+
+  # Parse document
+  unless ($doc->parse) {
+    $log->warn($output . " can't be processed - no document data");
+    return;
+  };
+
+  # Get tokenization
+  my $tokens = KorAP::XML::Tokenizer->new(
+    path => $doc->path,
+    doc => $doc,
+    foundry => $self->{foundry},
+    layer => $self->{layer},
+    name => 'tokens'
+  );
+
+  # Unable to process base tokenization
+  unless ($tokens->parse) {
+    $log->error($output . " can't be processed - no base tokenization");
+    return;
+  };
+
+  foreach (@{$self->{anno}}) {
+    $tokens->add(@$_);
+  };
+
+# Go on here with my $file; my $print_text
+};
+
+1;

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5e9cc38..53f0765 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -59,9 +59,12 @@
 # - Added multi archive support
 # - Added prefix negation support
 # - Added Malt#Dependency support
+#
+# 2016/07/06
+# - Added MDParser#Dependency
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2016/03/17';
+our $LAST_CHANGE = '2016/07/06';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -270,6 +273,9 @@
   # Malt
   push(@layers, ['Malt', 'Dependency']);
 
+  # MDParser
+  push(@layers, ['MDParser', 'Dependency']);
+
   # Mate
   push(@layers, ['Mate', 'Morpho']);
   push(@layers, ['Mate', 'Dependency']);
commit	8b9905233c2ad3044af7ee8c712c371faf2ebec3	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Jul 06 16:45:57 2016 +0200
committer	Akron <nils@diewald-online.de>	Wed Jul 06 16:45:57 2016 +0200
tree	6835ecdfa2ffa53512c35a4cf4d35196d3c22fb2
parent	a86d94a3b310cc83176e0073d10c6eafe3007be1 [diff]