Add extract_sigle method to archive
Change-Id: Ic2eb6578c6e8fb57e2191f685a000fd712ec463d
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 3530288..d53004b 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -691,72 +691,75 @@
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
- my $prefix = 1;
+ # Will set @sigle
+ my $prefix = set_sigle($archive);
- # No sigles given
- unless (@sigle) {
-
- # Get files
- foreach ($archive->list_texts) {
-
- # Split path information
- ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
-
- # TODO: Make this OS independent
- push @sigle, join '/', $corpus, $doc, $text;
- };
- }
-
- # Check sigle for doc sigles
- else {
- my @new_sigle;
-
- my $prefix_check = 0;
-
- # Iterate over all sigle
- foreach (@sigle) {
-
- # Sigle is a doc sigle
- if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
-
- print "$_ ...";
- # Check if a prefix is needed
- unless ($prefix_check) {
-
- if ($prefix = $archive->check_prefix) {
- print " with prefix ...";
- };
- $prefix_check = 1;
- };
-
- print "\n";
-
- # TODO: Make this OS independent
- my $path = ($prefix ? './' : '') . $_;
-
- print '... ' . (
- $archive->extract_doc(
- $path, $output, $sequential_extraction ? 1 : $jobs
- ) ? '' : 'not '
- );
- print "extracted.\n";
- }
-
- # Sigle is a text sigle
- else {
- push @new_sigle, $_;
-
- unless ($prefix_check) {
-
- if ($prefix = $archive->check_prefix) {
- print " with prefix ...";
- };
- $prefix_check = 1;
- };
- };
- };
- @sigle = @new_sigle;
- };
+# my $prefix = 1;
+#
+# # No sigles given
+# unless (@sigle) {
+#
+# # Get files
+# foreach ($archive->list_texts) {
+#
+# # Split path information
+# ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
+#
+# # TODO: Make this OS independent
+# push @sigle, join '/', $corpus, $doc, $text;
+# };
+# }
+#
+# # Check sigle for doc sigles
+# else {
+# my @new_sigle;
+#
+# my $prefix_check = 0;
+#
+# # Iterate over all sigle
+# foreach (@sigle) {
+#
+# # Sigle is a doc sigle
+# if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+#
+# print "$_ ...";
+# # Check if a prefix is needed
+# unless ($prefix_check) {
+#
+# if ($prefix = $archive->check_prefix) {
+# print " with prefix ...";
+# };
+# $prefix_check = 1;
+# };
+#
+# print "\n";
+#
+# # TODO: Make this OS independent
+# my $path = ($prefix ? './' : '') . $_;
+#
+# print '... ' . (
+# $archive->extract_doc(
+# $path, $output, $sequential_extraction ? 1 : $jobs
+# ) ? '' : 'not '
+# );
+# print "extracted.\n";
+# }
+#
+# # Sigle is a text sigle
+# else {
+# push @new_sigle, $_;
+#
+# unless ($prefix_check) {
+#
+# if ($prefix = $archive->check_prefix) {
+# print " with prefix ...";
+# };
+# $prefix_check = 1;
+# };
+# };
+# };
+# @sigle = @new_sigle;
+# };
# Iterate over all given sigles and extract
foreach (@sigle) {
@@ -811,7 +814,7 @@
# Add some random extra to avoid clashes with multiple archives
$extract_dir = catdir($extract_dir, random_string('cccccc'));
- # Extract to temprary directory
+ # Extract to temporary directory
if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
@input = ($extract_dir);
}
@@ -829,8 +832,6 @@
};
};
- # TODO: Support sigles
-
# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
@@ -971,6 +972,9 @@
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
+ # Get sigles to extract
+ my $prefix = set_sigle($archive);
+
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
@@ -1056,6 +1060,86 @@
};
+# For an archive, this will create the list
+# of all sigles to process
+sub set_sigle {
+ my $archive = shift;
+
+ my $prefix = 1;
+ my @dirs = ();
+
+ # No sigles given
+ unless (@sigle) {
+
+ # Get files
+ foreach ($archive->list_texts) {
+
+ push @dirs, $_;
+
+ # Split path information
+ ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
+
+ # TODO: Make this OS independent
+ push @sigle, join '/', $corpus, $doc, $text;
+ };
+ }
+
+ # Check sigle for doc sigles
+ else {
+ my @new_sigle;
+
+ my $prefix_check = 0;
+
+ # Iterate over all sigle
+ foreach (@sigle) {
+
+ # Sigle is a doc sigle
+ if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+
+ print "$_ ...";
+ # Check if a prefix is needed
+ unless ($prefix_check) {
+
+ if ($prefix = $archive->check_prefix) {
+ print " with prefix ...";
+ };
+ $prefix_check = 1;
+ };
+
+ print "\n";
+
+ # TODO: Make this OS independent
+ my $path = ($prefix ? './' : '') . $_;
+
+ print '... ' . (
+ $archive->extract_doc(
+ $path, $output, $sequential_extraction ? 1 : $jobs
+ ) ? '' : 'not '
+ );
+ print "extracted.\n";
+ }
+
+ # Sigle is a text sigle
+ else {
+ push @new_sigle, $_;
+
+ unless ($prefix_check) {
+
+ if ($prefix = $archive->check_prefix) {
+ print " with prefix ...";
+ };
+ $prefix_check = 1;
+ };
+ };
+ };
+ @sigle = @new_sigle;
+ };
+
+ return $prefix;
+};
+
+
+
# Cleanup temporary extraction directory
if ($extract_dir) {
my $objects = remove_tree($extract_dir, { safe => 1 });
@@ -1344,8 +1428,8 @@
Supported parameters are:
C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>,
-C<temp-extract>, C<sequential-extraction>,
+C<output>, C<koral>,
+C<tempary-extract>, C<sequential-extraction>,
C<base-sentences>, C<base-paragraphs>,
C<base-pagebreaks>,
C<skip> (semicolon separated), C<sigle>