Fix memory leak by using Path::Iterator::Rule
Change-Id: I16967fb0801e2be0491e61c6dfaa28a4ebc4b742
diff --git a/Changes b/Changes
index 5914da4..c95879a 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.59 2024-11-14
+ - UsePath::Iterator::Rule instead of Mojo::File.
+
0.58 2024-09-11
- Remove Directory::Iterator and replace it with Mojo::File.
- Add performance hint.
diff --git a/Makefile.PL b/Makefile.PL
index 8c56aff..f4e3025 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -41,6 +41,7 @@
'Config::Simple' => 4.58,
'String::Random' => 0.32,
'File::Path' => 2.18,
+ 'Path::Iterator::Rule' => 1.015,
'Archive::Tar' => 2.40,
'Archive::Tar::Stream' => 0.02,
'Clone' => 0.45,
diff --git a/lib/KorAP/XML/ForkPool.pm b/lib/KorAP/XML/ForkPool.pm
index b3453ce..e6d407c 100644
--- a/lib/KorAP/XML/ForkPool.pm
+++ b/lib/KorAP/XML/ForkPool.pm
@@ -1,7 +1,7 @@
package KorAP::XML::ForkPool;
use strict;
use warnings;
-use Mojo::File;
+use Path::Iterator::Rule;
use Parallel::ForkManager;
use v5.10;
@@ -54,15 +54,19 @@
my @dirs;
- Mojo::File->new($input[0])
- ->list_tree({hidden => 0, dir => 0})
- ->grep(qr/\/data\.xml$/)
- ->each(
- sub {
- s/\/data\.xml$//;
- push @dirs, $_;
- }
- );
+ my $rule = Path::Iterator::Rule->new;
+ $rule->name('data.xml')->file;
+
+ my $next = $rule->iter(
+ $input => {
+ sorted => 0,
+ depthfirst => -1,
+ error_handler => undef
+ });
+ while (defined(my $file = $next->())) {
+ $file =~ s/\/data\.xml$//;
+ push @dirs, $file;
+ };
$self->{count} = scalar @dirs;
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index d7f4d58..a213982 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
-our $VERSION = '0.58';
+our $VERSION = '0.59';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index ff60469..4c82bc5 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -46,6 +46,7 @@
# Check if token is emoji
sub is_emoji {
+ # return $_[0] =~ m{^\p{gc:S}\p{gc:M}*?(\x{200D}\p{gc:S}\p{gc:M}*?)*$}i;
return $_[0] =~ m{^(\p{Extended_Pictographic}+\p{Emoji_Modifier}*)$}i;
};
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 30f3384..a40d75a 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -19,12 +19,12 @@
use KorAP::XML::Tokenizer;
use KorAP::XML::Batch::File;
use Config::Simple;
+use Path::Iterator::Rule;
use Parallel::ForkManager;
use File::Glob ':bsd_glob';
use File::Temp qw/tempdir/;
use File::Path qw(remove_tree make_path);
use File::Basename;
-use Mojo::File;
use Mojo::Collection 'c';
use String::Random qw(random_string);
use IO::File;
@@ -178,7 +178,7 @@
# - Improve core count logging.
# ----------------------------------------------------------
-our $LAST_CHANGE = '2024/06/05';
+our $LAST_CHANGE = '2024/11/14';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -879,15 +879,18 @@
my @dirs;
- Mojo::File->new($input[0])
- ->list_tree({hidden => 0, dir => 0})
- ->grep(qr/\/data\.xml$/)
- ->each(
- sub {
- s/\/data\.xml$//;
- push @dirs, $_;
- }
- );
+ my $rule = Path::Iterator::Rule->new;
+ $rule->name('data.xml')->file;
+ my $next = $rule->iter(
+ $input[0] => {
+ sorted => 0,
+ depthfirst => -1,
+ error_handler => undef
+ });
+ while (defined(my $file = $next->())) {
+ $file =~ s/\/data\.xml$//;
+ push @dirs, $file;
+ };
print "Start processing ...\n" unless $q;
$t = Benchmark->new;