Add -p|--progress option to show progress and ETA
Change-Id: I14d9225bae6ca3af2ccc129399f51f6440947f49
diff --git a/Changes b/Changes
index 7added2..03ccb1d 100644
--- a/Changes
+++ b/Changes
@@ -2,6 +2,7 @@
- Upgrade KorAP-Tokenizer to v2.3.0 (resolves issues with
gendersternchen after hyphens, emoji clusters, and Wikipedia templates).
- Upgrade Java dependency to 21.
+ - Added --progress option.
2.6.1 2025-04-16
- Fix ASCII entity resolution.
diff --git a/Makefile.PL b/Makefile.PL
index c915378..471add1 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -27,14 +27,16 @@
'DateTime' => '1.51',
'File::Share' => '0.25',
'Capture::Tiny' => '0.48',
- 'Log::Any' => '1.713'
+ 'Log::Any' => '1.713',
+ 'Time::Progress' => 0
},
PREREQ_PM => {
'File::Share' => '0.25',
'XML::CompactTree::XS' => '0.03',
'XML::LibXML::Reader' => '2.0201',
'IO::Compress::Zip' => '2.091',
- 'Log::Any' => '1.713'
+ 'Log::Any' => '1.713',
+ 'Time::Progress' => 0
},
MIN_PERL_VERSION => '5.036',
EXE_FILES => ['script/tei2korapxml'],
diff --git a/Readme.pod b/Readme.pod
index 8cb2375..074db86 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -99,6 +99,11 @@
tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
+=item B<--progress|-p>
+
+Show a progress bar (including ETA).
+This option is ignored if valid input is not read from a file.
+
=item B<--output|-o>
The output zip file to be created. If no specific output is defined,
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 089b5da..5855406 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -69,6 +69,7 @@
'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
'log|l=s' => \(my $log_level = 'notice'),
'required-version|rv=s' => \(my $required_version),
+ 'progress|p' => \(my $progress),
'' => \(my $stdio),
'help|h' => sub {
pod2usage(
@@ -109,6 +110,16 @@
$what = qr!$what!;
};
+if ($progress) {
+ eval {
+ require Time::Progress;
+ 1;
+ } or do {
+ $log->warn('Time::Progress not installed. Progress bar disabled.');
+ $progress = 0;
+ }
+};
+
# tag (without attributes), which contains the primary text
my $_TEXT_BODY = 'text';
# optional
@@ -242,10 +253,25 @@
);
do {
- $log->notice("Reading input document $input_fname") if ($input_fname ne '');
+ my $p;
+ if ($progress && $input_fname ne '') {
+ my $file_size = -s $input_fname;
+ if ($file_size) {
+ $p = Time::Progress->new(min => 0, max => $file_size);
+ $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
+ }
+ } elsif ($input_fname ne '') {
+ $log->notice("Reading input document $input_fname");
+ };
+
+ my $i = 0;
MAIN:
while (<$input_fh>) {
+ if ($p && ($i++ % 500 == 0)) {
+ print STDERR $p->report("\r%20b %p ETA: %E", tell($input_fh));
+ };
+
# remove HTML (multi-line) comments (<!--...-->)
$_ = remove_xml_comments($input_fh, $_);
@@ -499,6 +525,10 @@
};
};
$text_id_esc = $auto_textsigle if ($auto_textsigle);
+
+ if ($p) {
+ print STDERR $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
+ };
} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
$zipper->close;