Add -p|--progress option to show progress and ETA

Change-Id: I14d9225bae6ca3af2ccc129399f51f6440947f49
diff --git a/Changes b/Changes
index 7added2..03ccb1d 100644
--- a/Changes
+++ b/Changes
@@ -2,6 +2,7 @@
         - Upgrade KorAP-Tokenizer to v2.3.0 (resolves issues with
           gendersternchen after hyphens, emoji clusters, and Wikipedia templates).
         - Upgrade Java dependency to 21.
+        - Added --progress option.
 
 2.6.1 2025-04-16
         - Fix ASCII entity resolution.
diff --git a/Makefile.PL b/Makefile.PL
index c915378..471add1 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -27,14 +27,16 @@
     'DateTime' => '1.51',
     'File::Share' => '0.25',
     'Capture::Tiny' => '0.48',
-    'Log::Any' => '1.713'
+    'Log::Any' => '1.713',
+    'Time::Progress' => 0
   },
   PREREQ_PM => {
     'File::Share' => '0.25',
     'XML::CompactTree::XS'     => '0.03',
     'XML::LibXML::Reader' => '2.0201',
     'IO::Compress::Zip' => '2.091',
-    'Log::Any' => '1.713'
+    'Log::Any' => '1.713',
+    'Time::Progress' => 0
   },
   MIN_PERL_VERSION => '5.036',
   EXE_FILES => ['script/tei2korapxml'],
diff --git a/Readme.pod b/Readme.pod
index 8cb2375..074db86 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -99,6 +99,11 @@
 
   tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
 
+=item B<--progress|-p>
+
+Show a progress bar (including ETA).
+This option is ignored if valid input is not read from a file.
+
 =item B<--output|-o>
 
 The output zip file to be created. If no specific output is defined,
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 089b5da..5855406 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -69,6 +69,7 @@
   'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
   'log|l=s'               => \(my $log_level   = 'notice'),
   'required-version|rv=s' => \(my $required_version),
+  'progress|p'            => \(my $progress),
   ''                      => \(my $stdio),
   'help|h' => sub {
     pod2usage(
@@ -109,6 +110,16 @@
   $what = qr!$what!;
 };
 
+if ($progress) {
+  eval {
+    require Time::Progress;
+    1;
+  } or do {
+     $log->warn('Time::Progress not installed. Progress bar disabled.');
+     $progress = 0;
+  }
+};
+
 # tag (without attributes), which contains the primary text
 my $_TEXT_BODY = 'text';
 # optional
@@ -242,10 +253,25 @@
 );
 
 do {
-  $log->notice("Reading input document $input_fname") if ($input_fname ne '');
+  my $p;
+  if ($progress && $input_fname ne '') {
+    my $file_size = -s $input_fname;
+    if ($file_size) {
+        $p = Time::Progress->new(min => 0, max => $file_size);
+        $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
+    }
+  } elsif ($input_fname ne '') {
+      $log->notice("Reading input document $input_fname");
+  };
+
+  my $i = 0;
   MAIN:
   while (<$input_fh>) {
 
+    if ($p && ($i++ % 500 == 0)) {
+        print STDERR $p->report("\r%20b %p  ETA: %E", tell($input_fh));
+    };
+
     # remove HTML (multi-line) comments (<!--...-->)
     $_ = remove_xml_comments($input_fh, $_);
 
@@ -499,6 +525,10 @@
     };
   };
   $text_id_esc = $auto_textsigle if ($auto_textsigle);
+
+  if ($p) {
+      print STDERR $p->report("\r%20b %p  ETA: %E\n", tell($input_fh));
+  };
 } while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
 $zipper->close;