Use Test::XML::Loy instead of Test::XML::Simple for performance reasons
Change-Id: I712e1e575808ca80930bd70b8c5cfd2eea0af684
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..f1083af
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2015, IDS Mannheim
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
\ No newline at end of file
diff --git a/Makefile.PL b/Makefile.PL
index d4714f9..1467c1e 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -8,13 +8,14 @@
NAME => 'tei2korapxml',
AUTHOR => 'Peter Harders',
ABSTRACT => 'Conversion of TEI P5 based formats to KorAP-XML',
- VERSION => '0.1',
+ VERSION_FROM => 'script/tei2korapxml',
LICENSE => 'freebsd',
BUILD_REQUIRES => {
'Test::More' => 0,
'Test::Output' => 0,
'XML::Loy' => 0.49,
'IO::Uncompress::Unzip' => '2.091',
+ 'Pod::Usage' => 0,
},
PREREQ_PM => {
'XML::CompactTree::XS' => '0.03',
@@ -22,5 +23,5 @@
'IO::Compress::Zip' => '2.091',
},
MIN_PERL_VERSION => '5.016',
- EXE_FILES => ['./script/tei2korapxml']
+ EXE_FILES => ['script/tei2korapxml']
);
diff --git a/script/tei2korapxml b/script/tei2korapxml
index d2c5fb9..3f6d717 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -23,6 +23,7 @@
use open qw(:std :utf8); # assume utf-8 encoding
use Encode qw(encode_utf8 decode_utf8);
+use Pod::Usage;
use Getopt::Long;
use XML::CompactTree::XS;
use XML::LibXML::Reader;
@@ -32,6 +33,9 @@
my $zip; my $first_write=1;
my $outh = *STDOUT;
+our $VERSION = '0.01';
+our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
+
my $_CORR_POS_FOR_EMPTY_S_TAGS = 1; # this should only be deactivated for test purposes (empty s-tags produce an additional blank (look for ' ' in data.xml)
my $_CORR_BYTE_POS_FOR_P_TAGS = 1;
@@ -125,15 +129,25 @@
my $inside_annot_tag = -1;
##
-my $help;
-
GetOptions(
- "base=s" => \$_root_dir,
- "help" => \$help
+ 'base|b=s' => \$_root_dir,
+ 'help|h' => sub {
+ pod2usage(
+ -verbose => 99,
+ -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
+ -msg => $VERSION_MSG,
+ -output => '-'
+ )
+ },
+ 'version|v' => sub {
+ pod2usage(
+ -verbose => 0,
+ -msg => $VERSION_MSG,
+ -output => '-'
+ )
+ }
);
-printhelp() if $help;
-
######
###### MAIN
######
@@ -975,15 +989,80 @@
} # end: sub write_annot
-sub printhelp {
- print STDERR <<EOHELP;
-This program is called from inside another script.
-EOHELP
- exit();
-}
sub startTokenizer {
$pid = open2($chld_out, $chld_in, 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
$select = IO::Select->new();
$select->add(*$chld_out);
}
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 NAME
+
+tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
+
+=head1 SYNOPSIS
+
+ cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
+
+=head1 DESCRIPTION
+
+C<tei2korapxml> is a script to convert TEI P5 and I5 based documents
+to the KorAP-XML format. If no specific input is defined, data is
+read from C<STDIN>. If no specific output is defined, data is written
+to C<STDOUT>.
+This program is usually called from inside another script.
+
+=head1 INSTALLATION
+
+C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
+these bindings are available, the preferred way to install the script is
+to use L<cpanm|App::cpanminus>.
+
+ $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
+
+In case everything went well, the C<tei2korapxml> tool will
+be available on your command line immediately.
+Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
+
+=head1 OPTIONS
+
+=over 2
+
+=item B<--base|-b>
+
+The base directory for output. Defaults to C<.>.
+
+=item B<--help|-h>
+
+Print help information.
+
+=item B<--version|-v>
+
+Print version information.
+
+=back
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
+
+Author: Peter Harders
+
+Contributors: Marc Kupietz, Carsten Schnober, Nils Diewald
+
+L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
+
+This program is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-TEI/master/LICENSE>.
+
+=cut
diff --git a/t/script.t b/t/script.t
index 6bdbc9e..101db08 100644
--- a/t/script.t
+++ b/t/script.t
@@ -14,12 +14,19 @@
my $script = catfile($f, '..', 'script', 'tei2korapxml');
ok(-f $script, 'Script found');
-stderr_is(
+stdout_like(
sub { system('perl', $script, '--help') },
- "This program is called from inside another script.\n",
+ qr!This\s*program\s*is\s*usually\s*called\s*from\s*inside\s*another\s*script\.!,
'Help'
);
+stdout_like(
+ sub { system('perl', $script, '--version') },
+ qr!tei2korapxml - v\d+?\.\d+?!,
+ 'Version'
+);
+
+
# Load example file
my $file = catfile($f, 'data', 'goe_sample.i5.xml');
my $outzip = tmpnam();
@@ -48,6 +55,7 @@
->text_is('h\.author', 'Goethe, Johann Wolfgang von', 'h.author')
->text_is('pubDate[type=year]', '1982', 'pubDate');
+
# Uncompress GOE/AGA/header.xml from zip file
$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/header.xml');