Minor style and documentation improvements
Change-Id: Ifcb6f64267826fffe58b6f96045f93e04388342a
diff --git a/Changes b/Changes
new file mode 100644
index 0000000..49ccd01
--- /dev/null
+++ b/Changes
@@ -0,0 +1,2 @@
+0.01 2020-09-28
+ - Initial release to GitHub.
diff --git a/Readme.pod b/Readme.pod
new file mode 100644
index 0000000..631c898
--- /dev/null
+++ b/Readme.pod
@@ -0,0 +1,142 @@
+=pod
+
+=encoding utf8
+
+=head1 NAME
+
+tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
+
+=head1 SYNOPSIS
+
+ cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
+
+=head1 DESCRIPTION
+
+C<tei2korapxml> is a script to convert TEI P5 and
+L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
+based documents to the
+L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
+If no specific input is defined, data is
+read from C<STDIN>. If no specific output is defined, data is written
+to C<STDOUT>.
+
+This program is usually called from inside another script.
+
+=head1 FORMATS
+
+=head2 Input restrictions
+
+=over 2
+
+=item
+
+utf8 encoded
+
+=item
+
+TEI P5 formatted input with certain restrictions:
+
+=over 4
+
+=item
+
+B<mandatory>: text-header with integrated textsigle, text-body
+
+=item
+
+B<optional>: corp-header with integrated corpsigle,
+doc-header with integrated docsigle
+
+=back
+
+=item
+
+All tokens inside the primary text may not be
+newline seperated, because newlines are removed
+(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
+into blanks between 2 tokens could lead to additional blanks,
+where there should be none (e.g.: punctuation characters like C<,> or
+C<.> should not be seperated from their predecessor token).
+(see also code section C<~ whitespace handling ~>).
+
+=back
+
+=head2 Notes on the output
+
+=over 2
+
+=item
+
+zip file output (default on C<stdout>) with utf8 encoded entries
+(which together form the KorAP-XML format)
+
+=back
+
+=head1 INSTALLATION
+
+C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
+these bindings are available, the preferred way to install the script is
+to use L<cpanm|App::cpanminus>.
+
+ $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
+
+In case everything went well, the C<tei2korapxml> tool will
+be available on your command line immediately.
+
+Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
+
+=head1 OPTIONS
+
+=over 2
+
+=item B<--root|-r>
+
+The root directory for output. Defaults to C<.>.
+
+=item B<--help|-h>
+
+Print help information.
+
+=item B<--version|-v>
+
+Print version information.
+
+=item B<--tokenizer-call|-tc>
+
+Call an external tokenizer process, that will tokenize
+a single line from STDIN and outputs one token per line.
+
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
+=item B<--use-intern-tokenization|-ti>
+
+Tokenize the data using two embedded tokenizers,
+that will take an I<Aggressive> and a I<conservative>
+approach.
+
+=item B<--log|-l>
+
+Loglevel for I<Log::Any>. Defaults to C<notice>.
+
+=back
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
+
+Author: Peter Harders
+
+Contributors: Marc Kupietz, Carsten Schnober, Nils Diewald
+
+L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
+
+This program is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-TEI/master/LICENSE>.
+
+=cut
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 8f1678d..27ef6f9 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -22,8 +22,10 @@
sub remove_xml_comments {
my ($fh, $html) = @_;
- # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
- # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
+ # the source code part where $tc is used, leads to the situation,
+ # that comments can produce an additional blank, which
+ # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...'
+ # would lead to '... <w>token</w>...' in $buf_in).
# removing comments before processing the line, prevents this situation.
my $pfx = '';
@@ -58,8 +60,9 @@
}
}
- if ( $html =~ /^\s*$/ ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
-
+ if ($html =~ /^\s*$/) {
+ # get next line and feed it also to this sub,
+ # if actual line is empty or only contains whitespace
$html = <$fh>;
goto CHECK;
};
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 9a09ec7..b7d4c87 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -52,7 +52,7 @@
my ($self, $txt) = @_;
return unless $self->{pid};
my $out = $self->{chld_in};
- print $out encode( "UTF-8", $txt ) . $self->{sep};
+ print $out encode('UTF-8', $txt) . $self->{sep};
return $self;
};
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9ce2c8e..7803335 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -70,42 +70,42 @@
# ~~~ parameter (mandatory) ~~~
#
my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
- # optional
+# optional
my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
- # optional
+# optional
my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
- # mandatory
+# mandatory
my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
#
# ~~~ constants ~~~
#
+
## extern tokenization
my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
- # TODO:
- # Read tokenizer call from configuration file.
- # was 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
- my $ext_tok;
- if ($tokenizer_call) {
- $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
- }
+my $ext_tok;
+if ($tokenizer_call) {
+ $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+}
- elsif ($tokenizer_korap) {
- $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
- };
- my $_tok_file_ext = "tokens.xml";
+elsif ($tokenizer_korap) {
+ $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
+};
+my $_tok_file_ext = "tokens.xml";
##
+
## intern tokenization
my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
- my $_tok_file_con = "tokens_conservative.xml";
- my $_tok_file_agg = "tokens_aggressive.xml";
- my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
- my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
+my $_tok_file_con = "tokens_conservative.xml";
+my $_tok_file_agg = "tokens_aggressive.xml";
+my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
##
+
my $_tok_dir = "base"; # name of directory for storing tokenization files
my $_DEBUG = 0; # set to 1 for minimal more debug output (no need to be parametrized)
@@ -160,7 +160,8 @@
my $dir; # text directory (below $_root_dir)
-my ( $text_id, $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
+my ( $text_id,
+ $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
$tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
@@ -220,12 +221,9 @@
$dir = "";
if ( $input_fname ne '' ){
-
open ( $input_fh, "<", "$input_fname") || die "File \'$input_fname\' could not be opened.\n";
-
}
-
# prevents segfaulting of 'XML::LibXML::Reader' inside 'main()' - see notes on 'PerlIO layers' in 'man XML::LibXML')
# removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
# see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
@@ -246,7 +244,8 @@
# ~ start of text body ~
- $pfx = $1; $sfx = $2;
+ $pfx = $1;
+ $sfx = $2;
die "ERROR ($0): main(): input line number $.: line with opening text-body tag '${_TEXT_BODY}'"
." contains additional information ... => Aborting\n\tline=$_"
@@ -294,9 +293,7 @@
$structures->reset;
- if ( $_TOKENS_PROC ){
- $tokens->reset;
- }
+ $tokens->reset if $_TOKENS_PROC;
# ~ whitespace related issue ~
$add_one = 0;
@@ -317,8 +314,7 @@
);
# ~ tokenization ~
-
- if ( $_GEN_TOK_EXT ){
+ if ($_GEN_TOK_EXT) {
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
@@ -327,7 +323,7 @@
);
};
- if ( $_GEN_TOK_INT ){
+ if ($_GEN_TOK_INT) {
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
@@ -401,7 +397,7 @@
### NOTE: this is only relevant, if a text consists of more than one line
### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
- if ( m/<[^>]+>[^<]/ ){ # line contains at least one tag with at least one character contents
+ if (m/<[^>]+>[^<]/){ # line contains at least one tag with at least one character contents
# NOTE: not stringent ('...' stands for text):
#
@@ -424,7 +420,7 @@
$buf_in .= $_;
};
- } elsif ( m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$# ){
+ } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
# ~ start of header ~
$pfx = $1;
@@ -456,10 +452,10 @@
$text_id_esc = $header->id_esc;
# log output for seeing progression
- $log->notice("$0: main(): text_id=".decode('UTF-8', $text_id ));
+ $log->notice("$0: main(): text_id=".decode('UTF-8', $text_id));
$tl = 0; # reset (needed for ~ whitespace handling ~)
- };
+ }
}
}
} #end: while
@@ -561,23 +557,20 @@
# ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
# $data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
# $data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
- #
+ #
#
# retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
# Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
# ${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
+ foreach $e (@{${$_[0]}}) { # iteration through all array elements ($_[0] is a reference to an array reference)
- foreach $e ( @{${$_[0]}} ){ # iteration through all array elements ($_[0] is a reference to an array reference)
-
- if ( $e->[0] == XML_READER_TYPE_ELEMENT ){ # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
-
+ if ($e->[0] == XML_READER_TYPE_ELEMENT) { # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
#~~~~
# from here: tag-node (opening)
#~~~~
-
# ~ handle structures ~
# $e->[1] represents the tag name
@@ -592,9 +585,9 @@
# ~ handle attributes ~
- if ( defined $e->[3] ){ # only if attributes exist
+ if (defined $e->[3]) { # only if attributes exist
- for ( $c = 0; $c < @{$e->[3]}; $c += 2 ){ # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
+ for ($c = 0; $c < @{$e->[3]}; $c += 2) { # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
@@ -618,7 +611,7 @@
# ~~ RECURSION ~~
- if ( defined $e->[$_IDX] ){ # do no recursion, if $e->[$_IDX] is not defined (because we have no array of child-nodes, e.g.: <back/>)
+ if (defined $e->[$_IDX]) { # do no recursion, if $e->[$_IDX] is not defined (because we have no array of child-nodes, e.g.: <back/>)
retr_info($rl+1, \$e->[$_IDX]); # recursion with array of child-nodes
}
@@ -635,7 +628,7 @@
{
$fval = $anno->from;
- if ( $fval > 0 && not exists $ws{ $fval - 1 } ){ # ~ whitespace related issue ~
+ if ($fval > 0 && not exists $ws{$fval - 1}) { # ~ whitespace related issue ~
# ~ previous node was a text-node ~
@@ -649,6 +642,7 @@
# TODO: find example for which this case applies
# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
+ #
# TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
# do testing with bigger corpus excerpt (wikipedia?)
$anno->set_from($pos) if $fval == $pos + 1;
@@ -660,7 +654,7 @@
# ~ whitespace related issue ~
# clean up
- delete $ws{ $fval - 1 } if $fval > 0 && exists $ws{ $fval - 1 };
+ delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
#~~~~
@@ -668,20 +662,20 @@
#~~~~
- #~~~~~
- # from here: text- and whitespace-nodes
- #~~~~~
+ #~~~~~
+ # from here: text- and whitespace-nodes
+ #~~~~~
- # The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
- # 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
- #
- # When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
- # '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
- # (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
- #
- # echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
+ # The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
+ # 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
+ #
+ # When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
+ # '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
+ # (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
+ #
+ # echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
- } elsif ( $e->[0] == XML_READER_TYPE_TEXT || $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ){
+ } elsif ($e->[0] == XML_READER_TYPE_TEXT || $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE){
# Notes on ~ whitespace related issue ~ (referred to the code fragment below)
#
@@ -689,7 +683,7 @@
#
# Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
# 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
- #
+ #
# The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
# it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
#
@@ -719,7 +713,7 @@
# Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
# and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
- if( $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE ){
+ if ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
# ~ whitespace-node ~
@@ -731,12 +725,12 @@
# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
$ws{$data->position}++;
- }else{
+ } else {
# ~ text-node ~
$add_one = 1;
- }
+ };
# ~ update $data ~
@@ -748,11 +742,11 @@
#~~~~~
- #elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute-node
- # note: attributes cannot be processed like this ( => use 'XCT_ATTRIBUTE_ARRAY' - see above )
+ # elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute-node
+ # note: attributes cannot be processed like this ( => use 'XCT_ATTRIBUTE_ARRAY' - see above )
- }else{ # not yet handled type
+ } else { # not yet handled type
die "ERROR ($0): Not yet handled type (\$e->[0]=".$e->[0].") ... => Aborting\n";
}
@@ -816,9 +810,9 @@
=item
-all tokens inside the primary text (inside $data) may not be
+All tokens inside the primary text may not be
newline seperated, because newlines are removed
-(see code section C<~ inside text body ~>) and a conversion of newlines
+(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
into blanks between 2 tokens could lead to additional blanks,
where there should be none (e.g.: punctuation characters like C<,> or
C<.> should not be seperated from their predecessor token).