Cleanup: Improve variable naming
Change-Id: I9deafae98caa7a78fa85e0708e619b6460d846dc
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 53c4ea8..c8ad29f 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -42,19 +42,19 @@
# Parse options from the command line
GetOptions(
- "root|r=s" => \(my $_root_dir = '.'),
+ "root|r=s" => \(my $root_dir = '.'),
"input|i=s" => \(my $input_fname = ''),
'tokenizer-call|tc=s' => \(my $tokenizer_call),
'tokenizer-korap|tk' => \(my $tokenizer_korap),
- 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT),
+ 'tokenizer-internal|ti' => \(my $tokenizer_intern),
'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
- 'base-foundry=s' => \(my $_tok_dir = 'base'),
- 'data-file=s' => \(my $_data_file = 'data'),
- 'header-file=s' => \(my $_header_file = 'header'),
- 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
+ 'base-foundry=s' => \(my $base_dir = 'base'),
+ 'data-file=s' => \(my $data_file = 'data'),
+ 'header-file=s' => \(my $header_file = 'header'),
+ 'tokens-file=s' => \(my $tokens_file = 'tokens'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
@@ -86,6 +86,10 @@
# TODO: IDS-specific (and redundant)
my $_HEADER_TAG = 'idsHeader';
+# name of the tag containing all information stored in $_tokens_file
+my $_TOKENS_TAG = 'w';
+
+
if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
};
@@ -112,16 +116,13 @@
##
# Name of the directory and the file containing all inline structure informations
-# except for $_TOKEN_TAG information
+# except for $_TOKENS_TAG information
my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
# Name of the directory and the file containing all inline token informations
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
-# name of the tag containing all information stored in $_tokens_file
-my $_TOKENS_TAG = "w";
-
# Handling inline annotations (inside $_TOKENS_TAG)
my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
@@ -133,77 +134,81 @@
my $data = KorAP::XML::TEI::Data->new;
# Initialize zipper
-my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
+my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
#
# ~~~ variables ~~~
#
+my $dir = ''; # text directory (below $root_dir)
-my $input_fh; # input file handle (default: stdin)
-
-my $dir; # text directory (below $_root_dir)
-
-my ( $text_id,
- $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
+# '$text_id_esc' = escaped version of $text_id
+my ($text_id, $text_id_esc);
# these are only used inside recursive function 'retr_info'
-my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
- $e, # element from $tree_data
- ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
- $add_one, # ...
- $fval, # ...
- %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
- # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
- # (means: 'from-index - 1' is a key in %ws).
- # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
+# value is set dependent on DEBUG - for extracting array of
+# child elements from element in $tree_data
+my $child_idx;
+
+# element from $tree_data
+my $e;
+
+# Keeping track of the current positions in the text
+my $pos;
+
+# Default encoding of the text
+my $input_enc = 'UTF-8';
+
+# variables for handling ~ whitespace related issue ~
+# (it is sometimes necessary, to correct the from-values for some tags)
+my $add_one;
+my $from = 0;
+
+# text line (needed for whitespace handling)
+my $text_line = 0;
+
+# hash for indices of whitespace-nodes
+# (needed to recorrect from-values)
+# IDEA:
+# when closing element, check if it's from-index minus 1 refers to a whitespace-node
+# (means: 'from-index - 1' is a key in %ws).
+# if this is _not_ the case, then the from-value is one
+# to high => correct it by substracting 1
+my %ws;
#
# ~~~ main ~~~
#
-# ~ initializations ~
-
# Include line numbers in elements of $tree_data for debugging
-DEBUG ? ($_IDX = 5) : ($_IDX = 4);
+DEBUG ? ($child_idx = 5) : ($child_idx = 4);
-$fval = 0;
# ~ read input and write output (text by text) ~
-my $tl = 0; # text line (needed for whitespace handling)
+# Input file handle (default: stdin)
+my $input_fh = *STDIN;
-$input_fh = *STDIN; # input file handle (default: stdin)
-
-# Maybe not necessary
-$data->reset;
-
-$dir = '';
-
-if ( $input_fname ne '' ){
+if ($input_fname ne '') {
unless (open($input_fh, '<', $input_fname)) {
die $log->fatal("File '$input_fname' could not be opened.");
};
-}
+};
# Prevents segfaulting (see notes on segfault prevention)
binmode $input_fh;
-my $sfx;
-my $pos;
-my $input_enc = 'UTF-8';
-my $l = length('</' . $_TEXT_BODY) + 1;
-# ~ loop (reading input document) ~
-
+# Reading input document
MAIN: while ( <$input_fh> ){
- $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
+ # remove HTML (multi-line) comments (<!--...-->)
+ $_ = remove_xml_comments( $input_fh, $_ );
# Set input encoding
- if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
+ if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
$input_enc = $2;
next;
};
@@ -211,20 +216,20 @@
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
- if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
+ # Start of Text body
+ if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#){
- # ~ start of text body ~
+ my $suffix = $2;
- $sfx = $2;
-
- if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
+ if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with opening text-body tag '${_TEXT_BODY}' " .
"contains additional information ... => Aborting (line=$_)");
};
- # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
- my $buf_in = '';
+ # Text body data extracted from input document ($input_fh),
+ # further processed by XML::LibXML::Reader
+ my $text_buffer = '';
# Iterate over all lines in the text body
while (<$input_fh>) {
@@ -233,24 +238,27 @@
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
- # ~ end of text body ~
+ # End of text body
if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
- if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
+ if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'".
" contains additional information ... => Aborting (line=$_)");
};
if ($dir eq '') {
- $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
+ $log->warn(
+ "Maybe empty textSigle => skipping this text ...\n" .
+ 'data=' . substr($data->data, 0, 200)
+ );
next MAIN;
};
my $reader = XML::LibXML::Reader->new(
- string => "<text>$buf_in</text>",
+ string => "<text>$text_buffer</text>",
huge => 1
);
@@ -271,38 +279,39 @@
%ws = ();
# ~ recursion ~
- retr_info(1, \$tree_data->[2] ); # parse input data
+ retr_info(1, \$tree_data->[2]); # parse input data
if (DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
+ $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
};
- # ~ write data.xml ~
+ # Write data.xml
$data->to_zip(
- $zipper->new_stream("$dir/${_data_file}.xml"),
+ $zipper->new_stream("$dir/${data_file}.xml"),
$text_id_esc
);
- # ~ tokenization ~
+ # Tokenize with external tokenizer
if ($ext_tok) {
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
+ $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
$text_id_esc
);
};
- if ($_GEN_TOK_INT) {
+ # Tokenize with internal tokenizer
+ if ($tokenizer_intern) {
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
+ $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
$text_id_esc
);
$aggr_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
+ $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
$text_id_esc
);
@@ -341,31 +350,44 @@
next MAIN;
};
- # ~ inside text body ~
# ~ whitespace handling ~
# Fix whitespaces (see notes on whitespace fixing)
- # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
- # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
+ # TODO:
+ # Maybe it's best, to keep the stripping of whitespace and
+ # to just remove the if-clause and to insert a blank by default
+ # (with possibly an option on how newlines in primary text should
+ # be handled (stripped or replaced by a whitespace)).
# Remove consecutive whitespace at beginning and end (mostly one newline)
s/^\s+//; s/\s+$//;
- ### NOTE: this is only relevant, if a text consists of more than one line
- ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
- ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
- if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
+ # NOTE:
+ # this is only relevant, if a text consists of more than one line
- $tl++; # counter for text lines
+ # TODO:
+ # find a better solution, or create a warning, if a text has more
+ # than one line ($text_line > 1)
- s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
+ # TODO:
+ # do testing with 2 different corpora
+ # (one with only one-line texts, the other with several lines per text)
+
+ # line contains at least one tag with at least one character contents
+ if (m/<[^>]+>[^<]/) {
+
+ # Increment counter for text lines
+ $text_line++;
+
+ # insert blank before 1st character
+ #(for 2nd line and consecutive lines)
+ s/^(.)/ $1/ if $text_line > 1;
}
- ###
# add line to buffer
- $buf_in .= $_;
+ $text_buffer .= $_;
};
} elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
@@ -374,9 +396,10 @@
my $content = "$2\n";
if ($1 !~ /^\s*$/) {
- die $log->fatal("input line number $.: " .
- "line with opening header tag" .
- " is not in expected format ... => Aborting (line=$_)");
+ die $log->fatal(
+ "input line number $.: " .
+ 'line with opening header tag is not in expected format ... ' .
+ "=> Aborting (line=$_)");
};
# Parse header
@@ -386,7 +409,7 @@
if ($header) {
# Write header to zip
- my $file = $header->dir . '/' . $_header_file . '.xml';
+ my $file = $header->dir . '/' . $header_file . '.xml';
$log->debug("Writing file $file") if DEBUG;
@@ -403,24 +426,29 @@
# log output for seeing progression
$log->notice("$0: text_id=$text_id");
- $tl = 0; # reset (needed for ~ whitespace handling ~)
- }
- }
- }
-} #end: while
+ # Reset counter for text lines
+ # (needed for whitespace handling)
+ $text_line = 0;
+ };
+ };
+ };
+};
$zipper->close;
$ext_tok->close if $ext_tok;
+close $input_fh;
+
exit(0);
# Recursively called function to handle XML tree data
sub retr_info {
+
# recursion level
# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
- my $rl = shift;
+ my $depth = shift;
# Iteration through all array elements
# ($_[0] is a reference to an array reference)
@@ -438,8 +466,8 @@
# $e->[1] represents the tag name
# Skip sentences
if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
- if (defined $e->[$_IDX]) {
- retr_info($rl+1, \$e->[$_IDX]);
+ if (defined $e->[$child_idx]) {
+ retr_info($depth+1, \$e->[$child_idx]);
}
next;
}
@@ -476,12 +504,12 @@
# Call function recursively
- # do no recursion, if $e->[$_IDX] is not defined
+ # do no recursion, if $e->[$child_idx] is not defined
# (because we have no array of child-nodes, e.g.: <back/>)
- if (defined $e->[$_IDX]) {
+ if (defined $e->[$child_idx]) {
# Recursion with array of child-nodes
- retr_info($rl+1, \$e->[$_IDX]);
+ retr_info($depth+1, \$e->[$child_idx]);
}
@@ -495,34 +523,40 @@
# Handle structures and tokens
- $fval = $anno->from;
+ $from = $anno->from;
# ~ whitespace related issue ~
- if ($fval > 0 && not exists $ws{$fval - 1}) {
+ if ($from > 0 && not exists $ws{$from - 1}) {
# ~ previous node was a text-node ~
- $anno->set_from($fval - 1);
- }
-
- # in case this fails, check input
- if (($fval - 1) > $pos) {
- die $log->fatal("text_id='$text_id', " .
- "processing of structures: " .
- "from-value ($fval) is 2 or more greater " .
- "than to-value ($pos) => please check. Aborting");
+ $anno->set_from($from - 1);
};
- # TODO: find example for which this case applies
- # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
+ # in case this fails, check input
+ if (($from - 1) > $pos) {
+ die $log->fatal(
+ "text_id='$text_id', " .
+ 'processing of structures: ' .
+ "from-value ($from) is 2 or more greater " .
+ "than to-value ($pos) => please check. Aborting"
+ );
+ };
+
+ # TODO:
+ # find example for which this case applies
+ # maybe this is not necessary anymore, because the
+ # above recorrection of the from-value suffices
#
- # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
+ # TODO:
+ # check, if it's better to remove this line and
+ # change above check to 'if ($from - 1) >= $pos;
# do testing with bigger corpus excerpt (wikipedia?)
- $anno->set_from($pos) if $fval == $pos + 1;
+ $anno->set_from($pos) if $from == $pos + 1;
$anno->set_to($pos);
- $anno->set_level($rl);
+ $anno->set_level($depth);
# Clean up whitespace
- delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
+ delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
#~~~~
@@ -829,7 +863,7 @@
ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
$data->[2]->[0]->[4] == 1 (line number)
ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
- # child-nodes of actual node (see $_IDX)
+ # child-nodes of actual node (see $child_idx)
ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
@@ -905,7 +939,7 @@
[1]
Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
- (see above code fragment '... not exists $ws{ $fval - 1 } ...').
+ (see above code fragment '... not exists $ws{ $from - 1 } ...').
[2]
Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a