Atomize and test comment stripping
Change-Id: Id798c8bac96214f29659b2764f8861539d6f5210
diff --git a/script/tei2korapxml b/script/tei2korapxml
index fe8c37d..d50dc24 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -44,6 +44,12 @@
use IO::Compress::Zip qw(zip $ZipError :constants);
use IPC::Open2 qw(open2);
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use KorAP::XML::TEI;
our $VERSION = '0.01';
@@ -308,7 +314,7 @@
# TODO: yet not tested fo big amounts of data
# must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
- delHTMLcom ( $_ ); # remove HTML comments (<!--...-->)
+ KorAP::XML::TEI::delHTMLcom ( $input_fh, $_ ); # remove HTML comments (<!--...-->)
if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
@@ -1473,39 +1479,6 @@
} # end: sub write_tokens
-sub delHTMLcom { # remove HTML comments
-
- # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
- # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
- # removing comments before processing the line, prevents this situation.
-
- my ( $pfx, $sfx );
-
- while ( $_[0] =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
-
- if ( $_[0] =~ /^(.*)<!--/ && $_[0] !~ /-->/ ){ # remove comment spanning over several lines
-
- $pfx = $1;
-
- while ( $_[0] = <> ){
-
- if ( $_[0] =~ /-->(.*)$/ ){
- $sfx = $1; last
- }
-
- }
-
- $_[0] = "$pfx$sfx";
-
- }
-
- if ( $_[0] =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
-
- $_[0] = <>; delHTMLcom ( $_[0] );
- }
-}
-
-
## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
sub startTokenizer {
$pid = open2($chld_out, $chld_in, 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");