Atomize and test comment stripping Change-Id: Id798c8bac96214f29659b2764f8861539d6f5210

commit: 4f67cd4981c4b3c960104f2d7bd04e7ce8c084be [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jul 02 12:27:58 2020 +0200
committer: Akron <nils@diewald-online.de> Fri Jul 03 15:29:31 2020 +0200
tree: d4c442cac73184b60a0086922ff4cbafa82cc7d8
parent: 9015734b7bd4b0db912e97f4ea8df8139f802c77 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index fe8c37d..d50dc24 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -44,6 +44,12 @@
 use IO::Compress::Zip qw(zip $ZipError :constants);
 use IPC::Open2 qw(open2);
 
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+use KorAP::XML::TEI;
 
 our $VERSION = '0.01';
 
@@ -308,7 +314,7 @@
 
     # TODO: yet not tested fo big amounts of data
     # must-have, otherwise comments in input could be fatal (e.g.: ...<!--\n<idsHeader...\n-->...)
-    delHTMLcom ( $_ ); # remove HTML comments (<!--...-->)
+    KorAP::XML::TEI::delHTMLcom ( $input_fh, $_ ); # remove HTML comments (<!--...-->)
 
     if ( $data_fl && m#^(.*)</${_TEXT_BODY}>(.*)$# ){
 
@@ -1473,39 +1479,6 @@
 } # end: sub write_tokens
 
 
-sub delHTMLcom { # remove HTML comments
-
-  # the source code part where $tc is used, leads to the situation, that comments can produce an additional blank, which
-  # sometimes is not desirable (e.g.: '...<!-- comment -->\n<w>token</w>...' would lead to '... <w>token</w>...' in $buf_in).
-  # removing comments before processing the line, prevents this situation.
-
-  my ( $pfx, $sfx );
-
-  while ( $_[0] =~ s/<!--.*?-->//g ){}; # remove all comments in actual line
-
-  if ( $_[0] =~ /^(.*)<!--/ && $_[0] !~ /-->/ ){ # remove comment spanning over several lines
-
-    $pfx = $1;
-
-    while ( $_[0] = <> ){
-
-      if ( $_[0] =~ /-->(.*)$/ ){
-        $sfx = $1; last
-      }
-
-    }
-
-    $_[0] = "$pfx$sfx";
-
-  }
-
-  if ( $_[0] =~ s/^\s*$// ){ # get next line and feed it also to this sub, if actual line is empty or only contains whitespace
-
-    $_[0] = <>; delHTMLcom ( $_[0] );
-  }
-}
-
-
 ## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
 sub startTokenizer {
   $pid = open2($chld_out, $chld_in, 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
commit	4f67cd4981c4b3c960104f2d7bd04e7ce8c084be	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jul 02 12:27:58 2020 +0200
committer	Akron <nils@diewald-online.de>	Fri Jul 03 15:29:31 2020 +0200
tree	d4c442cac73184b60a0086922ff4cbafa82cc7d8
parent	9015734b7bd4b0db912e97f4ea8df8139f802c77 [diff] [blame]