Cleanup output of stanford tokenizer+sentencesplitter Change-Id: I4d620d319b0546aef21a0f7070c4ab5c5356d646

commit: b040897b226b57cc84a628e884807f2cfb437f51 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Mar 07 11:36:17 2022 +0100
committer: Akron <nils@diewald-online.de> Mon Mar 07 11:36:17 2022 +0100
tree: 9be858c68006e3ec029bebbc4085d972997c8d36
parent: c261642121b09f09e62bf913951371597b244638 [diff]
diff --git a/benchmarks/cleanup/stanford.pl b/benchmarks/cleanup/stanford.pl
new file mode 100644
index 0000000..ced1274
--- /dev/null
+++ b/benchmarks/cleanup/stanford.pl

@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script rewrites the pipeline output
+# of the stanford parser for tokenize,ssplit,mwt
+
+our @ARGV;
+
+if (open(my $file, '<' . $ARGV[0])) {
+  foreach (readline($file)) {
+    if (s/^\[Text\=(.+?)\s+CharacterOffsetBegin\=\d+\s+CharacterOffsetEnd=\d+\]$/$1/) {
+      print $_;
+    }
+    elsif (m/^Sentence\s+\#\d+\s+\(/) {
+      print "\n";
+    };
+  };
+
+  print "Done.";
+  close($file);
+}
+else {
+  warn 'Unable to open file'
+};
commit	b040897b226b57cc84a628e884807f2cfb437f51	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Mar 07 11:36:17 2022 +0100
committer	Akron <nils@diewald-online.de>	Mon Mar 07 11:36:17 2022 +0100
tree	9be858c68006e3ec029bebbc4085d972997c8d36
parent	c261642121b09f09e62bf913951371597b244638 [diff]