Fix TokenWriter regarding sentence boundaries and remove simple TokenWriter
Change-Id: I38a276f87f1f457a765462dabb14a0511f9447a2
diff --git a/cmd/datok.go b/cmd/datok.go
index bd797bf..66bc7e9 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -95,7 +95,7 @@
}
// Create token writer based on the options defined
- tw := datok.NewTokenWriterFromOptions(os.Stdout, flags)
+ tw := datok.NewTokenWriter(os.Stdout, flags)
// Program is running in a pipe
fileInfo, _ := os.Stdin.Stat()
diff --git a/datok.go b/datok.go
index 7e5a36d..5bd54e9 100644
--- a/datok.go
+++ b/datok.go
@@ -746,7 +746,7 @@
// Transduce input to ouutput
func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
- return dat.TransduceTokenWriter(r, NewTokenWriter(w))
+ return dat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
}
// TransduceTokenWriter transduces an input string against
diff --git a/matrix.go b/matrix.go
index 3a06951..021c191 100644
--- a/matrix.go
+++ b/matrix.go
@@ -315,7 +315,7 @@
// Transduce input to ouutput
func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
- return mat.TransduceTokenWriter(r, NewTokenWriter(w))
+ return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
}
// TransduceTokenWriter transduces an input string against
diff --git a/token_writer.go b/token_writer.go
index aa9c32d..da4ae4d 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -14,6 +14,8 @@
TOKEN_POS
SENTENCE_POS
NEWLINE_AFTER_EOT
+
+ SIMPLE = TOKENS | SENTENCES
)
type TokenWriter struct {
@@ -23,29 +25,8 @@
Token func(int, []rune)
}
-func NewTokenWriter(w io.Writer) *TokenWriter {
- writer := bufio.NewWriter(w)
-
- return &TokenWriter{
- SentenceEnd: func(_ int) {
- writer.WriteRune('\n')
- },
- TextEnd: func(_ int) {
- writer.WriteRune('\n')
- writer.Flush()
- },
- Token: func(offset int, buf []rune) {
- writer.WriteString(string(buf[offset:]))
- writer.WriteRune('\n')
- },
- Flush: func() error {
- return writer.Flush()
- },
- }
-}
-
// Create a new token writer based on the options
-func NewTokenWriterFromOptions(w io.Writer, flags Bits) *TokenWriter {
+func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
writer := bufio.NewWriter(w)
posC := 0
pos := make([]int, 0, 1024)
@@ -84,15 +65,17 @@
}
// Only print one token per line
- } else {
+ } else if flags&TOKENS != 0 {
tw.Token = func(offset int, buf []rune) {
writer.WriteString(string(buf[offset:]))
writer.WriteRune('\n')
}
+ } else {
+ tw.Token = func(_ int, _ []rune) {}
}
// Print sentence boundaries
- if flags&(SENTENCES|SENTENCE_POS) != 0 {
+ if flags&SENTENCE_POS != 0 {
tw.SentenceEnd = func(offset int) {
// Add end position of last token to sentence boundary
@@ -148,7 +131,6 @@
writer.WriteRune('\n')
writer.Flush()
}
-
}
tw.Flush = func() error {
diff --git a/token_writer_test.go b/token_writer_test.go
index e724212..82baa69 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -14,7 +14,7 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- tws := NewTokenWriter(w)
+ tws := NewTokenWriter(w, SIMPLE)
assert.NotNil(tws)
@@ -40,7 +40,7 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- tws := NewTokenWriterFromOptions(w, TOKENS|SENTENCES|TOKEN_POS)
+ tws := NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS)
assert.True(mat.TransduceTokenWriter(
strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
@@ -57,7 +57,7 @@
//
// Accept newline after EOT
- tws = NewTokenWriterFromOptions(w, TOKENS|SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
+ tws = NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
@@ -67,7 +67,7 @@
//
// Write no tokens
- tws = NewTokenWriterFromOptions(w, SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
+ tws = NewTokenWriter(w, SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
@@ -77,7 +77,7 @@
//
// Write sentence offsets
- tws = NewTokenWriterFromOptions(w, TOKEN_POS|SENTENCE_POS|NEWLINE_AFTER_EOT)
+ tws = NewTokenWriter(w, TOKEN_POS|SENTENCE_POS|NEWLINE_AFTER_EOT)
w.Reset()
mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)