Support token offsets in token writer

Change-Id: I7db20d8f26fd87a2f43c3a6599bfeba050fc340a
diff --git a/cmd/datok.go b/cmd/datok.go
index adff996..9314a93 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -18,6 +18,8 @@
 	} `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
 	Tokenize struct {
 		Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
+		Positions bool   `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
+		Tokens    bool   `kong:"optional,negatable,default=true,help="Print token surfaces""`
 	} `kong:"cmd, help='Tokenize a text'"`
 }
 
@@ -67,10 +69,13 @@
 		os.Exit(1)
 	}
 
+	// Create token writer based on the options defined
+	tw := datok.NewTokenWriterFromOptions(os.Stdout, cli.Tokenize.Positions)
+
 	// Program is running in a pipe
 	fileInfo, _ := os.Stdin.Stat()
 	if fileInfo.Mode()&os.ModeCharDevice == 0 {
-		// Transduce from STDIN and write to STDOUT
-		dat.Transduce(os.Stdin, os.Stdout)
+		dat.TransduceTokenWriter(os.Stdin, tw)
+		tw.Flush()
 	}
 }
diff --git a/datok.go b/datok.go
index f8fccaa..f804bf0 100644
--- a/datok.go
+++ b/datok.go
@@ -746,7 +746,7 @@
 
 // Transduce input to ouutput
 func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
-	return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+	return dat.TransduceTokenWriter(r, NewTokenWriter(w))
 }
 
 // TransduceTokenWriter transduces an input string against
@@ -757,7 +757,7 @@
 // Based on Mizobuchi et al (2000), p. 129,
 // with additional support for IDENTITY, UNKNOWN
 // and EPSILON transitions and NONTOKEN and TOKENEND handling.
-func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
+func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
 	var a int
 	var t0 uint32
 	t := uint32(1) // Initial state
diff --git a/fomafile.go b/fomafile.go
index 2b10736..33128a2 100644
--- a/fomafile.go
+++ b/fomafile.go
@@ -29,6 +29,7 @@
 
 type Tokenizer interface {
 	Transduce(r io.Reader, w io.Writer) bool
+	TransduceTokenWriter(r io.Reader, w *TokenWriter) bool
 	Type() string
 }
 
diff --git a/matrix.go b/matrix.go
index 545cd07..2ac31c3 100644
--- a/matrix.go
+++ b/matrix.go
@@ -315,14 +315,14 @@
 
 // Transduce input to ouutput
 func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
-	return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+	return mat.TransduceTokenWriter(r, NewTokenWriter(w))
 }
 
 // TransduceTokenWriter transduces an input string against
 // the matrix FSA. The rules are always greedy. If the
 // automaton fails, it takes the last possible token ending
 // branch.
-func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
+func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
 	var a int
 	var t0 uint32
 	t := uint32(1) // Initial state
@@ -499,7 +499,7 @@
 				textEnd = false
 			} else {
 				sentenceEnd = true
-				w.SentenceEnd(0)
+				w.SentenceEnd(buffc)
 			}
 		}
 
@@ -531,7 +531,7 @@
 		if eot {
 			eot = false
 			textEnd = true
-			w.TextEnd(0)
+			w.TextEnd(buffc)
 			if DEBUG {
 				fmt.Println("END OF TEXT")
 			}
@@ -580,14 +580,14 @@
 	// sentence split was reached. This may be controversial and therefore
 	// optional via parameter.
 	if !sentenceEnd {
-		w.SentenceEnd(0)
+		w.SentenceEnd(buffc)
 		if DEBUG {
 			fmt.Println("Sentence end")
 		}
 	}
 
 	if !textEnd {
-		w.TextEnd(0)
+		w.TextEnd(buffc)
 
 		if DEBUG {
 			fmt.Println("Text end")
diff --git a/token_writer.go b/token_writer.go
index e1fabdf..d75b261 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -3,39 +3,96 @@
 import (
 	"bufio"
 	"io"
+	"strconv"
 )
 
-type TokenWriterI interface {
-	SentenceEnd(int)
-	TextEnd(int)
-	Token(int, []rune)
-	Flush() error
+type TokenWriter struct {
+	SentenceEnd func(int)
+	TextEnd     func(int)
+	Flush       func() error
+	Token       func(int, []rune)
 }
 
-var _ TokenWriterI = &TokenWriterSimple{}
+func NewTokenWriter(w io.Writer) *TokenWriter {
+	writer := bufio.NewWriter(w)
 
-type TokenWriterSimple struct {
-	writer *bufio.Writer
+	return &TokenWriter{
+		SentenceEnd: func(_ int) {
+			writer.WriteRune('\n')
+		},
+		TextEnd: func(_ int) {
+			writer.WriteRune('\n')
+			writer.Flush()
+		},
+		Token: func(offset int, buf []rune) {
+			writer.WriteString(string(buf[offset:]))
+			writer.WriteRune('\n')
+		},
+		Flush: func() error {
+			return writer.Flush()
+		},
+	}
 }
 
-func NewTokenWriterSimple(w io.Writer) *TokenWriterSimple {
-	return &TokenWriterSimple{bufio.NewWriter(w)}
-}
+// Create a new token writer based on the options
+func NewTokenWriterFromOptions(w io.Writer, positionFlag bool) *TokenWriter {
+	writer := bufio.NewWriter(w)
+	posC := 0
+	pos := make([]int, 0, 200)
 
-func (tw *TokenWriterSimple) SentenceEnd(_ int) {
-	tw.writer.WriteRune('\n')
-}
+	tw := &TokenWriter{}
 
-func (tw *TokenWriterSimple) TextEnd(_ int) {
-	tw.writer.WriteRune('\n')
-	tw.writer.Flush()
-}
+	if positionFlag {
+		tw.Token = func(offset int, buf []rune) {
 
-func (tw *TokenWriterSimple) Token(offset int, buf []rune) {
-	tw.writer.WriteString(string(buf[offset:]))
-	tw.writer.WriteRune('\n')
-}
+			// TODO:
+			//   Store in []uint16
+			//   and write to string
+			posC += offset
+			pos = append(pos, posC)
+			posC += len(buf) - offset
+			pos = append(pos, posC)
+			//		pos = append(pos, offset, len(buf)-offset)
 
-func (tw *TokenWriterSimple) Flush() error {
-	return tw.writer.Flush()
+			writer.WriteString(string(buf[offset:]))
+			writer.WriteRune('\n')
+		}
+	} else {
+		tw.Token = func(offset int, buf []rune) {
+			writer.WriteString(string(buf[offset:]))
+			writer.WriteRune('\n')
+		}
+	}
+
+	tw.SentenceEnd = func(_ int) {
+		writer.WriteRune('\n')
+	}
+
+	if positionFlag {
+		tw.TextEnd = func(offset int) {
+			writer.Flush()
+
+			writer.WriteString(strconv.Itoa(pos[0]))
+			for _, x := range pos[1:] {
+				writer.WriteByte(' ')
+				writer.WriteString(strconv.Itoa(x))
+			}
+			writer.WriteRune('\n')
+
+			posC = 0 - offset
+			pos = pos[:0]
+		}
+	} else {
+		tw.TextEnd = func(_ int) {
+			writer.WriteRune('\n')
+			writer.Flush()
+		}
+
+	}
+
+	tw.Flush = func() error {
+		return writer.Flush()
+	}
+
+	return tw
 }
diff --git a/token_writer_test.go b/token_writer_test.go
index 8ab6ed0..71524fb 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -2,6 +2,7 @@
 
 import (
 	"bytes"
+	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -13,7 +14,7 @@
 	b := make([]byte, 0, 2048)
 	w := bytes.NewBuffer(b)
 
-	tws := NewTokenWriterSimple(w)
+	tws := NewTokenWriter(w)
 
 	assert.NotNil(tws)
 
@@ -29,3 +30,23 @@
 
 	assert.Equal("abc\nef\n\n\n", w.String())
 }
+
+func TestTokenWriterFromOptions(t *testing.T) {
+	assert := assert.New(t)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+
+	tws := NewTokenWriterFromOptions(w, true)
+
+	mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+	assert.NotNil(mat)
+
+	assert.True(mat.TransduceTokenWriter(
+		strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
+	)
+
+	matStr := w.String()
+	assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr)
+}