Add sentence flags (for printing and offsets)

Change-Id: Ia4aaf75cae509ba1a43d04e369e7d5b21867bc15
diff --git a/cmd/datok.go b/cmd/datok.go
index 95919e3..00c79eb 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -17,10 +17,12 @@
 		DoubleArray bool   `kong:"optional,short='d',help='Convert to Double Array instead of Matrix representation'"`
 	} `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
 	Tokenize struct {
-		Tokenizer       string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
-		Positions       bool   `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
-		Tokens          bool   `kong:"optional,negatable,default=true,help='Print token surfaces'"`
-		NewlineAfterEOT bool   `kong:"optional,negatable,help='Ignore newline after EOT'"`
+		Tokenizer         string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
+		Tokens            bool   `kong:"optional,negatable,default=true,help='Print token surfaces'"`
+		Sentences         bool   `kong:"optional,negatable,default=true,help='Print sentence boundaries'"`
+		TokenPositions    bool   `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
+		SentencePositions bool   `kong:"optional,negatable,default=false,help='Print sentence offsets'"`
+		NewlineAfterEOT   bool   `kong:"optional,negatable,help='Ignore newline after EOT'"`
 	} `kong:"cmd, help='Tokenize a text'"`
 }
 
@@ -73,8 +75,10 @@
 	// Create token writer based on the options defined
 	tw := datok.NewTokenWriterFromOptions(
 		os.Stdout,
-		cli.Tokenize.Positions,
+		cli.Tokenize.TokenPositions,
 		cli.Tokenize.Tokens,
+		cli.Tokenize.Sentences,
+		cli.Tokenize.SentencePositions,
 		cli.Tokenize.NewlineAfterEOT,
 	)
 
diff --git a/token_writer.go b/token_writer.go
index 5e4b72d..dd528b5 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -35,10 +35,12 @@
 }
 
 // Create a new token writer based on the options
-func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, newlineAfterEot bool) *TokenWriter {
+func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, sentenceFlag bool, sentencePositionFlag bool, newlineAfterEot bool) *TokenWriter {
 	writer := bufio.NewWriter(w)
 	posC := 0
-	pos := make([]int, 0, 200)
+	pos := make([]int, 0, 1024)
+	sentB := true
+	sent := make([]int, 0, 1024)
 
 	tw := &TokenWriter{}
 
@@ -56,6 +58,12 @@
 
 			posC += offset
 			pos = append(pos, posC)
+
+			// Token is the start of a sentence
+			if sentB {
+				sentB = false
+				sent = append(sent, posC)
+			}
 			posC += len(buf) - offset
 			pos = append(pos, posC)
 
@@ -64,6 +72,8 @@
 				writer.WriteRune('\n')
 			}
 		}
+
+		// Only print one token per line
 	} else {
 		tw.Token = func(offset int, buf []rune) {
 			writer.WriteString(string(buf[offset:]))
@@ -71,20 +81,53 @@
 		}
 	}
 
-	tw.SentenceEnd = func(_ int) {
-		writer.WriteRune('\n')
+	// Print sentence boundaries
+	if sentenceFlag || sentencePositionFlag {
+		tw.SentenceEnd = func(offset int) {
+
+			// Add end position of last token to sentence boundary
+			sent = append(sent, pos[len(pos)-1])
+			sentB = true
+
+			if sentenceFlag {
+				writer.WriteRune('\n')
+			}
+		}
+
+		// Print sentence boundaries as newlines
+	} else if sentenceFlag {
+		tw.SentenceEnd = func(_ int) {
+			writer.WriteRune('\n')
+		}
+
+		// Ignore sentence boundaries
+	} else {
+		tw.SentenceEnd = func(_ int) {}
 	}
 
-	if positionFlag {
+	if positionFlag || sentencePositionFlag {
 		tw.TextEnd = func(_ int) {
 			writer.Flush()
 
-			writer.WriteString(strconv.Itoa(pos[0]))
-			for _, x := range pos[1:] {
-				writer.WriteByte(' ')
-				writer.WriteString(strconv.Itoa(x))
+			if positionFlag {
+				writer.WriteString(strconv.Itoa(pos[0]))
+				for _, x := range pos[1:] {
+					writer.WriteByte(' ')
+					writer.WriteString(strconv.Itoa(x))
+				}
+				writer.WriteRune('\n')
 			}
-			writer.WriteRune('\n')
+
+			if sentencePositionFlag {
+				writer.WriteString(strconv.Itoa(sent[0]))
+				for _, x := range sent[1:] {
+					writer.WriteByte(' ')
+					writer.WriteString(strconv.Itoa(x))
+				}
+				writer.WriteRune('\n')
+				sent = sent[:0]
+				sentB = true
+			}
 
 			posC = 0
 			pos = pos[:0]
diff --git a/token_writer_test.go b/token_writer_test.go
index a75b3d9..601da0e 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -37,7 +37,7 @@
 	b := make([]byte, 0, 2048)
 	w := bytes.NewBuffer(b)
 
-	tws := NewTokenWriterFromOptions(w, true, true, false)
+	tws := NewTokenWriterFromOptions(w, true, true, true, false, false)
 
 	mat := LoadMatrixFile("testdata/tokenizer.matok")
 
@@ -58,7 +58,7 @@
 
 	//
 	// Accept newline after EOT
-	tws = NewTokenWriterFromOptions(w, true, true, true)
+	tws = NewTokenWriterFromOptions(w, true, true, true, false, true)
 
 	w.Reset()
 	mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
@@ -68,11 +68,21 @@
 
 	//
 	// Write no tokens
-	tws = NewTokenWriterFromOptions(w, true, false, true)
+	tws = NewTokenWriterFromOptions(w, true, false, true, false, true)
 
 	w.Reset()
 	mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
 
 	matStr = w.String()
 	assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr)
+
+	//
+	// Write sentences
+	tws = NewTokenWriterFromOptions(w, true, false, false, true, true)
+
+	w.Reset()
+	mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
+
+	matStr = w.String()
+	assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
 }