Support token offsets in token writer
Change-Id: I7db20d8f26fd87a2f43c3a6599bfeba050fc340a
diff --git a/cmd/datok.go b/cmd/datok.go
index adff996..9314a93 100644
--- a/cmd/datok.go
+++ b/cmd/datok.go
@@ -18,6 +18,8 @@
} `kong:"cmd, help='Convert a foma file to a Matrix or Double Array tokenizer'"`
Tokenize struct {
Tokenizer string `kong:"required,short='t',help='The Matrix or Double Array Tokenizer file'"`
+ Positions bool `kong:"optional,negatable,default=false,short='p',help='Print token offsets'"`
+ Tokens bool `kong:"optional,negatable,default=true,help="Print token surfaces""`
} `kong:"cmd, help='Tokenize a text'"`
}
@@ -67,10 +69,13 @@
os.Exit(1)
}
+ // Create token writer based on the options defined
+ tw := datok.NewTokenWriterFromOptions(os.Stdout, cli.Tokenize.Positions)
+
// Program is running in a pipe
fileInfo, _ := os.Stdin.Stat()
if fileInfo.Mode()&os.ModeCharDevice == 0 {
- // Transduce from STDIN and write to STDOUT
- dat.Transduce(os.Stdin, os.Stdout)
+ dat.TransduceTokenWriter(os.Stdin, tw)
+ tw.Flush()
}
}
diff --git a/datok.go b/datok.go
index f8fccaa..f804bf0 100644
--- a/datok.go
+++ b/datok.go
@@ -746,7 +746,7 @@
// Transduce input to ouutput
func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
- return dat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+ return dat.TransduceTokenWriter(r, NewTokenWriter(w))
}
// TransduceTokenWriter transduces an input string against
@@ -757,7 +757,7 @@
// Based on Mizobuchi et al (2000), p. 129,
// with additional support for IDENTITY, UNKNOWN
// and EPSILON transitions and NONTOKEN and TOKENEND handling.
-func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
+func (dat *DaTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
var a int
var t0 uint32
t := uint32(1) // Initial state
diff --git a/fomafile.go b/fomafile.go
index 2b10736..33128a2 100644
--- a/fomafile.go
+++ b/fomafile.go
@@ -29,6 +29,7 @@
type Tokenizer interface {
Transduce(r io.Reader, w io.Writer) bool
+ TransduceTokenWriter(r io.Reader, w *TokenWriter) bool
Type() string
}
diff --git a/matrix.go b/matrix.go
index 545cd07..2ac31c3 100644
--- a/matrix.go
+++ b/matrix.go
@@ -315,14 +315,14 @@
// Transduce input to ouutput
func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
- return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
+ return mat.TransduceTokenWriter(r, NewTokenWriter(w))
}
// TransduceTokenWriter transduces an input string against
// the matrix FSA. The rules are always greedy. If the
// automaton fails, it takes the last possible token ending
// branch.
-func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
+func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
var a int
var t0 uint32
t := uint32(1) // Initial state
@@ -499,7 +499,7 @@
textEnd = false
} else {
sentenceEnd = true
- w.SentenceEnd(0)
+ w.SentenceEnd(buffc)
}
}
@@ -531,7 +531,7 @@
if eot {
eot = false
textEnd = true
- w.TextEnd(0)
+ w.TextEnd(buffc)
if DEBUG {
fmt.Println("END OF TEXT")
}
@@ -580,14 +580,14 @@
// sentence split was reached. This may be controversial and therefore
// optional via parameter.
if !sentenceEnd {
- w.SentenceEnd(0)
+ w.SentenceEnd(buffc)
if DEBUG {
fmt.Println("Sentence end")
}
}
if !textEnd {
- w.TextEnd(0)
+ w.TextEnd(buffc)
if DEBUG {
fmt.Println("Text end")
diff --git a/token_writer.go b/token_writer.go
index e1fabdf..d75b261 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -3,39 +3,96 @@
import (
"bufio"
"io"
+ "strconv"
)
-type TokenWriterI interface {
- SentenceEnd(int)
- TextEnd(int)
- Token(int, []rune)
- Flush() error
+type TokenWriter struct {
+ SentenceEnd func(int)
+ TextEnd func(int)
+ Flush func() error
+ Token func(int, []rune)
}
-var _ TokenWriterI = &TokenWriterSimple{}
+func NewTokenWriter(w io.Writer) *TokenWriter {
+ writer := bufio.NewWriter(w)
-type TokenWriterSimple struct {
- writer *bufio.Writer
+ return &TokenWriter{
+ SentenceEnd: func(_ int) {
+ writer.WriteRune('\n')
+ },
+ TextEnd: func(_ int) {
+ writer.WriteRune('\n')
+ writer.Flush()
+ },
+ Token: func(offset int, buf []rune) {
+ writer.WriteString(string(buf[offset:]))
+ writer.WriteRune('\n')
+ },
+ Flush: func() error {
+ return writer.Flush()
+ },
+ }
}
-func NewTokenWriterSimple(w io.Writer) *TokenWriterSimple {
- return &TokenWriterSimple{bufio.NewWriter(w)}
-}
+// Create a new token writer based on the options
+func NewTokenWriterFromOptions(w io.Writer, positionFlag bool) *TokenWriter {
+ writer := bufio.NewWriter(w)
+ posC := 0
+ pos := make([]int, 0, 200)
-func (tw *TokenWriterSimple) SentenceEnd(_ int) {
- tw.writer.WriteRune('\n')
-}
+ tw := &TokenWriter{}
-func (tw *TokenWriterSimple) TextEnd(_ int) {
- tw.writer.WriteRune('\n')
- tw.writer.Flush()
-}
+ if positionFlag {
+ tw.Token = func(offset int, buf []rune) {
-func (tw *TokenWriterSimple) Token(offset int, buf []rune) {
- tw.writer.WriteString(string(buf[offset:]))
- tw.writer.WriteRune('\n')
-}
+ // TODO:
+ // Store in []uint16
+ // and write to string
+ posC += offset
+ pos = append(pos, posC)
+ posC += len(buf) - offset
+ pos = append(pos, posC)
+ // pos = append(pos, offset, len(buf)-offset)
-func (tw *TokenWriterSimple) Flush() error {
- return tw.writer.Flush()
+ writer.WriteString(string(buf[offset:]))
+ writer.WriteRune('\n')
+ }
+ } else {
+ tw.Token = func(offset int, buf []rune) {
+ writer.WriteString(string(buf[offset:]))
+ writer.WriteRune('\n')
+ }
+ }
+
+ tw.SentenceEnd = func(_ int) {
+ writer.WriteRune('\n')
+ }
+
+ if positionFlag {
+ tw.TextEnd = func(offset int) {
+ writer.Flush()
+
+ writer.WriteString(strconv.Itoa(pos[0]))
+ for _, x := range pos[1:] {
+ writer.WriteByte(' ')
+ writer.WriteString(strconv.Itoa(x))
+ }
+ writer.WriteRune('\n')
+
+ posC = 0 - offset
+ pos = pos[:0]
+ }
+ } else {
+ tw.TextEnd = func(_ int) {
+ writer.WriteRune('\n')
+ writer.Flush()
+ }
+
+ }
+
+ tw.Flush = func() error {
+ return writer.Flush()
+ }
+
+ return tw
}
diff --git a/token_writer_test.go b/token_writer_test.go
index 8ab6ed0..71524fb 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -2,6 +2,7 @@
import (
"bytes"
+ "strings"
"testing"
"github.com/stretchr/testify/assert"
@@ -13,7 +14,7 @@
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
- tws := NewTokenWriterSimple(w)
+ tws := NewTokenWriter(w)
assert.NotNil(tws)
@@ -29,3 +30,23 @@
assert.Equal("abc\nef\n\n\n", w.String())
}
+
+func TestTokenWriterFromOptions(t *testing.T) {
+ assert := assert.New(t)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ tws := NewTokenWriterFromOptions(w, true)
+
+ mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+ assert.NotNil(mat)
+
+ assert.True(mat.TransduceTokenWriter(
+ strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
+ )
+
+ matStr := w.String()
+ assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr)
+}