blob: 71524fbbd283d015241ccbbde8766c6f2f949be7 [file] [log] [blame]
package datok
import (
"bytes"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestTokenWriterSimple(t *testing.T) {
assert := assert.New(t)
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
tws := NewTokenWriter(w)
assert.NotNil(tws)
tws.Token(0, []rune{'a', 'b', 'c'})
tws.Token(1, []rune{'d', 'e', 'f'})
tws.SentenceEnd(0)
tws.TextEnd(0)
tws.Flush()
assert.Equal("abc\nef\n\n\n", w.String())
}
func TestTokenWriterFromOptions(t *testing.T) {
assert := assert.New(t)
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
tws := NewTokenWriterFromOptions(w, true)
mat := LoadMatrixFile("testdata/tokenizer.matok")
assert.NotNil(mat)
assert.True(mat.TransduceTokenWriter(
strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
)
matStr := w.String()
assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr)
}