blob: 37a61b291a01abadf5758c15f410e9bcae524508 [file] [log] [blame]
package datok
import (
"bytes"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestFullTokenizerMatrix(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/simpletok.fst")
assert.NotNil(foma)
mat := foma.ToMatrix()
r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
var tokens []string
mat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
assert.Equal(len(tokens), 10)
assert.Equal("wald", tokens[0])
assert.Equal("gehen", tokens[1])
assert.Equal("Da", tokens[2])
assert.Equal("kann", tokens[3])
assert.Equal("man", tokens[4])
assert.Equal("was", tokens[5])
assert.Equal("\"erleben\"", tokens[6])
assert.Equal("!", tokens[7])
r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
w.Reset()
mat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
assert.Equal("In", tokens[0])
assert.Equal("den", tokens[1])
assert.Equal("Wald", tokens[2])
assert.Equal("gehen", tokens[3])
assert.Equal("?", tokens[4])
assert.Equal("--", tokens[5])
r = strings.NewReader(" g? -- D")
w.Reset()
mat.Transduce(r, w)
tokens = strings.Split(w.String(), "\n")
assert.Equal("g", tokens[0])
assert.Equal("?", tokens[1])
assert.Equal("--", tokens[2])
assert.Equal("D", tokens[3])
assert.Equal("", tokens[4])
assert.Equal("", tokens[5])
assert.Equal(6, len(tokens))
}
func TestReadWriteMatrixTokenizer(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/simpletok.fst")
assert.NotNil(foma)
mat := foma.ToMatrix()
assert.NotNil(foma)
assert.True(tmatch(mat, "bau"))
assert.True(tmatch(mat, "bad"))
assert.True(tmatch(mat, "wald gehen"))
b := make([]byte, 0, 1024)
buf := bytes.NewBuffer(b)
n, err := mat.WriteTo(buf)
assert.Nil(err)
assert.Equal(int64(248), n)
mat2 := ParseMatrix(buf)
assert.NotNil(mat2)
assert.Equal(mat.sigma, mat2.sigma)
assert.Equal(mat.epsilon, mat2.epsilon)
assert.Equal(mat.unknown, mat2.unknown)
assert.Equal(mat.identity, mat2.identity)
assert.Equal(mat.stateCount, mat2.stateCount)
assert.Equal(len(mat.array), len(mat2.array))
assert.Equal(mat.array, mat2.array)
assert.True(tmatch(mat2, "bau"))
assert.True(tmatch(mat2, "bad"))
assert.True(tmatch(mat2, "wald gehen"))
}
func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/tokenizer.fst")
assert.NotNil(foma)
mat := foma.ToMatrix()
b := make([]byte, 0, 2048)
w := bytes.NewBuffer(b)
var sentences []string
// testSentSplitterSimple
assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
assert.Equal("Der\nalte\nMann\n.", sentences[0])
assert.Equal("", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
assert.Equal("", sentences[1])
/*
w.Reset()
assert.True(mat.Transduce(strings.NewReader(""), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 1)
assert.Equal("\n", sentences[0])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
assert.Equal("", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal("", sentences[1])
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
assert.Equal("", sentences[1])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 3)
assert.Equal("Ausschalten\n!!!", sentences[0])
assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
assert.Equal("", sentences[2])
w.Reset()
assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
sentences = strings.Split(w.String(), "\n\n")
assert.Equal(len(sentences), 2)
*/
/*
Test:
"\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
*/
}