blob: 0c30dbb00a732edd26c009664376290fe00e5a4b [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron8ef408b2021-08-02 22:11:04 +02005 "testing"
6
7 "github.com/stretchr/testify/assert"
8)
9
10func TestSimpleString(t *testing.T) {
11 assert := assert.New(t)
12
13 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020014 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020015 dat := tok.ToDoubleArray()
16 assert.True(dat.Match("bau"))
17 assert.True(dat.Match("bauamt"))
18 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020019}
Akron75ebe7f2021-08-03 10:34:10 +020020
21func TestSimpleBranches(t *testing.T) {
22 assert := assert.New(t)
23
24 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020025 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020026 dat := tok.ToDoubleArray()
27 assert.False(dat.Match("bau"))
28 assert.True(dat.Match("bauamt"))
29 assert.True(dat.Match("wahlamt"))
30 assert.True(dat.Match("bauen"))
31 assert.True(dat.Match("wahlen"))
32 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020033}
Akron730a79c2021-08-03 11:05:29 +020034
35func TestSimpleTokenizer(t *testing.T) {
36 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020037 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020038 dat := tok.ToDoubleArray()
39 assert.True(dat.Match("bau"))
40 assert.True(dat.Match("bad"))
41 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020042}
Akron740f3d72021-08-03 12:12:34 +020043
Akron6247a5d2021-08-03 19:18:28 +020044func TestWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020045 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020046 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020047 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020048 assert.True(dat.Match("bau"))
49 assert.True(dat.Match("bad"))
50 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020051
Akron03a3c612021-08-04 11:51:27 +020052 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020053
54 b := make([]byte, 1024)
55 buf := bytes.NewBuffer(b)
56 n, err := dat.WriteTo(buf)
57 assert.Nil(err)
58 assert.Equal(n, int64(186))
59}
60
61func TestFullTokenizer(t *testing.T) {
Akron3fdfec62021-08-04 11:40:10 +020062 assert := assert.New(t)
63 tok := LoadFomaFile("testdata/tokenizer.fst")
64 dat := tok.ToDoubleArray()
Akron03a3c612021-08-04 11:51:27 +020065 assert.True(dat.LoadFactor() >= 70)
Akron3fdfec62021-08-04 11:40:10 +020066 assert.True(dat.Match("bau"))
67 assert.True(dat.Match("bad"))
68 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +020069}