blob: d5de48e3a534f24189709be93136cbe37be98eeb [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron8ef408b2021-08-02 22:11:04 +02005 "testing"
6
7 "github.com/stretchr/testify/assert"
8)
9
10func TestSimpleString(t *testing.T) {
11 assert := assert.New(t)
12
13 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020014 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020015 dat := tok.ToDoubleArray()
16 assert.True(dat.Match("bau"))
17 assert.True(dat.Match("bauamt"))
18 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020019}
Akron75ebe7f2021-08-03 10:34:10 +020020
21func TestSimpleBranches(t *testing.T) {
22 assert := assert.New(t)
23
24 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020025 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020026 dat := tok.ToDoubleArray()
27 assert.False(dat.Match("bau"))
28 assert.True(dat.Match("bauamt"))
29 assert.True(dat.Match("wahlamt"))
30 assert.True(dat.Match("bauen"))
31 assert.True(dat.Match("wahlen"))
32 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020033}
Akron730a79c2021-08-03 11:05:29 +020034
35func TestSimpleTokenizer(t *testing.T) {
36 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020037 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020038 dat := tok.ToDoubleArray()
39 assert.True(dat.Match("bau"))
40 assert.True(dat.Match("bad"))
41 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020042}
Akron740f3d72021-08-03 12:12:34 +020043
Akron068874c2021-08-04 15:19:56 +020044func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020045 assert := assert.New(t)
46 tok := LoadFomaFile("testdata/simpletok.fst")
47 // tok := LoadFomaFile("testdata/tokenizer.fst")
48 dat := tok.ToDoubleArray()
49 // assert.True(dat.Transduce("bau"))
50 // assert.True(dat.Match("bad"))
51 assert.True(dat.Transduce("wald gehen"))
52 // assert.True(dat.Transduce("wald gehen"))
53 assert.Fail("!")
Akron068874c2021-08-04 15:19:56 +020054}
55
Akron6247a5d2021-08-03 19:18:28 +020056func TestWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020057 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020058 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020059 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020060 assert.True(dat.Match("bau"))
61 assert.True(dat.Match("bad"))
62 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020063
Akron03a3c612021-08-04 11:51:27 +020064 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020065
66 b := make([]byte, 1024)
67 buf := bytes.NewBuffer(b)
68 n, err := dat.WriteTo(buf)
69 assert.Nil(err)
70 assert.Equal(n, int64(186))
71}
72
73func TestFullTokenizer(t *testing.T) {
Akron2a4b9292021-08-04 15:35:22 +020074 /*
75 assert := assert.New(t)
76 tok := LoadFomaFile("testdata/tokenizer.fst")
77 dat := tok.ToDoubleArray()
78 assert.True(dat.LoadFactor() >= 70)
79 assert.True(dat.Match("bau"))
80 assert.True(dat.Match("bad"))
81 assert.True(dat.Match("wald gehen"))
82 */
Akron740f3d72021-08-03 12:12:34 +020083}