blob: 4ab205d369db56c8f3f6b3ac398334f5d7ab679d [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron8ef408b2021-08-02 22:11:04 +02005 "testing"
6
7 "github.com/stretchr/testify/assert"
8)
9
10func TestSimpleString(t *testing.T) {
11 assert := assert.New(t)
12
13 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020014 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020015 dat := tok.ToDoubleArray()
16 assert.True(dat.Match("bau"))
17 assert.True(dat.Match("bauamt"))
18 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020019}
Akron75ebe7f2021-08-03 10:34:10 +020020
21func TestSimpleBranches(t *testing.T) {
22 assert := assert.New(t)
23
24 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020025 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020026 dat := tok.ToDoubleArray()
27 assert.False(dat.Match("bau"))
28 assert.True(dat.Match("bauamt"))
29 assert.True(dat.Match("wahlamt"))
30 assert.True(dat.Match("bauen"))
31 assert.True(dat.Match("wahlen"))
32 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020033}
Akron730a79c2021-08-03 11:05:29 +020034
35func TestSimpleTokenizer(t *testing.T) {
36 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020037 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020038 dat := tok.ToDoubleArray()
39 assert.True(dat.Match("bau"))
40 assert.True(dat.Match("bad"))
41 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020042}
Akron740f3d72021-08-03 12:12:34 +020043
Akron068874c2021-08-04 15:19:56 +020044func TestSimpleTokenizerTransduce(t *testing.T) {
45 /*
46 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
48 dat := tok.ToDoubleArray()
49 // assert.True(dat.Transduce("bau"))
50 // assert.True(dat.Match("bad"))
51 assert.True(dat.Transduce("wald gehen"))
Akron2a4b9292021-08-04 15:35:22 +020052 assert.Fail("!")*/
Akron068874c2021-08-04 15:19:56 +020053}
54
Akron6247a5d2021-08-03 19:18:28 +020055func TestWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020056 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020057 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020058 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020059 assert.True(dat.Match("bau"))
60 assert.True(dat.Match("bad"))
61 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020062
Akron03a3c612021-08-04 11:51:27 +020063 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020064
65 b := make([]byte, 1024)
66 buf := bytes.NewBuffer(b)
67 n, err := dat.WriteTo(buf)
68 assert.Nil(err)
69 assert.Equal(n, int64(186))
70}
71
72func TestFullTokenizer(t *testing.T) {
Akron2a4b9292021-08-04 15:35:22 +020073 /*
74 assert := assert.New(t)
75 tok := LoadFomaFile("testdata/tokenizer.fst")
76 dat := tok.ToDoubleArray()
77 assert.True(dat.LoadFactor() >= 70)
78 assert.True(dat.Match("bau"))
79 assert.True(dat.Match("bad"))
80 assert.True(dat.Match("wald gehen"))
81 */
Akron740f3d72021-08-03 12:12:34 +020082}