blob: 192574321367810bcd9f9a6c3ebf136c458316d1 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron524c5432021-08-05 14:14:27 +020054 dat.Transduce(r, w)
55 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020056 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
Akron524c5432021-08-05 14:14:27 +020064 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
65 w.Reset()
66 dat.Transduce(r, w)
67 tokens = strings.Split(w.String(), "\n")
68 assert.Equal("In", tokens[0])
69 assert.Equal("den", tokens[1])
70 assert.Equal("Wald", tokens[2])
71 assert.Equal("gehen", tokens[3])
72 assert.Equal("?", tokens[4])
73 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020074
Akron524c5432021-08-05 14:14:27 +020075 r = strings.NewReader(" g? -- D")
76 w.Reset()
77 dat.Transduce(r, w)
78 tokens = strings.Split(w.String(), "\n")
79 assert.Equal("g", tokens[0])
80 assert.Equal("?", tokens[1])
81 assert.Equal("--", tokens[2])
82 assert.Equal("D", tokens[3])
83 assert.Equal("", tokens[4])
84 assert.Equal("", tokens[5])
85 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020086}
87
Akron3f8571a2021-08-05 11:18:10 +020088func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020089 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020090 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020091 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020092 assert.True(dat.Match("bau"))
93 assert.True(dat.Match("bad"))
94 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020095
Akron03a3c612021-08-04 11:51:27 +020096 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020097
Akron3f8571a2021-08-05 11:18:10 +020098 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020099 buf := bytes.NewBuffer(b)
100 n, err := dat.WriteTo(buf)
101 assert.Nil(err)
Akron3a063ef2021-08-05 19:36:35 +0200102 assert.Equal(int64(208), n)
Akron3f8571a2021-08-05 11:18:10 +0200103
104 dat2 := ParseDatok(buf)
105 assert.NotNil(dat2)
106 assert.Equal(dat.array, dat2.array)
107 assert.Equal(dat.sigma, dat2.sigma)
108 assert.Equal(dat.epsilon, dat2.epsilon)
109 assert.Equal(dat.unknown, dat2.unknown)
110 assert.Equal(dat.identity, dat2.identity)
111 assert.Equal(dat.final, dat2.final)
112 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
113 assert.True(dat2.Match("bau"))
114 assert.True(dat2.Match("bad"))
115 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116}
117
118func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200119 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200120 /*
Akron2a4b9292021-08-04 15:35:22 +0200121 tok := LoadFomaFile("testdata/tokenizer.fst")
122 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200123 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200124 */
Akron3a063ef2021-08-05 19:36:35 +0200125 dat := LoadDatokFile("testdata/tokenizer.datok")
126 assert.NotNil(dat)
127 assert.True(dat.LoadFactor() >= 70)
128 assert.Equal(dat.epsilon, 1)
129 assert.Equal(dat.unknown, 2)
130 assert.Equal(dat.identity, 3)
131 assert.Equal(dat.final, 135)
132 assert.Equal(len(dat.sigma), 131)
133 assert.Equal(len(dat.array), 3771904)
134 assert.Equal(dat.maxSize, 3771903)
135
136 assert.True(dat.Match("bau"))
137 assert.True(dat.Match("bad"))
138 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200139}
Akron3f8571a2021-08-05 11:18:10 +0200140
141func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200142 assert := assert.New(t)
143
Akron3f8571a2021-08-05 11:18:10 +0200144 /*
Akron3f8571a2021-08-05 11:18:10 +0200145
Akron3610f102021-08-08 14:13:25 +0200146 tok := LoadFomaFile("testdata/tokenizer.fst")
147 dat := tok.ToDoubleArray()
148 //dat.Save("testdata/tokenizer.datok")
Akron3f8571a2021-08-05 11:18:10 +0200149 */
Akron3610f102021-08-08 14:13:25 +0200150 dat := LoadDatokFile("testdata/tokenizer.datok")
151
152 assert.NotNil(dat)
153
154 r := strings.NewReader("tra. und Du?")
155
156 b := make([]byte, 0, 2048)
157 w := bytes.NewBuffer(b)
158 var tokens []string
159
160 assert.True(dat.Transduce(r, w))
161
162 tokens = strings.Split(w.String(), "\n")
163 assert.Equal("tra", tokens[0])
164 assert.Equal(".", tokens[1])
165 assert.Equal("und", tokens[2])
166 assert.Equal("Du", tokens[3])
167 assert.Equal("?", tokens[4])
168 assert.Equal("", tokens[5])
169 assert.Equal("", tokens[6])
170 assert.Equal(7, len(tokens))
Akron3f8571a2021-08-05 11:18:10 +0200171}