blob: 05847b2c71e22fe4cd0af22ca034c3c34ff21d21 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron524c5432021-08-05 14:14:27 +020054 dat.Transduce(r, w)
55 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020056 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
Akron524c5432021-08-05 14:14:27 +020064 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
65 w.Reset()
66 dat.Transduce(r, w)
67 tokens = strings.Split(w.String(), "\n")
68 assert.Equal("In", tokens[0])
69 assert.Equal("den", tokens[1])
70 assert.Equal("Wald", tokens[2])
71 assert.Equal("gehen", tokens[3])
72 assert.Equal("?", tokens[4])
73 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020074
Akron524c5432021-08-05 14:14:27 +020075 r = strings.NewReader(" g? -- D")
76 w.Reset()
77 dat.Transduce(r, w)
78 tokens = strings.Split(w.String(), "\n")
79 assert.Equal("g", tokens[0])
80 assert.Equal("?", tokens[1])
81 assert.Equal("--", tokens[2])
82 assert.Equal("D", tokens[3])
83 assert.Equal("", tokens[4])
Akrondf0a3ef2021-08-09 15:53:45 +020084 assert.Equal(5, len(tokens))
Akron068874c2021-08-04 15:19:56 +020085}
86
Akron3f8571a2021-08-05 11:18:10 +020087func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020088 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020089 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020090 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020091 assert.True(dat.Match("bau"))
92 assert.True(dat.Match("bad"))
93 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020094
Akron03a3c612021-08-04 11:51:27 +020095 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020096
Akron3f8571a2021-08-05 11:18:10 +020097 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020098 buf := bytes.NewBuffer(b)
99 n, err := dat.WriteTo(buf)
100 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200101 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200102
103 dat2 := ParseDatok(buf)
104 assert.NotNil(dat2)
105 assert.Equal(dat.array, dat2.array)
106 assert.Equal(dat.sigma, dat2.sigma)
107 assert.Equal(dat.epsilon, dat2.epsilon)
108 assert.Equal(dat.unknown, dat2.unknown)
109 assert.Equal(dat.identity, dat2.identity)
110 assert.Equal(dat.final, dat2.final)
111 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
112 assert.True(dat2.Match("bau"))
113 assert.True(dat2.Match("bad"))
114 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200115}
116
117func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200118 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200119 /*
Akron2a4b9292021-08-04 15:35:22 +0200120 tok := LoadFomaFile("testdata/tokenizer.fst")
121 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200122 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200123 */
Akron3a063ef2021-08-05 19:36:35 +0200124 dat := LoadDatokFile("testdata/tokenizer.datok")
125 assert.NotNil(dat)
126 assert.True(dat.LoadFactor() >= 70)
127 assert.Equal(dat.epsilon, 1)
128 assert.Equal(dat.unknown, 2)
129 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200130 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200131 assert.Equal(len(dat.sigma), 131)
Akron03c92fe2021-08-09 14:07:57 +0200132 assert.Equal(len(dat.array), 3806280)
133 assert.Equal(dat.maxSize, 3806279)
Akron3a063ef2021-08-05 19:36:35 +0200134
135 assert.True(dat.Match("bau"))
136 assert.True(dat.Match("bad"))
137 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200138}
Akron3f8571a2021-08-05 11:18:10 +0200139
140func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200141 assert := assert.New(t)
142
Akron03c92fe2021-08-09 14:07:57 +0200143 var dat *DaTokenizer
Akron3610f102021-08-08 14:13:25 +0200144
Akron03c92fe2021-08-09 14:07:57 +0200145 if false {
146 tok := LoadFomaFile("testdata/tokenizer.fst")
147 dat = tok.ToDoubleArray()
Akron439f4ec2021-08-09 15:45:38 +0200148 // dat.Save("testdata/tokenizer.datok")
Akron03c92fe2021-08-09 14:07:57 +0200149 } else {
150 dat = LoadDatokFile("testdata/tokenizer.datok")
151 }
Akron3610f102021-08-08 14:13:25 +0200152 assert.NotNil(dat)
153
Akronc5d8d432021-08-10 16:48:44 +0200154 r := strings.NewReader("tra. u Du?")
Akron3610f102021-08-08 14:13:25 +0200155
156 b := make([]byte, 0, 2048)
157 w := bytes.NewBuffer(b)
158 var tokens []string
159
160 assert.True(dat.Transduce(r, w))
161
162 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200163 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200164 assert.Equal("tra", tokens[0])
165 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200166 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200167 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200168 assert.Equal("Du", tokens[4])
169 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200170 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200171 assert.Equal("", tokens[7])
172 assert.Equal(8, len(tokens))
Akron3f8571a2021-08-05 11:18:10 +0200173}
Akronb7e1f132021-08-10 11:52:31 +0200174
175func TestFullTokenizerSentenceSplitter(t *testing.T) {
176 assert := assert.New(t)
177 dat := LoadDatokFile("testdata/tokenizer.datok")
178 assert.NotNil(dat)
179
180 b := make([]byte, 0, 2048)
181 w := bytes.NewBuffer(b)
182 var sentences []string
183
184 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200185 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
186 sentences = strings.Split(w.String(), "\n\n")
187
188 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
189 assert.Equal("Der\nalte\nMann\n.", sentences[0])
190 assert.Equal("", sentences[1])
191 assert.Equal(len(sentences), 2)
192
193 w.Reset()
194 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
195 sentences = strings.Split(w.String(), "\n\n")
196 assert.Equal(len(sentences), 2)
197 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
198 assert.Equal("", sentences[1])
199
200 w.Reset()
201 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200202 sentences = strings.Split(w.String(), "\n\n")
203 assert.Equal(len(sentences), 1)
Akron1594cb82021-08-11 11:14:56 +0200204 assert.Equal("", sentences[0])
205
206 w.Reset()
207 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
208 sentences = strings.Split(w.String(), "\n\n")
209 assert.Equal(len(sentences), 2)
210
211 w.Reset()
212 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
213 sentences = strings.Split(w.String(), "\n\n")
214 assert.Equal(len(sentences), 2)
215
216 /*
217 w.Reset()
218 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
219 sentences = strings.Split(w.String(), "\n\n")
220 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum\n", sentences[0])
221 assert.Equal(len(sentences), 1)
222
223 w.Reset()
224 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 1)
227
228 w.Reset()
229 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
230 sentences = strings.Split(w.String(), "\n\n")
231 assert.Equal(len(sentences), 1)
232
233 w.Reset()
234 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
235 sentences = strings.Split(w.String(), "\n\n")
236 assert.Equal(len(sentences), 1)
237
238 w.Reset()
239 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
240 sentences = strings.Split(w.String(), "\n\n")
241 assert.Equal(len(sentences), 1)
242 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen\n", sentences[0])
243
244 w.Reset()
245 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
246 sentences = strings.Split(w.String(), "\n\n")
247 assert.Equal(len(sentences), 2)
248 assert.Equal("Ausschalten\n!!!", sentences[0])
249 assert.Equal("Hast\nDu\nnicht\ngehört\n???\n", sentences[1])
250 */
251
252 /*
253 w.Reset()
254 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
255 sentences = strings.Split(w.String(), "\n\n")
256 assert.Equal(len(sentences), 1)
257 */
258
259 /*
260 Test:
261 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
262 */
Akronb7e1f132021-08-10 11:52:31 +0200263}