blob: 720199f840c957f785101a2cfc14679f64a399cf [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron524c5432021-08-05 14:14:27 +020054 dat.Transduce(r, w)
55 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020056 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
Akron524c5432021-08-05 14:14:27 +020064 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
65 w.Reset()
66 dat.Transduce(r, w)
67 tokens = strings.Split(w.String(), "\n")
68 assert.Equal("In", tokens[0])
69 assert.Equal("den", tokens[1])
70 assert.Equal("Wald", tokens[2])
71 assert.Equal("gehen", tokens[3])
72 assert.Equal("?", tokens[4])
73 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020074
Akron524c5432021-08-05 14:14:27 +020075 r = strings.NewReader(" g? -- D")
76 w.Reset()
77 dat.Transduce(r, w)
78 tokens = strings.Split(w.String(), "\n")
79 assert.Equal("g", tokens[0])
80 assert.Equal("?", tokens[1])
81 assert.Equal("--", tokens[2])
82 assert.Equal("D", tokens[3])
83 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +020084 assert.Equal("", tokens[5])
85 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020086}
87
Akron3f8571a2021-08-05 11:18:10 +020088func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020089 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020090 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020091 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020092 assert.True(dat.Match("bau"))
93 assert.True(dat.Match("bad"))
94 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020095
Akron03a3c612021-08-04 11:51:27 +020096 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020097
Akron3f8571a2021-08-05 11:18:10 +020098 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020099 buf := bytes.NewBuffer(b)
100 n, err := dat.WriteTo(buf)
101 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200102 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200103
104 dat2 := ParseDatok(buf)
105 assert.NotNil(dat2)
106 assert.Equal(dat.array, dat2.array)
107 assert.Equal(dat.sigma, dat2.sigma)
108 assert.Equal(dat.epsilon, dat2.epsilon)
109 assert.Equal(dat.unknown, dat2.unknown)
110 assert.Equal(dat.identity, dat2.identity)
111 assert.Equal(dat.final, dat2.final)
112 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
113 assert.True(dat2.Match("bau"))
114 assert.True(dat2.Match("bad"))
115 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116}
117
118func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200119 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200120 /*
Akron2a4b9292021-08-04 15:35:22 +0200121 tok := LoadFomaFile("testdata/tokenizer.fst")
122 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200123 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200124 */
Akron3a063ef2021-08-05 19:36:35 +0200125 dat := LoadDatokFile("testdata/tokenizer.datok")
126 assert.NotNil(dat)
127 assert.True(dat.LoadFactor() >= 70)
128 assert.Equal(dat.epsilon, 1)
129 assert.Equal(dat.unknown, 2)
130 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200131 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200132 assert.Equal(len(dat.sigma), 131)
Akron03c92fe2021-08-09 14:07:57 +0200133 assert.Equal(len(dat.array), 3806280)
134 assert.Equal(dat.maxSize, 3806279)
Akron3a063ef2021-08-05 19:36:35 +0200135
136 assert.True(dat.Match("bau"))
137 assert.True(dat.Match("bad"))
138 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200139}
Akron3f8571a2021-08-05 11:18:10 +0200140
141func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200142 assert := assert.New(t)
143
Akron03c92fe2021-08-09 14:07:57 +0200144 var dat *DaTokenizer
Akron3610f102021-08-08 14:13:25 +0200145
Akron03c92fe2021-08-09 14:07:57 +0200146 if false {
147 tok := LoadFomaFile("testdata/tokenizer.fst")
148 dat = tok.ToDoubleArray()
Akron439f4ec2021-08-09 15:45:38 +0200149 // dat.Save("testdata/tokenizer.datok")
Akron03c92fe2021-08-09 14:07:57 +0200150 } else {
151 dat = LoadDatokFile("testdata/tokenizer.datok")
152 }
Akron3610f102021-08-08 14:13:25 +0200153 assert.NotNil(dat)
154
Akronc5d8d432021-08-10 16:48:44 +0200155 r := strings.NewReader("tra. u Du?")
Akron3610f102021-08-08 14:13:25 +0200156
157 b := make([]byte, 0, 2048)
158 w := bytes.NewBuffer(b)
159 var tokens []string
160
161 assert.True(dat.Transduce(r, w))
162
163 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200164 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200165 assert.Equal("tra", tokens[0])
166 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200167 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200168 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200169 assert.Equal("Du", tokens[4])
170 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200171 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200172 assert.Equal("", tokens[7])
173 assert.Equal(8, len(tokens))
Akron3f8571a2021-08-05 11:18:10 +0200174}
Akronb7e1f132021-08-10 11:52:31 +0200175
176func TestFullTokenizerSentenceSplitter(t *testing.T) {
177 assert := assert.New(t)
178 dat := LoadDatokFile("testdata/tokenizer.datok")
179 assert.NotNil(dat)
180
181 b := make([]byte, 0, 2048)
182 w := bytes.NewBuffer(b)
183 var sentences []string
184
185 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200186 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
187 sentences = strings.Split(w.String(), "\n\n")
188
189 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
190 assert.Equal("Der\nalte\nMann\n.", sentences[0])
191 assert.Equal("", sentences[1])
192 assert.Equal(len(sentences), 2)
193
194 w.Reset()
195 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
196 sentences = strings.Split(w.String(), "\n\n")
197 assert.Equal(len(sentences), 2)
198 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
199 assert.Equal("", sentences[1])
200
201 w.Reset()
202 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200203 sentences = strings.Split(w.String(), "\n\n")
204 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200205 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200206
207 w.Reset()
208 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210 assert.Equal(len(sentences), 2)
211
212 w.Reset()
213 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
214 sentences = strings.Split(w.String(), "\n\n")
215 assert.Equal(len(sentences), 2)
216
Akron6e70dc82021-08-11 11:33:18 +0200217 w.Reset()
218 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
219 sentences = strings.Split(w.String(), "\n\n")
220 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
221 assert.Equal("", sentences[1])
222 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200223
Akron6e70dc82021-08-11 11:33:18 +0200224 w.Reset()
225 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
226 sentences = strings.Split(w.String(), "\n\n")
227 assert.Equal("", sentences[1])
228 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200229
Akron6e70dc82021-08-11 11:33:18 +0200230 w.Reset()
231 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
232 sentences = strings.Split(w.String(), "\n\n")
233 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200234
Akron6e70dc82021-08-11 11:33:18 +0200235 w.Reset()
236 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
237 sentences = strings.Split(w.String(), "\n\n")
238 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200239
Akron6e70dc82021-08-11 11:33:18 +0200240 w.Reset()
241 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
242 sentences = strings.Split(w.String(), "\n\n")
243 assert.Equal(len(sentences), 2)
244 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
245 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200246
Akron6e70dc82021-08-11 11:33:18 +0200247 w.Reset()
248 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal(len(sentences), 3)
251 assert.Equal("Ausschalten\n!!!", sentences[0])
252 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
253 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200254
255 /*
256 w.Reset()
257 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
258 sentences = strings.Split(w.String(), "\n\n")
259 assert.Equal(len(sentences), 1)
260 */
261
262 /*
263 Test:
264 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
265 */
Akronb7e1f132021-08-10 11:52:31 +0200266}