blob: 73282d257147b635fab42ca84204d0bae4e62078 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akronec835ad2021-08-11 18:23:22 +020014func tmatch(dat *DaTokenizer, s string) bool {
15 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
17 return dat.Transduce(strings.NewReader(s), w)
18}
19
20func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
21 w.Reset()
22 ok := dat.Transduce(strings.NewReader(str), w)
23 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron8ef408b2021-08-02 22:11:04 +020041}
Akron75ebe7f2021-08-03 10:34:10 +020042
43func TestSimpleBranches(t *testing.T) {
44 assert := assert.New(t)
45
46 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020047 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020048 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020049 assert.False(tmatch(dat, "bau"))
50 assert.True(tmatch(dat, "bauamt"))
51 assert.True(tmatch(dat, "wahlamt"))
52 assert.True(tmatch(dat, "bauen"))
53 assert.True(tmatch(dat, "wahlen"))
54 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020055}
Akron730a79c2021-08-03 11:05:29 +020056
57func TestSimpleTokenizer(t *testing.T) {
58 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020059 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020060 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020061 assert.True(tmatch(dat, "bau"))
62 assert.True(tmatch(dat, "bad"))
63 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020064}
Akron740f3d72021-08-03 12:12:34 +020065
Akron068874c2021-08-04 15:19:56 +020066func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020067 assert := assert.New(t)
68 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020069 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020070
71 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
72 b := make([]byte, 0, 2048)
73 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020074 var tokens []string
Akron524c5432021-08-05 14:14:27 +020075 dat.Transduce(r, w)
76 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020077 assert.Equal("wald", tokens[0])
78 assert.Equal("gehen", tokens[1])
79 assert.Equal("Da", tokens[2])
80 assert.Equal("kann", tokens[3])
81 assert.Equal("man", tokens[4])
82 assert.Equal("was", tokens[5])
83 assert.Equal("\"erleben\"", tokens[6])
84
Akron524c5432021-08-05 14:14:27 +020085 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
86 w.Reset()
87 dat.Transduce(r, w)
88 tokens = strings.Split(w.String(), "\n")
89 assert.Equal("In", tokens[0])
90 assert.Equal("den", tokens[1])
91 assert.Equal("Wald", tokens[2])
92 assert.Equal("gehen", tokens[3])
93 assert.Equal("?", tokens[4])
94 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020095
Akron524c5432021-08-05 14:14:27 +020096 r = strings.NewReader(" g? -- D")
97 w.Reset()
98 dat.Transduce(r, w)
99 tokens = strings.Split(w.String(), "\n")
100 assert.Equal("g", tokens[0])
101 assert.Equal("?", tokens[1])
102 assert.Equal("--", tokens[2])
103 assert.Equal("D", tokens[3])
104 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200105 assert.Equal("", tokens[5])
106 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200107}
108
Akron3f8571a2021-08-05 11:18:10 +0200109func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200110 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200111 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200112 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200113 assert.True(tmatch(dat, "bau"))
114 assert.True(tmatch(dat, "bad"))
115 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116
Akron03a3c612021-08-04 11:51:27 +0200117 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200123 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200137}
138
139func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200140 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200141 dat := LoadDatokFile("testdata/tokenizer.datok")
142 assert.NotNil(dat)
143 assert.True(dat.LoadFactor() >= 70)
144 assert.Equal(dat.epsilon, 1)
145 assert.Equal(dat.unknown, 2)
146 assert.Equal(dat.identity, 3)
Akronec835ad2021-08-11 18:23:22 +0200147 assert.Equal(dat.final, 137)
148 assert.Equal(len(dat.sigma), 132)
Akronf1a16502021-08-16 15:24:38 +0200149 assert.True(len(dat.array) > 3600000)
150 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200151 assert.True(tmatch(dat, "bau"))
152 assert.True(tmatch(dat, "bad"))
153 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200154}
Akron3f8571a2021-08-05 11:18:10 +0200155
Akrona0bded52021-08-11 15:48:02 +0200156func XTestFullTokenizerBuild(t *testing.T) {
157 assert := assert.New(t)
158 tok := LoadFomaFile("testdata/tokenizer.fst")
159 dat := tok.ToDoubleArray()
160 n, err := dat.Save("testdata/tokenizer.datok")
161 assert.Nil(err)
162 assert.True(n > 500)
163}
164
Akron3f8571a2021-08-05 11:18:10 +0200165func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200166 assert := assert.New(t)
167
Akrona0bded52021-08-11 15:48:02 +0200168 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200169 assert.NotNil(dat)
170
Akron3610f102021-08-08 14:13:25 +0200171 b := make([]byte, 0, 2048)
172 w := bytes.NewBuffer(b)
173 var tokens []string
174
Akron03ca4252021-08-11 13:32:53 +0200175 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200176
177 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200178 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200179 assert.Equal("tra", tokens[0])
180 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200181 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200182 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200183 assert.Equal("Du", tokens[4])
184 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200185 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200186 assert.Equal("", tokens[7])
187 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200188
189 w.Reset()
190 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
191 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200192}
Akronb7e1f132021-08-10 11:52:31 +0200193
194func TestFullTokenizerSentenceSplitter(t *testing.T) {
195 assert := assert.New(t)
196 dat := LoadDatokFile("testdata/tokenizer.datok")
197 assert.NotNil(dat)
198
199 b := make([]byte, 0, 2048)
200 w := bytes.NewBuffer(b)
201 var sentences []string
202
203 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200204 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
205 sentences = strings.Split(w.String(), "\n\n")
206
207 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
208 assert.Equal("Der\nalte\nMann\n.", sentences[0])
209 assert.Equal("", sentences[1])
210 assert.Equal(len(sentences), 2)
211
212 w.Reset()
213 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
214 sentences = strings.Split(w.String(), "\n\n")
215 assert.Equal(len(sentences), 2)
216 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
217 assert.Equal("", sentences[1])
218
219 w.Reset()
220 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200221 sentences = strings.Split(w.String(), "\n\n")
222 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200223 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200224
225 w.Reset()
226 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
227 sentences = strings.Split(w.String(), "\n\n")
228 assert.Equal(len(sentences), 2)
229
230 w.Reset()
231 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
232 sentences = strings.Split(w.String(), "\n\n")
233 assert.Equal(len(sentences), 2)
234
Akron6e70dc82021-08-11 11:33:18 +0200235 w.Reset()
236 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
237 sentences = strings.Split(w.String(), "\n\n")
238 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
239 assert.Equal("", sentences[1])
240 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200241
Akron6e70dc82021-08-11 11:33:18 +0200242 w.Reset()
243 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal("", sentences[1])
246 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200247
Akron6e70dc82021-08-11 11:33:18 +0200248 w.Reset()
249 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
250 sentences = strings.Split(w.String(), "\n\n")
251 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200252
Akron6e70dc82021-08-11 11:33:18 +0200253 w.Reset()
254 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
255 sentences = strings.Split(w.String(), "\n\n")
256 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200257
Akron6e70dc82021-08-11 11:33:18 +0200258 w.Reset()
259 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
260 sentences = strings.Split(w.String(), "\n\n")
261 assert.Equal(len(sentences), 2)
262 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
263 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200264
Akron6e70dc82021-08-11 11:33:18 +0200265 w.Reset()
266 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 3)
269 assert.Equal("Ausschalten\n!!!", sentences[0])
270 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
271 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200272
Akron4af79f12021-08-11 14:48:17 +0200273 w.Reset()
274 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
275 sentences = strings.Split(w.String(), "\n\n")
276 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200277
278 /*
279 Test:
280 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
281 */
Akronb7e1f132021-08-10 11:52:31 +0200282}
Akron03ca4252021-08-11 13:32:53 +0200283
Akron03ca4252021-08-11 13:32:53 +0200284func TestFullTokenizerTokenSplitter(t *testing.T) {
285 assert := assert.New(t)
286 dat := LoadDatokFile("testdata/tokenizer.datok")
287 assert.NotNil(dat)
288
289 b := make([]byte, 0, 2048)
290 w := bytes.NewBuffer(b)
291 var tokens []string
292
293 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200294 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200295 assert.Equal(tokens[0], "Der")
296 assert.Equal(tokens[1], "alte")
297 assert.Equal(tokens[2], "Mann")
298 assert.Equal(len(tokens), 3)
299
Akronec835ad2021-08-11 18:23:22 +0200300 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200301 assert.Equal(tokens[0], "Der")
302 assert.Equal(tokens[1], "alte")
303 assert.Equal(tokens[2], "Mann")
304 assert.Equal(tokens[3], ".")
305 assert.Equal(len(tokens), 4)
306
307 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200308 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200309 assert.Equal(tokens[0], "Der")
310 assert.Equal(tokens[1], "Vorsitzende")
311 assert.Equal(tokens[2], "der")
312 assert.Equal(tokens[3], "F.D.P.")
313 assert.Equal(tokens[4], "hat")
314 assert.Equal(tokens[5], "gewählt")
315 assert.Equal(len(tokens), 6)
316 // Ignored in KorAP-Tokenizer
317
318 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200319 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200320 assert.Equal(tokens[0], "Gefunden")
321 assert.Equal(tokens[1], "auf")
322 assert.Equal(tokens[2], "wikipedia.org")
323 assert.Equal(len(tokens), 3)
324
325 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200326 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200327 assert.Equal("Gefunden", tokens[0])
328 assert.Equal("auf", tokens[1])
329 assert.Equal("www.wikipedia.org", tokens[2])
330 assert.Equal(3, len(tokens))
331
332 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200333 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200334 assert.Equal("www.info.biz/info", tokens[3])
335
336 // testTokenizerFtpHost
337 /*
338 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
339 assert.Equal("Kann", tokens[0])
340 assert.Equal("von", tokens[1])
341 assert.Equal("ftp.download.org", tokens[2])
342 assert.Equal(5, len(tokens))
343 // Ignored in KorAP-Tokenizer
344 */
345
346 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200347 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200348 assert.Equal(tokens[0], "Das")
349 assert.Equal(tokens[1], "war")
350 assert.Equal(tokens[2], "--")
351 assert.Equal(tokens[3], "spitze")
352 assert.Equal(len(tokens), 4)
353
354 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200355 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200356 assert.Equal(tokens[0], "Ich")
357 assert.Equal(tokens[1], "bin")
358 assert.Equal(tokens[2], "unter")
359 assert.Equal(tokens[3], "korap@ids-mannheim.de")
360 assert.Equal(tokens[4], "erreichbar")
361 assert.Equal(tokens[5], ".")
362 assert.Equal(len(tokens), 6)
363
364 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200365 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200366 assert.Equal(tokens[0], "Oder")
367 assert.Equal(tokens[1], "unter")
368 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
369 assert.Equal(tokens[3], ".")
370 assert.Equal(len(tokens), 4)
371
372 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200373 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200374 assert.Equal(tokens[0], "Oder")
375 assert.Equal(tokens[1], "unter")
376 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
377 assert.Equal(tokens[3], ".")
378 assert.Equal(len(tokens), 4)
379 // Ignored in KorAP-Tokenizer
380
381 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200382 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200383 assert.Equal("\"", tokens[0])
384 assert.Equal("John", tokens[1])
385 assert.Equal("Doe", tokens[2])
386 assert.Equal("\"", tokens[3])
387 assert.Equal("@xx", tokens[4])
388 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
389 assert.Equal("com", tokens[6])
390 assert.Equal(7, len(tokens))
391
392 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200393 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200394 assert.Equal(tokens[0], "Folgt")
395 assert.Equal(tokens[1], "@korap")
396 assert.Equal(tokens[2], "und")
397 assert.Equal(tokens[3], "#korap")
398 assert.Equal(len(tokens), 4)
399
400 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200401 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200402 assert.Equal(tokens[0], "Unsere")
403 assert.Equal(tokens[1], "Website")
404 assert.Equal(tokens[2], "ist")
405 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
406 assert.Equal(len(tokens), 4)
407
408 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200409 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200410 assert.Equal(tokens[0], "Wir")
411 assert.Equal(tokens[1], "sind")
412 assert.Equal(tokens[2], "auch")
413 assert.Equal(tokens[3], "im")
414 assert.Equal(tokens[4], "Internet")
415 assert.Equal(tokens[5], "(")
416 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
417 assert.Equal(tokens[7], ")")
418 assert.Equal(len(tokens), 8)
419 // Ignored in KorAP-Tokenizer
420
421 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200422 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200423 assert.Equal(tokens[0], "Die")
424 assert.Equal(tokens[1], "Adresse")
425 assert.Equal(tokens[2], "ist")
426 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
427 assert.Equal(tokens[4], ".")
428 assert.Equal(len(tokens), 5)
429 // Ignored in KorAP-Tokenizer
430
431 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200432 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200433 assert.Equal(tokens[0], "Unser")
434 assert.Equal(tokens[1], "Server")
435 assert.Equal(tokens[2], "ist")
436 assert.Equal(tokens[3], "10.0.10.51")
437 assert.Equal(tokens[4], ".")
438 assert.Equal(len(tokens), 5)
439
440 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200441 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200442 assert.Equal(tokens[0], "Zu")
443 assert.Equal(tokens[1], "50,4%")
444 assert.Equal(tokens[2], "ist")
445 assert.Equal(tokens[3], "es")
446 assert.Equal(tokens[4], "sicher")
447 assert.Equal(len(tokens), 5)
448 // Differs from KorAP-Tokenizer
449
450 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200451 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200452 assert.Equal(tokens[0], "Der")
453 assert.Equal(tokens[1], "Termin")
454 assert.Equal(tokens[2], "ist")
455 assert.Equal(tokens[3], "am")
456 assert.Equal(tokens[4], "5.9.2018")
457 assert.Equal(len(tokens), 5)
458
Akronec835ad2021-08-11 18:23:22 +0200459 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200460 assert.Equal(tokens[0], "Der")
461 assert.Equal(tokens[1], "Termin")
462 assert.Equal(tokens[2], "ist")
463 assert.Equal(tokens[3], "am")
464 assert.Equal(tokens[4], "5/9/2018")
465 assert.Equal(len(tokens), 5)
466
467 // testTokenizerDateRange
468 /*
469 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
470 assert.Equal(tokens[0], "Der")
471 assert.Equal(tokens[1], "Termin")
472 assert.Equal(tokens[2], "war")
473 assert.Equal(tokens[3], "vom")
474 assert.Equal(tokens[4], "4.")
475 assert.Equal(tokens[5], "-")
476 assert.Equal(tokens[6], "5.9.2018")
477 assert.Equal(len(tokens), 7)
478 // Ignored in KorAP-Tokenizer
479 */
480
481 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200482 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200483 assert.Equal(tokens[0], "Das")
484 assert.Equal(tokens[1], "ist")
485 assert.Equal(tokens[2], "toll")
486 assert.Equal(tokens[3], "!")
487 assert.Equal(tokens[4], ";)")
488 assert.Equal(len(tokens), 5)
489
490 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200491 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200492 assert.Equal(tokens[0], "Kupietz")
493 assert.Equal(tokens[1], "und")
494 assert.Equal(tokens[2], "Schmidt")
495 assert.Equal(tokens[3], "(2018)")
496 assert.Equal(tokens[4], ":")
497 assert.Equal(tokens[5], "Korpuslinguistik")
498 assert.Equal(len(tokens), 6)
499 // Differs from KorAP-Tokenizer!
500
501 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200502 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200503 assert.Equal(tokens[0], "Kupietz")
504 assert.Equal(tokens[1], "und")
505 assert.Equal(tokens[2], "Schmidt")
506 assert.Equal(tokens[3], "[2018]")
507 assert.Equal(tokens[4], ":")
508 assert.Equal(tokens[5], "Korpuslinguistik")
509 assert.Equal(len(tokens), 6)
510 // Differs from KorAP-Tokenizer!
511
512 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200513 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200514 assert.Equal(tokens[0], "Er")
515 assert.Equal(tokens[1], "ist")
516 assert.Equal(tokens[2], "ein")
517 assert.Equal(tokens[3], "A****loch")
518 assert.Equal(tokens[4], "!")
519 assert.Equal(len(tokens), 5)
520
521 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200522 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200523 assert.Equal(tokens[0], "F*ck")
524 assert.Equal(tokens[1], "!")
525 assert.Equal(len(tokens), 2)
526
527 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200528 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200529 assert.Equal(tokens[0], "Dieses")
530 assert.Equal(tokens[1], "verf*****")
531 assert.Equal(tokens[2], "Kleid")
532 assert.Equal(tokens[3], "!")
533 assert.Equal(len(tokens), 4)
534
535 // Probably interpreted as HOST
536 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200537 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200538 assert.Equal(tokens[0], "Ich")
539 assert.Equal(tokens[1], "habe")
540 assert.Equal(tokens[2], "die")
541 assert.Equal(tokens[3], "readme.txt")
542 assert.Equal(tokens[4], "heruntergeladen")
543 assert.Equal(len(tokens), 5)
544
545 // Probably interpreted as HOST
546 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200547 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200548 assert.Equal(tokens[0], "Nimm")
549 assert.Equal(tokens[1], "die")
550 assert.Equal(tokens[2], "README.TXT")
551 assert.Equal(tokens[3], "!")
552 assert.Equal(len(tokens), 4)
553
554 // Probably interpreted as HOST
555 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200556 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200557 assert.Equal(tokens[0], "Zeig")
558 assert.Equal(tokens[1], "mir")
559 assert.Equal(tokens[2], "profile.jpeg")
560 assert.Equal(len(tokens), 3)
561
562 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200563
Akronec835ad2021-08-11 18:23:22 +0200564 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200565 assert.Equal(tokens[0], "Zeig")
566 assert.Equal(tokens[1], "mir")
567 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
568 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200569
Akrone8837b52021-08-11 17:29:58 +0200570 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200571 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200572 assert.Equal(tokens[0], "Gehe")
573 assert.Equal(tokens[1], "zu")
574 assert.Equal(tokens[2], "/Dokumente/profile.docx")
575 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200576
Akrone8837b52021-08-11 17:29:58 +0200577 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200578 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200579 assert.Equal(tokens[0], "Zeig")
580 assert.Equal(tokens[1], "mir")
581 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
582 assert.Equal(len(tokens), 3)
583 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200584
Akronfd92d7e2021-08-11 16:31:43 +0200585 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200586 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200587 assert.Equal(tokens[0], "Er")
588 assert.Equal(tokens[1], "sagte")
589 assert.Equal(tokens[2], ":")
590 assert.Equal(tokens[3], "\"")
591 assert.Equal(tokens[4], "Es")
592 assert.Equal(tokens[5], "geht")
593 assert.Equal(tokens[6], "mir")
594 assert.Equal(tokens[7], "gut")
595 assert.Equal(tokens[8], "!")
596 assert.Equal(tokens[9], "\"")
597 assert.Equal(tokens[10], ",")
598 assert.Equal(tokens[11], "daraufhin")
599 assert.Equal(tokens[12], "ging")
600 assert.Equal(tokens[13], "er")
601 assert.Equal(tokens[14], ".")
602 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200603
604 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200605 tokens = ttokenize(dat, w, ""Das ist von C&A!"")
606 assert.Equal(tokens[0], """)
607 assert.Equal(tokens[1], "Das")
608 assert.Equal(tokens[2], "ist")
609 assert.Equal(tokens[3], "von")
610 assert.Equal(tokens[4], "C&A")
611 assert.Equal(tokens[5], "!")
612 assert.Equal(tokens[6], """)
613 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200614
615 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200616 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200617 assert.Equal(tokens[0], "Siehst")
618 assert.Equal(tokens[1], "Du")
619 assert.Equal(tokens[2], "?!!?")
620 assert.Equal(len(tokens), 3)
621
622 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200623 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200624 assert.Equal(tokens[0], "Peter")
625 assert.Equal(tokens[1], "O'Toole")
626 assert.Equal(len(tokens), 2)
627
628 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200629 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200630 assert.Equal(tokens[0], "Früher")
631 assert.Equal(tokens[1], "bzw.")
632 assert.Equal(tokens[2], "später")
633 assert.Equal(tokens[3], "...")
634 assert.Equal(len(tokens), 4)
635
636 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200637 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200638 assert.Equal(tokens[0], "Es")
639 assert.Equal(tokens[1], "war")
640 assert.Equal(tokens[2], "spät")
641 assert.Equal(tokens[3], ".")
642 assert.Equal(tokens[4], "Morgen")
643 assert.Equal(tokens[5], "ist")
644 assert.Equal(tokens[6], "es")
645 assert.Equal(tokens[7], "früh")
646 assert.Equal(tokens[8], ".")
647 assert.Equal(len(tokens), 9)
648 // Ignored in KorAP-Tokenizer
649
650 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200651 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200652 assert.Equal(tokens[0], "Sie")
653 assert.Equal(tokens[1], "erreichte")
654 assert.Equal(tokens[2], "den")
655 assert.Equal(tokens[3], "1.")
656 assert.Equal(tokens[4], "Platz")
657 assert.Equal(tokens[5], "!")
658 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200659
660 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200661 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200662 assert.Equal(tokens[0], "Archive")
663 assert.Equal(tokens[1], ":")
664 assert.Equal(tokens[2], "Ich")
665 assert.Equal(tokens[3], "bin")
666 assert.Equal(tokens[4], "kein")
667 assert.Equal(tokens[5], "zip")
668 assert.Equal(6, len(tokens))
669
670 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200671 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200672 assert.Equal(tokens[4], "Weststr.")
673 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200674
675 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200676 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200677 assert.Equal("D'dorf", tokens[0])
678 assert.Equal("Ku'damm", tokens[1])
679 assert.Equal("Lu'hafen", tokens[2])
680 assert.Equal("M'gladbach", tokens[3])
681 assert.Equal("W'schaft", tokens[4])
682 assert.Equal(5, len(tokens))
683
684 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200685 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200686 assert.Equal("mach's", tokens[0])
687 assert.Equal("macht's", tokens[1])
688 assert.Equal("was'n", tokens[2])
689 assert.Equal("ist's", tokens[3])
690 assert.Equal("haste", tokens[4])
691 assert.Equal("willste", tokens[5])
692 assert.Equal("kannste", tokens[6])
693 assert.Equal("biste", tokens[7])
694 assert.Equal("kriegste", tokens[8])
695 assert.Equal(9, len(tokens))
696
697 /*
698 @Test
699 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
700 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
701 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
702 assert.Equal("'ve", tokens[1]);
703 assert.Equal("'ll", tokens[3]);
704 assert.Equal("'d", tokens[5]);
705 assert.Equal("'m", tokens[7]);
706 assert.Equal("'re", tokens[9]);
707 assert.Equal("'s", tokens[11]);
708 assert.Equal("is", tokens[12]);
709 assert.Equal("n't", tokens[13]);
710 assert.Equal(14, len(tokens));
711 }
712
713 @Test
714 public void frenchTokenizerKnowsFrenchAbbreviations () {
715 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
716 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
717 assert.Equal("Approx.", tokens[0]);
718 assert.Equal("juill.", tokens[2]);
719 assert.Equal("prof.", tokens[5]);
720 assert.Equal("exerc.", tokens[15]);
721 assert.Equal("no.", tokens[16]);
722 assert.Equal("pp.", tokens[21]);
723 }
724
725 @Test
726 public void frenchTokenizerKnowsFrenchContractions () {
727 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
728 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
729 assert.Equal("J'", tokens[0]);
730 assert.Equal("j'", tokens[2]);
731 assert.Equal("qu'", tokens[4]);
732 assert.Equal("d'", tokens[6]);
733 assert.Equal("jusqu'", tokens[8]);
734 assert.Equal("Aujourd'hui", tokens[10]);
735 assert.Equal("D'", tokens[11]); // ’
736 assert.Equal("Quelqu'un", tokens[13]); // ’
737 assert.Equal("Presqu'île", tokens[14]); // ’
738 }
739
740 @Test
741 public void frenchTokenizerKnowsFrenchClitics () {
742 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
743 tokens = tokenize(dat, w, "suis-je sont-elles ")
744 assert.Equal("suis", tokens[0]);
745 assert.Equal("-je", tokens[1]);
746 assert.Equal("sont", tokens[2]);
747 assert.Equal("-elles", tokens[3]);
748 }
749
750 @Test
751 public void testEnglishTokenizerScienceAbbreviations () {
752 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
753 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
754 assert.Equal("Approx.", tokens[0]);
755 assert.Equal("in", tokens[1]);
756 assert.Equal("Sept.", tokens[2]);
757 assert.Equal("1954", tokens[3]);
758 assert.Equal(",", tokens[4]);
759 assert.Equal("Assoc.", tokens[5]);
760 assert.Equal("Prof.", tokens[6]);
761 assert.Equal("Dr.", tokens[7]);
762 assert.Equal("R.", tokens[8]);
763 assert.Equal("J.", tokens[9]);
764 assert.Equal("Ewing", tokens[10]);
765 assert.Equal("reviewed", tokens[11]);
766 assert.Equal("articles", tokens[12]);
767 assert.Equal("on", tokens[13]);
768 assert.Equal("Enzymol.", tokens[14]);
769 assert.Equal("Bacteriol.", tokens[15]);
770 assert.Equal("effects", tokens[16]);
771 assert.Equal("later", tokens[17]);
772 assert.Equal("published", tokens[18]);
773 assert.Equal("in", tokens[19]);
774 assert.Equal("Nutr.", tokens[20]);
775 assert.Equal("Rheumatol.", tokens[21]);
776 assert.Equal("No.", tokens[22]);
777 assert.Equal("12", tokens[23]);
778 assert.Equal("and", tokens[24]);
779 assert.Equal("Nº.", tokens[25]);
780 assert.Equal("13.", tokens[26]);
781 assert.Equal(",", tokens[27]);
782 assert.Equal("pp.", tokens[28]);
783 assert.Equal("17-18", tokens[29]);
784 assert.Equal(".", tokens[30]);
785 }
786
787 @Test
788 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
789 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
790 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
791 assert.Equal("I.", tokens[1]);
792 assert.Equal("I", tokens[8]);
793 assert.Equal(".", tokens[9]);
794 assert.Equal("I", tokens[12]);
795 assert.Equal(".", tokens[13]);
796 }
797
798 @Test
799 public void testZipOuputArchive () {
800
801 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
802 System.setOut(new PrintStream(clearOut));
803 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
804 assert.Equal(0, len(tokens));
805 }
806 */
807 /*
808
809 @Test
810 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
811 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
812 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
813 .printOffsets(true)
814 .build();
815 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
816 assert.Equal("Text1", tokens[0].getType());
817 assert.Equal(len(tokens), 9 );
818 }
819 */
820}
Akronbd406802021-08-11 18:39:13 +0200821
822func BenchmarkTransduce(b *testing.B) {
823 bu := make([]byte, 0, 2048)
824 w := bytes.NewBuffer(bu)
825
826 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
827 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
828 Der Termin ist am 5.9.2018.
829 Ich habe die readme.txt heruntergeladen.
830 Ausschalten!!! Hast Du nicht gehört???
831 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
832 Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
833 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
834 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
835 r := strings.NewReader(s)
836
837 dat := LoadDatokFile("testdata/tokenizer.datok")
838
839 for i := 0; i < b.N; i++ {
840 w.Reset()
841 r.Reset(s)
842 ok := dat.Transduce(r, w)
843 if !ok {
844 fmt.Println("Fail!")
845 fmt.Println(w.String())
846 os.Exit(1)
847 }
848 }
Akronbd406802021-08-11 18:39:13 +0200849}
Akronbb4aac52021-08-13 00:52:27 +0200850
Akron6f1c16c2021-08-17 10:45:42 +0200851// This test is deprecated as the datok file changes over time
852func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200853 for i := 0; i < b.N; i++ {
854 dat := LoadDatokFile("testdata/tokenizer.datok")
855 if dat == nil {
856 fmt.Println("Fail!")
857 os.Exit(1)
858 }
859 }
860}
861
Akron6f1c16c2021-08-17 10:45:42 +0200862func BenchmarkToDoubleArray(b *testing.B) {
863 tok := LoadFomaFile("testdata/simple_bench.fst")
864 for i := 0; i < b.N; i++ {
865 dat := tok.ToDoubleArray()
866 if dat == nil {
867 fmt.Println("Fail!")
868 os.Exit(1)
869 }
870 }
871}
872
Akronbb4aac52021-08-13 00:52:27 +0200873// 2021-08-11 (go 1.16)
874// go test -bench=. -test.benchmem
875// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200876// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200877// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
878// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
879// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
880// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200881// 2021-08-16
882// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
883// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
884// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
885// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200886// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
887// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200888// 2021-08-17
889// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
890// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op