blob: e7e37d5de29ce86a71758df0019b141a40a7ec94 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akronec835ad2021-08-11 18:23:22 +020014func tmatch(dat *DaTokenizer, s string) bool {
15 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
17 return dat.Transduce(strings.NewReader(s), w)
18}
19
20func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
21 w.Reset()
22 ok := dat.Transduce(strings.NewReader(str), w)
23 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron8ef408b2021-08-02 22:11:04 +020041}
Akron75ebe7f2021-08-03 10:34:10 +020042
43func TestSimpleBranches(t *testing.T) {
44 assert := assert.New(t)
45
46 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020047 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020048 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020049 assert.False(tmatch(dat, "bau"))
50 assert.True(tmatch(dat, "bauamt"))
51 assert.True(tmatch(dat, "wahlamt"))
52 assert.True(tmatch(dat, "bauen"))
53 assert.True(tmatch(dat, "wahlen"))
54 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020055}
Akron730a79c2021-08-03 11:05:29 +020056
57func TestSimpleTokenizer(t *testing.T) {
58 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020059 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020060 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020061 assert.True(tmatch(dat, "bau"))
62 assert.True(tmatch(dat, "bad"))
63 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020064}
Akron740f3d72021-08-03 12:12:34 +020065
Akron068874c2021-08-04 15:19:56 +020066func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020067 assert := assert.New(t)
68 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020069 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020070
71 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
72 b := make([]byte, 0, 2048)
73 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020074 var tokens []string
Akron524c5432021-08-05 14:14:27 +020075 dat.Transduce(r, w)
76 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020077 assert.Equal("wald", tokens[0])
78 assert.Equal("gehen", tokens[1])
79 assert.Equal("Da", tokens[2])
80 assert.Equal("kann", tokens[3])
81 assert.Equal("man", tokens[4])
82 assert.Equal("was", tokens[5])
83 assert.Equal("\"erleben\"", tokens[6])
84
Akron524c5432021-08-05 14:14:27 +020085 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
86 w.Reset()
87 dat.Transduce(r, w)
88 tokens = strings.Split(w.String(), "\n")
89 assert.Equal("In", tokens[0])
90 assert.Equal("den", tokens[1])
91 assert.Equal("Wald", tokens[2])
92 assert.Equal("gehen", tokens[3])
93 assert.Equal("?", tokens[4])
94 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020095
Akron524c5432021-08-05 14:14:27 +020096 r = strings.NewReader(" g? -- D")
97 w.Reset()
98 dat.Transduce(r, w)
99 tokens = strings.Split(w.String(), "\n")
100 assert.Equal("g", tokens[0])
101 assert.Equal("?", tokens[1])
102 assert.Equal("--", tokens[2])
103 assert.Equal("D", tokens[3])
104 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200105 assert.Equal("", tokens[5])
106 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200107}
108
Akron3f8571a2021-08-05 11:18:10 +0200109func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200110 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200111 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200112 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200113 assert.True(tmatch(dat, "bau"))
114 assert.True(tmatch(dat, "bad"))
115 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116
Akron03a3c612021-08-04 11:51:27 +0200117 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200123 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200137}
138
139func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200140 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200141 dat := LoadDatokFile("testdata/tokenizer.datok")
142 assert.NotNil(dat)
143 assert.True(dat.LoadFactor() >= 70)
144 assert.Equal(dat.epsilon, 1)
145 assert.Equal(dat.unknown, 2)
146 assert.Equal(dat.identity, 3)
Akronec835ad2021-08-11 18:23:22 +0200147 assert.Equal(dat.final, 137)
148 assert.Equal(len(dat.sigma), 132)
Akron4af79f12021-08-11 14:48:17 +0200149 assert.True(len(dat.array) > 3800000)
150 assert.True(dat.maxSize > 3800000)
Akron3a063ef2021-08-05 19:36:35 +0200151
Akronec835ad2021-08-11 18:23:22 +0200152 assert.True(tmatch(dat, "bau"))
153 assert.True(tmatch(dat, "bad"))
154 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200155}
Akron3f8571a2021-08-05 11:18:10 +0200156
Akrona0bded52021-08-11 15:48:02 +0200157func XTestFullTokenizerBuild(t *testing.T) {
158 assert := assert.New(t)
159 tok := LoadFomaFile("testdata/tokenizer.fst")
160 dat := tok.ToDoubleArray()
161 n, err := dat.Save("testdata/tokenizer.datok")
162 assert.Nil(err)
163 assert.True(n > 500)
164}
165
Akron3f8571a2021-08-05 11:18:10 +0200166func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200167 assert := assert.New(t)
168
Akrona0bded52021-08-11 15:48:02 +0200169 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200170 assert.NotNil(dat)
171
Akron3610f102021-08-08 14:13:25 +0200172 b := make([]byte, 0, 2048)
173 w := bytes.NewBuffer(b)
174 var tokens []string
175
Akron03ca4252021-08-11 13:32:53 +0200176 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200177
178 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200179 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200180 assert.Equal("tra", tokens[0])
181 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200182 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200183 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200184 assert.Equal("Du", tokens[4])
185 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200186 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200187 assert.Equal("", tokens[7])
188 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200189
190 w.Reset()
191 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
192 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200193}
Akronb7e1f132021-08-10 11:52:31 +0200194
195func TestFullTokenizerSentenceSplitter(t *testing.T) {
196 assert := assert.New(t)
197 dat := LoadDatokFile("testdata/tokenizer.datok")
198 assert.NotNil(dat)
199
200 b := make([]byte, 0, 2048)
201 w := bytes.NewBuffer(b)
202 var sentences []string
203
204 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200205 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
206 sentences = strings.Split(w.String(), "\n\n")
207
208 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
209 assert.Equal("Der\nalte\nMann\n.", sentences[0])
210 assert.Equal("", sentences[1])
211 assert.Equal(len(sentences), 2)
212
213 w.Reset()
214 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
215 sentences = strings.Split(w.String(), "\n\n")
216 assert.Equal(len(sentences), 2)
217 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
218 assert.Equal("", sentences[1])
219
220 w.Reset()
221 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200222 sentences = strings.Split(w.String(), "\n\n")
223 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200224 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200225
226 w.Reset()
227 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
228 sentences = strings.Split(w.String(), "\n\n")
229 assert.Equal(len(sentences), 2)
230
231 w.Reset()
232 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
233 sentences = strings.Split(w.String(), "\n\n")
234 assert.Equal(len(sentences), 2)
235
Akron6e70dc82021-08-11 11:33:18 +0200236 w.Reset()
237 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
238 sentences = strings.Split(w.String(), "\n\n")
239 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
240 assert.Equal("", sentences[1])
241 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200242
Akron6e70dc82021-08-11 11:33:18 +0200243 w.Reset()
244 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
245 sentences = strings.Split(w.String(), "\n\n")
246 assert.Equal("", sentences[1])
247 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200248
Akron6e70dc82021-08-11 11:33:18 +0200249 w.Reset()
250 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
251 sentences = strings.Split(w.String(), "\n\n")
252 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200253
Akron6e70dc82021-08-11 11:33:18 +0200254 w.Reset()
255 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
256 sentences = strings.Split(w.String(), "\n\n")
257 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200258
Akron6e70dc82021-08-11 11:33:18 +0200259 w.Reset()
260 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
261 sentences = strings.Split(w.String(), "\n\n")
262 assert.Equal(len(sentences), 2)
263 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
264 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200265
Akron6e70dc82021-08-11 11:33:18 +0200266 w.Reset()
267 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
268 sentences = strings.Split(w.String(), "\n\n")
269 assert.Equal(len(sentences), 3)
270 assert.Equal("Ausschalten\n!!!", sentences[0])
271 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
272 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200273
Akron4af79f12021-08-11 14:48:17 +0200274 w.Reset()
275 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
276 sentences = strings.Split(w.String(), "\n\n")
277 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200278
279 /*
280 Test:
281 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
282 */
Akronb7e1f132021-08-10 11:52:31 +0200283}
Akron03ca4252021-08-11 13:32:53 +0200284
Akron03ca4252021-08-11 13:32:53 +0200285func TestFullTokenizerTokenSplitter(t *testing.T) {
286 assert := assert.New(t)
287 dat := LoadDatokFile("testdata/tokenizer.datok")
288 assert.NotNil(dat)
289
290 b := make([]byte, 0, 2048)
291 w := bytes.NewBuffer(b)
292 var tokens []string
293
294 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200295 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200296 assert.Equal(tokens[0], "Der")
297 assert.Equal(tokens[1], "alte")
298 assert.Equal(tokens[2], "Mann")
299 assert.Equal(len(tokens), 3)
300
Akronec835ad2021-08-11 18:23:22 +0200301 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200302 assert.Equal(tokens[0], "Der")
303 assert.Equal(tokens[1], "alte")
304 assert.Equal(tokens[2], "Mann")
305 assert.Equal(tokens[3], ".")
306 assert.Equal(len(tokens), 4)
307
308 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200309 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200310 assert.Equal(tokens[0], "Der")
311 assert.Equal(tokens[1], "Vorsitzende")
312 assert.Equal(tokens[2], "der")
313 assert.Equal(tokens[3], "F.D.P.")
314 assert.Equal(tokens[4], "hat")
315 assert.Equal(tokens[5], "gewählt")
316 assert.Equal(len(tokens), 6)
317 // Ignored in KorAP-Tokenizer
318
319 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200320 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200321 assert.Equal(tokens[0], "Gefunden")
322 assert.Equal(tokens[1], "auf")
323 assert.Equal(tokens[2], "wikipedia.org")
324 assert.Equal(len(tokens), 3)
325
326 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200327 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200328 assert.Equal("Gefunden", tokens[0])
329 assert.Equal("auf", tokens[1])
330 assert.Equal("www.wikipedia.org", tokens[2])
331 assert.Equal(3, len(tokens))
332
333 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200334 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200335 assert.Equal("www.info.biz/info", tokens[3])
336
337 // testTokenizerFtpHost
338 /*
339 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
340 assert.Equal("Kann", tokens[0])
341 assert.Equal("von", tokens[1])
342 assert.Equal("ftp.download.org", tokens[2])
343 assert.Equal(5, len(tokens))
344 // Ignored in KorAP-Tokenizer
345 */
346
347 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200348 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200349 assert.Equal(tokens[0], "Das")
350 assert.Equal(tokens[1], "war")
351 assert.Equal(tokens[2], "--")
352 assert.Equal(tokens[3], "spitze")
353 assert.Equal(len(tokens), 4)
354
355 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200356 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200357 assert.Equal(tokens[0], "Ich")
358 assert.Equal(tokens[1], "bin")
359 assert.Equal(tokens[2], "unter")
360 assert.Equal(tokens[3], "korap@ids-mannheim.de")
361 assert.Equal(tokens[4], "erreichbar")
362 assert.Equal(tokens[5], ".")
363 assert.Equal(len(tokens), 6)
364
365 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200366 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200367 assert.Equal(tokens[0], "Oder")
368 assert.Equal(tokens[1], "unter")
369 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
370 assert.Equal(tokens[3], ".")
371 assert.Equal(len(tokens), 4)
372
373 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200374 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200375 assert.Equal(tokens[0], "Oder")
376 assert.Equal(tokens[1], "unter")
377 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
378 assert.Equal(tokens[3], ".")
379 assert.Equal(len(tokens), 4)
380 // Ignored in KorAP-Tokenizer
381
382 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200383 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200384 assert.Equal("\"", tokens[0])
385 assert.Equal("John", tokens[1])
386 assert.Equal("Doe", tokens[2])
387 assert.Equal("\"", tokens[3])
388 assert.Equal("@xx", tokens[4])
389 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
390 assert.Equal("com", tokens[6])
391 assert.Equal(7, len(tokens))
392
393 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200394 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200395 assert.Equal(tokens[0], "Folgt")
396 assert.Equal(tokens[1], "@korap")
397 assert.Equal(tokens[2], "und")
398 assert.Equal(tokens[3], "#korap")
399 assert.Equal(len(tokens), 4)
400
401 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200402 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200403 assert.Equal(tokens[0], "Unsere")
404 assert.Equal(tokens[1], "Website")
405 assert.Equal(tokens[2], "ist")
406 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
407 assert.Equal(len(tokens), 4)
408
409 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200410 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200411 assert.Equal(tokens[0], "Wir")
412 assert.Equal(tokens[1], "sind")
413 assert.Equal(tokens[2], "auch")
414 assert.Equal(tokens[3], "im")
415 assert.Equal(tokens[4], "Internet")
416 assert.Equal(tokens[5], "(")
417 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
418 assert.Equal(tokens[7], ")")
419 assert.Equal(len(tokens), 8)
420 // Ignored in KorAP-Tokenizer
421
422 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200423 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200424 assert.Equal(tokens[0], "Die")
425 assert.Equal(tokens[1], "Adresse")
426 assert.Equal(tokens[2], "ist")
427 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
428 assert.Equal(tokens[4], ".")
429 assert.Equal(len(tokens), 5)
430 // Ignored in KorAP-Tokenizer
431
432 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200433 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200434 assert.Equal(tokens[0], "Unser")
435 assert.Equal(tokens[1], "Server")
436 assert.Equal(tokens[2], "ist")
437 assert.Equal(tokens[3], "10.0.10.51")
438 assert.Equal(tokens[4], ".")
439 assert.Equal(len(tokens), 5)
440
441 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200442 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200443 assert.Equal(tokens[0], "Zu")
444 assert.Equal(tokens[1], "50,4%")
445 assert.Equal(tokens[2], "ist")
446 assert.Equal(tokens[3], "es")
447 assert.Equal(tokens[4], "sicher")
448 assert.Equal(len(tokens), 5)
449 // Differs from KorAP-Tokenizer
450
451 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200452 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200453 assert.Equal(tokens[0], "Der")
454 assert.Equal(tokens[1], "Termin")
455 assert.Equal(tokens[2], "ist")
456 assert.Equal(tokens[3], "am")
457 assert.Equal(tokens[4], "5.9.2018")
458 assert.Equal(len(tokens), 5)
459
Akronec835ad2021-08-11 18:23:22 +0200460 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200461 assert.Equal(tokens[0], "Der")
462 assert.Equal(tokens[1], "Termin")
463 assert.Equal(tokens[2], "ist")
464 assert.Equal(tokens[3], "am")
465 assert.Equal(tokens[4], "5/9/2018")
466 assert.Equal(len(tokens), 5)
467
468 // testTokenizerDateRange
469 /*
470 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
471 assert.Equal(tokens[0], "Der")
472 assert.Equal(tokens[1], "Termin")
473 assert.Equal(tokens[2], "war")
474 assert.Equal(tokens[3], "vom")
475 assert.Equal(tokens[4], "4.")
476 assert.Equal(tokens[5], "-")
477 assert.Equal(tokens[6], "5.9.2018")
478 assert.Equal(len(tokens), 7)
479 // Ignored in KorAP-Tokenizer
480 */
481
482 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200483 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200484 assert.Equal(tokens[0], "Das")
485 assert.Equal(tokens[1], "ist")
486 assert.Equal(tokens[2], "toll")
487 assert.Equal(tokens[3], "!")
488 assert.Equal(tokens[4], ";)")
489 assert.Equal(len(tokens), 5)
490
491 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200492 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200493 assert.Equal(tokens[0], "Kupietz")
494 assert.Equal(tokens[1], "und")
495 assert.Equal(tokens[2], "Schmidt")
496 assert.Equal(tokens[3], "(2018)")
497 assert.Equal(tokens[4], ":")
498 assert.Equal(tokens[5], "Korpuslinguistik")
499 assert.Equal(len(tokens), 6)
500 // Differs from KorAP-Tokenizer!
501
502 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200503 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200504 assert.Equal(tokens[0], "Kupietz")
505 assert.Equal(tokens[1], "und")
506 assert.Equal(tokens[2], "Schmidt")
507 assert.Equal(tokens[3], "[2018]")
508 assert.Equal(tokens[4], ":")
509 assert.Equal(tokens[5], "Korpuslinguistik")
510 assert.Equal(len(tokens), 6)
511 // Differs from KorAP-Tokenizer!
512
513 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200514 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200515 assert.Equal(tokens[0], "Er")
516 assert.Equal(tokens[1], "ist")
517 assert.Equal(tokens[2], "ein")
518 assert.Equal(tokens[3], "A****loch")
519 assert.Equal(tokens[4], "!")
520 assert.Equal(len(tokens), 5)
521
522 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200523 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200524 assert.Equal(tokens[0], "F*ck")
525 assert.Equal(tokens[1], "!")
526 assert.Equal(len(tokens), 2)
527
528 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200529 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200530 assert.Equal(tokens[0], "Dieses")
531 assert.Equal(tokens[1], "verf*****")
532 assert.Equal(tokens[2], "Kleid")
533 assert.Equal(tokens[3], "!")
534 assert.Equal(len(tokens), 4)
535
536 // Probably interpreted as HOST
537 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200538 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200539 assert.Equal(tokens[0], "Ich")
540 assert.Equal(tokens[1], "habe")
541 assert.Equal(tokens[2], "die")
542 assert.Equal(tokens[3], "readme.txt")
543 assert.Equal(tokens[4], "heruntergeladen")
544 assert.Equal(len(tokens), 5)
545
546 // Probably interpreted as HOST
547 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200548 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200549 assert.Equal(tokens[0], "Nimm")
550 assert.Equal(tokens[1], "die")
551 assert.Equal(tokens[2], "README.TXT")
552 assert.Equal(tokens[3], "!")
553 assert.Equal(len(tokens), 4)
554
555 // Probably interpreted as HOST
556 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200557 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200558 assert.Equal(tokens[0], "Zeig")
559 assert.Equal(tokens[1], "mir")
560 assert.Equal(tokens[2], "profile.jpeg")
561 assert.Equal(len(tokens), 3)
562
563 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200564
Akronec835ad2021-08-11 18:23:22 +0200565 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200566 assert.Equal(tokens[0], "Zeig")
567 assert.Equal(tokens[1], "mir")
568 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
569 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200570
Akrone8837b52021-08-11 17:29:58 +0200571 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200572 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200573 assert.Equal(tokens[0], "Gehe")
574 assert.Equal(tokens[1], "zu")
575 assert.Equal(tokens[2], "/Dokumente/profile.docx")
576 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200577
Akrone8837b52021-08-11 17:29:58 +0200578 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200579 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200580 assert.Equal(tokens[0], "Zeig")
581 assert.Equal(tokens[1], "mir")
582 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
583 assert.Equal(len(tokens), 3)
584 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200585
Akronfd92d7e2021-08-11 16:31:43 +0200586 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200587 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200588 assert.Equal(tokens[0], "Er")
589 assert.Equal(tokens[1], "sagte")
590 assert.Equal(tokens[2], ":")
591 assert.Equal(tokens[3], "\"")
592 assert.Equal(tokens[4], "Es")
593 assert.Equal(tokens[5], "geht")
594 assert.Equal(tokens[6], "mir")
595 assert.Equal(tokens[7], "gut")
596 assert.Equal(tokens[8], "!")
597 assert.Equal(tokens[9], "\"")
598 assert.Equal(tokens[10], ",")
599 assert.Equal(tokens[11], "daraufhin")
600 assert.Equal(tokens[12], "ging")
601 assert.Equal(tokens[13], "er")
602 assert.Equal(tokens[14], ".")
603 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200604
605 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200606 tokens = ttokenize(dat, w, ""Das ist von C&A!"")
607 assert.Equal(tokens[0], """)
608 assert.Equal(tokens[1], "Das")
609 assert.Equal(tokens[2], "ist")
610 assert.Equal(tokens[3], "von")
611 assert.Equal(tokens[4], "C&A")
612 assert.Equal(tokens[5], "!")
613 assert.Equal(tokens[6], """)
614 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200615
616 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200617 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200618 assert.Equal(tokens[0], "Siehst")
619 assert.Equal(tokens[1], "Du")
620 assert.Equal(tokens[2], "?!!?")
621 assert.Equal(len(tokens), 3)
622
623 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200624 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200625 assert.Equal(tokens[0], "Peter")
626 assert.Equal(tokens[1], "O'Toole")
627 assert.Equal(len(tokens), 2)
628
629 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200630 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200631 assert.Equal(tokens[0], "Früher")
632 assert.Equal(tokens[1], "bzw.")
633 assert.Equal(tokens[2], "später")
634 assert.Equal(tokens[3], "...")
635 assert.Equal(len(tokens), 4)
636
637 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200638 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200639 assert.Equal(tokens[0], "Es")
640 assert.Equal(tokens[1], "war")
641 assert.Equal(tokens[2], "spät")
642 assert.Equal(tokens[3], ".")
643 assert.Equal(tokens[4], "Morgen")
644 assert.Equal(tokens[5], "ist")
645 assert.Equal(tokens[6], "es")
646 assert.Equal(tokens[7], "früh")
647 assert.Equal(tokens[8], ".")
648 assert.Equal(len(tokens), 9)
649 // Ignored in KorAP-Tokenizer
650
651 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200652 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200653 assert.Equal(tokens[0], "Sie")
654 assert.Equal(tokens[1], "erreichte")
655 assert.Equal(tokens[2], "den")
656 assert.Equal(tokens[3], "1.")
657 assert.Equal(tokens[4], "Platz")
658 assert.Equal(tokens[5], "!")
659 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200660
661 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200662 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200663 assert.Equal(tokens[0], "Archive")
664 assert.Equal(tokens[1], ":")
665 assert.Equal(tokens[2], "Ich")
666 assert.Equal(tokens[3], "bin")
667 assert.Equal(tokens[4], "kein")
668 assert.Equal(tokens[5], "zip")
669 assert.Equal(6, len(tokens))
670
671 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200672 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200673 assert.Equal(tokens[4], "Weststr.")
674 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200675
676 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200677 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200678 assert.Equal("D'dorf", tokens[0])
679 assert.Equal("Ku'damm", tokens[1])
680 assert.Equal("Lu'hafen", tokens[2])
681 assert.Equal("M'gladbach", tokens[3])
682 assert.Equal("W'schaft", tokens[4])
683 assert.Equal(5, len(tokens))
684
685 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200686 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200687 assert.Equal("mach's", tokens[0])
688 assert.Equal("macht's", tokens[1])
689 assert.Equal("was'n", tokens[2])
690 assert.Equal("ist's", tokens[3])
691 assert.Equal("haste", tokens[4])
692 assert.Equal("willste", tokens[5])
693 assert.Equal("kannste", tokens[6])
694 assert.Equal("biste", tokens[7])
695 assert.Equal("kriegste", tokens[8])
696 assert.Equal(9, len(tokens))
697
698 /*
699 @Test
700 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
701 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
702 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
703 assert.Equal("'ve", tokens[1]);
704 assert.Equal("'ll", tokens[3]);
705 assert.Equal("'d", tokens[5]);
706 assert.Equal("'m", tokens[7]);
707 assert.Equal("'re", tokens[9]);
708 assert.Equal("'s", tokens[11]);
709 assert.Equal("is", tokens[12]);
710 assert.Equal("n't", tokens[13]);
711 assert.Equal(14, len(tokens));
712 }
713
714 @Test
715 public void frenchTokenizerKnowsFrenchAbbreviations () {
716 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
717 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
718 assert.Equal("Approx.", tokens[0]);
719 assert.Equal("juill.", tokens[2]);
720 assert.Equal("prof.", tokens[5]);
721 assert.Equal("exerc.", tokens[15]);
722 assert.Equal("no.", tokens[16]);
723 assert.Equal("pp.", tokens[21]);
724 }
725
726 @Test
727 public void frenchTokenizerKnowsFrenchContractions () {
728 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
729 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
730 assert.Equal("J'", tokens[0]);
731 assert.Equal("j'", tokens[2]);
732 assert.Equal("qu'", tokens[4]);
733 assert.Equal("d'", tokens[6]);
734 assert.Equal("jusqu'", tokens[8]);
735 assert.Equal("Aujourd'hui", tokens[10]);
736 assert.Equal("D'", tokens[11]); // ’
737 assert.Equal("Quelqu'un", tokens[13]); // ’
738 assert.Equal("Presqu'île", tokens[14]); // ’
739 }
740
741 @Test
742 public void frenchTokenizerKnowsFrenchClitics () {
743 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
744 tokens = tokenize(dat, w, "suis-je sont-elles ")
745 assert.Equal("suis", tokens[0]);
746 assert.Equal("-je", tokens[1]);
747 assert.Equal("sont", tokens[2]);
748 assert.Equal("-elles", tokens[3]);
749 }
750
751 @Test
752 public void testEnglishTokenizerScienceAbbreviations () {
753 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
754 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
755 assert.Equal("Approx.", tokens[0]);
756 assert.Equal("in", tokens[1]);
757 assert.Equal("Sept.", tokens[2]);
758 assert.Equal("1954", tokens[3]);
759 assert.Equal(",", tokens[4]);
760 assert.Equal("Assoc.", tokens[5]);
761 assert.Equal("Prof.", tokens[6]);
762 assert.Equal("Dr.", tokens[7]);
763 assert.Equal("R.", tokens[8]);
764 assert.Equal("J.", tokens[9]);
765 assert.Equal("Ewing", tokens[10]);
766 assert.Equal("reviewed", tokens[11]);
767 assert.Equal("articles", tokens[12]);
768 assert.Equal("on", tokens[13]);
769 assert.Equal("Enzymol.", tokens[14]);
770 assert.Equal("Bacteriol.", tokens[15]);
771 assert.Equal("effects", tokens[16]);
772 assert.Equal("later", tokens[17]);
773 assert.Equal("published", tokens[18]);
774 assert.Equal("in", tokens[19]);
775 assert.Equal("Nutr.", tokens[20]);
776 assert.Equal("Rheumatol.", tokens[21]);
777 assert.Equal("No.", tokens[22]);
778 assert.Equal("12", tokens[23]);
779 assert.Equal("and", tokens[24]);
780 assert.Equal("Nº.", tokens[25]);
781 assert.Equal("13.", tokens[26]);
782 assert.Equal(",", tokens[27]);
783 assert.Equal("pp.", tokens[28]);
784 assert.Equal("17-18", tokens[29]);
785 assert.Equal(".", tokens[30]);
786 }
787
788 @Test
789 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
790 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
791 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
792 assert.Equal("I.", tokens[1]);
793 assert.Equal("I", tokens[8]);
794 assert.Equal(".", tokens[9]);
795 assert.Equal("I", tokens[12]);
796 assert.Equal(".", tokens[13]);
797 }
798
799 @Test
800 public void testZipOuputArchive () {
801
802 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
803 System.setOut(new PrintStream(clearOut));
804 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
805 assert.Equal(0, len(tokens));
806 }
807 */
808 /*
809
810 @Test
811 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
812 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
813 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
814 .printOffsets(true)
815 .build();
816 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
817 assert.Equal("Text1", tokens[0].getType());
818 assert.Equal(len(tokens), 9 );
819 }
820 */
821}
Akronbd406802021-08-11 18:39:13 +0200822
823func BenchmarkTransduce(b *testing.B) {
824 bu := make([]byte, 0, 2048)
825 w := bytes.NewBuffer(bu)
826
827 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
828 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
829 Der Termin ist am 5.9.2018.
830 Ich habe die readme.txt heruntergeladen.
831 Ausschalten!!! Hast Du nicht gehört???
832 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
833 Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
834 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
835 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
836 r := strings.NewReader(s)
837
838 dat := LoadDatokFile("testdata/tokenizer.datok")
839
840 for i := 0; i < b.N; i++ {
841 w.Reset()
842 r.Reset(s)
843 ok := dat.Transduce(r, w)
844 if !ok {
845 fmt.Println("Fail!")
846 fmt.Println(w.String())
847 os.Exit(1)
848 }
849 }
Akron01912fc2021-08-12 11:41:58 +0200850 // 2021-08-11 (go 1.16)
851 // go test -bench=. -test.benchmem
852 // BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
853 // 2021-08-112 (go 1.16)
854 // BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
Akronbd406802021-08-11 18:39:13 +0200855}