blob: ffee35390ca7ff0dc1ab4b211e2f2846bfb7bff1 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akronec835ad2021-08-11 18:23:22 +020014func tmatch(dat *DaTokenizer, s string) bool {
15 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
17 return dat.Transduce(strings.NewReader(s), w)
18}
19
20func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
21 w.Reset()
22 ok := dat.Transduce(strings.NewReader(str), w)
23 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron8ef408b2021-08-02 22:11:04 +020041}
Akron75ebe7f2021-08-03 10:34:10 +020042
43func TestSimpleBranches(t *testing.T) {
44 assert := assert.New(t)
45
46 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020047 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020048 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020049 assert.False(tmatch(dat, "bau"))
50 assert.True(tmatch(dat, "bauamt"))
51 assert.True(tmatch(dat, "wahlamt"))
52 assert.True(tmatch(dat, "bauen"))
53 assert.True(tmatch(dat, "wahlen"))
54 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020055}
Akron730a79c2021-08-03 11:05:29 +020056
57func TestSimpleTokenizer(t *testing.T) {
58 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020059 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020060 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020061 assert.True(tmatch(dat, "bau"))
62 assert.True(tmatch(dat, "bad"))
63 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020064}
Akron740f3d72021-08-03 12:12:34 +020065
Akron068874c2021-08-04 15:19:56 +020066func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020067 assert := assert.New(t)
68 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020069 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020070
71 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
72 b := make([]byte, 0, 2048)
73 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020074 var tokens []string
Akron524c5432021-08-05 14:14:27 +020075 dat.Transduce(r, w)
76 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020077 assert.Equal("wald", tokens[0])
78 assert.Equal("gehen", tokens[1])
79 assert.Equal("Da", tokens[2])
80 assert.Equal("kann", tokens[3])
81 assert.Equal("man", tokens[4])
82 assert.Equal("was", tokens[5])
83 assert.Equal("\"erleben\"", tokens[6])
84
Akron524c5432021-08-05 14:14:27 +020085 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
86 w.Reset()
87 dat.Transduce(r, w)
88 tokens = strings.Split(w.String(), "\n")
89 assert.Equal("In", tokens[0])
90 assert.Equal("den", tokens[1])
91 assert.Equal("Wald", tokens[2])
92 assert.Equal("gehen", tokens[3])
93 assert.Equal("?", tokens[4])
94 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020095
Akron524c5432021-08-05 14:14:27 +020096 r = strings.NewReader(" g? -- D")
97 w.Reset()
98 dat.Transduce(r, w)
99 tokens = strings.Split(w.String(), "\n")
100 assert.Equal("g", tokens[0])
101 assert.Equal("?", tokens[1])
102 assert.Equal("--", tokens[2])
103 assert.Equal("D", tokens[3])
104 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200105 assert.Equal("", tokens[5])
106 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200107}
108
Akron3f8571a2021-08-05 11:18:10 +0200109func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200110 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200111 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200112 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200113 assert.True(tmatch(dat, "bau"))
114 assert.True(tmatch(dat, "bad"))
115 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116
Akron03a3c612021-08-04 11:51:27 +0200117 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200123 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200137}
138
Akron31f3c062021-08-27 10:15:13 +0200139func TestIgnorableMCS(t *testing.T) {
140 assert := assert.New(t)
141 // File has MCS in sigma but not in net
142 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
143 assert.NotNil(tok)
144 dat := tok.ToDoubleArray()
145 assert.NotNil(dat)
146
147 b := make([]byte, 0, 2048)
148 w := bytes.NewBuffer(b)
149 var tokens []string
150
151 // Is only unambigous when transducing strictly greedy!
152 assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
153 tokens = strings.Split(w.String(), "\n")
154 assert.Equal("a\nb\n<ab>\n", w.String())
155 assert.Equal("a", tokens[0])
156 assert.Equal("b", tokens[1])
157 assert.Equal("<ab>", tokens[2])
158 assert.Equal(4, len(tokens))
159}
160
Akron6247a5d2021-08-03 19:18:28 +0200161func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200162 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200163 dat := LoadDatokFile("testdata/tokenizer.datok")
164 assert.NotNil(dat)
165 assert.True(dat.LoadFactor() >= 70)
166 assert.Equal(dat.epsilon, 1)
167 assert.Equal(dat.unknown, 2)
168 assert.Equal(dat.identity, 3)
Akronec835ad2021-08-11 18:23:22 +0200169 assert.Equal(dat.final, 137)
170 assert.Equal(len(dat.sigma), 132)
Akronf1a16502021-08-16 15:24:38 +0200171 assert.True(len(dat.array) > 3600000)
172 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200173 assert.True(tmatch(dat, "bau"))
174 assert.True(tmatch(dat, "bad"))
175 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200176}
Akron3f8571a2021-08-05 11:18:10 +0200177
Akrona0bded52021-08-11 15:48:02 +0200178func XTestFullTokenizerBuild(t *testing.T) {
179 assert := assert.New(t)
180 tok := LoadFomaFile("testdata/tokenizer.fst")
181 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200182 assert.NotNil(dat)
183 // n, err := dat.Save("testdata/tokenizer.datok")
184 // assert.Nil(err)
185 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200186}
187
Akron3f8571a2021-08-05 11:18:10 +0200188func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200189 assert := assert.New(t)
190
Akrona0bded52021-08-11 15:48:02 +0200191 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200192 assert.NotNil(dat)
193
Akron3610f102021-08-08 14:13:25 +0200194 b := make([]byte, 0, 2048)
195 w := bytes.NewBuffer(b)
196 var tokens []string
197
Akron03ca4252021-08-11 13:32:53 +0200198 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200199
200 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200201 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200202 assert.Equal("tra", tokens[0])
203 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200204 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200205 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200206 assert.Equal("Du", tokens[4])
207 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200208 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200209 assert.Equal("", tokens[7])
210 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200211
212 w.Reset()
213 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
214 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200215}
Akronb7e1f132021-08-10 11:52:31 +0200216
217func TestFullTokenizerSentenceSplitter(t *testing.T) {
218 assert := assert.New(t)
219 dat := LoadDatokFile("testdata/tokenizer.datok")
220 assert.NotNil(dat)
221
222 b := make([]byte, 0, 2048)
223 w := bytes.NewBuffer(b)
224 var sentences []string
225
226 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200227 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
228 sentences = strings.Split(w.String(), "\n\n")
229
230 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
231 assert.Equal("Der\nalte\nMann\n.", sentences[0])
232 assert.Equal("", sentences[1])
233 assert.Equal(len(sentences), 2)
234
235 w.Reset()
236 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
237 sentences = strings.Split(w.String(), "\n\n")
238 assert.Equal(len(sentences), 2)
239 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
240 assert.Equal("", sentences[1])
241
242 w.Reset()
243 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200246 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200247
248 w.Reset()
249 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
250 sentences = strings.Split(w.String(), "\n\n")
251 assert.Equal(len(sentences), 2)
252
253 w.Reset()
254 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
255 sentences = strings.Split(w.String(), "\n\n")
256 assert.Equal(len(sentences), 2)
257
Akron6e70dc82021-08-11 11:33:18 +0200258 w.Reset()
259 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
260 sentences = strings.Split(w.String(), "\n\n")
261 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
262 assert.Equal("", sentences[1])
263 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200264
Akron6e70dc82021-08-11 11:33:18 +0200265 w.Reset()
266 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal("", sentences[1])
269 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200270
Akron6e70dc82021-08-11 11:33:18 +0200271 w.Reset()
272 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
273 sentences = strings.Split(w.String(), "\n\n")
274 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200275
Akron6e70dc82021-08-11 11:33:18 +0200276 w.Reset()
277 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
278 sentences = strings.Split(w.String(), "\n\n")
279 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200280
Akron6e70dc82021-08-11 11:33:18 +0200281 w.Reset()
282 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
283 sentences = strings.Split(w.String(), "\n\n")
284 assert.Equal(len(sentences), 2)
285 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
286 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200287
Akron6e70dc82021-08-11 11:33:18 +0200288 w.Reset()
289 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
290 sentences = strings.Split(w.String(), "\n\n")
291 assert.Equal(len(sentences), 3)
292 assert.Equal("Ausschalten\n!!!", sentences[0])
293 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
294 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200295
Akron4af79f12021-08-11 14:48:17 +0200296 w.Reset()
297 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
298 sentences = strings.Split(w.String(), "\n\n")
299 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200300
301 /*
302 Test:
303 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
304 */
Akronb7e1f132021-08-10 11:52:31 +0200305}
Akron03ca4252021-08-11 13:32:53 +0200306
Akron03ca4252021-08-11 13:32:53 +0200307func TestFullTokenizerTokenSplitter(t *testing.T) {
308 assert := assert.New(t)
309 dat := LoadDatokFile("testdata/tokenizer.datok")
310 assert.NotNil(dat)
311
312 b := make([]byte, 0, 2048)
313 w := bytes.NewBuffer(b)
314 var tokens []string
315
316 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200317 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200318 assert.Equal(tokens[0], "Der")
319 assert.Equal(tokens[1], "alte")
320 assert.Equal(tokens[2], "Mann")
321 assert.Equal(len(tokens), 3)
322
Akronec835ad2021-08-11 18:23:22 +0200323 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200324 assert.Equal(tokens[0], "Der")
325 assert.Equal(tokens[1], "alte")
326 assert.Equal(tokens[2], "Mann")
327 assert.Equal(tokens[3], ".")
328 assert.Equal(len(tokens), 4)
329
330 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200331 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200332 assert.Equal(tokens[0], "Der")
333 assert.Equal(tokens[1], "Vorsitzende")
334 assert.Equal(tokens[2], "der")
335 assert.Equal(tokens[3], "F.D.P.")
336 assert.Equal(tokens[4], "hat")
337 assert.Equal(tokens[5], "gewählt")
338 assert.Equal(len(tokens), 6)
339 // Ignored in KorAP-Tokenizer
340
341 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200342 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200343 assert.Equal(tokens[0], "Gefunden")
344 assert.Equal(tokens[1], "auf")
345 assert.Equal(tokens[2], "wikipedia.org")
346 assert.Equal(len(tokens), 3)
347
348 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200349 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200350 assert.Equal("Gefunden", tokens[0])
351 assert.Equal("auf", tokens[1])
352 assert.Equal("www.wikipedia.org", tokens[2])
353 assert.Equal(3, len(tokens))
354
355 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200356 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200357 assert.Equal("www.info.biz/info", tokens[3])
358
359 // testTokenizerFtpHost
360 /*
361 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
362 assert.Equal("Kann", tokens[0])
363 assert.Equal("von", tokens[1])
364 assert.Equal("ftp.download.org", tokens[2])
365 assert.Equal(5, len(tokens))
366 // Ignored in KorAP-Tokenizer
367 */
368
369 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200370 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200371 assert.Equal(tokens[0], "Das")
372 assert.Equal(tokens[1], "war")
373 assert.Equal(tokens[2], "--")
374 assert.Equal(tokens[3], "spitze")
375 assert.Equal(len(tokens), 4)
376
377 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200378 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200379 assert.Equal(tokens[0], "Ich")
380 assert.Equal(tokens[1], "bin")
381 assert.Equal(tokens[2], "unter")
382 assert.Equal(tokens[3], "korap@ids-mannheim.de")
383 assert.Equal(tokens[4], "erreichbar")
384 assert.Equal(tokens[5], ".")
385 assert.Equal(len(tokens), 6)
386
387 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200388 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200389 assert.Equal(tokens[0], "Oder")
390 assert.Equal(tokens[1], "unter")
391 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
392 assert.Equal(tokens[3], ".")
393 assert.Equal(len(tokens), 4)
394
395 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200396 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200397 assert.Equal(tokens[0], "Oder")
398 assert.Equal(tokens[1], "unter")
399 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
400 assert.Equal(tokens[3], ".")
401 assert.Equal(len(tokens), 4)
402 // Ignored in KorAP-Tokenizer
403
404 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200405 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200406 assert.Equal("\"", tokens[0])
407 assert.Equal("John", tokens[1])
408 assert.Equal("Doe", tokens[2])
409 assert.Equal("\"", tokens[3])
410 assert.Equal("@xx", tokens[4])
411 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
412 assert.Equal("com", tokens[6])
413 assert.Equal(7, len(tokens))
414
415 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200416 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200417 assert.Equal(tokens[0], "Folgt")
418 assert.Equal(tokens[1], "@korap")
419 assert.Equal(tokens[2], "und")
420 assert.Equal(tokens[3], "#korap")
421 assert.Equal(len(tokens), 4)
422
423 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200424 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200425 assert.Equal(tokens[0], "Unsere")
426 assert.Equal(tokens[1], "Website")
427 assert.Equal(tokens[2], "ist")
428 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
429 assert.Equal(len(tokens), 4)
430
431 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200432 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200433 assert.Equal(tokens[0], "Wir")
434 assert.Equal(tokens[1], "sind")
435 assert.Equal(tokens[2], "auch")
436 assert.Equal(tokens[3], "im")
437 assert.Equal(tokens[4], "Internet")
438 assert.Equal(tokens[5], "(")
439 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
440 assert.Equal(tokens[7], ")")
441 assert.Equal(len(tokens), 8)
442 // Ignored in KorAP-Tokenizer
443
444 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200445 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200446 assert.Equal(tokens[0], "Die")
447 assert.Equal(tokens[1], "Adresse")
448 assert.Equal(tokens[2], "ist")
449 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
450 assert.Equal(tokens[4], ".")
451 assert.Equal(len(tokens), 5)
452 // Ignored in KorAP-Tokenizer
453
454 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200455 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200456 assert.Equal(tokens[0], "Unser")
457 assert.Equal(tokens[1], "Server")
458 assert.Equal(tokens[2], "ist")
459 assert.Equal(tokens[3], "10.0.10.51")
460 assert.Equal(tokens[4], ".")
461 assert.Equal(len(tokens), 5)
462
463 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200464 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200465 assert.Equal(tokens[0], "Zu")
466 assert.Equal(tokens[1], "50,4%")
467 assert.Equal(tokens[2], "ist")
468 assert.Equal(tokens[3], "es")
469 assert.Equal(tokens[4], "sicher")
470 assert.Equal(len(tokens), 5)
471 // Differs from KorAP-Tokenizer
472
473 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200474 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200475 assert.Equal(tokens[0], "Der")
476 assert.Equal(tokens[1], "Termin")
477 assert.Equal(tokens[2], "ist")
478 assert.Equal(tokens[3], "am")
479 assert.Equal(tokens[4], "5.9.2018")
480 assert.Equal(len(tokens), 5)
481
Akronec835ad2021-08-11 18:23:22 +0200482 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200483 assert.Equal(tokens[0], "Der")
484 assert.Equal(tokens[1], "Termin")
485 assert.Equal(tokens[2], "ist")
486 assert.Equal(tokens[3], "am")
487 assert.Equal(tokens[4], "5/9/2018")
488 assert.Equal(len(tokens), 5)
489
490 // testTokenizerDateRange
491 /*
492 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
493 assert.Equal(tokens[0], "Der")
494 assert.Equal(tokens[1], "Termin")
495 assert.Equal(tokens[2], "war")
496 assert.Equal(tokens[3], "vom")
497 assert.Equal(tokens[4], "4.")
498 assert.Equal(tokens[5], "-")
499 assert.Equal(tokens[6], "5.9.2018")
500 assert.Equal(len(tokens), 7)
501 // Ignored in KorAP-Tokenizer
502 */
503
504 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200505 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200506 assert.Equal(tokens[0], "Das")
507 assert.Equal(tokens[1], "ist")
508 assert.Equal(tokens[2], "toll")
509 assert.Equal(tokens[3], "!")
510 assert.Equal(tokens[4], ";)")
511 assert.Equal(len(tokens), 5)
512
513 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200514 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200515 assert.Equal(tokens[0], "Kupietz")
516 assert.Equal(tokens[1], "und")
517 assert.Equal(tokens[2], "Schmidt")
518 assert.Equal(tokens[3], "(2018)")
519 assert.Equal(tokens[4], ":")
520 assert.Equal(tokens[5], "Korpuslinguistik")
521 assert.Equal(len(tokens), 6)
522 // Differs from KorAP-Tokenizer!
523
524 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200525 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200526 assert.Equal(tokens[0], "Kupietz")
527 assert.Equal(tokens[1], "und")
528 assert.Equal(tokens[2], "Schmidt")
529 assert.Equal(tokens[3], "[2018]")
530 assert.Equal(tokens[4], ":")
531 assert.Equal(tokens[5], "Korpuslinguistik")
532 assert.Equal(len(tokens), 6)
533 // Differs from KorAP-Tokenizer!
534
535 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200536 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200537 assert.Equal(tokens[0], "Er")
538 assert.Equal(tokens[1], "ist")
539 assert.Equal(tokens[2], "ein")
540 assert.Equal(tokens[3], "A****loch")
541 assert.Equal(tokens[4], "!")
542 assert.Equal(len(tokens), 5)
543
544 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200545 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200546 assert.Equal(tokens[0], "F*ck")
547 assert.Equal(tokens[1], "!")
548 assert.Equal(len(tokens), 2)
549
550 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200551 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200552 assert.Equal(tokens[0], "Dieses")
553 assert.Equal(tokens[1], "verf*****")
554 assert.Equal(tokens[2], "Kleid")
555 assert.Equal(tokens[3], "!")
556 assert.Equal(len(tokens), 4)
557
558 // Probably interpreted as HOST
559 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200560 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200561 assert.Equal(tokens[0], "Ich")
562 assert.Equal(tokens[1], "habe")
563 assert.Equal(tokens[2], "die")
564 assert.Equal(tokens[3], "readme.txt")
565 assert.Equal(tokens[4], "heruntergeladen")
566 assert.Equal(len(tokens), 5)
567
568 // Probably interpreted as HOST
569 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200570 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200571 assert.Equal(tokens[0], "Nimm")
572 assert.Equal(tokens[1], "die")
573 assert.Equal(tokens[2], "README.TXT")
574 assert.Equal(tokens[3], "!")
575 assert.Equal(len(tokens), 4)
576
577 // Probably interpreted as HOST
578 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200579 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200580 assert.Equal(tokens[0], "Zeig")
581 assert.Equal(tokens[1], "mir")
582 assert.Equal(tokens[2], "profile.jpeg")
583 assert.Equal(len(tokens), 3)
584
585 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200586
Akronec835ad2021-08-11 18:23:22 +0200587 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200588 assert.Equal(tokens[0], "Zeig")
589 assert.Equal(tokens[1], "mir")
590 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
591 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200592
Akrone8837b52021-08-11 17:29:58 +0200593 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200594 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200595 assert.Equal(tokens[0], "Gehe")
596 assert.Equal(tokens[1], "zu")
597 assert.Equal(tokens[2], "/Dokumente/profile.docx")
598 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200599
Akrone8837b52021-08-11 17:29:58 +0200600 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200601 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200602 assert.Equal(tokens[0], "Zeig")
603 assert.Equal(tokens[1], "mir")
604 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
605 assert.Equal(len(tokens), 3)
606 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200607
Akronfd92d7e2021-08-11 16:31:43 +0200608 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200609 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200610 assert.Equal(tokens[0], "Er")
611 assert.Equal(tokens[1], "sagte")
612 assert.Equal(tokens[2], ":")
613 assert.Equal(tokens[3], "\"")
614 assert.Equal(tokens[4], "Es")
615 assert.Equal(tokens[5], "geht")
616 assert.Equal(tokens[6], "mir")
617 assert.Equal(tokens[7], "gut")
618 assert.Equal(tokens[8], "!")
619 assert.Equal(tokens[9], "\"")
620 assert.Equal(tokens[10], ",")
621 assert.Equal(tokens[11], "daraufhin")
622 assert.Equal(tokens[12], "ging")
623 assert.Equal(tokens[13], "er")
624 assert.Equal(tokens[14], ".")
625 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200626
627 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200628 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
629 assert.Equal(tokens[0], "&quot;")
630 assert.Equal(tokens[1], "Das")
631 assert.Equal(tokens[2], "ist")
632 assert.Equal(tokens[3], "von")
633 assert.Equal(tokens[4], "C&A")
634 assert.Equal(tokens[5], "!")
635 assert.Equal(tokens[6], "&quot;")
636 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200637
638 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200639 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200640 assert.Equal(tokens[0], "Siehst")
641 assert.Equal(tokens[1], "Du")
642 assert.Equal(tokens[2], "?!!?")
643 assert.Equal(len(tokens), 3)
644
645 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200646 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200647 assert.Equal(tokens[0], "Peter")
648 assert.Equal(tokens[1], "O'Toole")
649 assert.Equal(len(tokens), 2)
650
651 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200652 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200653 assert.Equal(tokens[0], "Früher")
654 assert.Equal(tokens[1], "bzw.")
655 assert.Equal(tokens[2], "später")
656 assert.Equal(tokens[3], "...")
657 assert.Equal(len(tokens), 4)
658
659 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200660 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200661 assert.Equal(tokens[0], "Es")
662 assert.Equal(tokens[1], "war")
663 assert.Equal(tokens[2], "spät")
664 assert.Equal(tokens[3], ".")
665 assert.Equal(tokens[4], "Morgen")
666 assert.Equal(tokens[5], "ist")
667 assert.Equal(tokens[6], "es")
668 assert.Equal(tokens[7], "früh")
669 assert.Equal(tokens[8], ".")
670 assert.Equal(len(tokens), 9)
671 // Ignored in KorAP-Tokenizer
672
673 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200674 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200675 assert.Equal(tokens[0], "Sie")
676 assert.Equal(tokens[1], "erreichte")
677 assert.Equal(tokens[2], "den")
678 assert.Equal(tokens[3], "1.")
679 assert.Equal(tokens[4], "Platz")
680 assert.Equal(tokens[5], "!")
681 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200682
683 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200684 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200685 assert.Equal(tokens[0], "Archive")
686 assert.Equal(tokens[1], ":")
687 assert.Equal(tokens[2], "Ich")
688 assert.Equal(tokens[3], "bin")
689 assert.Equal(tokens[4], "kein")
690 assert.Equal(tokens[5], "zip")
691 assert.Equal(6, len(tokens))
692
693 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200694 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200695 assert.Equal(tokens[4], "Weststr.")
696 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200697
698 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200699 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200700 assert.Equal("D'dorf", tokens[0])
701 assert.Equal("Ku'damm", tokens[1])
702 assert.Equal("Lu'hafen", tokens[2])
703 assert.Equal("M'gladbach", tokens[3])
704 assert.Equal("W'schaft", tokens[4])
705 assert.Equal(5, len(tokens))
706
707 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200708 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200709 assert.Equal("mach's", tokens[0])
710 assert.Equal("macht's", tokens[1])
711 assert.Equal("was'n", tokens[2])
712 assert.Equal("ist's", tokens[3])
713 assert.Equal("haste", tokens[4])
714 assert.Equal("willste", tokens[5])
715 assert.Equal("kannste", tokens[6])
716 assert.Equal("biste", tokens[7])
717 assert.Equal("kriegste", tokens[8])
718 assert.Equal(9, len(tokens))
719
720 /*
721 @Test
722 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
723 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
724 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
725 assert.Equal("'ve", tokens[1]);
726 assert.Equal("'ll", tokens[3]);
727 assert.Equal("'d", tokens[5]);
728 assert.Equal("'m", tokens[7]);
729 assert.Equal("'re", tokens[9]);
730 assert.Equal("'s", tokens[11]);
731 assert.Equal("is", tokens[12]);
732 assert.Equal("n't", tokens[13]);
733 assert.Equal(14, len(tokens));
734 }
735
736 @Test
737 public void frenchTokenizerKnowsFrenchAbbreviations () {
738 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
739 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
740 assert.Equal("Approx.", tokens[0]);
741 assert.Equal("juill.", tokens[2]);
742 assert.Equal("prof.", tokens[5]);
743 assert.Equal("exerc.", tokens[15]);
744 assert.Equal("no.", tokens[16]);
745 assert.Equal("pp.", tokens[21]);
746 }
747
748 @Test
749 public void frenchTokenizerKnowsFrenchContractions () {
750 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
751 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
752 assert.Equal("J'", tokens[0]);
753 assert.Equal("j'", tokens[2]);
754 assert.Equal("qu'", tokens[4]);
755 assert.Equal("d'", tokens[6]);
756 assert.Equal("jusqu'", tokens[8]);
757 assert.Equal("Aujourd'hui", tokens[10]);
758 assert.Equal("D'", tokens[11]); // ’
759 assert.Equal("Quelqu'un", tokens[13]); // ’
760 assert.Equal("Presqu'île", tokens[14]); // ’
761 }
762
763 @Test
764 public void frenchTokenizerKnowsFrenchClitics () {
765 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
766 tokens = tokenize(dat, w, "suis-je sont-elles ")
767 assert.Equal("suis", tokens[0]);
768 assert.Equal("-je", tokens[1]);
769 assert.Equal("sont", tokens[2]);
770 assert.Equal("-elles", tokens[3]);
771 }
772
773 @Test
774 public void testEnglishTokenizerScienceAbbreviations () {
775 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
776 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
777 assert.Equal("Approx.", tokens[0]);
778 assert.Equal("in", tokens[1]);
779 assert.Equal("Sept.", tokens[2]);
780 assert.Equal("1954", tokens[3]);
781 assert.Equal(",", tokens[4]);
782 assert.Equal("Assoc.", tokens[5]);
783 assert.Equal("Prof.", tokens[6]);
784 assert.Equal("Dr.", tokens[7]);
785 assert.Equal("R.", tokens[8]);
786 assert.Equal("J.", tokens[9]);
787 assert.Equal("Ewing", tokens[10]);
788 assert.Equal("reviewed", tokens[11]);
789 assert.Equal("articles", tokens[12]);
790 assert.Equal("on", tokens[13]);
791 assert.Equal("Enzymol.", tokens[14]);
792 assert.Equal("Bacteriol.", tokens[15]);
793 assert.Equal("effects", tokens[16]);
794 assert.Equal("later", tokens[17]);
795 assert.Equal("published", tokens[18]);
796 assert.Equal("in", tokens[19]);
797 assert.Equal("Nutr.", tokens[20]);
798 assert.Equal("Rheumatol.", tokens[21]);
799 assert.Equal("No.", tokens[22]);
800 assert.Equal("12", tokens[23]);
801 assert.Equal("and", tokens[24]);
802 assert.Equal("Nº.", tokens[25]);
803 assert.Equal("13.", tokens[26]);
804 assert.Equal(",", tokens[27]);
805 assert.Equal("pp.", tokens[28]);
806 assert.Equal("17-18", tokens[29]);
807 assert.Equal(".", tokens[30]);
808 }
809
810 @Test
811 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
812 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
813 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
814 assert.Equal("I.", tokens[1]);
815 assert.Equal("I", tokens[8]);
816 assert.Equal(".", tokens[9]);
817 assert.Equal("I", tokens[12]);
818 assert.Equal(".", tokens[13]);
819 }
820
821 @Test
822 public void testZipOuputArchive () {
823
824 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
825 System.setOut(new PrintStream(clearOut));
826 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
827 assert.Equal(0, len(tokens));
828 }
829 */
830 /*
831
832 @Test
833 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
834 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
835 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
836 .printOffsets(true)
837 .build();
838 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
839 assert.Equal("Text1", tokens[0].getType());
840 assert.Equal(len(tokens), 9 );
841 }
842 */
843}
Akronbd406802021-08-11 18:39:13 +0200844
845func BenchmarkTransduce(b *testing.B) {
846 bu := make([]byte, 0, 2048)
847 w := bytes.NewBuffer(bu)
848
849 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
850 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
851 Der Termin ist am 5.9.2018.
852 Ich habe die readme.txt heruntergeladen.
853 Ausschalten!!! Hast Du nicht gehört???
854 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
855 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
856 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
857 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
858 r := strings.NewReader(s)
859
860 dat := LoadDatokFile("testdata/tokenizer.datok")
861
862 for i := 0; i < b.N; i++ {
863 w.Reset()
864 r.Reset(s)
865 ok := dat.Transduce(r, w)
866 if !ok {
867 fmt.Println("Fail!")
868 fmt.Println(w.String())
869 os.Exit(1)
870 }
871 }
Akronbd406802021-08-11 18:39:13 +0200872}
Akronbb4aac52021-08-13 00:52:27 +0200873
Akron6f1c16c2021-08-17 10:45:42 +0200874// This test is deprecated as the datok file changes over time
875func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200876 for i := 0; i < b.N; i++ {
877 dat := LoadDatokFile("testdata/tokenizer.datok")
878 if dat == nil {
879 fmt.Println("Fail!")
880 os.Exit(1)
881 }
882 }
883}
884
Akron6f1c16c2021-08-17 10:45:42 +0200885func BenchmarkToDoubleArray(b *testing.B) {
886 tok := LoadFomaFile("testdata/simple_bench.fst")
887 for i := 0; i < b.N; i++ {
888 dat := tok.ToDoubleArray()
889 if dat == nil {
890 fmt.Println("Fail!")
891 os.Exit(1)
892 }
893 }
894}
895
Akronbb4aac52021-08-13 00:52:27 +0200896// 2021-08-11 (go 1.16)
897// go test -bench=. -test.benchmem
898// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200899// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200900// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
901// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
902// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
903// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200904// 2021-08-16
905// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
906// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
907// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
908// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200909// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
910// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200911// 2021-08-17
912// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
913// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200914// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
915// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op