blob: 6c642603f93ace260e6f1ad5deec9a584035e295 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron9fb63af2021-10-28 01:15:53 +020014var dat *DaTokenizer
15
Akrondf275812022-03-27 12:54:46 +020016func ttokenizeStr(tok Tokenizer, str string) string {
Akronec835ad2021-08-11 18:23:22 +020017 b := make([]byte, 0, 2048)
18 w := bytes.NewBuffer(b)
Akrondf275812022-03-27 12:54:46 +020019 return strings.Join(ttokenize(tok, w, str), "\n")
Akronec835ad2021-08-11 18:23:22 +020020}
21
Akron1c34ce62021-09-23 23:27:39 +020022func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020023 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020024 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020025 if !ok {
26 return []string{}
27 }
28 obj := regexp.MustCompile("\n+")
29
30 tokens := obj.Split(w.String(), -1)
31 return tokens[:len(tokens)-1]
32}
33
Akronc9c0eae2021-10-22 19:49:43 +020034func TestDoubleArraySimpleString(t *testing.T) {
Akron8ef408b2021-08-02 22:11:04 +020035 assert := assert.New(t)
Akron8ef408b2021-08-02 22:11:04 +020036 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020037 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020038 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +020039
40 b := make([]byte, 0, 2048)
41 w := bytes.NewBuffer(b)
42 var tokens []string
43
44 tokens = ttokenize(dat, w, "ibauamt")
45 assert.Equal("i", tokens[0])
46 assert.Equal("bauamt", tokens[1])
47
48 tokens = ttokenize(dat, w, "ibbauamt")
49 assert.Equal("i", tokens[0])
50
51 assert.Equal("b", tokens[1])
52 assert.Equal("bauamt", tokens[2])
53
54 tokens = ttokenize(dat, w, "bau")
55 assert.Equal("bau", tokens[0])
56
57 tokens = ttokenize(dat, w, "baum")
58 assert.Equal("bau", tokens[0])
59 assert.Equal("m", tokens[1])
60
61 tokens = ttokenize(dat, w, "baudibauamt")
62 assert.Equal("bau", tokens[0])
63 assert.Equal("d", tokens[1])
64 assert.Equal("i", tokens[2])
65 assert.Equal("bauamt", tokens[3])
Akron8ef408b2021-08-02 22:11:04 +020066}
Akron75ebe7f2021-08-03 10:34:10 +020067
Akronc9c0eae2021-10-22 19:49:43 +020068func TestDoubleArraySimpleBranches(t *testing.T) {
Akron75ebe7f2021-08-03 10:34:10 +020069 assert := assert.New(t)
Akron75ebe7f2021-08-03 10:34:10 +020070 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020071 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020072 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +020073
74 b := make([]byte, 0, 2048)
75 w := bytes.NewBuffer(b)
76 var tokens []string
77
78 tokens = ttokenize(dat, w, "bau")
79 assert.Equal("bau", tokens[0])
80
81 tokens = ttokenize(dat, w, "bauamt")
82 assert.Equal("bauamt", tokens[0])
83
84 tokens = ttokenize(dat, w, "wahlamt")
85 assert.Equal("wahlamt", tokens[0])
86
87 tokens = ttokenize(dat, w, "bauen")
88 assert.Equal("bauen", tokens[0])
89
90 tokens = ttokenize(dat, w, "wahlen")
91 assert.Equal("wahlen", tokens[0])
92
93 tokens = ttokenize(dat, w, "baum")
94 assert.Equal("bau", tokens[0])
95 assert.Equal("m", tokens[1])
Akron75ebe7f2021-08-03 10:34:10 +020096}
Akron730a79c2021-08-03 11:05:29 +020097
Akrondf275812022-03-27 12:54:46 +020098func TestDoubleArraySimpleTokenizer(t *testing.T) {
Akron730a79c2021-08-03 11:05:29 +020099 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200100 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200101 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +0200102 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
103 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
104 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron730a79c2021-08-03 11:05:29 +0200105}
Akron740f3d72021-08-03 12:12:34 +0200106
Akronc9c0eae2021-10-22 19:49:43 +0200107func TestDoubleArraySimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +0200108 assert := assert.New(t)
109 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +0200110 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +0200111
112 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
113 b := make([]byte, 0, 2048)
114 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +0200115 var tokens []string
Akron524c5432021-08-05 14:14:27 +0200116 dat.Transduce(r, w)
117 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200118 assert.Equal(len(tokens), 11)
Akron3f8571a2021-08-05 11:18:10 +0200119 assert.Equal("wald", tokens[0])
120 assert.Equal("gehen", tokens[1])
121 assert.Equal("Da", tokens[2])
122 assert.Equal("kann", tokens[3])
123 assert.Equal("man", tokens[4])
124 assert.Equal("was", tokens[5])
125 assert.Equal("\"erleben\"", tokens[6])
126
Akron524c5432021-08-05 14:14:27 +0200127 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
128 w.Reset()
129 dat.Transduce(r, w)
130 tokens = strings.Split(w.String(), "\n")
131 assert.Equal("In", tokens[0])
132 assert.Equal("den", tokens[1])
133 assert.Equal("Wald", tokens[2])
134 assert.Equal("gehen", tokens[3])
135 assert.Equal("?", tokens[4])
136 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +0200137
Akron524c5432021-08-05 14:14:27 +0200138 r = strings.NewReader(" g? -- D")
139 w.Reset()
140 dat.Transduce(r, w)
141 tokens = strings.Split(w.String(), "\n")
142 assert.Equal("g", tokens[0])
143 assert.Equal("?", tokens[1])
144 assert.Equal("--", tokens[2])
145 assert.Equal("D", tokens[3])
146 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200147 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +0200148 assert.Equal(7, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200149}
150
Akronc9c0eae2021-10-22 19:49:43 +0200151func TestDoubleArrayReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200152 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200153 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200154 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +0200155 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
156 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
157 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron6247a5d2021-08-03 19:18:28 +0200158
Akron3f8571a2021-08-05 11:18:10 +0200159 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200160 buf := bytes.NewBuffer(b)
161 n, err := dat.WriteTo(buf)
162 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200163 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200164
165 dat2 := ParseDatok(buf)
166 assert.NotNil(dat2)
167 assert.Equal(dat.array, dat2.array)
168 assert.Equal(dat.sigma, dat2.sigma)
169 assert.Equal(dat.epsilon, dat2.epsilon)
170 assert.Equal(dat.unknown, dat2.unknown)
171 assert.Equal(dat.identity, dat2.identity)
172 assert.Equal(dat.final, dat2.final)
173 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akrondf275812022-03-27 12:54:46 +0200174 assert.Equal(ttokenizeStr(dat2, "bau"), "bau")
175 assert.Equal(ttokenizeStr(dat2, "bad"), "bad")
176 assert.Equal(ttokenizeStr(dat2, "wald gehen"), "wald\ngehen")
Akron4fa28b32021-08-27 10:55:41 +0200177
Akron92704eb2021-08-27 10:59:46 +0200178 assert.Equal(dat.TransCount(), 17)
179 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200180}
181
Akronc9c0eae2021-10-22 19:49:43 +0200182func TestDoubleArrayIgnorableMCS(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200183
184 // This test relies on final states. That's why it is
185 // not working correctly anymore.
186
Akron31f3c062021-08-27 10:15:13 +0200187 assert := assert.New(t)
188 // File has MCS in sigma but not in net
189 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
190 assert.NotNil(tok)
191 dat := tok.ToDoubleArray()
192 assert.NotNil(dat)
193
194 b := make([]byte, 0, 2048)
195 w := bytes.NewBuffer(b)
196 var tokens []string
197
198 // Is only unambigous when transducing strictly greedy!
Akrone396a932021-10-19 01:06:13 +0200199 assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
Akron31f3c062021-08-27 10:15:13 +0200200 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200201 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akron31f3c062021-08-27 10:15:13 +0200202 assert.Equal("a", tokens[0])
203 assert.Equal("b", tokens[1])
Akrone396a932021-10-19 01:06:13 +0200204 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200205 assert.Equal(6, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200206 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200207}
208
Akronc9c0eae2021-10-22 19:49:43 +0200209func TestDoubleArrayFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200210 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200211
212 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200213 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200214 }
Akron3a063ef2021-08-05 19:36:35 +0200215 assert.NotNil(dat)
Akrond47c67e2022-04-10 11:02:59 +0200216 assert.True(dat.LoadFactor() >= 60)
Akron3a063ef2021-08-05 19:36:35 +0200217 assert.Equal(dat.epsilon, 1)
218 assert.Equal(dat.unknown, 2)
219 assert.Equal(dat.identity, 3)
Akrond0c6e102021-12-09 10:30:29 +0100220 // assert.Equal(dat.final, 142)
221 // assert.Equal(len(dat.sigma), 137)
Akronfac8abc2021-11-10 07:19:59 +0100222 // assert.True(len(dat.array) > 3000000)
223 // assert.True(dat.maxSize > 3000000)
Akrondf275812022-03-27 12:54:46 +0200224 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
225 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
226 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron740f3d72021-08-03 12:12:34 +0200227}
Akron3f8571a2021-08-05 11:18:10 +0200228
Akronc9c0eae2021-10-22 19:49:43 +0200229func TestDoubleArrayTokenizerBranch(t *testing.T) {
Akron941f2152021-09-26 15:14:25 +0200230 assert := assert.New(t)
231 tok := LoadTokenizerFile("testdata/simpletok.datok")
232 assert.NotNil(tok)
233 assert.Equal(tok.Type(), "DATOK")
234
235 tok = LoadTokenizerFile("testdata/simpletok.matok")
236 assert.NotNil(tok)
237 assert.Equal(tok.Type(), "MATOK")
238}
239
Akronc9c0eae2021-10-22 19:49:43 +0200240func XTestDoubleArrayFullTokenizerBuild(t *testing.T) {
Akrona0bded52021-08-11 15:48:02 +0200241 assert := assert.New(t)
Akron0139bc52023-08-31 16:35:58 +0200242 tok := LoadFomaFile("testdata/tokenizer_de.fst")
Akrona0bded52021-08-11 15:48:02 +0200243 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200244 assert.NotNil(dat)
Akron0139bc52023-08-31 16:35:58 +0200245 // n, err := dat.Save("testdata/tokenizer_de.datok")
Akronde18e902021-08-27 09:34:12 +0200246 // assert.Nil(err)
247 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200248}
249
Akronc9c0eae2021-10-22 19:49:43 +0200250func TestDoubleArrayFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200251 assert := assert.New(t)
252
Akron9fb63af2021-10-28 01:15:53 +0200253 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200254 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200255 }
256
Akron3610f102021-08-08 14:13:25 +0200257 assert.NotNil(dat)
258
Akron3610f102021-08-08 14:13:25 +0200259 b := make([]byte, 0, 2048)
260 w := bytes.NewBuffer(b)
261 var tokens []string
262
Akron03ca4252021-08-11 13:32:53 +0200263 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200264
265 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200266 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200267 assert.Equal("tra", tokens[0])
268 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200269 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200270 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200271 assert.Equal("Du", tokens[4])
272 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200273 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200274 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("", tokens[8])
276 assert.Equal(9, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200277
278 w.Reset()
279 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200280 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200281}
Akronb7e1f132021-08-10 11:52:31 +0200282
Akronc9c0eae2021-10-22 19:49:43 +0200283func TestDoubleArrayFullTokenizerSentenceSplitter(t *testing.T) {
Akronb7e1f132021-08-10 11:52:31 +0200284 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200285
286 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200287 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200288 }
289
Akronb7e1f132021-08-10 11:52:31 +0200290 assert.NotNil(dat)
291
292 b := make([]byte, 0, 2048)
293 w := bytes.NewBuffer(b)
294 var sentences []string
295
296 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200297 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
298 sentences = strings.Split(w.String(), "\n\n")
299
Akrona854faa2021-10-22 19:31:08 +0200300 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron1594cb82021-08-11 11:14:56 +0200301 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200302 assert.Equal("\n", sentences[1])
303 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200304
305 w.Reset()
306 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
307 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200308 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200309 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200310 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200311
312 w.Reset()
313 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200314 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200315 assert.Equal(2, len(sentences))
316 assert.Equal("", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200317
318 w.Reset()
319 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
320 sentences = strings.Split(w.String(), "\n\n")
321 assert.Equal(len(sentences), 2)
322
323 w.Reset()
324 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
325 sentences = strings.Split(w.String(), "\n\n")
326 assert.Equal(len(sentences), 2)
327
Akron6e70dc82021-08-11 11:33:18 +0200328 w.Reset()
329 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
330 sentences = strings.Split(w.String(), "\n\n")
331 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200332 assert.Equal("\n", sentences[1])
333 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200334
Akron6e70dc82021-08-11 11:33:18 +0200335 w.Reset()
336 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
337 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200338 assert.Equal("\n", sentences[1])
339 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200340
Akron6e70dc82021-08-11 11:33:18 +0200341 w.Reset()
342 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
343 sentences = strings.Split(w.String(), "\n\n")
344 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200345
Akron6e70dc82021-08-11 11:33:18 +0200346 w.Reset()
347 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
348 sentences = strings.Split(w.String(), "\n\n")
349 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200350
Akron6e70dc82021-08-11 11:33:18 +0200351 w.Reset()
352 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
353 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200354 assert.Equal(2, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200355 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200356 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200357
Akron6e70dc82021-08-11 11:33:18 +0200358 w.Reset()
359 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
360 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200361 assert.Equal(3, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200362 assert.Equal("Ausschalten\n!!!", sentences[0])
363 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200364 assert.Equal("\n", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200365
Akron4af79f12021-08-11 14:48:17 +0200366 w.Reset()
367 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
368 sentences = strings.Split(w.String(), "\n\n")
369 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200370
371 /*
372 Test:
373 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
374 */
Akronb7e1f132021-08-10 11:52:31 +0200375}
Akron03ca4252021-08-11 13:32:53 +0200376
Akronc9c0eae2021-10-22 19:49:43 +0200377func TestDoubleArrayFullTokenizerTokenSplitter(t *testing.T) {
Akron03ca4252021-08-11 13:32:53 +0200378 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200379
380 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200381 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200382 }
383
Akron03ca4252021-08-11 13:32:53 +0200384 assert.NotNil(dat)
385
386 b := make([]byte, 0, 2048)
387 w := bytes.NewBuffer(b)
388 var tokens []string
389
390 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200391 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200392 assert.Equal(tokens[0], "Der")
393 assert.Equal(tokens[1], "alte")
394 assert.Equal(tokens[2], "Mann")
395 assert.Equal(len(tokens), 3)
396
Akronec835ad2021-08-11 18:23:22 +0200397 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200398 assert.Equal(tokens[0], "Der")
399 assert.Equal(tokens[1], "alte")
400 assert.Equal(tokens[2], "Mann")
401 assert.Equal(tokens[3], ".")
402 assert.Equal(len(tokens), 4)
403
404 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200405 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200406 assert.Equal(tokens[0], "Der")
407 assert.Equal(tokens[1], "Vorsitzende")
408 assert.Equal(tokens[2], "der")
409 assert.Equal(tokens[3], "F.D.P.")
410 assert.Equal(tokens[4], "hat")
411 assert.Equal(tokens[5], "gewählt")
412 assert.Equal(len(tokens), 6)
413 // Ignored in KorAP-Tokenizer
414
415 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200416 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200417 assert.Equal(tokens[0], "Gefunden")
418 assert.Equal(tokens[1], "auf")
419 assert.Equal(tokens[2], "wikipedia.org")
420 assert.Equal(len(tokens), 3)
421
422 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200423 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200424 assert.Equal("Gefunden", tokens[0])
425 assert.Equal("auf", tokens[1])
426 assert.Equal("www.wikipedia.org", tokens[2])
427 assert.Equal(3, len(tokens))
428
429 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200430 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200431 assert.Equal("www.info.biz/info", tokens[3])
432
433 // testTokenizerFtpHost
434 /*
435 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
436 assert.Equal("Kann", tokens[0])
437 assert.Equal("von", tokens[1])
438 assert.Equal("ftp.download.org", tokens[2])
439 assert.Equal(5, len(tokens))
440 // Ignored in KorAP-Tokenizer
441 */
442
443 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200444 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200445 assert.Equal(tokens[0], "Das")
446 assert.Equal(tokens[1], "war")
447 assert.Equal(tokens[2], "--")
448 assert.Equal(tokens[3], "spitze")
449 assert.Equal(len(tokens), 4)
450
451 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200452 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200453 assert.Equal(tokens[0], "Ich")
454 assert.Equal(tokens[1], "bin")
455 assert.Equal(tokens[2], "unter")
456 assert.Equal(tokens[3], "korap@ids-mannheim.de")
457 assert.Equal(tokens[4], "erreichbar")
458 assert.Equal(tokens[5], ".")
459 assert.Equal(len(tokens), 6)
460
461 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200462 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200463 assert.Equal(tokens[0], "Oder")
464 assert.Equal(tokens[1], "unter")
465 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
466 assert.Equal(tokens[3], ".")
467 assert.Equal(len(tokens), 4)
468
469 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200470 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200471 assert.Equal(tokens[0], "Oder")
472 assert.Equal(tokens[1], "unter")
473 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
474 assert.Equal(tokens[3], ".")
475 assert.Equal(len(tokens), 4)
476 // Ignored in KorAP-Tokenizer
477
478 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200479 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200480 assert.Equal("\"", tokens[0])
481 assert.Equal("John", tokens[1])
482 assert.Equal("Doe", tokens[2])
483 assert.Equal("\"", tokens[3])
484 assert.Equal("@xx", tokens[4])
485 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
486 assert.Equal("com", tokens[6])
487 assert.Equal(7, len(tokens))
488
489 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200490 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200491 assert.Equal(tokens[0], "Folgt")
492 assert.Equal(tokens[1], "@korap")
493 assert.Equal(tokens[2], "und")
494 assert.Equal(tokens[3], "#korap")
495 assert.Equal(len(tokens), 4)
496
497 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200498 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200499 assert.Equal(tokens[0], "Unsere")
500 assert.Equal(tokens[1], "Website")
501 assert.Equal(tokens[2], "ist")
502 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
503 assert.Equal(len(tokens), 4)
504
505 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200506 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200507 assert.Equal(tokens[0], "Wir")
508 assert.Equal(tokens[1], "sind")
509 assert.Equal(tokens[2], "auch")
510 assert.Equal(tokens[3], "im")
511 assert.Equal(tokens[4], "Internet")
512 assert.Equal(tokens[5], "(")
513 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
514 assert.Equal(tokens[7], ")")
515 assert.Equal(len(tokens), 8)
516 // Ignored in KorAP-Tokenizer
517
518 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200519 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200520 assert.Equal(tokens[0], "Die")
521 assert.Equal(tokens[1], "Adresse")
522 assert.Equal(tokens[2], "ist")
523 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
524 assert.Equal(tokens[4], ".")
525 assert.Equal(len(tokens), 5)
526 // Ignored in KorAP-Tokenizer
527
528 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200529 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200530 assert.Equal(tokens[0], "Unser")
531 assert.Equal(tokens[1], "Server")
532 assert.Equal(tokens[2], "ist")
533 assert.Equal(tokens[3], "10.0.10.51")
534 assert.Equal(tokens[4], ".")
535 assert.Equal(len(tokens), 5)
536
537 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200538 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200539 assert.Equal(tokens[0], "Zu")
540 assert.Equal(tokens[1], "50,4%")
541 assert.Equal(tokens[2], "ist")
542 assert.Equal(tokens[3], "es")
543 assert.Equal(tokens[4], "sicher")
544 assert.Equal(len(tokens), 5)
545 // Differs from KorAP-Tokenizer
546
547 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200548 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200549 assert.Equal(tokens[0], "Der")
550 assert.Equal(tokens[1], "Termin")
551 assert.Equal(tokens[2], "ist")
552 assert.Equal(tokens[3], "am")
553 assert.Equal(tokens[4], "5.9.2018")
554 assert.Equal(len(tokens), 5)
555
Akronec835ad2021-08-11 18:23:22 +0200556 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200557 assert.Equal(tokens[0], "Der")
558 assert.Equal(tokens[1], "Termin")
559 assert.Equal(tokens[2], "ist")
560 assert.Equal(tokens[3], "am")
561 assert.Equal(tokens[4], "5/9/2018")
562 assert.Equal(len(tokens), 5)
563
564 // testTokenizerDateRange
565 /*
566 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
567 assert.Equal(tokens[0], "Der")
568 assert.Equal(tokens[1], "Termin")
569 assert.Equal(tokens[2], "war")
570 assert.Equal(tokens[3], "vom")
571 assert.Equal(tokens[4], "4.")
572 assert.Equal(tokens[5], "-")
573 assert.Equal(tokens[6], "5.9.2018")
574 assert.Equal(len(tokens), 7)
575 // Ignored in KorAP-Tokenizer
576 */
577
578 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200579 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200580 assert.Equal(tokens[0], "Das")
581 assert.Equal(tokens[1], "ist")
582 assert.Equal(tokens[2], "toll")
583 assert.Equal(tokens[3], "!")
584 assert.Equal(tokens[4], ";)")
585 assert.Equal(len(tokens), 5)
586
587 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200588 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200589 assert.Equal(tokens[0], "Kupietz")
590 assert.Equal(tokens[1], "und")
591 assert.Equal(tokens[2], "Schmidt")
592 assert.Equal(tokens[3], "(2018)")
593 assert.Equal(tokens[4], ":")
594 assert.Equal(tokens[5], "Korpuslinguistik")
595 assert.Equal(len(tokens), 6)
596 // Differs from KorAP-Tokenizer!
597
598 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200599 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200600 assert.Equal(tokens[0], "Kupietz")
601 assert.Equal(tokens[1], "und")
602 assert.Equal(tokens[2], "Schmidt")
603 assert.Equal(tokens[3], "[2018]")
604 assert.Equal(tokens[4], ":")
605 assert.Equal(tokens[5], "Korpuslinguistik")
606 assert.Equal(len(tokens), 6)
607 // Differs from KorAP-Tokenizer!
608
609 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200610 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200611 assert.Equal(tokens[0], "Er")
612 assert.Equal(tokens[1], "ist")
613 assert.Equal(tokens[2], "ein")
614 assert.Equal(tokens[3], "A****loch")
615 assert.Equal(tokens[4], "!")
616 assert.Equal(len(tokens), 5)
617
618 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200619 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200620 assert.Equal(tokens[0], "F*ck")
621 assert.Equal(tokens[1], "!")
622 assert.Equal(len(tokens), 2)
623
624 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200625 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200626 assert.Equal(tokens[0], "Dieses")
627 assert.Equal(tokens[1], "verf*****")
628 assert.Equal(tokens[2], "Kleid")
629 assert.Equal(tokens[3], "!")
630 assert.Equal(len(tokens), 4)
631
632 // Probably interpreted as HOST
633 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200634 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200635 assert.Equal(tokens[0], "Ich")
636 assert.Equal(tokens[1], "habe")
637 assert.Equal(tokens[2], "die")
638 assert.Equal(tokens[3], "readme.txt")
639 assert.Equal(tokens[4], "heruntergeladen")
640 assert.Equal(len(tokens), 5)
641
642 // Probably interpreted as HOST
643 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200644 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200645 assert.Equal(tokens[0], "Nimm")
646 assert.Equal(tokens[1], "die")
647 assert.Equal(tokens[2], "README.TXT")
648 assert.Equal(tokens[3], "!")
649 assert.Equal(len(tokens), 4)
650
651 // Probably interpreted as HOST
652 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200653 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200654 assert.Equal(tokens[0], "Zeig")
655 assert.Equal(tokens[1], "mir")
656 assert.Equal(tokens[2], "profile.jpeg")
657 assert.Equal(len(tokens), 3)
658
659 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200660
Akronec835ad2021-08-11 18:23:22 +0200661 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200662 assert.Equal(tokens[0], "Zeig")
663 assert.Equal(tokens[1], "mir")
664 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
665 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200666
Akrone8837b52021-08-11 17:29:58 +0200667 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200668 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200669 assert.Equal(tokens[0], "Gehe")
670 assert.Equal(tokens[1], "zu")
671 assert.Equal(tokens[2], "/Dokumente/profile.docx")
672 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200673
Akrone8837b52021-08-11 17:29:58 +0200674 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200675 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200676 assert.Equal(tokens[0], "Zeig")
677 assert.Equal(tokens[1], "mir")
678 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
679 assert.Equal(len(tokens), 3)
680 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200681
Akronfd92d7e2021-08-11 16:31:43 +0200682 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200683 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200684 assert.Equal(tokens[0], "Er")
685 assert.Equal(tokens[1], "sagte")
686 assert.Equal(tokens[2], ":")
687 assert.Equal(tokens[3], "\"")
688 assert.Equal(tokens[4], "Es")
689 assert.Equal(tokens[5], "geht")
690 assert.Equal(tokens[6], "mir")
691 assert.Equal(tokens[7], "gut")
692 assert.Equal(tokens[8], "!")
693 assert.Equal(tokens[9], "\"")
694 assert.Equal(tokens[10], ",")
695 assert.Equal(tokens[11], "daraufhin")
696 assert.Equal(tokens[12], "ging")
697 assert.Equal(tokens[13], "er")
698 assert.Equal(tokens[14], ".")
699 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200700
701 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200702 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
703 assert.Equal(tokens[0], "&quot;")
704 assert.Equal(tokens[1], "Das")
705 assert.Equal(tokens[2], "ist")
706 assert.Equal(tokens[3], "von")
707 assert.Equal(tokens[4], "C&A")
708 assert.Equal(tokens[5], "!")
709 assert.Equal(tokens[6], "&quot;")
710 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200711
712 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200713 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200714 assert.Equal(tokens[0], "Siehst")
715 assert.Equal(tokens[1], "Du")
716 assert.Equal(tokens[2], "?!!?")
717 assert.Equal(len(tokens), 3)
718
719 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200720 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200721 assert.Equal(tokens[0], "Peter")
722 assert.Equal(tokens[1], "O'Toole")
723 assert.Equal(len(tokens), 2)
724
725 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200726 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200727 assert.Equal(tokens[0], "Früher")
728 assert.Equal(tokens[1], "bzw.")
729 assert.Equal(tokens[2], "später")
730 assert.Equal(tokens[3], "...")
731 assert.Equal(len(tokens), 4)
732
733 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200734 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200735 assert.Equal(tokens[0], "Es")
736 assert.Equal(tokens[1], "war")
737 assert.Equal(tokens[2], "spät")
738 assert.Equal(tokens[3], ".")
739 assert.Equal(tokens[4], "Morgen")
740 assert.Equal(tokens[5], "ist")
741 assert.Equal(tokens[6], "es")
742 assert.Equal(tokens[7], "früh")
743 assert.Equal(tokens[8], ".")
744 assert.Equal(len(tokens), 9)
745 // Ignored in KorAP-Tokenizer
746
747 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200748 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200749 assert.Equal(tokens[0], "Sie")
750 assert.Equal(tokens[1], "erreichte")
751 assert.Equal(tokens[2], "den")
752 assert.Equal(tokens[3], "1.")
753 assert.Equal(tokens[4], "Platz")
754 assert.Equal(tokens[5], "!")
755 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200756
757 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200758 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200759 assert.Equal(tokens[0], "Archive")
760 assert.Equal(tokens[1], ":")
761 assert.Equal(tokens[2], "Ich")
762 assert.Equal(tokens[3], "bin")
763 assert.Equal(tokens[4], "kein")
764 assert.Equal(tokens[5], "zip")
765 assert.Equal(6, len(tokens))
766
767 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200768 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200769 assert.Equal(tokens[4], "Weststr.")
770 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200771
772 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200773 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200774 assert.Equal("D'dorf", tokens[0])
775 assert.Equal("Ku'damm", tokens[1])
776 assert.Equal("Lu'hafen", tokens[2])
777 assert.Equal("M'gladbach", tokens[3])
778 assert.Equal("W'schaft", tokens[4])
779 assert.Equal(5, len(tokens))
780
781 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200782 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200783 assert.Equal("mach's", tokens[0])
784 assert.Equal("macht's", tokens[1])
785 assert.Equal("was'n", tokens[2])
786 assert.Equal("ist's", tokens[3])
787 assert.Equal("haste", tokens[4])
788 assert.Equal("willste", tokens[5])
789 assert.Equal("kannste", tokens[6])
790 assert.Equal("biste", tokens[7])
791 assert.Equal("kriegste", tokens[8])
792 assert.Equal(9, len(tokens))
793
Akrond8d88952026-02-04 09:02:09 +0100794 // Regression test for hyphenated abbreviations from Wiktionary (2024-12)
795 tokens = ttokenize(dat, w, "Ich wohne in Ba.-Wü. und bin Dipl.-Ing. bei Reg.-Bez. Karlsruhe.")
796 assert.Equal("Ich", tokens[0])
797 assert.Equal("wohne", tokens[1])
798 assert.Equal("in", tokens[2])
799 assert.Equal("Ba.-Wü.", tokens[3])
800 assert.Equal("und", tokens[4])
801 assert.Equal("bin", tokens[5])
802 assert.Equal("Dipl.-Ing.", tokens[6])
803 assert.Equal("bei", tokens[7])
804 assert.Equal("Reg.-Bez.", tokens[8])
805 assert.Equal("Karlsruhe", tokens[9])
806 assert.Equal(".", tokens[10])
807 assert.Equal(11, len(tokens));
808
Akrona2f952f2026-02-04 09:51:51 +0100809 // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/131
810 tokens = ttokenize(dat, w, "Donau\u00ADdampf\u00ADschiff")
811 assert.Equal("Donau\u00ADdampf\u00ADschiff", tokens[0])
812 assert.Equal(1, len(tokens));
813
814 // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/115
815 tokens = ttokenize(dat, w, "Die Serb*innen wie die Kosovo-Albaner*innen")
816 assert.Equal("Die", tokens[0]);
817 assert.Equal("Serb*innen", tokens[1]);
818 assert.Equal("wie", tokens[2]);
819 assert.Equal("die", tokens[3]);
820 assert.Equal("Kosovo-Albaner*innen", tokens[4]);
821 assert.Equal(5, len(tokens));
822
823 // Test Wikipedia emoji template from the issue
824 tokens = ttokenize(dat, w, "Ein Smiley [_EMOJI:{{S|;)}}_] hier")
825 assert.Equal("Ein", tokens[0]);
826 assert.Equal("Smiley", tokens[1]);
827 assert.Equal("[_EMOJI:{{S|;)}}_]", tokens[2]); // Should be one token
828 assert.Equal("hier", tokens[3]);
829 assert.Equal(4, len(tokens));
830
831 // Test simple pragma still works
832 tokens = ttokenize(dat, w, "Name: [_ANONYMIZED_] Ende")
833 assert.Equal("Name", tokens[0]);
834 assert.Equal(":", tokens[1]);
835 assert.Equal("[_ANONYMIZED_]", tokens[2]); // Should be one token
836 assert.Equal("Ende", tokens[3]);
837 assert.Equal(4, len(tokens));
838
839 /*
840 DeReKo-Behaviour
841 tokens = ttokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
842 assert.Equal("'ve", tokens[1]);
843 assert.Equal("'ll", tokens[3]);
844 assert.Equal("'d", tokens[5]);
845 assert.Equal("'m", tokens[7]);
846 assert.Equal("'re", tokens[9]);
847 assert.Equal("'s", tokens[11]);
848 assert.Equal("is", tokens[12]);
849 assert.Equal("n't", tokens[13]);
850 assert.Equal(14, len(tokens));
851
852
853 assert.Equal(tokens[0], "Der")
854 assert.Equal(tokens[1], "alte")
855 assert.Equal(tokens[2], "Mann")
856 assert.Equal(len(tokens), 3)
Akrond8d88952026-02-04 09:02:09 +0100857
Akron03ca4252021-08-11 13:32:53 +0200858 /*
859 @Test
860 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
861 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
Akron03ca4252021-08-11 13:32:53 +0200862 }
863
864 @Test
865 public void frenchTokenizerKnowsFrenchAbbreviations () {
866 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
867 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
868 assert.Equal("Approx.", tokens[0]);
869 assert.Equal("juill.", tokens[2]);
870 assert.Equal("prof.", tokens[5]);
871 assert.Equal("exerc.", tokens[15]);
872 assert.Equal("no.", tokens[16]);
873 assert.Equal("pp.", tokens[21]);
874 }
875
876 @Test
877 public void frenchTokenizerKnowsFrenchContractions () {
878 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
879 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
880 assert.Equal("J'", tokens[0]);
881 assert.Equal("j'", tokens[2]);
882 assert.Equal("qu'", tokens[4]);
883 assert.Equal("d'", tokens[6]);
884 assert.Equal("jusqu'", tokens[8]);
885 assert.Equal("Aujourd'hui", tokens[10]);
886 assert.Equal("D'", tokens[11]); // ’
887 assert.Equal("Quelqu'un", tokens[13]); // ’
888 assert.Equal("Presqu'île", tokens[14]); // ’
889 }
890
891 @Test
892 public void frenchTokenizerKnowsFrenchClitics () {
893 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
894 tokens = tokenize(dat, w, "suis-je sont-elles ")
895 assert.Equal("suis", tokens[0]);
896 assert.Equal("-je", tokens[1]);
897 assert.Equal("sont", tokens[2]);
898 assert.Equal("-elles", tokens[3]);
899 }
900
901 @Test
902 public void testEnglishTokenizerScienceAbbreviations () {
903 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
904 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
905 assert.Equal("Approx.", tokens[0]);
906 assert.Equal("in", tokens[1]);
907 assert.Equal("Sept.", tokens[2]);
908 assert.Equal("1954", tokens[3]);
909 assert.Equal(",", tokens[4]);
910 assert.Equal("Assoc.", tokens[5]);
911 assert.Equal("Prof.", tokens[6]);
912 assert.Equal("Dr.", tokens[7]);
913 assert.Equal("R.", tokens[8]);
914 assert.Equal("J.", tokens[9]);
915 assert.Equal("Ewing", tokens[10]);
916 assert.Equal("reviewed", tokens[11]);
917 assert.Equal("articles", tokens[12]);
918 assert.Equal("on", tokens[13]);
919 assert.Equal("Enzymol.", tokens[14]);
920 assert.Equal("Bacteriol.", tokens[15]);
921 assert.Equal("effects", tokens[16]);
922 assert.Equal("later", tokens[17]);
923 assert.Equal("published", tokens[18]);
924 assert.Equal("in", tokens[19]);
925 assert.Equal("Nutr.", tokens[20]);
926 assert.Equal("Rheumatol.", tokens[21]);
927 assert.Equal("No.", tokens[22]);
928 assert.Equal("12", tokens[23]);
929 assert.Equal("and", tokens[24]);
930 assert.Equal("Nº.", tokens[25]);
931 assert.Equal("13.", tokens[26]);
932 assert.Equal(",", tokens[27]);
933 assert.Equal("pp.", tokens[28]);
934 assert.Equal("17-18", tokens[29]);
935 assert.Equal(".", tokens[30]);
936 }
937
938 @Test
939 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
940 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
941 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
942 assert.Equal("I.", tokens[1]);
943 assert.Equal("I", tokens[8]);
944 assert.Equal(".", tokens[9]);
945 assert.Equal("I", tokens[12]);
946 assert.Equal(".", tokens[13]);
947 }
948
949 @Test
950 public void testZipOuputArchive () {
951
952 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
953 System.setOut(new PrintStream(clearOut));
954 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
955 assert.Equal(0, len(tokens));
956 }
957 */
958 /*
959
960 @Test
961 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
962 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
963 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
964 .printOffsets(true)
965 .build();
966 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
967 assert.Equal("Text1", tokens[0].getType());
968 assert.Equal(len(tokens), 9 );
969 }
970 */
971}
Akronbd406802021-08-11 18:39:13 +0200972
Akrondf275812022-03-27 12:54:46 +0200973func TestDoubleArrayFullTokenizerSentenceSplitterBug1(t *testing.T) {
974 assert := assert.New(t)
975
976 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200977 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akrondf275812022-03-27 12:54:46 +0200978 }
979
980 b := make([]byte, 0, 2048)
981 w := bytes.NewBuffer(b)
982 var sentences []string
983
984 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
985
986 w.Reset()
987 assert.True(dat.Transduce(strings.NewReader(text), w))
988 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +0200989 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +0200990 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
991 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +0200992 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
993 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
994 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akrondf275812022-03-27 12:54:46 +0200995}
996
Akronc9c0eae2021-10-22 19:49:43 +0200997func TestDoubleArrayLoadFactor1(t *testing.T) {
Akron29e306f2021-09-02 18:29:56 +0200998 assert := assert.New(t)
999 tok := LoadFomaFile("testdata/abbr_bench.fst")
1000 dat := tok.ToDoubleArray()
1001 assert.True(dat.LoadFactor() > 88)
1002}
1003
Akronc9c0eae2021-10-22 19:49:43 +02001004func TestDoubleArrayFullTokenizerXML(t *testing.T) {
Akron4c2a1ad2021-08-31 00:35:53 +02001005 assert := assert.New(t)
1006
Akron9fb63af2021-10-28 01:15:53 +02001007 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +02001008 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +02001009 }
1010
Akron4c2a1ad2021-08-31 00:35:53 +02001011 assert.NotNil(dat)
1012
1013 b := make([]byte, 0, 2048)
1014 w := bytes.NewBuffer(b)
1015 var tokens []string
1016
1017 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
1018 assert.Equal("Das", tokens[0])
1019 assert.Equal("<b>", tokens[1])
1020 assert.Equal("beste", tokens[2])
1021 assert.Equal("</b>", tokens[3])
1022 assert.Equal("Fußballspiel", tokens[4])
1023 assert.Equal(5, len(tokens))
1024
1025 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1026 assert.Equal("Das", tokens[0])
1027 assert.Equal("<b class=\"c\">", tokens[1])
1028 assert.Equal("beste", tokens[2])
1029 assert.Equal("</b>", tokens[3])
1030 assert.Equal("Fußballspiel", tokens[4])
1031 assert.Equal(5, len(tokens))
1032
1033 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1034 assert.Equal("der", tokens[0])
1035 assert.Equal("<x y=\"alte \">", tokens[1])
1036 assert.Equal("<x x>", tokens[2])
1037 assert.Equal("alte", tokens[3])
1038 assert.Equal("</x>", tokens[4])
1039 assert.Equal("etc.", tokens[5])
1040 assert.Equal("et", tokens[6])
1041 assert.Equal(".", tokens[7])
1042 assert.Equal("Mann", tokens[8])
1043 assert.Equal(".", tokens[9])
1044 assert.Equal(10, len(tokens))
1045}
1046
Akronc9c0eae2021-10-22 19:49:43 +02001047func BenchmarkDoubleArrayTransduce(b *testing.B) {
Akronbd406802021-08-11 18:39:13 +02001048 bu := make([]byte, 0, 2048)
1049 w := bytes.NewBuffer(bu)
1050
1051 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
1052 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
1053 Der Termin ist am 5.9.2018.
1054 Ich habe die readme.txt heruntergeladen.
1055 Ausschalten!!! Hast Du nicht gehört???
1056 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
1057 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
1058 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
1059 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
1060 r := strings.NewReader(s)
1061
Akron0139bc52023-08-31 16:35:58 +02001062 dat := LoadDatokFile("testdata/tokenizer_de.datok")
Akronbd406802021-08-11 18:39:13 +02001063
Akrondf37a552021-09-02 12:16:08 +02001064 b.ResetTimer()
1065
Akronbd406802021-08-11 18:39:13 +02001066 for i := 0; i < b.N; i++ {
1067 w.Reset()
1068 r.Reset(s)
1069 ok := dat.Transduce(r, w)
1070 if !ok {
1071 fmt.Println("Fail!")
1072 fmt.Println(w.String())
1073 os.Exit(1)
1074 }
1075 }
Akronbd406802021-08-11 18:39:13 +02001076}
Akronbb4aac52021-08-13 00:52:27 +02001077
Akron6f1c16c2021-08-17 10:45:42 +02001078// This test is deprecated as the datok file changes over time
1079func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +02001080 for i := 0; i < b.N; i++ {
Akron0139bc52023-08-31 16:35:58 +02001081 dat := LoadDatokFile("testdata/tokenizer_de.datok")
Akronbb4aac52021-08-13 00:52:27 +02001082 if dat == nil {
1083 fmt.Println("Fail!")
1084 os.Exit(1)
1085 }
1086 }
1087}
1088
Akronc9c0eae2021-10-22 19:49:43 +02001089func BenchmarkDoubleArrayConstruction(b *testing.B) {
Akron6f1c16c2021-08-17 10:45:42 +02001090 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +02001091 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +02001092 for i := 0; i < b.N; i++ {
1093 dat := tok.ToDoubleArray()
1094 if dat == nil {
1095 fmt.Println("Fail!")
1096 os.Exit(1)
1097 }
1098 }
1099}
1100
Akronc9c0eae2021-10-22 19:49:43 +02001101func BenchmarkDoubleArrayLarger(b *testing.B) {
Akron7b1faa62021-09-02 16:10:21 +02001102 tok := LoadFomaFile("testdata/abbr_bench.fst")
1103 b.ResetTimer()
1104 for i := 0; i < b.N; i++ {
1105 dat := tok.ToDoubleArray()
1106 if dat == nil {
1107 fmt.Println("Fail!")
1108 os.Exit(1)
1109 }
1110 }
1111}
1112
Akronbb4aac52021-08-13 00:52:27 +02001113// 2021-08-11 (go 1.16)
1114// go test -bench=. -test.benchmem
1115// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001116// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +02001117// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
1118// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
1119// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
1120// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001121// 2021-08-16
1122// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
1123// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
1124// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
1125// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +02001126// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
1127// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +02001128// 2021-08-17
1129// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
1130// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +02001131// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
1132// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +02001133// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +02001134// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
1135// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
1136// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
1137// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
1138// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
1139// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1140// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1141// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1142// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001143// 2021-09-02 - xCheckSkip() with .9
1144// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1145// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1146// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001147// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1148// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1149// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1150// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
Akron28031b72021-10-02 13:07:25 +02001151// 2021-09-30 - Go 1.17.1
1152// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
1153// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
1154// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
1155// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
Akron094a4e82021-10-02 18:37:00 +02001156// 2021-10-02
1157// BenchmarkTransduce-4 47676 25398 ns/op 8240 B/op 3 allocs/op
1158// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
1159// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
1160// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
Akrone396a932021-10-19 01:06:13 +02001161// 2021-10-12 - Introduction of Callbacks in Matrix
1162// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
1163// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
1164// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
1165// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
1166// 2021-10-18 - Introduction of Callbacks in DA
1167// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
1168// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
1169// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
1170// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
Akrona854faa2021-10-22 19:31:08 +02001171// 2021-10-21 - Simplify DA code to ignore final states
1172// BenchmarkTransduce-4 41365 33766 ns/op 12408 B/op 6 allocs/op
1173// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
1174// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
1175// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001176// 2021-10-22 - Introduxe EOT
Akronc9c0eae2021-10-22 19:49:43 +02001177// BenchmarkDoubleArrayTransduce-4 43820 27661 ns/op 12408 B/op 6 allocs/op
1178// BenchmarkDoubleArrayConstruction-4 68259 16608 ns/op 10703 B/op 29 allocs/op
1179// BenchmarkDoubleArrayLarger-4 16 69889532 ns/op 6357901 B/op 2578 allocs/op
1180// BenchmarkMatrixTransduce-4 49426 25105 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001181// 2021-10-23 - Improve offset handling
1182// BenchmarkDoubleArrayTransduce-4 41890 29729 ns/op 12408 B/op 6 allocs/op
1183// BenchmarkDoubleArrayConstruction-4 74510 15879 ns/op 10703 B/op 29 allocs/op
1184// BenchmarkDoubleArrayLarger-4 18 73752383 ns/op 6357956 B/op 2579 allocs/op
1185// BenchmarkMatrixTransduce-4 46870 27140 ns/op 12408 B/op 6 allocs/op
Akron04335c62021-10-28 11:56:00 +02001186// 2021-10-28 - Finalize feature compatibility with KorAP-Tokenizer
1187// BenchmarkDoubleArrayTransduce-4 39130 31612 ns/op 28944 B/op 16 allocs/op
1188// BenchmarkDoubleArrayConstruction-4 79302 14994 ns/op 10703 B/op 29 allocs/op
1189// BenchmarkDoubleArrayLarger-4 18 67942077 ns/op 6357870 B/op 2577 allocs/op
1190// BenchmarkMatrixTransduce-4 39536 30510 ns/op 28944 B/op 16 allocs/op
Akron289414f2021-11-09 19:56:42 +01001191// 2021-11-09 - go 1.17.3
1192// BenchmarkDoubleArrayTransduce-4 35067 34192 ns/op 28944 B/op 17 allocs/op
1193// BenchmarkDoubleArrayConstruction-4 72446 15614 ns/op 10703 B/op 29 allocs/op
1194// BenchmarkDoubleArrayLarger-4 16 71058822 ns/op 6357860 B/op 2577 allocs/op
1195// BenchmarkMatrixTransduce-4 36703 31891 ns/op 28944 B/op 17 allocs/op
Akronfac8abc2021-11-10 07:19:59 +01001196// 2021-11-10 - rearranged longest match operator
Akron4880fb62021-12-05 12:03:05 +01001197// BenchmarkDoubleArrayTransduce-4 34522 33210 ns/op 28944 B/op 17 allocs/op
1198// BenchmarkDoubleArrayConstruction-4 66990 16012 ns/op 10703 B/op 29 allocs/op
1199// BenchmarkDoubleArrayLarger-4 16 62829878 ns/op 6357823 B/op 2576 allocs/op
1200// BenchmarkMatrixTransduce-4 36154 32702 ns/op 28944 B/op 17 allocs/op
1201// 2021-12-04 - optimize identity branch
1202// BenchmarkDoubleArrayTransduce-4 34903 32255 ns/op 28944 B/op 17 allocs/op
1203// BenchmarkDoubleArrayConstruction-4 79394 14561 ns/op 10703 B/op 29 allocs/op
1204// BenchmarkDoubleArrayLarger-4 19 60257675 ns/op 6357911 B/op 2577 allocs/op
1205// BenchmarkMatrixTransduce-4 35076 30581 ns/op 28944 B/op 17 allocs/op
Akron00cecd12021-12-05 13:14:03 +01001206// 2021-12-05 - init identity for sigma < 256
1207// BenchmarkDoubleArrayTransduce-4 35284 31918 ns/op 28944 B/op 17 allocs/op
1208// BenchmarkDoubleArrayConstruction-4 80342 14504 ns/op 10703 B/op 29 allocs/op
1209// BenchmarkDoubleArrayLarger-4 19 60343253 ns/op 6357789 B/op 2575 allocs/op
1210// BenchmarkMatrixTransduce-4 34029 30238 ns/op 28944 B/op 17 allocs/op