blob: 86dd428d5d31f25142751d8a0972c7a0dd016ea8 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron9fb63af2021-10-28 01:15:53 +020014var dat *DaTokenizer
15
Akrondf275812022-03-27 12:54:46 +020016func ttokenizeStr(tok Tokenizer, str string) string {
Akronec835ad2021-08-11 18:23:22 +020017 b := make([]byte, 0, 2048)
18 w := bytes.NewBuffer(b)
Akrondf275812022-03-27 12:54:46 +020019 return strings.Join(ttokenize(tok, w, str), "\n")
Akronec835ad2021-08-11 18:23:22 +020020}
21
Akron1c34ce62021-09-23 23:27:39 +020022func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020023 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020024 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020025 if !ok {
26 return []string{}
27 }
28 obj := regexp.MustCompile("\n+")
29
30 tokens := obj.Split(w.String(), -1)
31 return tokens[:len(tokens)-1]
32}
33
Akronc9c0eae2021-10-22 19:49:43 +020034func TestDoubleArraySimpleString(t *testing.T) {
Akron8ef408b2021-08-02 22:11:04 +020035 assert := assert.New(t)
Akron8ef408b2021-08-02 22:11:04 +020036 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020037 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020038 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +020039
40 b := make([]byte, 0, 2048)
41 w := bytes.NewBuffer(b)
42 var tokens []string
43
44 tokens = ttokenize(dat, w, "ibauamt")
45 assert.Equal("i", tokens[0])
46 assert.Equal("bauamt", tokens[1])
47
48 tokens = ttokenize(dat, w, "ibbauamt")
49 assert.Equal("i", tokens[0])
50
51 assert.Equal("b", tokens[1])
52 assert.Equal("bauamt", tokens[2])
53
54 tokens = ttokenize(dat, w, "bau")
55 assert.Equal("bau", tokens[0])
56
57 tokens = ttokenize(dat, w, "baum")
58 assert.Equal("bau", tokens[0])
59 assert.Equal("m", tokens[1])
60
61 tokens = ttokenize(dat, w, "baudibauamt")
62 assert.Equal("bau", tokens[0])
63 assert.Equal("d", tokens[1])
64 assert.Equal("i", tokens[2])
65 assert.Equal("bauamt", tokens[3])
Akron8ef408b2021-08-02 22:11:04 +020066}
Akron75ebe7f2021-08-03 10:34:10 +020067
Akronc9c0eae2021-10-22 19:49:43 +020068func TestDoubleArraySimpleBranches(t *testing.T) {
Akron75ebe7f2021-08-03 10:34:10 +020069 assert := assert.New(t)
Akron75ebe7f2021-08-03 10:34:10 +020070 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020071 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020072 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +020073
74 b := make([]byte, 0, 2048)
75 w := bytes.NewBuffer(b)
76 var tokens []string
77
78 tokens = ttokenize(dat, w, "bau")
79 assert.Equal("bau", tokens[0])
80
81 tokens = ttokenize(dat, w, "bauamt")
82 assert.Equal("bauamt", tokens[0])
83
84 tokens = ttokenize(dat, w, "wahlamt")
85 assert.Equal("wahlamt", tokens[0])
86
87 tokens = ttokenize(dat, w, "bauen")
88 assert.Equal("bauen", tokens[0])
89
90 tokens = ttokenize(dat, w, "wahlen")
91 assert.Equal("wahlen", tokens[0])
92
93 tokens = ttokenize(dat, w, "baum")
94 assert.Equal("bau", tokens[0])
95 assert.Equal("m", tokens[1])
Akron75ebe7f2021-08-03 10:34:10 +020096}
Akron730a79c2021-08-03 11:05:29 +020097
Akrondf275812022-03-27 12:54:46 +020098func TestDoubleArraySimpleTokenizer(t *testing.T) {
Akron730a79c2021-08-03 11:05:29 +020099 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200100 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200101 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +0200102 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
103 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
104 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron730a79c2021-08-03 11:05:29 +0200105}
Akron740f3d72021-08-03 12:12:34 +0200106
Akronc9c0eae2021-10-22 19:49:43 +0200107func TestDoubleArraySimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +0200108 assert := assert.New(t)
109 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +0200110 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +0200111
112 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
113 b := make([]byte, 0, 2048)
114 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +0200115 var tokens []string
Akron524c5432021-08-05 14:14:27 +0200116 dat.Transduce(r, w)
117 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200118 assert.Equal(len(tokens), 11)
Akron3f8571a2021-08-05 11:18:10 +0200119 assert.Equal("wald", tokens[0])
120 assert.Equal("gehen", tokens[1])
121 assert.Equal("Da", tokens[2])
122 assert.Equal("kann", tokens[3])
123 assert.Equal("man", tokens[4])
124 assert.Equal("was", tokens[5])
125 assert.Equal("\"erleben\"", tokens[6])
126
Akron524c5432021-08-05 14:14:27 +0200127 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
128 w.Reset()
129 dat.Transduce(r, w)
130 tokens = strings.Split(w.String(), "\n")
131 assert.Equal("In", tokens[0])
132 assert.Equal("den", tokens[1])
133 assert.Equal("Wald", tokens[2])
134 assert.Equal("gehen", tokens[3])
135 assert.Equal("?", tokens[4])
136 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +0200137
Akron524c5432021-08-05 14:14:27 +0200138 r = strings.NewReader(" g? -- D")
139 w.Reset()
140 dat.Transduce(r, w)
141 tokens = strings.Split(w.String(), "\n")
142 assert.Equal("g", tokens[0])
143 assert.Equal("?", tokens[1])
144 assert.Equal("--", tokens[2])
145 assert.Equal("D", tokens[3])
146 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200147 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +0200148 assert.Equal(7, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200149}
150
Akronc9c0eae2021-10-22 19:49:43 +0200151func TestDoubleArrayReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200152 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200153 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200154 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +0200155 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
156 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
157 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron6247a5d2021-08-03 19:18:28 +0200158
Akron3f8571a2021-08-05 11:18:10 +0200159 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200160 buf := bytes.NewBuffer(b)
161 n, err := dat.WriteTo(buf)
162 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200163 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200164
165 dat2 := ParseDatok(buf)
166 assert.NotNil(dat2)
167 assert.Equal(dat.array, dat2.array)
168 assert.Equal(dat.sigma, dat2.sigma)
169 assert.Equal(dat.epsilon, dat2.epsilon)
170 assert.Equal(dat.unknown, dat2.unknown)
171 assert.Equal(dat.identity, dat2.identity)
172 assert.Equal(dat.final, dat2.final)
173 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akrondf275812022-03-27 12:54:46 +0200174 assert.Equal(ttokenizeStr(dat2, "bau"), "bau")
175 assert.Equal(ttokenizeStr(dat2, "bad"), "bad")
176 assert.Equal(ttokenizeStr(dat2, "wald gehen"), "wald\ngehen")
Akron4fa28b32021-08-27 10:55:41 +0200177
Akron92704eb2021-08-27 10:59:46 +0200178 assert.Equal(dat.TransCount(), 17)
179 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200180}
181
Akronc9c0eae2021-10-22 19:49:43 +0200182func TestDoubleArrayIgnorableMCS(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200183
184 // This test relies on final states. That's why it is
185 // not working correctly anymore.
186
Akron31f3c062021-08-27 10:15:13 +0200187 assert := assert.New(t)
188 // File has MCS in sigma but not in net
189 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
190 assert.NotNil(tok)
191 dat := tok.ToDoubleArray()
192 assert.NotNil(dat)
193
194 b := make([]byte, 0, 2048)
195 w := bytes.NewBuffer(b)
196 var tokens []string
197
198 // Is only unambigous when transducing strictly greedy!
Akrone396a932021-10-19 01:06:13 +0200199 assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
Akron31f3c062021-08-27 10:15:13 +0200200 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200201 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akron31f3c062021-08-27 10:15:13 +0200202 assert.Equal("a", tokens[0])
203 assert.Equal("b", tokens[1])
Akrone396a932021-10-19 01:06:13 +0200204 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200205 assert.Equal(6, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200206 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200207}
208
Akronc9c0eae2021-10-22 19:49:43 +0200209func TestDoubleArrayFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200210 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200211
212 if dat == nil {
213 dat = LoadDatokFile("testdata/tokenizer.datok")
214 }
Akron3a063ef2021-08-05 19:36:35 +0200215 assert.NotNil(dat)
216 assert.True(dat.LoadFactor() >= 70)
217 assert.Equal(dat.epsilon, 1)
218 assert.Equal(dat.unknown, 2)
219 assert.Equal(dat.identity, 3)
Akrond0c6e102021-12-09 10:30:29 +0100220 // assert.Equal(dat.final, 142)
221 // assert.Equal(len(dat.sigma), 137)
Akronfac8abc2021-11-10 07:19:59 +0100222 // assert.True(len(dat.array) > 3000000)
223 // assert.True(dat.maxSize > 3000000)
Akrondf275812022-03-27 12:54:46 +0200224 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
225 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
226 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron740f3d72021-08-03 12:12:34 +0200227}
Akron3f8571a2021-08-05 11:18:10 +0200228
Akronc9c0eae2021-10-22 19:49:43 +0200229func TestDoubleArrayTokenizerBranch(t *testing.T) {
Akron941f2152021-09-26 15:14:25 +0200230 assert := assert.New(t)
231 tok := LoadTokenizerFile("testdata/simpletok.datok")
232 assert.NotNil(tok)
233 assert.Equal(tok.Type(), "DATOK")
234
235 tok = LoadTokenizerFile("testdata/simpletok.matok")
236 assert.NotNil(tok)
237 assert.Equal(tok.Type(), "MATOK")
238}
239
Akronc9c0eae2021-10-22 19:49:43 +0200240func XTestDoubleArrayFullTokenizerBuild(t *testing.T) {
Akrona0bded52021-08-11 15:48:02 +0200241 assert := assert.New(t)
242 tok := LoadFomaFile("testdata/tokenizer.fst")
243 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200244 assert.NotNil(dat)
245 // n, err := dat.Save("testdata/tokenizer.datok")
246 // assert.Nil(err)
247 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200248}
249
Akronc9c0eae2021-10-22 19:49:43 +0200250func TestDoubleArrayFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200251 assert := assert.New(t)
252
Akron9fb63af2021-10-28 01:15:53 +0200253 if dat == nil {
254 dat = LoadDatokFile("testdata/tokenizer.datok")
255 }
256
Akron3610f102021-08-08 14:13:25 +0200257 assert.NotNil(dat)
258
Akron3610f102021-08-08 14:13:25 +0200259 b := make([]byte, 0, 2048)
260 w := bytes.NewBuffer(b)
261 var tokens []string
262
Akron03ca4252021-08-11 13:32:53 +0200263 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200264
265 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200266 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200267 assert.Equal("tra", tokens[0])
268 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200269 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200270 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200271 assert.Equal("Du", tokens[4])
272 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200273 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200274 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("", tokens[8])
276 assert.Equal(9, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200277
278 w.Reset()
279 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200280 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200281}
Akronb7e1f132021-08-10 11:52:31 +0200282
Akronc9c0eae2021-10-22 19:49:43 +0200283func TestDoubleArrayFullTokenizerSentenceSplitter(t *testing.T) {
Akronb7e1f132021-08-10 11:52:31 +0200284 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200285
286 if dat == nil {
287 dat = LoadDatokFile("testdata/tokenizer.datok")
288 }
289
Akronb7e1f132021-08-10 11:52:31 +0200290 assert.NotNil(dat)
291
292 b := make([]byte, 0, 2048)
293 w := bytes.NewBuffer(b)
294 var sentences []string
295
296 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200297 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
298 sentences = strings.Split(w.String(), "\n\n")
299
Akrona854faa2021-10-22 19:31:08 +0200300 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron1594cb82021-08-11 11:14:56 +0200301 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200302 assert.Equal("\n", sentences[1])
303 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200304
305 w.Reset()
306 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
307 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200308 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200309 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200310 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200311
312 w.Reset()
313 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200314 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200315 assert.Equal(2, len(sentences))
316 assert.Equal("", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200317
318 w.Reset()
319 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
320 sentences = strings.Split(w.String(), "\n\n")
321 assert.Equal(len(sentences), 2)
322
323 w.Reset()
324 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
325 sentences = strings.Split(w.String(), "\n\n")
326 assert.Equal(len(sentences), 2)
327
Akron6e70dc82021-08-11 11:33:18 +0200328 w.Reset()
329 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
330 sentences = strings.Split(w.String(), "\n\n")
331 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200332 assert.Equal("\n", sentences[1])
333 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200334
Akron6e70dc82021-08-11 11:33:18 +0200335 w.Reset()
336 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
337 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200338 assert.Equal("\n", sentences[1])
339 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200340
Akron6e70dc82021-08-11 11:33:18 +0200341 w.Reset()
342 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
343 sentences = strings.Split(w.String(), "\n\n")
344 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200345
Akron6e70dc82021-08-11 11:33:18 +0200346 w.Reset()
347 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
348 sentences = strings.Split(w.String(), "\n\n")
349 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200350
Akron6e70dc82021-08-11 11:33:18 +0200351 w.Reset()
352 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
353 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200354 assert.Equal(2, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200355 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200356 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200357
Akron6e70dc82021-08-11 11:33:18 +0200358 w.Reset()
359 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
360 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200361 assert.Equal(3, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200362 assert.Equal("Ausschalten\n!!!", sentences[0])
363 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200364 assert.Equal("\n", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200365
Akron4af79f12021-08-11 14:48:17 +0200366 w.Reset()
367 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
368 sentences = strings.Split(w.String(), "\n\n")
369 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200370
371 /*
372 Test:
373 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
374 */
Akronb7e1f132021-08-10 11:52:31 +0200375}
Akron03ca4252021-08-11 13:32:53 +0200376
Akronc9c0eae2021-10-22 19:49:43 +0200377func TestDoubleArrayFullTokenizerTokenSplitter(t *testing.T) {
Akron03ca4252021-08-11 13:32:53 +0200378 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200379
380 if dat == nil {
381 dat = LoadDatokFile("testdata/tokenizer.datok")
382 }
383
Akron03ca4252021-08-11 13:32:53 +0200384 assert.NotNil(dat)
385
386 b := make([]byte, 0, 2048)
387 w := bytes.NewBuffer(b)
388 var tokens []string
389
390 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200391 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200392 assert.Equal(tokens[0], "Der")
393 assert.Equal(tokens[1], "alte")
394 assert.Equal(tokens[2], "Mann")
395 assert.Equal(len(tokens), 3)
396
Akronec835ad2021-08-11 18:23:22 +0200397 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200398 assert.Equal(tokens[0], "Der")
399 assert.Equal(tokens[1], "alte")
400 assert.Equal(tokens[2], "Mann")
401 assert.Equal(tokens[3], ".")
402 assert.Equal(len(tokens), 4)
403
404 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200405 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200406 assert.Equal(tokens[0], "Der")
407 assert.Equal(tokens[1], "Vorsitzende")
408 assert.Equal(tokens[2], "der")
409 assert.Equal(tokens[3], "F.D.P.")
410 assert.Equal(tokens[4], "hat")
411 assert.Equal(tokens[5], "gewählt")
412 assert.Equal(len(tokens), 6)
413 // Ignored in KorAP-Tokenizer
414
415 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200416 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200417 assert.Equal(tokens[0], "Gefunden")
418 assert.Equal(tokens[1], "auf")
419 assert.Equal(tokens[2], "wikipedia.org")
420 assert.Equal(len(tokens), 3)
421
422 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200423 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200424 assert.Equal("Gefunden", tokens[0])
425 assert.Equal("auf", tokens[1])
426 assert.Equal("www.wikipedia.org", tokens[2])
427 assert.Equal(3, len(tokens))
428
429 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200430 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200431 assert.Equal("www.info.biz/info", tokens[3])
432
433 // testTokenizerFtpHost
434 /*
435 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
436 assert.Equal("Kann", tokens[0])
437 assert.Equal("von", tokens[1])
438 assert.Equal("ftp.download.org", tokens[2])
439 assert.Equal(5, len(tokens))
440 // Ignored in KorAP-Tokenizer
441 */
442
443 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200444 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200445 assert.Equal(tokens[0], "Das")
446 assert.Equal(tokens[1], "war")
447 assert.Equal(tokens[2], "--")
448 assert.Equal(tokens[3], "spitze")
449 assert.Equal(len(tokens), 4)
450
451 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200452 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200453 assert.Equal(tokens[0], "Ich")
454 assert.Equal(tokens[1], "bin")
455 assert.Equal(tokens[2], "unter")
456 assert.Equal(tokens[3], "korap@ids-mannheim.de")
457 assert.Equal(tokens[4], "erreichbar")
458 assert.Equal(tokens[5], ".")
459 assert.Equal(len(tokens), 6)
460
461 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200462 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200463 assert.Equal(tokens[0], "Oder")
464 assert.Equal(tokens[1], "unter")
465 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
466 assert.Equal(tokens[3], ".")
467 assert.Equal(len(tokens), 4)
468
469 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200470 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200471 assert.Equal(tokens[0], "Oder")
472 assert.Equal(tokens[1], "unter")
473 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
474 assert.Equal(tokens[3], ".")
475 assert.Equal(len(tokens), 4)
476 // Ignored in KorAP-Tokenizer
477
478 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200479 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200480 assert.Equal("\"", tokens[0])
481 assert.Equal("John", tokens[1])
482 assert.Equal("Doe", tokens[2])
483 assert.Equal("\"", tokens[3])
484 assert.Equal("@xx", tokens[4])
485 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
486 assert.Equal("com", tokens[6])
487 assert.Equal(7, len(tokens))
488
489 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200490 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200491 assert.Equal(tokens[0], "Folgt")
492 assert.Equal(tokens[1], "@korap")
493 assert.Equal(tokens[2], "und")
494 assert.Equal(tokens[3], "#korap")
495 assert.Equal(len(tokens), 4)
496
497 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200498 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200499 assert.Equal(tokens[0], "Unsere")
500 assert.Equal(tokens[1], "Website")
501 assert.Equal(tokens[2], "ist")
502 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
503 assert.Equal(len(tokens), 4)
504
505 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200506 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200507 assert.Equal(tokens[0], "Wir")
508 assert.Equal(tokens[1], "sind")
509 assert.Equal(tokens[2], "auch")
510 assert.Equal(tokens[3], "im")
511 assert.Equal(tokens[4], "Internet")
512 assert.Equal(tokens[5], "(")
513 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
514 assert.Equal(tokens[7], ")")
515 assert.Equal(len(tokens), 8)
516 // Ignored in KorAP-Tokenizer
517
518 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200519 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200520 assert.Equal(tokens[0], "Die")
521 assert.Equal(tokens[1], "Adresse")
522 assert.Equal(tokens[2], "ist")
523 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
524 assert.Equal(tokens[4], ".")
525 assert.Equal(len(tokens), 5)
526 // Ignored in KorAP-Tokenizer
527
528 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200529 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200530 assert.Equal(tokens[0], "Unser")
531 assert.Equal(tokens[1], "Server")
532 assert.Equal(tokens[2], "ist")
533 assert.Equal(tokens[3], "10.0.10.51")
534 assert.Equal(tokens[4], ".")
535 assert.Equal(len(tokens), 5)
536
537 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200538 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200539 assert.Equal(tokens[0], "Zu")
540 assert.Equal(tokens[1], "50,4%")
541 assert.Equal(tokens[2], "ist")
542 assert.Equal(tokens[3], "es")
543 assert.Equal(tokens[4], "sicher")
544 assert.Equal(len(tokens), 5)
545 // Differs from KorAP-Tokenizer
546
547 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200548 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200549 assert.Equal(tokens[0], "Der")
550 assert.Equal(tokens[1], "Termin")
551 assert.Equal(tokens[2], "ist")
552 assert.Equal(tokens[3], "am")
553 assert.Equal(tokens[4], "5.9.2018")
554 assert.Equal(len(tokens), 5)
555
Akronec835ad2021-08-11 18:23:22 +0200556 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200557 assert.Equal(tokens[0], "Der")
558 assert.Equal(tokens[1], "Termin")
559 assert.Equal(tokens[2], "ist")
560 assert.Equal(tokens[3], "am")
561 assert.Equal(tokens[4], "5/9/2018")
562 assert.Equal(len(tokens), 5)
563
564 // testTokenizerDateRange
565 /*
566 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
567 assert.Equal(tokens[0], "Der")
568 assert.Equal(tokens[1], "Termin")
569 assert.Equal(tokens[2], "war")
570 assert.Equal(tokens[3], "vom")
571 assert.Equal(tokens[4], "4.")
572 assert.Equal(tokens[5], "-")
573 assert.Equal(tokens[6], "5.9.2018")
574 assert.Equal(len(tokens), 7)
575 // Ignored in KorAP-Tokenizer
576 */
577
578 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200579 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200580 assert.Equal(tokens[0], "Das")
581 assert.Equal(tokens[1], "ist")
582 assert.Equal(tokens[2], "toll")
583 assert.Equal(tokens[3], "!")
584 assert.Equal(tokens[4], ";)")
585 assert.Equal(len(tokens), 5)
586
587 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200588 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200589 assert.Equal(tokens[0], "Kupietz")
590 assert.Equal(tokens[1], "und")
591 assert.Equal(tokens[2], "Schmidt")
592 assert.Equal(tokens[3], "(2018)")
593 assert.Equal(tokens[4], ":")
594 assert.Equal(tokens[5], "Korpuslinguistik")
595 assert.Equal(len(tokens), 6)
596 // Differs from KorAP-Tokenizer!
597
598 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200599 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200600 assert.Equal(tokens[0], "Kupietz")
601 assert.Equal(tokens[1], "und")
602 assert.Equal(tokens[2], "Schmidt")
603 assert.Equal(tokens[3], "[2018]")
604 assert.Equal(tokens[4], ":")
605 assert.Equal(tokens[5], "Korpuslinguistik")
606 assert.Equal(len(tokens), 6)
607 // Differs from KorAP-Tokenizer!
608
609 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200610 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200611 assert.Equal(tokens[0], "Er")
612 assert.Equal(tokens[1], "ist")
613 assert.Equal(tokens[2], "ein")
614 assert.Equal(tokens[3], "A****loch")
615 assert.Equal(tokens[4], "!")
616 assert.Equal(len(tokens), 5)
617
618 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200619 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200620 assert.Equal(tokens[0], "F*ck")
621 assert.Equal(tokens[1], "!")
622 assert.Equal(len(tokens), 2)
623
624 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200625 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200626 assert.Equal(tokens[0], "Dieses")
627 assert.Equal(tokens[1], "verf*****")
628 assert.Equal(tokens[2], "Kleid")
629 assert.Equal(tokens[3], "!")
630 assert.Equal(len(tokens), 4)
631
632 // Probably interpreted as HOST
633 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200634 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200635 assert.Equal(tokens[0], "Ich")
636 assert.Equal(tokens[1], "habe")
637 assert.Equal(tokens[2], "die")
638 assert.Equal(tokens[3], "readme.txt")
639 assert.Equal(tokens[4], "heruntergeladen")
640 assert.Equal(len(tokens), 5)
641
642 // Probably interpreted as HOST
643 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200644 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200645 assert.Equal(tokens[0], "Nimm")
646 assert.Equal(tokens[1], "die")
647 assert.Equal(tokens[2], "README.TXT")
648 assert.Equal(tokens[3], "!")
649 assert.Equal(len(tokens), 4)
650
651 // Probably interpreted as HOST
652 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200653 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200654 assert.Equal(tokens[0], "Zeig")
655 assert.Equal(tokens[1], "mir")
656 assert.Equal(tokens[2], "profile.jpeg")
657 assert.Equal(len(tokens), 3)
658
659 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200660
Akronec835ad2021-08-11 18:23:22 +0200661 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200662 assert.Equal(tokens[0], "Zeig")
663 assert.Equal(tokens[1], "mir")
664 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
665 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200666
Akrone8837b52021-08-11 17:29:58 +0200667 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200668 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200669 assert.Equal(tokens[0], "Gehe")
670 assert.Equal(tokens[1], "zu")
671 assert.Equal(tokens[2], "/Dokumente/profile.docx")
672 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200673
Akrone8837b52021-08-11 17:29:58 +0200674 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200675 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200676 assert.Equal(tokens[0], "Zeig")
677 assert.Equal(tokens[1], "mir")
678 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
679 assert.Equal(len(tokens), 3)
680 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200681
Akronfd92d7e2021-08-11 16:31:43 +0200682 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200683 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200684 assert.Equal(tokens[0], "Er")
685 assert.Equal(tokens[1], "sagte")
686 assert.Equal(tokens[2], ":")
687 assert.Equal(tokens[3], "\"")
688 assert.Equal(tokens[4], "Es")
689 assert.Equal(tokens[5], "geht")
690 assert.Equal(tokens[6], "mir")
691 assert.Equal(tokens[7], "gut")
692 assert.Equal(tokens[8], "!")
693 assert.Equal(tokens[9], "\"")
694 assert.Equal(tokens[10], ",")
695 assert.Equal(tokens[11], "daraufhin")
696 assert.Equal(tokens[12], "ging")
697 assert.Equal(tokens[13], "er")
698 assert.Equal(tokens[14], ".")
699 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200700
701 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200702 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
703 assert.Equal(tokens[0], "&quot;")
704 assert.Equal(tokens[1], "Das")
705 assert.Equal(tokens[2], "ist")
706 assert.Equal(tokens[3], "von")
707 assert.Equal(tokens[4], "C&A")
708 assert.Equal(tokens[5], "!")
709 assert.Equal(tokens[6], "&quot;")
710 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200711
712 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200713 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200714 assert.Equal(tokens[0], "Siehst")
715 assert.Equal(tokens[1], "Du")
716 assert.Equal(tokens[2], "?!!?")
717 assert.Equal(len(tokens), 3)
718
719 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200720 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200721 assert.Equal(tokens[0], "Peter")
722 assert.Equal(tokens[1], "O'Toole")
723 assert.Equal(len(tokens), 2)
724
725 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200726 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200727 assert.Equal(tokens[0], "Früher")
728 assert.Equal(tokens[1], "bzw.")
729 assert.Equal(tokens[2], "später")
730 assert.Equal(tokens[3], "...")
731 assert.Equal(len(tokens), 4)
732
733 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200734 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200735 assert.Equal(tokens[0], "Es")
736 assert.Equal(tokens[1], "war")
737 assert.Equal(tokens[2], "spät")
738 assert.Equal(tokens[3], ".")
739 assert.Equal(tokens[4], "Morgen")
740 assert.Equal(tokens[5], "ist")
741 assert.Equal(tokens[6], "es")
742 assert.Equal(tokens[7], "früh")
743 assert.Equal(tokens[8], ".")
744 assert.Equal(len(tokens), 9)
745 // Ignored in KorAP-Tokenizer
746
747 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200748 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200749 assert.Equal(tokens[0], "Sie")
750 assert.Equal(tokens[1], "erreichte")
751 assert.Equal(tokens[2], "den")
752 assert.Equal(tokens[3], "1.")
753 assert.Equal(tokens[4], "Platz")
754 assert.Equal(tokens[5], "!")
755 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200756
757 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200758 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200759 assert.Equal(tokens[0], "Archive")
760 assert.Equal(tokens[1], ":")
761 assert.Equal(tokens[2], "Ich")
762 assert.Equal(tokens[3], "bin")
763 assert.Equal(tokens[4], "kein")
764 assert.Equal(tokens[5], "zip")
765 assert.Equal(6, len(tokens))
766
767 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200768 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200769 assert.Equal(tokens[4], "Weststr.")
770 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200771
772 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200773 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200774 assert.Equal("D'dorf", tokens[0])
775 assert.Equal("Ku'damm", tokens[1])
776 assert.Equal("Lu'hafen", tokens[2])
777 assert.Equal("M'gladbach", tokens[3])
778 assert.Equal("W'schaft", tokens[4])
779 assert.Equal(5, len(tokens))
780
781 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200782 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200783 assert.Equal("mach's", tokens[0])
784 assert.Equal("macht's", tokens[1])
785 assert.Equal("was'n", tokens[2])
786 assert.Equal("ist's", tokens[3])
787 assert.Equal("haste", tokens[4])
788 assert.Equal("willste", tokens[5])
789 assert.Equal("kannste", tokens[6])
790 assert.Equal("biste", tokens[7])
791 assert.Equal("kriegste", tokens[8])
792 assert.Equal(9, len(tokens))
793
794 /*
795 @Test
796 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
797 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
798 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
799 assert.Equal("'ve", tokens[1]);
800 assert.Equal("'ll", tokens[3]);
801 assert.Equal("'d", tokens[5]);
802 assert.Equal("'m", tokens[7]);
803 assert.Equal("'re", tokens[9]);
804 assert.Equal("'s", tokens[11]);
805 assert.Equal("is", tokens[12]);
806 assert.Equal("n't", tokens[13]);
807 assert.Equal(14, len(tokens));
808 }
809
810 @Test
811 public void frenchTokenizerKnowsFrenchAbbreviations () {
812 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
813 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
814 assert.Equal("Approx.", tokens[0]);
815 assert.Equal("juill.", tokens[2]);
816 assert.Equal("prof.", tokens[5]);
817 assert.Equal("exerc.", tokens[15]);
818 assert.Equal("no.", tokens[16]);
819 assert.Equal("pp.", tokens[21]);
820 }
821
822 @Test
823 public void frenchTokenizerKnowsFrenchContractions () {
824 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
825 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
826 assert.Equal("J'", tokens[0]);
827 assert.Equal("j'", tokens[2]);
828 assert.Equal("qu'", tokens[4]);
829 assert.Equal("d'", tokens[6]);
830 assert.Equal("jusqu'", tokens[8]);
831 assert.Equal("Aujourd'hui", tokens[10]);
832 assert.Equal("D'", tokens[11]); // ’
833 assert.Equal("Quelqu'un", tokens[13]); // ’
834 assert.Equal("Presqu'île", tokens[14]); // ’
835 }
836
837 @Test
838 public void frenchTokenizerKnowsFrenchClitics () {
839 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
840 tokens = tokenize(dat, w, "suis-je sont-elles ")
841 assert.Equal("suis", tokens[0]);
842 assert.Equal("-je", tokens[1]);
843 assert.Equal("sont", tokens[2]);
844 assert.Equal("-elles", tokens[3]);
845 }
846
847 @Test
848 public void testEnglishTokenizerScienceAbbreviations () {
849 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
850 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
851 assert.Equal("Approx.", tokens[0]);
852 assert.Equal("in", tokens[1]);
853 assert.Equal("Sept.", tokens[2]);
854 assert.Equal("1954", tokens[3]);
855 assert.Equal(",", tokens[4]);
856 assert.Equal("Assoc.", tokens[5]);
857 assert.Equal("Prof.", tokens[6]);
858 assert.Equal("Dr.", tokens[7]);
859 assert.Equal("R.", tokens[8]);
860 assert.Equal("J.", tokens[9]);
861 assert.Equal("Ewing", tokens[10]);
862 assert.Equal("reviewed", tokens[11]);
863 assert.Equal("articles", tokens[12]);
864 assert.Equal("on", tokens[13]);
865 assert.Equal("Enzymol.", tokens[14]);
866 assert.Equal("Bacteriol.", tokens[15]);
867 assert.Equal("effects", tokens[16]);
868 assert.Equal("later", tokens[17]);
869 assert.Equal("published", tokens[18]);
870 assert.Equal("in", tokens[19]);
871 assert.Equal("Nutr.", tokens[20]);
872 assert.Equal("Rheumatol.", tokens[21]);
873 assert.Equal("No.", tokens[22]);
874 assert.Equal("12", tokens[23]);
875 assert.Equal("and", tokens[24]);
876 assert.Equal("Nº.", tokens[25]);
877 assert.Equal("13.", tokens[26]);
878 assert.Equal(",", tokens[27]);
879 assert.Equal("pp.", tokens[28]);
880 assert.Equal("17-18", tokens[29]);
881 assert.Equal(".", tokens[30]);
882 }
883
884 @Test
885 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
886 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
887 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
888 assert.Equal("I.", tokens[1]);
889 assert.Equal("I", tokens[8]);
890 assert.Equal(".", tokens[9]);
891 assert.Equal("I", tokens[12]);
892 assert.Equal(".", tokens[13]);
893 }
894
895 @Test
896 public void testZipOuputArchive () {
897
898 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
899 System.setOut(new PrintStream(clearOut));
900 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
901 assert.Equal(0, len(tokens));
902 }
903 */
904 /*
905
906 @Test
907 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
908 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
909 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
910 .printOffsets(true)
911 .build();
912 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
913 assert.Equal("Text1", tokens[0].getType());
914 assert.Equal(len(tokens), 9 );
915 }
916 */
917}
Akronbd406802021-08-11 18:39:13 +0200918
Akrondf275812022-03-27 12:54:46 +0200919func TestDoubleArrayFullTokenizerSentenceSplitterBug1(t *testing.T) {
920 assert := assert.New(t)
921
922 if dat == nil {
923 dat = LoadDatokFile("testdata/tokenizer.datok")
924 }
925
926 b := make([]byte, 0, 2048)
927 w := bytes.NewBuffer(b)
928 var sentences []string
929
930 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
931
932 w.Reset()
933 assert.True(dat.Transduce(strings.NewReader(text), w))
934 sentences = strings.Split(w.String(), "\n\n")
935 assert.Equal(len(sentences), 5)
936 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
937 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
938 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.\n \nDie\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[2])
939 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[3])
940}
941
Akronc9c0eae2021-10-22 19:49:43 +0200942func TestDoubleArrayLoadFactor1(t *testing.T) {
Akron29e306f2021-09-02 18:29:56 +0200943 assert := assert.New(t)
944 tok := LoadFomaFile("testdata/abbr_bench.fst")
945 dat := tok.ToDoubleArray()
946 assert.True(dat.LoadFactor() > 88)
947}
948
Akronc9c0eae2021-10-22 19:49:43 +0200949func TestDoubleArrayFullTokenizerXML(t *testing.T) {
Akron4c2a1ad2021-08-31 00:35:53 +0200950 assert := assert.New(t)
951
Akron9fb63af2021-10-28 01:15:53 +0200952 if dat == nil {
953 dat = LoadDatokFile("testdata/tokenizer.datok")
954 }
955
Akron4c2a1ad2021-08-31 00:35:53 +0200956 assert.NotNil(dat)
957
958 b := make([]byte, 0, 2048)
959 w := bytes.NewBuffer(b)
960 var tokens []string
961
962 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
963 assert.Equal("Das", tokens[0])
964 assert.Equal("<b>", tokens[1])
965 assert.Equal("beste", tokens[2])
966 assert.Equal("</b>", tokens[3])
967 assert.Equal("Fußballspiel", tokens[4])
968 assert.Equal(5, len(tokens))
969
970 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
971 assert.Equal("Das", tokens[0])
972 assert.Equal("<b class=\"c\">", tokens[1])
973 assert.Equal("beste", tokens[2])
974 assert.Equal("</b>", tokens[3])
975 assert.Equal("Fußballspiel", tokens[4])
976 assert.Equal(5, len(tokens))
977
978 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
979 assert.Equal("der", tokens[0])
980 assert.Equal("<x y=\"alte \">", tokens[1])
981 assert.Equal("<x x>", tokens[2])
982 assert.Equal("alte", tokens[3])
983 assert.Equal("</x>", tokens[4])
984 assert.Equal("etc.", tokens[5])
985 assert.Equal("et", tokens[6])
986 assert.Equal(".", tokens[7])
987 assert.Equal("Mann", tokens[8])
988 assert.Equal(".", tokens[9])
989 assert.Equal(10, len(tokens))
990}
991
Akronc9c0eae2021-10-22 19:49:43 +0200992func BenchmarkDoubleArrayTransduce(b *testing.B) {
Akronbd406802021-08-11 18:39:13 +0200993 bu := make([]byte, 0, 2048)
994 w := bytes.NewBuffer(bu)
995
996 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
997 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
998 Der Termin ist am 5.9.2018.
999 Ich habe die readme.txt heruntergeladen.
1000 Ausschalten!!! Hast Du nicht gehört???
1001 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
1002 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
1003 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
1004 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
1005 r := strings.NewReader(s)
1006
1007 dat := LoadDatokFile("testdata/tokenizer.datok")
1008
Akrondf37a552021-09-02 12:16:08 +02001009 b.ResetTimer()
1010
Akronbd406802021-08-11 18:39:13 +02001011 for i := 0; i < b.N; i++ {
1012 w.Reset()
1013 r.Reset(s)
1014 ok := dat.Transduce(r, w)
1015 if !ok {
1016 fmt.Println("Fail!")
1017 fmt.Println(w.String())
1018 os.Exit(1)
1019 }
1020 }
Akronbd406802021-08-11 18:39:13 +02001021}
Akronbb4aac52021-08-13 00:52:27 +02001022
Akron6f1c16c2021-08-17 10:45:42 +02001023// This test is deprecated as the datok file changes over time
1024func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +02001025 for i := 0; i < b.N; i++ {
1026 dat := LoadDatokFile("testdata/tokenizer.datok")
1027 if dat == nil {
1028 fmt.Println("Fail!")
1029 os.Exit(1)
1030 }
1031 }
1032}
1033
Akronc9c0eae2021-10-22 19:49:43 +02001034func BenchmarkDoubleArrayConstruction(b *testing.B) {
Akron6f1c16c2021-08-17 10:45:42 +02001035 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +02001036 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +02001037 for i := 0; i < b.N; i++ {
1038 dat := tok.ToDoubleArray()
1039 if dat == nil {
1040 fmt.Println("Fail!")
1041 os.Exit(1)
1042 }
1043 }
1044}
1045
Akronc9c0eae2021-10-22 19:49:43 +02001046func BenchmarkDoubleArrayLarger(b *testing.B) {
Akron7b1faa62021-09-02 16:10:21 +02001047 tok := LoadFomaFile("testdata/abbr_bench.fst")
1048 b.ResetTimer()
1049 for i := 0; i < b.N; i++ {
1050 dat := tok.ToDoubleArray()
1051 if dat == nil {
1052 fmt.Println("Fail!")
1053 os.Exit(1)
1054 }
1055 }
1056}
1057
Akronbb4aac52021-08-13 00:52:27 +02001058// 2021-08-11 (go 1.16)
1059// go test -bench=. -test.benchmem
1060// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001061// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +02001062// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
1063// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
1064// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
1065// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001066// 2021-08-16
1067// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
1068// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
1069// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
1070// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +02001071// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
1072// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +02001073// 2021-08-17
1074// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
1075// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +02001076// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
1077// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +02001078// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +02001079// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
1080// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
1081// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
1082// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
1083// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
1084// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1085// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1086// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1087// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001088// 2021-09-02 - xCheckSkip() with .9
1089// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1090// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1091// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001092// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1093// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1094// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1095// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
Akron28031b72021-10-02 13:07:25 +02001096// 2021-09-30 - Go 1.17.1
1097// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
1098// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
1099// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
1100// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
Akron094a4e82021-10-02 18:37:00 +02001101// 2021-10-02
1102// BenchmarkTransduce-4 47676 25398 ns/op 8240 B/op 3 allocs/op
1103// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
1104// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
1105// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
Akrone396a932021-10-19 01:06:13 +02001106// 2021-10-12 - Introduction of Callbacks in Matrix
1107// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
1108// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
1109// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
1110// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
1111// 2021-10-18 - Introduction of Callbacks in DA
1112// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
1113// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
1114// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
1115// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
Akrona854faa2021-10-22 19:31:08 +02001116// 2021-10-21 - Simplify DA code to ignore final states
1117// BenchmarkTransduce-4 41365 33766 ns/op 12408 B/op 6 allocs/op
1118// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
1119// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
1120// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001121// 2021-10-22 - Introduxe EOT
Akronc9c0eae2021-10-22 19:49:43 +02001122// BenchmarkDoubleArrayTransduce-4 43820 27661 ns/op 12408 B/op 6 allocs/op
1123// BenchmarkDoubleArrayConstruction-4 68259 16608 ns/op 10703 B/op 29 allocs/op
1124// BenchmarkDoubleArrayLarger-4 16 69889532 ns/op 6357901 B/op 2578 allocs/op
1125// BenchmarkMatrixTransduce-4 49426 25105 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001126// 2021-10-23 - Improve offset handling
1127// BenchmarkDoubleArrayTransduce-4 41890 29729 ns/op 12408 B/op 6 allocs/op
1128// BenchmarkDoubleArrayConstruction-4 74510 15879 ns/op 10703 B/op 29 allocs/op
1129// BenchmarkDoubleArrayLarger-4 18 73752383 ns/op 6357956 B/op 2579 allocs/op
1130// BenchmarkMatrixTransduce-4 46870 27140 ns/op 12408 B/op 6 allocs/op
Akron04335c62021-10-28 11:56:00 +02001131// 2021-10-28 - Finalize feature compatibility with KorAP-Tokenizer
1132// BenchmarkDoubleArrayTransduce-4 39130 31612 ns/op 28944 B/op 16 allocs/op
1133// BenchmarkDoubleArrayConstruction-4 79302 14994 ns/op 10703 B/op 29 allocs/op
1134// BenchmarkDoubleArrayLarger-4 18 67942077 ns/op 6357870 B/op 2577 allocs/op
1135// BenchmarkMatrixTransduce-4 39536 30510 ns/op 28944 B/op 16 allocs/op
Akron289414f2021-11-09 19:56:42 +01001136// 2021-11-09 - go 1.17.3
1137// BenchmarkDoubleArrayTransduce-4 35067 34192 ns/op 28944 B/op 17 allocs/op
1138// BenchmarkDoubleArrayConstruction-4 72446 15614 ns/op 10703 B/op 29 allocs/op
1139// BenchmarkDoubleArrayLarger-4 16 71058822 ns/op 6357860 B/op 2577 allocs/op
1140// BenchmarkMatrixTransduce-4 36703 31891 ns/op 28944 B/op 17 allocs/op
Akronfac8abc2021-11-10 07:19:59 +01001141// 2021-11-10 - rearranged longest match operator
Akron4880fb62021-12-05 12:03:05 +01001142// BenchmarkDoubleArrayTransduce-4 34522 33210 ns/op 28944 B/op 17 allocs/op
1143// BenchmarkDoubleArrayConstruction-4 66990 16012 ns/op 10703 B/op 29 allocs/op
1144// BenchmarkDoubleArrayLarger-4 16 62829878 ns/op 6357823 B/op 2576 allocs/op
1145// BenchmarkMatrixTransduce-4 36154 32702 ns/op 28944 B/op 17 allocs/op
1146// 2021-12-04 - optimize identity branch
1147// BenchmarkDoubleArrayTransduce-4 34903 32255 ns/op 28944 B/op 17 allocs/op
1148// BenchmarkDoubleArrayConstruction-4 79394 14561 ns/op 10703 B/op 29 allocs/op
1149// BenchmarkDoubleArrayLarger-4 19 60257675 ns/op 6357911 B/op 2577 allocs/op
1150// BenchmarkMatrixTransduce-4 35076 30581 ns/op 28944 B/op 17 allocs/op
Akron00cecd12021-12-05 13:14:03 +01001151// 2021-12-05 - init identity for sigma < 256
1152// BenchmarkDoubleArrayTransduce-4 35284 31918 ns/op 28944 B/op 17 allocs/op
1153// BenchmarkDoubleArrayConstruction-4 80342 14504 ns/op 10703 B/op 29 allocs/op
1154// BenchmarkDoubleArrayLarger-4 19 60343253 ns/op 6357789 B/op 2575 allocs/op
1155// BenchmarkMatrixTransduce-4 34029 30238 ns/op 28944 B/op 17 allocs/op