blob: 08c7637a7059b638ddb786910578b16a50ecda25 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron2f7f6f32026-02-11 15:12:48 +01004 "bufio"
Akron6247a5d2021-08-03 19:18:28 +02005 "bytes"
Akronbd406802021-08-11 18:39:13 +02006 "fmt"
7 "os"
Akron03ca4252021-08-11 13:32:53 +02008 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02009 "strings"
Akron8ef408b2021-08-02 22:11:04 +020010 "testing"
11
12 "github.com/stretchr/testify/assert"
13)
14
Akron9fb63af2021-10-28 01:15:53 +020015var dat *DaTokenizer
16
Akrondf275812022-03-27 12:54:46 +020017func ttokenizeStr(tok Tokenizer, str string) string {
Akronec835ad2021-08-11 18:23:22 +020018 b := make([]byte, 0, 2048)
19 w := bytes.NewBuffer(b)
Akrondf275812022-03-27 12:54:46 +020020 return strings.Join(ttokenize(tok, w, str), "\n")
Akronec835ad2021-08-11 18:23:22 +020021}
22
Akron1c34ce62021-09-23 23:27:39 +020023func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020024 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020025 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020026 if !ok {
27 return []string{}
28 }
29 obj := regexp.MustCompile("\n+")
30
31 tokens := obj.Split(w.String(), -1)
32 return tokens[:len(tokens)-1]
33}
34
Akron2f7f6f32026-02-11 15:12:48 +010035func ttokenLines(t *testing.T, path string) []string {
36 f, err := os.Open(path)
37 if err != nil {
38 t.Fatalf("failed to open %s: %v", path, err)
39 }
40 defer f.Close()
41
42 lines := []string{}
43 scanner := bufio.NewScanner(f)
44 for scanner.Scan() {
45 line := strings.TrimSpace(scanner.Text())
46 if line == "" || strings.HasPrefix(line, "#") {
47 continue
48 }
49 lines = append(lines, line)
50 }
51 if err := scanner.Err(); err != nil {
52 t.Fatalf("failed to read %s: %v", path, err)
53 }
54 return lines
55}
56
Akronc9c0eae2021-10-22 19:49:43 +020057func TestDoubleArraySimpleString(t *testing.T) {
Akron8ef408b2021-08-02 22:11:04 +020058 assert := assert.New(t)
Akron8ef408b2021-08-02 22:11:04 +020059 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +020062
63 b := make([]byte, 0, 2048)
64 w := bytes.NewBuffer(b)
65 var tokens []string
66
67 tokens = ttokenize(dat, w, "ibauamt")
68 assert.Equal("i", tokens[0])
69 assert.Equal("bauamt", tokens[1])
70
71 tokens = ttokenize(dat, w, "ibbauamt")
72 assert.Equal("i", tokens[0])
73
74 assert.Equal("b", tokens[1])
75 assert.Equal("bauamt", tokens[2])
76
77 tokens = ttokenize(dat, w, "bau")
78 assert.Equal("bau", tokens[0])
79
80 tokens = ttokenize(dat, w, "baum")
81 assert.Equal("bau", tokens[0])
82 assert.Equal("m", tokens[1])
83
84 tokens = ttokenize(dat, w, "baudibauamt")
85 assert.Equal("bau", tokens[0])
86 assert.Equal("d", tokens[1])
87 assert.Equal("i", tokens[2])
88 assert.Equal("bauamt", tokens[3])
Akron8ef408b2021-08-02 22:11:04 +020089}
Akron75ebe7f2021-08-03 10:34:10 +020090
Akronc9c0eae2021-10-22 19:49:43 +020091func TestDoubleArraySimpleBranches(t *testing.T) {
Akron75ebe7f2021-08-03 10:34:10 +020092 assert := assert.New(t)
Akron75ebe7f2021-08-03 10:34:10 +020093 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020094 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020095 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +020096
97 b := make([]byte, 0, 2048)
98 w := bytes.NewBuffer(b)
99 var tokens []string
100
101 tokens = ttokenize(dat, w, "bau")
102 assert.Equal("bau", tokens[0])
103
104 tokens = ttokenize(dat, w, "bauamt")
105 assert.Equal("bauamt", tokens[0])
106
107 tokens = ttokenize(dat, w, "wahlamt")
108 assert.Equal("wahlamt", tokens[0])
109
110 tokens = ttokenize(dat, w, "bauen")
111 assert.Equal("bauen", tokens[0])
112
113 tokens = ttokenize(dat, w, "wahlen")
114 assert.Equal("wahlen", tokens[0])
115
116 tokens = ttokenize(dat, w, "baum")
117 assert.Equal("bau", tokens[0])
118 assert.Equal("m", tokens[1])
Akron75ebe7f2021-08-03 10:34:10 +0200119}
Akron730a79c2021-08-03 11:05:29 +0200120
Akrondf275812022-03-27 12:54:46 +0200121func TestDoubleArraySimpleTokenizer(t *testing.T) {
Akron730a79c2021-08-03 11:05:29 +0200122 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200123 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200124 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +0200125 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
126 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
127 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron730a79c2021-08-03 11:05:29 +0200128}
Akron740f3d72021-08-03 12:12:34 +0200129
Akronc9c0eae2021-10-22 19:49:43 +0200130func TestDoubleArraySimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +0200131 assert := assert.New(t)
132 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +0200133 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +0200134
135 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
136 b := make([]byte, 0, 2048)
137 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +0200138 var tokens []string
Akron524c5432021-08-05 14:14:27 +0200139 dat.Transduce(r, w)
140 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal(len(tokens), 11)
Akron3f8571a2021-08-05 11:18:10 +0200142 assert.Equal("wald", tokens[0])
143 assert.Equal("gehen", tokens[1])
144 assert.Equal("Da", tokens[2])
145 assert.Equal("kann", tokens[3])
146 assert.Equal("man", tokens[4])
147 assert.Equal("was", tokens[5])
148 assert.Equal("\"erleben\"", tokens[6])
149
Akron524c5432021-08-05 14:14:27 +0200150 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
151 w.Reset()
152 dat.Transduce(r, w)
153 tokens = strings.Split(w.String(), "\n")
154 assert.Equal("In", tokens[0])
155 assert.Equal("den", tokens[1])
156 assert.Equal("Wald", tokens[2])
157 assert.Equal("gehen", tokens[3])
158 assert.Equal("?", tokens[4])
159 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +0200160
Akron524c5432021-08-05 14:14:27 +0200161 r = strings.NewReader(" g? -- D")
162 w.Reset()
163 dat.Transduce(r, w)
164 tokens = strings.Split(w.String(), "\n")
165 assert.Equal("g", tokens[0])
166 assert.Equal("?", tokens[1])
167 assert.Equal("--", tokens[2])
168 assert.Equal("D", tokens[3])
169 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200170 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +0200171 assert.Equal(7, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200172}
173
Akronc9c0eae2021-10-22 19:49:43 +0200174func TestDoubleArrayReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200175 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200176 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200177 dat := tok.ToDoubleArray()
Akrondf275812022-03-27 12:54:46 +0200178 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
179 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
180 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron6247a5d2021-08-03 19:18:28 +0200181
Akron3f8571a2021-08-05 11:18:10 +0200182 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200183 buf := bytes.NewBuffer(b)
184 n, err := dat.WriteTo(buf)
185 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200186 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200187
188 dat2 := ParseDatok(buf)
189 assert.NotNil(dat2)
190 assert.Equal(dat.array, dat2.array)
191 assert.Equal(dat.sigma, dat2.sigma)
192 assert.Equal(dat.epsilon, dat2.epsilon)
193 assert.Equal(dat.unknown, dat2.unknown)
194 assert.Equal(dat.identity, dat2.identity)
195 assert.Equal(dat.final, dat2.final)
196 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akrondf275812022-03-27 12:54:46 +0200197 assert.Equal(ttokenizeStr(dat2, "bau"), "bau")
198 assert.Equal(ttokenizeStr(dat2, "bad"), "bad")
199 assert.Equal(ttokenizeStr(dat2, "wald gehen"), "wald\ngehen")
Akron4fa28b32021-08-27 10:55:41 +0200200
Akron92704eb2021-08-27 10:59:46 +0200201 assert.Equal(dat.TransCount(), 17)
202 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200203}
204
Akronc9c0eae2021-10-22 19:49:43 +0200205func TestDoubleArrayIgnorableMCS(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200206
207 // This test relies on final states. That's why it is
208 // not working correctly anymore.
209
Akron31f3c062021-08-27 10:15:13 +0200210 assert := assert.New(t)
211 // File has MCS in sigma but not in net
212 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
213 assert.NotNil(tok)
214 dat := tok.ToDoubleArray()
215 assert.NotNil(dat)
216
217 b := make([]byte, 0, 2048)
218 w := bytes.NewBuffer(b)
219 var tokens []string
220
221 // Is only unambigous when transducing strictly greedy!
Akrone396a932021-10-19 01:06:13 +0200222 assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
Akron31f3c062021-08-27 10:15:13 +0200223 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200224 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akron31f3c062021-08-27 10:15:13 +0200225 assert.Equal("a", tokens[0])
226 assert.Equal("b", tokens[1])
Akrone396a932021-10-19 01:06:13 +0200227 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal(6, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200229 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200230}
231
Akronc9c0eae2021-10-22 19:49:43 +0200232func TestDoubleArrayFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200233 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200234
235 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200236 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200237 }
Akron3a063ef2021-08-05 19:36:35 +0200238 assert.NotNil(dat)
Akrond47c67e2022-04-10 11:02:59 +0200239 assert.True(dat.LoadFactor() >= 60)
Akron3a063ef2021-08-05 19:36:35 +0200240 assert.Equal(dat.epsilon, 1)
241 assert.Equal(dat.unknown, 2)
242 assert.Equal(dat.identity, 3)
Akrond0c6e102021-12-09 10:30:29 +0100243 // assert.Equal(dat.final, 142)
244 // assert.Equal(len(dat.sigma), 137)
Akronfac8abc2021-11-10 07:19:59 +0100245 // assert.True(len(dat.array) > 3000000)
246 // assert.True(dat.maxSize > 3000000)
Akrondf275812022-03-27 12:54:46 +0200247 assert.Equal(ttokenizeStr(dat, "bau"), "bau")
248 assert.Equal(ttokenizeStr(dat, "bad"), "bad")
249 assert.Equal(ttokenizeStr(dat, "wald gehen"), "wald\ngehen")
Akron740f3d72021-08-03 12:12:34 +0200250}
Akron3f8571a2021-08-05 11:18:10 +0200251
Akronc9c0eae2021-10-22 19:49:43 +0200252func TestDoubleArrayTokenizerBranch(t *testing.T) {
Akron941f2152021-09-26 15:14:25 +0200253 assert := assert.New(t)
254 tok := LoadTokenizerFile("testdata/simpletok.datok")
255 assert.NotNil(tok)
256 assert.Equal(tok.Type(), "DATOK")
257
258 tok = LoadTokenizerFile("testdata/simpletok.matok")
259 assert.NotNil(tok)
260 assert.Equal(tok.Type(), "MATOK")
261}
262
Akronc9c0eae2021-10-22 19:49:43 +0200263func XTestDoubleArrayFullTokenizerBuild(t *testing.T) {
Akrona0bded52021-08-11 15:48:02 +0200264 assert := assert.New(t)
Akron0139bc52023-08-31 16:35:58 +0200265 tok := LoadFomaFile("testdata/tokenizer_de.fst")
Akrona0bded52021-08-11 15:48:02 +0200266 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200267 assert.NotNil(dat)
Akron0139bc52023-08-31 16:35:58 +0200268 // n, err := dat.Save("testdata/tokenizer_de.datok")
Akronde18e902021-08-27 09:34:12 +0200269 // assert.Nil(err)
270 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200271}
272
Akronc9c0eae2021-10-22 19:49:43 +0200273func TestDoubleArrayFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200274 assert := assert.New(t)
275
Akron9fb63af2021-10-28 01:15:53 +0200276 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200277 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200278 }
279
Akron3610f102021-08-08 14:13:25 +0200280 assert.NotNil(dat)
281
Akron3610f102021-08-08 14:13:25 +0200282 b := make([]byte, 0, 2048)
283 w := bytes.NewBuffer(b)
284 var tokens []string
285
Akron03ca4252021-08-11 13:32:53 +0200286 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200287
288 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200289 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200290 assert.Equal("tra", tokens[0])
291 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200292 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200293 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200294 assert.Equal("Du", tokens[4])
295 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200296 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200297 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200298 assert.Equal("", tokens[8])
299 assert.Equal(9, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200300
301 w.Reset()
302 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200303 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200304}
Akronb7e1f132021-08-10 11:52:31 +0200305
Akronc9c0eae2021-10-22 19:49:43 +0200306func TestDoubleArrayFullTokenizerSentenceSplitter(t *testing.T) {
Akronb7e1f132021-08-10 11:52:31 +0200307 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200308
309 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200310 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200311 }
312
Akronb7e1f132021-08-10 11:52:31 +0200313 assert.NotNil(dat)
314
315 b := make([]byte, 0, 2048)
316 w := bytes.NewBuffer(b)
317 var sentences []string
318
319 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200320 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
321 sentences = strings.Split(w.String(), "\n\n")
322
Akrona854faa2021-10-22 19:31:08 +0200323 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron1594cb82021-08-11 11:14:56 +0200324 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200325 assert.Equal("\n", sentences[1])
326 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200327
328 w.Reset()
329 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
330 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200331 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200332 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200333 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200334
335 w.Reset()
336 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200337 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200338 assert.Equal(2, len(sentences))
339 assert.Equal("", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200340
341 w.Reset()
342 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
343 sentences = strings.Split(w.String(), "\n\n")
344 assert.Equal(len(sentences), 2)
345
346 w.Reset()
347 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
348 sentences = strings.Split(w.String(), "\n\n")
349 assert.Equal(len(sentences), 2)
350
Akron6e70dc82021-08-11 11:33:18 +0200351 w.Reset()
352 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
353 sentences = strings.Split(w.String(), "\n\n")
354 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200355 assert.Equal("\n", sentences[1])
356 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200357
Akron6e70dc82021-08-11 11:33:18 +0200358 w.Reset()
359 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
360 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200361 assert.Equal("\n", sentences[1])
362 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200363
Akron6e70dc82021-08-11 11:33:18 +0200364 w.Reset()
365 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
366 sentences = strings.Split(w.String(), "\n\n")
367 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200368
Akron6e70dc82021-08-11 11:33:18 +0200369 w.Reset()
370 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
371 sentences = strings.Split(w.String(), "\n\n")
372 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200373
Akron6e70dc82021-08-11 11:33:18 +0200374 w.Reset()
375 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
376 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200377 assert.Equal(2, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200378 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200379 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200380
Akron6e70dc82021-08-11 11:33:18 +0200381 w.Reset()
382 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
383 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200384 assert.Equal(3, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200385 assert.Equal("Ausschalten\n!!!", sentences[0])
386 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200387 assert.Equal("\n", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200388
Akron4af79f12021-08-11 14:48:17 +0200389 w.Reset()
390 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
391 sentences = strings.Split(w.String(), "\n\n")
392 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200393
394 /*
395 Test:
396 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
397 */
Akronb7e1f132021-08-10 11:52:31 +0200398}
Akron03ca4252021-08-11 13:32:53 +0200399
Akronc9c0eae2021-10-22 19:49:43 +0200400func TestDoubleArrayFullTokenizerTokenSplitter(t *testing.T) {
Akron03ca4252021-08-11 13:32:53 +0200401 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200402
403 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +0200404 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +0200405 }
406
Akron03ca4252021-08-11 13:32:53 +0200407 assert.NotNil(dat)
408
409 b := make([]byte, 0, 2048)
410 w := bytes.NewBuffer(b)
411 var tokens []string
412
413 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200414 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200415 assert.Equal(tokens[0], "Der")
416 assert.Equal(tokens[1], "alte")
417 assert.Equal(tokens[2], "Mann")
418 assert.Equal(len(tokens), 3)
419
Akronec835ad2021-08-11 18:23:22 +0200420 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200421 assert.Equal(tokens[0], "Der")
422 assert.Equal(tokens[1], "alte")
423 assert.Equal(tokens[2], "Mann")
424 assert.Equal(tokens[3], ".")
425 assert.Equal(len(tokens), 4)
426
427 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200428 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200429 assert.Equal(tokens[0], "Der")
430 assert.Equal(tokens[1], "Vorsitzende")
431 assert.Equal(tokens[2], "der")
432 assert.Equal(tokens[3], "F.D.P.")
433 assert.Equal(tokens[4], "hat")
434 assert.Equal(tokens[5], "gewählt")
435 assert.Equal(len(tokens), 6)
436 // Ignored in KorAP-Tokenizer
437
438 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200439 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200440 assert.Equal(tokens[0], "Gefunden")
441 assert.Equal(tokens[1], "auf")
442 assert.Equal(tokens[2], "wikipedia.org")
443 assert.Equal(len(tokens), 3)
444
445 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200446 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200447 assert.Equal("Gefunden", tokens[0])
448 assert.Equal("auf", tokens[1])
449 assert.Equal("www.wikipedia.org", tokens[2])
450 assert.Equal(3, len(tokens))
451
452 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200453 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200454 assert.Equal("www.info.biz/info", tokens[3])
455
456 // testTokenizerFtpHost
457 /*
458 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
459 assert.Equal("Kann", tokens[0])
460 assert.Equal("von", tokens[1])
461 assert.Equal("ftp.download.org", tokens[2])
462 assert.Equal(5, len(tokens))
463 // Ignored in KorAP-Tokenizer
464 */
465
466 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200467 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200468 assert.Equal(tokens[0], "Das")
469 assert.Equal(tokens[1], "war")
470 assert.Equal(tokens[2], "--")
471 assert.Equal(tokens[3], "spitze")
472 assert.Equal(len(tokens), 4)
473
474 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200475 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200476 assert.Equal(tokens[0], "Ich")
477 assert.Equal(tokens[1], "bin")
478 assert.Equal(tokens[2], "unter")
479 assert.Equal(tokens[3], "korap@ids-mannheim.de")
480 assert.Equal(tokens[4], "erreichbar")
481 assert.Equal(tokens[5], ".")
482 assert.Equal(len(tokens), 6)
483
484 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200485 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200486 assert.Equal(tokens[0], "Oder")
487 assert.Equal(tokens[1], "unter")
488 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
489 assert.Equal(tokens[3], ".")
490 assert.Equal(len(tokens), 4)
491
492 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200493 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200494 assert.Equal(tokens[0], "Oder")
495 assert.Equal(tokens[1], "unter")
496 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
497 assert.Equal(tokens[3], ".")
498 assert.Equal(len(tokens), 4)
499 // Ignored in KorAP-Tokenizer
500
501 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200502 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200503 assert.Equal("\"", tokens[0])
504 assert.Equal("John", tokens[1])
505 assert.Equal("Doe", tokens[2])
506 assert.Equal("\"", tokens[3])
507 assert.Equal("@xx", tokens[4])
508 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
509 assert.Equal("com", tokens[6])
510 assert.Equal(7, len(tokens))
511
512 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200513 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200514 assert.Equal(tokens[0], "Folgt")
515 assert.Equal(tokens[1], "@korap")
516 assert.Equal(tokens[2], "und")
517 assert.Equal(tokens[3], "#korap")
518 assert.Equal(len(tokens), 4)
519
520 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200521 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200522 assert.Equal(tokens[0], "Unsere")
523 assert.Equal(tokens[1], "Website")
524 assert.Equal(tokens[2], "ist")
525 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
526 assert.Equal(len(tokens), 4)
527
528 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200529 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200530 assert.Equal(tokens[0], "Wir")
531 assert.Equal(tokens[1], "sind")
532 assert.Equal(tokens[2], "auch")
533 assert.Equal(tokens[3], "im")
534 assert.Equal(tokens[4], "Internet")
535 assert.Equal(tokens[5], "(")
536 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
537 assert.Equal(tokens[7], ")")
538 assert.Equal(len(tokens), 8)
539 // Ignored in KorAP-Tokenizer
540
541 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200542 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200543 assert.Equal(tokens[0], "Die")
544 assert.Equal(tokens[1], "Adresse")
545 assert.Equal(tokens[2], "ist")
546 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
547 assert.Equal(tokens[4], ".")
548 assert.Equal(len(tokens), 5)
549 // Ignored in KorAP-Tokenizer
550
551 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200552 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200553 assert.Equal(tokens[0], "Unser")
554 assert.Equal(tokens[1], "Server")
555 assert.Equal(tokens[2], "ist")
556 assert.Equal(tokens[3], "10.0.10.51")
557 assert.Equal(tokens[4], ".")
558 assert.Equal(len(tokens), 5)
559
560 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200561 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200562 assert.Equal(tokens[0], "Zu")
563 assert.Equal(tokens[1], "50,4%")
564 assert.Equal(tokens[2], "ist")
565 assert.Equal(tokens[3], "es")
566 assert.Equal(tokens[4], "sicher")
567 assert.Equal(len(tokens), 5)
568 // Differs from KorAP-Tokenizer
569
570 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200571 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200572 assert.Equal(tokens[0], "Der")
573 assert.Equal(tokens[1], "Termin")
574 assert.Equal(tokens[2], "ist")
575 assert.Equal(tokens[3], "am")
576 assert.Equal(tokens[4], "5.9.2018")
577 assert.Equal(len(tokens), 5)
578
Akronec835ad2021-08-11 18:23:22 +0200579 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200580 assert.Equal(tokens[0], "Der")
581 assert.Equal(tokens[1], "Termin")
582 assert.Equal(tokens[2], "ist")
583 assert.Equal(tokens[3], "am")
584 assert.Equal(tokens[4], "5/9/2018")
585 assert.Equal(len(tokens), 5)
586
Akron83d859a2026-02-11 15:44:57 +0100587 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
588 assert.Equal(tokens[0], "Der")
589 assert.Equal(tokens[1], "Termin")
590 assert.Equal(tokens[2], "ist")
591 assert.Equal(tokens[3], "am")
592 assert.Equal(tokens[4], "5/9/2018")
593 assert.Equal(len(tokens), 5)
594
595 tokens = ttokenize(dat, w, "Die Gewerkschaft ver.di fordert mehr Lohn.")
596 assert.Equal("Die", tokens[0])
597 assert.Equal("Gewerkschaft", tokens[1])
598 assert.Equal("ver.di", tokens[2])
599 assert.Equal("fordert", tokens[3])
600 assert.Equal("mehr", tokens[4])
601 assert.Equal("Lohn", tokens[5])
602 assert.Equal(".", tokens[6])
603 assert.Equal(len(tokens), 7)
604
Akron03ca4252021-08-11 13:32:53 +0200605 // testTokenizerDateRange
606 /*
607 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
608 assert.Equal(tokens[0], "Der")
609 assert.Equal(tokens[1], "Termin")
610 assert.Equal(tokens[2], "war")
611 assert.Equal(tokens[3], "vom")
612 assert.Equal(tokens[4], "4.")
613 assert.Equal(tokens[5], "-")
614 assert.Equal(tokens[6], "5.9.2018")
615 assert.Equal(len(tokens), 7)
616 // Ignored in KorAP-Tokenizer
617 */
618
619 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200620 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200621 assert.Equal(tokens[0], "Das")
622 assert.Equal(tokens[1], "ist")
623 assert.Equal(tokens[2], "toll")
624 assert.Equal(tokens[3], "!")
625 assert.Equal(tokens[4], ";)")
626 assert.Equal(len(tokens), 5)
627
628 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200629 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200630 assert.Equal(tokens[0], "Kupietz")
631 assert.Equal(tokens[1], "und")
632 assert.Equal(tokens[2], "Schmidt")
633 assert.Equal(tokens[3], "(2018)")
634 assert.Equal(tokens[4], ":")
635 assert.Equal(tokens[5], "Korpuslinguistik")
636 assert.Equal(len(tokens), 6)
637 // Differs from KorAP-Tokenizer!
638
639 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200640 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200641 assert.Equal(tokens[0], "Kupietz")
642 assert.Equal(tokens[1], "und")
643 assert.Equal(tokens[2], "Schmidt")
644 assert.Equal(tokens[3], "[2018]")
645 assert.Equal(tokens[4], ":")
646 assert.Equal(tokens[5], "Korpuslinguistik")
647 assert.Equal(len(tokens), 6)
648 // Differs from KorAP-Tokenizer!
649
650 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200651 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200652 assert.Equal(tokens[0], "Er")
653 assert.Equal(tokens[1], "ist")
654 assert.Equal(tokens[2], "ein")
655 assert.Equal(tokens[3], "A****loch")
656 assert.Equal(tokens[4], "!")
657 assert.Equal(len(tokens), 5)
658
659 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200660 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200661 assert.Equal(tokens[0], "F*ck")
662 assert.Equal(tokens[1], "!")
663 assert.Equal(len(tokens), 2)
664
665 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200666 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200667 assert.Equal(tokens[0], "Dieses")
668 assert.Equal(tokens[1], "verf*****")
669 assert.Equal(tokens[2], "Kleid")
670 assert.Equal(tokens[3], "!")
671 assert.Equal(len(tokens), 4)
672
673 // Probably interpreted as HOST
674 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200675 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200676 assert.Equal(tokens[0], "Ich")
677 assert.Equal(tokens[1], "habe")
678 assert.Equal(tokens[2], "die")
679 assert.Equal(tokens[3], "readme.txt")
680 assert.Equal(tokens[4], "heruntergeladen")
681 assert.Equal(len(tokens), 5)
682
683 // Probably interpreted as HOST
684 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200685 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200686 assert.Equal(tokens[0], "Nimm")
687 assert.Equal(tokens[1], "die")
688 assert.Equal(tokens[2], "README.TXT")
689 assert.Equal(tokens[3], "!")
690 assert.Equal(len(tokens), 4)
691
692 // Probably interpreted as HOST
693 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200694 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200695 assert.Equal(tokens[0], "Zeig")
696 assert.Equal(tokens[1], "mir")
697 assert.Equal(tokens[2], "profile.jpeg")
698 assert.Equal(len(tokens), 3)
699
700 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200701
Akronec835ad2021-08-11 18:23:22 +0200702 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200703 assert.Equal(tokens[0], "Zeig")
704 assert.Equal(tokens[1], "mir")
705 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
706 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200707
Akrone8837b52021-08-11 17:29:58 +0200708 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200709 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200710 assert.Equal(tokens[0], "Gehe")
711 assert.Equal(tokens[1], "zu")
712 assert.Equal(tokens[2], "/Dokumente/profile.docx")
713 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200714
Akrone8837b52021-08-11 17:29:58 +0200715 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200716 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200717 assert.Equal(tokens[0], "Zeig")
718 assert.Equal(tokens[1], "mir")
719 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
720 assert.Equal(len(tokens), 3)
721 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200722
Akronfd92d7e2021-08-11 16:31:43 +0200723 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200724 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200725 assert.Equal(tokens[0], "Er")
726 assert.Equal(tokens[1], "sagte")
727 assert.Equal(tokens[2], ":")
728 assert.Equal(tokens[3], "\"")
729 assert.Equal(tokens[4], "Es")
730 assert.Equal(tokens[5], "geht")
731 assert.Equal(tokens[6], "mir")
732 assert.Equal(tokens[7], "gut")
733 assert.Equal(tokens[8], "!")
734 assert.Equal(tokens[9], "\"")
735 assert.Equal(tokens[10], ",")
736 assert.Equal(tokens[11], "daraufhin")
737 assert.Equal(tokens[12], "ging")
738 assert.Equal(tokens[13], "er")
739 assert.Equal(tokens[14], ".")
740 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200741
742 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200743 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
744 assert.Equal(tokens[0], "&quot;")
745 assert.Equal(tokens[1], "Das")
746 assert.Equal(tokens[2], "ist")
747 assert.Equal(tokens[3], "von")
748 assert.Equal(tokens[4], "C&A")
749 assert.Equal(tokens[5], "!")
750 assert.Equal(tokens[6], "&quot;")
751 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200752
753 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200754 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200755 assert.Equal(tokens[0], "Siehst")
756 assert.Equal(tokens[1], "Du")
757 assert.Equal(tokens[2], "?!!?")
758 assert.Equal(len(tokens), 3)
759
760 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200761 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200762 assert.Equal(tokens[0], "Peter")
763 assert.Equal(tokens[1], "O'Toole")
764 assert.Equal(len(tokens), 2)
765
766 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200767 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200768 assert.Equal(tokens[0], "Früher")
769 assert.Equal(tokens[1], "bzw.")
770 assert.Equal(tokens[2], "später")
771 assert.Equal(tokens[3], "...")
772 assert.Equal(len(tokens), 4)
773
774 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200775 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200776 assert.Equal(tokens[0], "Es")
777 assert.Equal(tokens[1], "war")
778 assert.Equal(tokens[2], "spät")
779 assert.Equal(tokens[3], ".")
780 assert.Equal(tokens[4], "Morgen")
781 assert.Equal(tokens[5], "ist")
782 assert.Equal(tokens[6], "es")
783 assert.Equal(tokens[7], "früh")
784 assert.Equal(tokens[8], ".")
785 assert.Equal(len(tokens), 9)
786 // Ignored in KorAP-Tokenizer
787
788 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200789 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200790 assert.Equal(tokens[0], "Sie")
791 assert.Equal(tokens[1], "erreichte")
792 assert.Equal(tokens[2], "den")
793 assert.Equal(tokens[3], "1.")
794 assert.Equal(tokens[4], "Platz")
795 assert.Equal(tokens[5], "!")
796 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200797
798 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200799 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200800 assert.Equal(tokens[0], "Archive")
801 assert.Equal(tokens[1], ":")
802 assert.Equal(tokens[2], "Ich")
803 assert.Equal(tokens[3], "bin")
804 assert.Equal(tokens[4], "kein")
805 assert.Equal(tokens[5], "zip")
806 assert.Equal(6, len(tokens))
807
808 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200809 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200810 assert.Equal(tokens[4], "Weststr.")
811 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200812
813 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200814 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200815 assert.Equal("D'dorf", tokens[0])
816 assert.Equal("Ku'damm", tokens[1])
817 assert.Equal("Lu'hafen", tokens[2])
818 assert.Equal("M'gladbach", tokens[3])
819 assert.Equal("W'schaft", tokens[4])
820 assert.Equal(5, len(tokens))
821
822 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200823 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200824 assert.Equal("mach's", tokens[0])
825 assert.Equal("macht's", tokens[1])
826 assert.Equal("was'n", tokens[2])
827 assert.Equal("ist's", tokens[3])
828 assert.Equal("haste", tokens[4])
829 assert.Equal("willste", tokens[5])
830 assert.Equal("kannste", tokens[6])
831 assert.Equal("biste", tokens[7])
832 assert.Equal("kriegste", tokens[8])
833 assert.Equal(9, len(tokens))
834
Akrond8d88952026-02-04 09:02:09 +0100835 // Regression test for hyphenated abbreviations from Wiktionary (2024-12)
836 tokens = ttokenize(dat, w, "Ich wohne in Ba.-Wü. und bin Dipl.-Ing. bei Reg.-Bez. Karlsruhe.")
Akron3dd560e2026-02-04 11:23:08 +0100837 assert.Equal("Ich", tokens[0])
838 assert.Equal("wohne", tokens[1])
839 assert.Equal("in", tokens[2])
840 assert.Equal("Ba.-Wü.", tokens[3])
841 assert.Equal("und", tokens[4])
842 assert.Equal("bin", tokens[5])
843 assert.Equal("Dipl.-Ing.", tokens[6])
844 assert.Equal("bei", tokens[7])
845 assert.Equal("Reg.-Bez.", tokens[8])
846 assert.Equal("Karlsruhe", tokens[9])
847 assert.Equal(".", tokens[10])
848 assert.Equal(11, len(tokens))
Akrond8d88952026-02-04 09:02:09 +0100849
Akrona2f952f2026-02-04 09:51:51 +0100850 // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/131
851 tokens = ttokenize(dat, w, "Donau\u00ADdampf\u00ADschiff")
Akron3dd560e2026-02-04 11:23:08 +0100852 assert.Equal("Donau\u00ADdampf\u00ADschiff", tokens[0])
853 assert.Equal(1, len(tokens))
Akrona2f952f2026-02-04 09:51:51 +0100854
855 // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/115
856 tokens = ttokenize(dat, w, "Die Serb*innen wie die Kosovo-Albaner*innen")
Akron3dd560e2026-02-04 11:23:08 +0100857 assert.Equal("Die", tokens[0])
858 assert.Equal("Serb*innen", tokens[1])
859 assert.Equal("wie", tokens[2])
860 assert.Equal("die", tokens[3])
861 assert.Equal("Kosovo-Albaner*innen", tokens[4])
862 assert.Equal(5, len(tokens))
863
864 // Test Wikipedia emoji template from the issue
Akrona2f952f2026-02-04 09:51:51 +0100865 tokens = ttokenize(dat, w, "Ein Smiley [_EMOJI:{{S|;)}}_] hier")
Akron3dd560e2026-02-04 11:23:08 +0100866 assert.Equal("Ein", tokens[0])
867 assert.Equal("Smiley", tokens[1])
868 assert.Equal("[_EMOJI:{{S|;)}}_]", tokens[2]) // Should be one token
869 assert.Equal("hier", tokens[3])
870 assert.Equal(4, len(tokens))
871
872 // Test simple pragma still works
Akrona2f952f2026-02-04 09:51:51 +0100873 tokens = ttokenize(dat, w, "Name: [_ANONYMIZED_] Ende")
Akron3dd560e2026-02-04 11:23:08 +0100874 assert.Equal("Name", tokens[0])
875 assert.Equal(":", tokens[1])
876 assert.Equal("[_ANONYMIZED_]", tokens[2]) // Should be one token
877 assert.Equal("Ende", tokens[3])
878 assert.Equal(4, len(tokens))
879
880 // Gender forms
881 // Basic colon forms with -in/-innen
882 tokens = ttokenize(dat, w, "Die Schüler:innen und Lehrer:in kamen.")
883 assert.Equal("Die", tokens[0])
884 assert.Equal("Schüler:innen", tokens[1])
885 assert.Equal("und", tokens[2])
886 assert.Equal("Lehrer:in", tokens[3])
887 assert.Equal("kamen", tokens[4])
888 assert.Equal(".", tokens[5])
889 assert.Equal(6, len(tokens))
890
891 // More colon examples
892 tokens = ttokenize(dat, w, "Künstler:innen Mitarbeiter:innen Bürger:innen")
893 assert.Equal("Künstler:innen", tokens[0])
894 assert.Equal("Mitarbeiter:innen", tokens[1])
895 assert.Equal("Bürger:innen", tokens[2])
896 assert.Equal(3, len(tokens))
897
898 // Basic slash forms
899 tokens = ttokenize(dat, w, "Autor/in Autor/innen Teilnehmer/innen")
900 assert.Equal("Autor/in", tokens[0])
901 assert.Equal("Autor/innen", tokens[1])
902 assert.Equal("Teilnehmer/innen", tokens[2])
903 assert.Equal(3, len(tokens))
904
905 // Slash forms with hyphen: /-in, /-innen, /-frau
906 tokens = ttokenize(dat, w, "Kaufmann/-frau und Fachmann/-frau")
907 assert.Equal("Kaufmann/-frau", tokens[0])
908 assert.Equal("und", tokens[1])
909 assert.Equal("Fachmann/-frau", tokens[2])
910 assert.Equal(3, len(tokens))
911
912 // Slash forms without hyphen for frau (lowercase only)
913 tokens = ttokenize(dat, w, "Kaufmann/frau ist auch korrekt.")
914 assert.Equal("Kaufmann/frau", tokens[0])
915 assert.Equal("ist", tokens[1])
916 assert.Equal("auch", tokens[2])
917 assert.Equal("korrekt", tokens[3])
918 assert.Equal(".", tokens[4])
919 assert.Equal(5, len(tokens))
920
921 // Basic parenthetical forms
922 tokens = ttokenize(dat, w, "Schüler(innen) und Lehrer(in) kamen.")
923 assert.Equal("Schüler(innen)", tokens[0])
924 assert.Equal("und", tokens[1])
925 assert.Equal("Lehrer(in)", tokens[2])
926 assert.Equal("kamen", tokens[3])
927 assert.Equal(".", tokens[4])
928 assert.Equal(5, len(tokens))
929
930 // Compound words with hyphen + gender ending
931 tokens = ttokenize(dat, w, "Die Kosovo-Albaner/innen und Kosovo-Albaner:innen trafen sich.")
932 assert.Equal("Die", tokens[0])
933 assert.Equal("Kosovo-Albaner/innen", tokens[1])
934 assert.Equal("und", tokens[2])
935 assert.Equal("Kosovo-Albaner:innen", tokens[3])
936 assert.Equal("trafen", tokens[4])
937 assert.Equal("sich", tokens[5])
938 assert.Equal(".", tokens[6])
939 assert.Equal(7, len(tokens))
940
941 // With hyphen: Kosovo-Albaner/-innen
942 tokens = ttokenize(dat, w, "Kosovo-Albaner/-innen kamen.")
943 assert.Equal("Kosovo-Albaner/-innen", tokens[0])
944 assert.Equal("kamen", tokens[1])
945 assert.Equal(".", tokens[2])
946 assert.Equal(3, len(tokens))
947
948 // Mann/Frau should be separated (capital F = standalone word, not suffix)
949 tokens = ttokenize(dat, w, "Ob Mann/Frau das will?")
950 assert.Equal("Ob", tokens[0])
951 assert.Equal("Mann", tokens[1])
952 assert.Equal("/", tokens[2])
953 assert.Equal("Frau", tokens[3])
954 assert.Equal("das", tokens[4])
955 assert.Equal("will", tokens[5])
956 assert.Equal("?", tokens[6])
957 assert.Equal(7, len(tokens))
958
959 // Also Männer/Frauen
960 tokens = ttokenize(dat, w, "Männer/Frauen sind willkommen.")
961 assert.Equal("Männer", tokens[0])
962 assert.Equal("/", tokens[1])
963 assert.Equal("Frauen", tokens[2])
964 assert.Equal("sind", tokens[3])
965 assert.Equal("willkommen", tokens[4])
966 assert.Equal(".", tokens[5])
967 assert.Equal(6, len(tokens))
968
969 // /frau should only be joined when word ends in "mann"
970 // "xxx/frau" where xxx doesn't end in "mann" should be SEPARATED
971 tokens = ttokenize(dat, w, "xxx/frau sollte getrennt sein.")
972 assert.Equal("xxx", tokens[0])
973 assert.Equal("/", tokens[1])
974 assert.Equal("frau", tokens[2])
975 assert.Equal("sollte", tokens[3])
976 assert.Equal("getrennt", tokens[4])
977 assert.Equal("sein", tokens[5])
978 assert.Equal(".", tokens[6])
979 assert.Equal(7, len(tokens))
980
981 // But Kaufmann/frau should be one token (word ends in "mann")
982 tokens = ttokenize(dat, w, "Kaufmann/frau ist ein Beruf.")
983 assert.Equal("Kaufmann/frau", tokens[0])
984 assert.Equal("ist", tokens[1])
985 assert.Equal("ein", tokens[2])
986 assert.Equal("Beruf", tokens[3])
987 assert.Equal(".", tokens[4])
988 assert.Equal(5, len(tokens))
989
990 // And Fachmann/-frau should be one token
991 tokens = ttokenize(dat, w, "Fachmann/-frau gesucht")
992 assert.Equal("Fachmann/-frau", tokens[0])
993 assert.Equal("gesucht", tokens[1])
994 assert.Equal(2, len(tokens))
995
996 // Geschäftsmann/frau should also be one token
997 tokens = ttokenize(dat, w, "Ein Geschäftsmann/frau wird gesucht.")
998 assert.Equal("Ein", tokens[0])
999 assert.Equal("Geschäftsmann/frau", tokens[1])
1000 assert.Equal("wird", tokens[2])
1001 assert.Equal("gesucht", tokens[3])
1002 assert.Equal(".", tokens[4])
1003 assert.Equal(5, len(tokens))
1004
1005 // Genderstern forms (these should already work via existing rules)
1006 tokens = ttokenize(dat, w, "Schüler*innen und Lehrer*innen")
1007 assert.Equal("Schüler*innen", tokens[0])
1008 assert.Equal("und", tokens[1])
1009 assert.Equal("Lehrer*innen", tokens[2])
1010 assert.Equal(3, len(tokens))
1011
1012 // Mixed sentence with various gender forms
1013 tokens = ttokenize(dat, w, "Die Schüler:innen, Lehrer/innen und Mitarbeiter(innen) sowie Kaufmann/-frau trafen sich.")
1014 assert.Equal("Die", tokens[0])
1015 assert.Equal("Schüler:innen", tokens[1])
1016 assert.Equal(",", tokens[2])
1017 assert.Equal("Lehrer/innen", tokens[3])
1018 assert.Equal("und", tokens[4])
1019 assert.Equal("Mitarbeiter(innen)", tokens[5])
1020 assert.Equal("sowie", tokens[6])
1021 assert.Equal("Kaufmann/-frau", tokens[7])
1022 assert.Equal("trafen", tokens[8])
1023 assert.Equal("sich", tokens[9])
1024 assert.Equal(".", tokens[10])
1025 assert.Equal(11, len(tokens))
1026
1027 tokens = ttokenize(dat, w, "Nutzer/Innenarchitekt")
1028 assert.Equal("Nutzer", tokens[0])
1029 assert.Equal("/", tokens[1])
1030 assert.Equal("Innenarchitekt", tokens[2])
1031 assert.Equal(3, len(tokens))
1032
1033 tokens = ttokenize(dat, w, "Innenminister/in")
1034 assert.Equal("Innenminister/in", tokens[0])
1035 assert.Equal(1, len(tokens))
1036
1037 tokens = ttokenize(dat, w, "Innenminister/Innenministerinnen")
1038 assert.Equal("Innenminister", tokens[0])
1039 assert.Equal("/", tokens[1])
1040 assert.Equal("Innenministerinnen", tokens[2])
1041 assert.Equal(3, len(tokens))
Akrona2f952f2026-02-04 09:51:51 +01001042
1043 /*
Akron3dd560e2026-02-04 11:23:08 +01001044 DeReKo-Behaviour
1045 tokens = ttokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
1046 assert.Equal("'ve", tokens[1]);
1047 assert.Equal("'ll", tokens[3]);
1048 assert.Equal("'d", tokens[5]);
1049 assert.Equal("'m", tokens[7]);
1050 assert.Equal("'re", tokens[9]);
1051 assert.Equal("'s", tokens[11]);
1052 assert.Equal("is", tokens[12]);
1053 assert.Equal("n't", tokens[13]);
1054 assert.Equal(14, len(tokens));
Akrona2f952f2026-02-04 09:51:51 +01001055
Akrond8d88952026-02-04 09:02:09 +01001056
Akron3dd560e2026-02-04 11:23:08 +01001057 assert.Equal(tokens[0], "Der")
1058 assert.Equal(tokens[1], "alte")
1059 assert.Equal(tokens[2], "Mann")
1060 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +02001061
Akron3dd560e2026-02-04 11:23:08 +01001062 /*
1063 @Test
1064 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
1065 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1066 }
Akron03ca4252021-08-11 13:32:53 +02001067
Akron3dd560e2026-02-04 11:23:08 +01001068 @Test
1069 public void frenchTokenizerKnowsFrenchAbbreviations () {
1070 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1071 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
1072 assert.Equal("Approx.", tokens[0]);
1073 assert.Equal("juill.", tokens[2]);
1074 assert.Equal("prof.", tokens[5]);
1075 assert.Equal("exerc.", tokens[15]);
1076 assert.Equal("no.", tokens[16]);
1077 assert.Equal("pp.", tokens[21]);
1078 }
Akron03ca4252021-08-11 13:32:53 +02001079
Akron3dd560e2026-02-04 11:23:08 +01001080 @Test
1081 public void frenchTokenizerKnowsFrenchContractions () {
1082 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1083 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
1084 assert.Equal("J'", tokens[0]);
1085 assert.Equal("j'", tokens[2]);
1086 assert.Equal("qu'", tokens[4]);
1087 assert.Equal("d'", tokens[6]);
1088 assert.Equal("jusqu'", tokens[8]);
1089 assert.Equal("Aujourd'hui", tokens[10]);
1090 assert.Equal("D'", tokens[11]); // ’
1091 assert.Equal("Quelqu'un", tokens[13]); // ’
1092 assert.Equal("Presqu'île", tokens[14]); // ’
1093 }
Akron03ca4252021-08-11 13:32:53 +02001094
Akron3dd560e2026-02-04 11:23:08 +01001095 @Test
1096 public void frenchTokenizerKnowsFrenchClitics () {
1097 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1098 tokens = tokenize(dat, w, "suis-je sont-elles ")
1099 assert.Equal("suis", tokens[0]);
1100 assert.Equal("-je", tokens[1]);
1101 assert.Equal("sont", tokens[2]);
1102 assert.Equal("-elles", tokens[3]);
1103 }
Akron03ca4252021-08-11 13:32:53 +02001104
Akron3dd560e2026-02-04 11:23:08 +01001105 @Test
1106 public void testEnglishTokenizerScienceAbbreviations () {
1107 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1108 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
1109 assert.Equal("Approx.", tokens[0]);
1110 assert.Equal("in", tokens[1]);
1111 assert.Equal("Sept.", tokens[2]);
1112 assert.Equal("1954", tokens[3]);
1113 assert.Equal(",", tokens[4]);
1114 assert.Equal("Assoc.", tokens[5]);
1115 assert.Equal("Prof.", tokens[6]);
1116 assert.Equal("Dr.", tokens[7]);
1117 assert.Equal("R.", tokens[8]);
1118 assert.Equal("J.", tokens[9]);
1119 assert.Equal("Ewing", tokens[10]);
1120 assert.Equal("reviewed", tokens[11]);
1121 assert.Equal("articles", tokens[12]);
1122 assert.Equal("on", tokens[13]);
1123 assert.Equal("Enzymol.", tokens[14]);
1124 assert.Equal("Bacteriol.", tokens[15]);
1125 assert.Equal("effects", tokens[16]);
1126 assert.Equal("later", tokens[17]);
1127 assert.Equal("published", tokens[18]);
1128 assert.Equal("in", tokens[19]);
1129 assert.Equal("Nutr.", tokens[20]);
1130 assert.Equal("Rheumatol.", tokens[21]);
1131 assert.Equal("No.", tokens[22]);
1132 assert.Equal("12", tokens[23]);
1133 assert.Equal("and", tokens[24]);
1134 assert.Equal("Nº.", tokens[25]);
1135 assert.Equal("13.", tokens[26]);
1136 assert.Equal(",", tokens[27]);
1137 assert.Equal("pp.", tokens[28]);
1138 assert.Equal("17-18", tokens[29]);
1139 assert.Equal(".", tokens[30]);
1140 }
Akron03ca4252021-08-11 13:32:53 +02001141
Akron3dd560e2026-02-04 11:23:08 +01001142 @Test
1143 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
1144 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1145 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
1146 assert.Equal("I.", tokens[1]);
1147 assert.Equal("I", tokens[8]);
1148 assert.Equal(".", tokens[9]);
1149 assert.Equal("I", tokens[12]);
1150 assert.Equal(".", tokens[13]);
1151 }
Akron03ca4252021-08-11 13:32:53 +02001152
Akron3dd560e2026-02-04 11:23:08 +01001153 @Test
1154 public void testZipOuputArchive () {
1155
1156 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1157 System.setOut(new PrintStream(clearOut));
1158 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1159 assert.Equal(0, len(tokens));
1160 }
Akron03ca4252021-08-11 13:32:53 +02001161 */
1162 /*
1163
1164 @Test
1165 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1166 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1167 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1168 .printOffsets(true)
1169 .build();
1170 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1171 assert.Equal("Text1", tokens[0].getType());
1172 assert.Equal(len(tokens), 9 );
1173 }
1174 */
1175}
Akronbd406802021-08-11 18:39:13 +02001176
Akrondf275812022-03-27 12:54:46 +02001177func TestDoubleArrayFullTokenizerSentenceSplitterBug1(t *testing.T) {
1178 assert := assert.New(t)
1179
1180 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +02001181 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akrondf275812022-03-27 12:54:46 +02001182 }
1183
1184 b := make([]byte, 0, 2048)
1185 w := bytes.NewBuffer(b)
1186 var sentences []string
1187
1188 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
1189
1190 w.Reset()
1191 assert.True(dat.Transduce(strings.NewReader(text), w))
1192 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +02001193 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +02001194 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
1195 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +02001196 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
1197 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
1198 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akrondf275812022-03-27 12:54:46 +02001199}
1200
Akron2f7f6f32026-02-11 15:12:48 +01001201func TestDoubleArrayFullTokenizerGenderDontSplitFromFile(t *testing.T) {
1202 assert := assert.New(t)
1203
1204 if dat == nil {
1205 dat = LoadDatokFile("testdata/tokenizer_de.datok")
1206 }
1207 assert.NotNil(dat)
1208
1209 b := make([]byte, 0, 2048)
1210 w := bytes.NewBuffer(b)
1211
1212 for _, token := range ttokenLines(t, "testdata/de/dontsplit.txt") {
1213 tokens := ttokenize(dat, w, token)
1214 assert.Equalf(1, len(tokens), "should not split %q", token)
1215 if len(tokens) == 1 {
1216 assert.Equalf(token, tokens[0], "token surface should match for %q", token)
1217 }
1218 }
1219}
1220
1221func TestDoubleArrayFullTokenizerGenderSplitFromFile(t *testing.T) {
1222 assert := assert.New(t)
1223
1224 if dat == nil {
1225 dat = LoadDatokFile("testdata/tokenizer_de.datok")
1226 }
1227 assert.NotNil(dat)
1228
1229 b := make([]byte, 0, 2048)
1230 w := bytes.NewBuffer(b)
1231
1232 for _, token := range ttokenLines(t, "testdata/de/split.txt") {
1233 tokens := ttokenize(dat, w, token)
1234 assert.Greaterf(len(tokens), 1, "should split %q", token)
1235 }
1236}
1237
Akronc9c0eae2021-10-22 19:49:43 +02001238func TestDoubleArrayLoadFactor1(t *testing.T) {
Akron29e306f2021-09-02 18:29:56 +02001239 assert := assert.New(t)
1240 tok := LoadFomaFile("testdata/abbr_bench.fst")
1241 dat := tok.ToDoubleArray()
1242 assert.True(dat.LoadFactor() > 88)
1243}
1244
Akronc9c0eae2021-10-22 19:49:43 +02001245func TestDoubleArrayFullTokenizerXML(t *testing.T) {
Akron4c2a1ad2021-08-31 00:35:53 +02001246 assert := assert.New(t)
1247
Akron9fb63af2021-10-28 01:15:53 +02001248 if dat == nil {
Akron0139bc52023-08-31 16:35:58 +02001249 dat = LoadDatokFile("testdata/tokenizer_de.datok")
Akron9fb63af2021-10-28 01:15:53 +02001250 }
1251
Akron4c2a1ad2021-08-31 00:35:53 +02001252 assert.NotNil(dat)
1253
1254 b := make([]byte, 0, 2048)
1255 w := bytes.NewBuffer(b)
1256 var tokens []string
1257
1258 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
1259 assert.Equal("Das", tokens[0])
1260 assert.Equal("<b>", tokens[1])
1261 assert.Equal("beste", tokens[2])
1262 assert.Equal("</b>", tokens[3])
1263 assert.Equal("Fußballspiel", tokens[4])
1264 assert.Equal(5, len(tokens))
1265
1266 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1267 assert.Equal("Das", tokens[0])
1268 assert.Equal("<b class=\"c\">", tokens[1])
1269 assert.Equal("beste", tokens[2])
1270 assert.Equal("</b>", tokens[3])
1271 assert.Equal("Fußballspiel", tokens[4])
1272 assert.Equal(5, len(tokens))
1273
1274 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1275 assert.Equal("der", tokens[0])
1276 assert.Equal("<x y=\"alte \">", tokens[1])
1277 assert.Equal("<x x>", tokens[2])
1278 assert.Equal("alte", tokens[3])
1279 assert.Equal("</x>", tokens[4])
1280 assert.Equal("etc.", tokens[5])
1281 assert.Equal("et", tokens[6])
1282 assert.Equal(".", tokens[7])
1283 assert.Equal("Mann", tokens[8])
1284 assert.Equal(".", tokens[9])
1285 assert.Equal(10, len(tokens))
1286}
1287
Akronc9c0eae2021-10-22 19:49:43 +02001288func BenchmarkDoubleArrayTransduce(b *testing.B) {
Akronbd406802021-08-11 18:39:13 +02001289 bu := make([]byte, 0, 2048)
1290 w := bytes.NewBuffer(bu)
1291
1292 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
1293 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
1294 Der Termin ist am 5.9.2018.
1295 Ich habe die readme.txt heruntergeladen.
1296 Ausschalten!!! Hast Du nicht gehört???
1297 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
1298 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
1299 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
1300 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
1301 r := strings.NewReader(s)
1302
Akron0139bc52023-08-31 16:35:58 +02001303 dat := LoadDatokFile("testdata/tokenizer_de.datok")
Akronbd406802021-08-11 18:39:13 +02001304
Akrondf37a552021-09-02 12:16:08 +02001305 b.ResetTimer()
1306
Akronbd406802021-08-11 18:39:13 +02001307 for i := 0; i < b.N; i++ {
1308 w.Reset()
1309 r.Reset(s)
1310 ok := dat.Transduce(r, w)
1311 if !ok {
1312 fmt.Println("Fail!")
1313 fmt.Println(w.String())
1314 os.Exit(1)
1315 }
1316 }
Akronbd406802021-08-11 18:39:13 +02001317}
Akronbb4aac52021-08-13 00:52:27 +02001318
Akron6f1c16c2021-08-17 10:45:42 +02001319// This test is deprecated as the datok file changes over time
1320func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +02001321 for i := 0; i < b.N; i++ {
Akron0139bc52023-08-31 16:35:58 +02001322 dat := LoadDatokFile("testdata/tokenizer_de.datok")
Akronbb4aac52021-08-13 00:52:27 +02001323 if dat == nil {
1324 fmt.Println("Fail!")
1325 os.Exit(1)
1326 }
1327 }
1328}
1329
Akronc9c0eae2021-10-22 19:49:43 +02001330func BenchmarkDoubleArrayConstruction(b *testing.B) {
Akron6f1c16c2021-08-17 10:45:42 +02001331 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +02001332 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +02001333 for i := 0; i < b.N; i++ {
1334 dat := tok.ToDoubleArray()
1335 if dat == nil {
1336 fmt.Println("Fail!")
1337 os.Exit(1)
1338 }
1339 }
1340}
1341
Akronc9c0eae2021-10-22 19:49:43 +02001342func BenchmarkDoubleArrayLarger(b *testing.B) {
Akron7b1faa62021-09-02 16:10:21 +02001343 tok := LoadFomaFile("testdata/abbr_bench.fst")
1344 b.ResetTimer()
1345 for i := 0; i < b.N; i++ {
1346 dat := tok.ToDoubleArray()
1347 if dat == nil {
1348 fmt.Println("Fail!")
1349 os.Exit(1)
1350 }
1351 }
1352}
1353
Akronbb4aac52021-08-13 00:52:27 +02001354// 2021-08-11 (go 1.16)
1355// go test -bench=. -test.benchmem
1356// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001357// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +02001358// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
1359// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
1360// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
1361// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001362// 2021-08-16
1363// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
1364// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
1365// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
1366// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +02001367// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
1368// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +02001369// 2021-08-17
1370// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
1371// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +02001372// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
1373// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +02001374// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +02001375// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
1376// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
1377// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
1378// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
1379// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
1380// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1381// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1382// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1383// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001384// 2021-09-02 - xCheckSkip() with .9
1385// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1386// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1387// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001388// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1389// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1390// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1391// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
Akron28031b72021-10-02 13:07:25 +02001392// 2021-09-30 - Go 1.17.1
1393// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
1394// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
1395// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
1396// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
Akron094a4e82021-10-02 18:37:00 +02001397// 2021-10-02
1398// BenchmarkTransduce-4 47676 25398 ns/op 8240 B/op 3 allocs/op
1399// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
1400// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
1401// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
Akrone396a932021-10-19 01:06:13 +02001402// 2021-10-12 - Introduction of Callbacks in Matrix
1403// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
1404// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
1405// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
1406// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
1407// 2021-10-18 - Introduction of Callbacks in DA
1408// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
1409// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
1410// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
1411// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
Akrona854faa2021-10-22 19:31:08 +02001412// 2021-10-21 - Simplify DA code to ignore final states
1413// BenchmarkTransduce-4 41365 33766 ns/op 12408 B/op 6 allocs/op
1414// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
1415// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
1416// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001417// 2021-10-22 - Introduxe EOT
Akronc9c0eae2021-10-22 19:49:43 +02001418// BenchmarkDoubleArrayTransduce-4 43820 27661 ns/op 12408 B/op 6 allocs/op
1419// BenchmarkDoubleArrayConstruction-4 68259 16608 ns/op 10703 B/op 29 allocs/op
1420// BenchmarkDoubleArrayLarger-4 16 69889532 ns/op 6357901 B/op 2578 allocs/op
1421// BenchmarkMatrixTransduce-4 49426 25105 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001422// 2021-10-23 - Improve offset handling
1423// BenchmarkDoubleArrayTransduce-4 41890 29729 ns/op 12408 B/op 6 allocs/op
1424// BenchmarkDoubleArrayConstruction-4 74510 15879 ns/op 10703 B/op 29 allocs/op
1425// BenchmarkDoubleArrayLarger-4 18 73752383 ns/op 6357956 B/op 2579 allocs/op
1426// BenchmarkMatrixTransduce-4 46870 27140 ns/op 12408 B/op 6 allocs/op
Akron04335c62021-10-28 11:56:00 +02001427// 2021-10-28 - Finalize feature compatibility with KorAP-Tokenizer
1428// BenchmarkDoubleArrayTransduce-4 39130 31612 ns/op 28944 B/op 16 allocs/op
1429// BenchmarkDoubleArrayConstruction-4 79302 14994 ns/op 10703 B/op 29 allocs/op
1430// BenchmarkDoubleArrayLarger-4 18 67942077 ns/op 6357870 B/op 2577 allocs/op
1431// BenchmarkMatrixTransduce-4 39536 30510 ns/op 28944 B/op 16 allocs/op
Akron289414f2021-11-09 19:56:42 +01001432// 2021-11-09 - go 1.17.3
1433// BenchmarkDoubleArrayTransduce-4 35067 34192 ns/op 28944 B/op 17 allocs/op
1434// BenchmarkDoubleArrayConstruction-4 72446 15614 ns/op 10703 B/op 29 allocs/op
1435// BenchmarkDoubleArrayLarger-4 16 71058822 ns/op 6357860 B/op 2577 allocs/op
1436// BenchmarkMatrixTransduce-4 36703 31891 ns/op 28944 B/op 17 allocs/op
Akronfac8abc2021-11-10 07:19:59 +01001437// 2021-11-10 - rearranged longest match operator
Akron4880fb62021-12-05 12:03:05 +01001438// BenchmarkDoubleArrayTransduce-4 34522 33210 ns/op 28944 B/op 17 allocs/op
1439// BenchmarkDoubleArrayConstruction-4 66990 16012 ns/op 10703 B/op 29 allocs/op
1440// BenchmarkDoubleArrayLarger-4 16 62829878 ns/op 6357823 B/op 2576 allocs/op
1441// BenchmarkMatrixTransduce-4 36154 32702 ns/op 28944 B/op 17 allocs/op
1442// 2021-12-04 - optimize identity branch
1443// BenchmarkDoubleArrayTransduce-4 34903 32255 ns/op 28944 B/op 17 allocs/op
1444// BenchmarkDoubleArrayConstruction-4 79394 14561 ns/op 10703 B/op 29 allocs/op
1445// BenchmarkDoubleArrayLarger-4 19 60257675 ns/op 6357911 B/op 2577 allocs/op
1446// BenchmarkMatrixTransduce-4 35076 30581 ns/op 28944 B/op 17 allocs/op
Akron00cecd12021-12-05 13:14:03 +01001447// 2021-12-05 - init identity for sigma < 256
1448// BenchmarkDoubleArrayTransduce-4 35284 31918 ns/op 28944 B/op 17 allocs/op
1449// BenchmarkDoubleArrayConstruction-4 80342 14504 ns/op 10703 B/op 29 allocs/op
1450// BenchmarkDoubleArrayLarger-4 19 60343253 ns/op 6357789 B/op 2575 allocs/op
1451// BenchmarkMatrixTransduce-4 34029 30238 ns/op 28944 B/op 17 allocs/op