blob: 38eb474edfe519b55c645ec1b59650249cee8406 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron1c34ce62021-09-23 23:27:39 +020014func tmatch(tok Tokenizer, s string) bool {
Akronec835ad2021-08-11 18:23:22 +020015 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
Akron1c34ce62021-09-23 23:27:39 +020017 return tok.Transduce(strings.NewReader(s), w)
Akronec835ad2021-08-11 18:23:22 +020018}
19
Akron1c34ce62021-09-23 23:27:39 +020020func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020021 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020022 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020023 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akronc9c0eae2021-10-22 19:49:43 +020032func TestDoubleArraySimpleString(t *testing.T) {
Akron8ef408b2021-08-02 22:11:04 +020033 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akrone396a932021-10-19 01:06:13 +020041 assert.True(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020042}
Akron75ebe7f2021-08-03 10:34:10 +020043
Akronc9c0eae2021-10-22 19:49:43 +020044func TestDoubleArraySimpleBranches(t *testing.T) {
Akron75ebe7f2021-08-03 10:34:10 +020045 assert := assert.New(t)
46
47 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020048 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020049 dat := tok.ToDoubleArray()
Akrone396a932021-10-19 01:06:13 +020050 assert.True(tmatch(dat, "bau"))
Akronec835ad2021-08-11 18:23:22 +020051 assert.True(tmatch(dat, "bauamt"))
52 assert.True(tmatch(dat, "wahlamt"))
53 assert.True(tmatch(dat, "bauen"))
54 assert.True(tmatch(dat, "wahlen"))
55 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020056}
Akron730a79c2021-08-03 11:05:29 +020057
58func TestSimpleTokenizer(t *testing.T) {
59 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020062 assert.True(tmatch(dat, "bau"))
63 assert.True(tmatch(dat, "bad"))
64 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020065}
Akron740f3d72021-08-03 12:12:34 +020066
Akronc9c0eae2021-10-22 19:49:43 +020067func TestDoubleArraySimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020068 assert := assert.New(t)
69 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020070 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020071
72 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
73 b := make([]byte, 0, 2048)
74 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020075 var tokens []string
Akron524c5432021-08-05 14:14:27 +020076 dat.Transduce(r, w)
77 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020078 assert.Equal(len(tokens), 11)
Akron3f8571a2021-08-05 11:18:10 +020079 assert.Equal("wald", tokens[0])
80 assert.Equal("gehen", tokens[1])
81 assert.Equal("Da", tokens[2])
82 assert.Equal("kann", tokens[3])
83 assert.Equal("man", tokens[4])
84 assert.Equal("was", tokens[5])
85 assert.Equal("\"erleben\"", tokens[6])
86
Akron524c5432021-08-05 14:14:27 +020087 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
88 w.Reset()
89 dat.Transduce(r, w)
90 tokens = strings.Split(w.String(), "\n")
91 assert.Equal("In", tokens[0])
92 assert.Equal("den", tokens[1])
93 assert.Equal("Wald", tokens[2])
94 assert.Equal("gehen", tokens[3])
95 assert.Equal("?", tokens[4])
96 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020097
Akron524c5432021-08-05 14:14:27 +020098 r = strings.NewReader(" g? -- D")
99 w.Reset()
100 dat.Transduce(r, w)
101 tokens = strings.Split(w.String(), "\n")
102 assert.Equal("g", tokens[0])
103 assert.Equal("?", tokens[1])
104 assert.Equal("--", tokens[2])
105 assert.Equal("D", tokens[3])
106 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200107 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +0200108 assert.Equal(7, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200109}
110
Akronc9c0eae2021-10-22 19:49:43 +0200111func TestDoubleArrayReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200112 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200113 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200114 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200115 assert.True(tmatch(dat, "bau"))
116 assert.True(tmatch(dat, "bad"))
117 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200123 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200137
Akron92704eb2021-08-27 10:59:46 +0200138 assert.Equal(dat.TransCount(), 17)
139 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200140}
141
Akronc9c0eae2021-10-22 19:49:43 +0200142func TestDoubleArrayIgnorableMCS(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200143
144 // This test relies on final states. That's why it is
145 // not working correctly anymore.
146
Akron31f3c062021-08-27 10:15:13 +0200147 assert := assert.New(t)
148 // File has MCS in sigma but not in net
149 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
150 assert.NotNil(tok)
151 dat := tok.ToDoubleArray()
152 assert.NotNil(dat)
153
154 b := make([]byte, 0, 2048)
155 w := bytes.NewBuffer(b)
156 var tokens []string
157
158 // Is only unambigous when transducing strictly greedy!
Akrone396a932021-10-19 01:06:13 +0200159 assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
Akron31f3c062021-08-27 10:15:13 +0200160 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akron31f3c062021-08-27 10:15:13 +0200162 assert.Equal("a", tokens[0])
163 assert.Equal("b", tokens[1])
Akrone396a932021-10-19 01:06:13 +0200164 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200165 assert.Equal(6, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200166 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200167}
168
Akronc9c0eae2021-10-22 19:49:43 +0200169func TestDoubleArrayFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200170 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200171 dat := LoadDatokFile("testdata/tokenizer.datok")
172 assert.NotNil(dat)
173 assert.True(dat.LoadFactor() >= 70)
174 assert.Equal(dat.epsilon, 1)
175 assert.Equal(dat.unknown, 2)
176 assert.Equal(dat.identity, 3)
Akrona854faa2021-10-22 19:31:08 +0200177 assert.Equal(dat.final, 146)
178 assert.Equal(len(dat.sigma), 141)
Akronf1a16502021-08-16 15:24:38 +0200179 assert.True(len(dat.array) > 3600000)
180 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200181 assert.True(tmatch(dat, "bau"))
182 assert.True(tmatch(dat, "bad"))
183 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200184}
Akron3f8571a2021-08-05 11:18:10 +0200185
Akronc9c0eae2021-10-22 19:49:43 +0200186func TestDoubleArrayTokenizerBranch(t *testing.T) {
Akron941f2152021-09-26 15:14:25 +0200187 assert := assert.New(t)
188 tok := LoadTokenizerFile("testdata/simpletok.datok")
189 assert.NotNil(tok)
190 assert.Equal(tok.Type(), "DATOK")
191
192 tok = LoadTokenizerFile("testdata/simpletok.matok")
193 assert.NotNil(tok)
194 assert.Equal(tok.Type(), "MATOK")
195}
196
Akronc9c0eae2021-10-22 19:49:43 +0200197func XTestDoubleArrayFullTokenizerBuild(t *testing.T) {
Akrona0bded52021-08-11 15:48:02 +0200198 assert := assert.New(t)
199 tok := LoadFomaFile("testdata/tokenizer.fst")
200 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200201 assert.NotNil(dat)
202 // n, err := dat.Save("testdata/tokenizer.datok")
203 // assert.Nil(err)
204 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200205}
206
Akronc9c0eae2021-10-22 19:49:43 +0200207func TestDoubleArrayFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200208 assert := assert.New(t)
209
Akrona0bded52021-08-11 15:48:02 +0200210 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200211 assert.NotNil(dat)
212
Akron3610f102021-08-08 14:13:25 +0200213 b := make([]byte, 0, 2048)
214 w := bytes.NewBuffer(b)
215 var tokens []string
216
Akron03ca4252021-08-11 13:32:53 +0200217 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200218
219 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200220 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200221 assert.Equal("tra", tokens[0])
222 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200223 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200224 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200225 assert.Equal("Du", tokens[4])
226 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200227 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200228 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200229 assert.Equal("", tokens[8])
230 assert.Equal(9, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200231
232 w.Reset()
233 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200234 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200235}
Akronb7e1f132021-08-10 11:52:31 +0200236
Akronc9c0eae2021-10-22 19:49:43 +0200237func TestDoubleArrayFullTokenizerSentenceSplitter(t *testing.T) {
Akronb7e1f132021-08-10 11:52:31 +0200238 assert := assert.New(t)
239 dat := LoadDatokFile("testdata/tokenizer.datok")
240 assert.NotNil(dat)
241
242 b := make([]byte, 0, 2048)
243 w := bytes.NewBuffer(b)
244 var sentences []string
245
246 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200247 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
248 sentences = strings.Split(w.String(), "\n\n")
249
Akrona854faa2021-10-22 19:31:08 +0200250 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron1594cb82021-08-11 11:14:56 +0200251 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200252 assert.Equal("\n", sentences[1])
253 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200254
255 w.Reset()
256 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
257 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200258 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200259 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200260 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200261
262 w.Reset()
263 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200264 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200265 assert.Equal(2, len(sentences))
266 assert.Equal("", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200267
268 w.Reset()
269 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
270 sentences = strings.Split(w.String(), "\n\n")
271 assert.Equal(len(sentences), 2)
272
273 w.Reset()
274 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
275 sentences = strings.Split(w.String(), "\n\n")
276 assert.Equal(len(sentences), 2)
277
Akron6e70dc82021-08-11 11:33:18 +0200278 w.Reset()
279 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
280 sentences = strings.Split(w.String(), "\n\n")
281 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200282 assert.Equal("\n", sentences[1])
283 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200284
Akron6e70dc82021-08-11 11:33:18 +0200285 w.Reset()
286 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
287 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200288 assert.Equal("\n", sentences[1])
289 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200290
Akron6e70dc82021-08-11 11:33:18 +0200291 w.Reset()
292 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
293 sentences = strings.Split(w.String(), "\n\n")
294 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200295
Akron6e70dc82021-08-11 11:33:18 +0200296 w.Reset()
297 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
298 sentences = strings.Split(w.String(), "\n\n")
299 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200300
Akron6e70dc82021-08-11 11:33:18 +0200301 w.Reset()
302 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
303 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200304 assert.Equal(2, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200305 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200306 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200307
Akron6e70dc82021-08-11 11:33:18 +0200308 w.Reset()
309 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
310 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200311 assert.Equal(3, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200312 assert.Equal("Ausschalten\n!!!", sentences[0])
313 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200314 assert.Equal("\n", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200315
Akron4af79f12021-08-11 14:48:17 +0200316 w.Reset()
317 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
318 sentences = strings.Split(w.String(), "\n\n")
319 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200320
321 /*
322 Test:
323 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
324 */
Akronb7e1f132021-08-10 11:52:31 +0200325}
Akron03ca4252021-08-11 13:32:53 +0200326
Akronc9c0eae2021-10-22 19:49:43 +0200327func TestDoubleArrayFullTokenizerTokenSplitter(t *testing.T) {
Akron03ca4252021-08-11 13:32:53 +0200328 assert := assert.New(t)
329 dat := LoadDatokFile("testdata/tokenizer.datok")
330 assert.NotNil(dat)
331
332 b := make([]byte, 0, 2048)
333 w := bytes.NewBuffer(b)
334 var tokens []string
335
336 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200337 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200338 assert.Equal(tokens[0], "Der")
339 assert.Equal(tokens[1], "alte")
340 assert.Equal(tokens[2], "Mann")
341 assert.Equal(len(tokens), 3)
342
Akronec835ad2021-08-11 18:23:22 +0200343 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200344 assert.Equal(tokens[0], "Der")
345 assert.Equal(tokens[1], "alte")
346 assert.Equal(tokens[2], "Mann")
347 assert.Equal(tokens[3], ".")
348 assert.Equal(len(tokens), 4)
349
350 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200351 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200352 assert.Equal(tokens[0], "Der")
353 assert.Equal(tokens[1], "Vorsitzende")
354 assert.Equal(tokens[2], "der")
355 assert.Equal(tokens[3], "F.D.P.")
356 assert.Equal(tokens[4], "hat")
357 assert.Equal(tokens[5], "gewählt")
358 assert.Equal(len(tokens), 6)
359 // Ignored in KorAP-Tokenizer
360
361 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200362 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200363 assert.Equal(tokens[0], "Gefunden")
364 assert.Equal(tokens[1], "auf")
365 assert.Equal(tokens[2], "wikipedia.org")
366 assert.Equal(len(tokens), 3)
367
368 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200369 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200370 assert.Equal("Gefunden", tokens[0])
371 assert.Equal("auf", tokens[1])
372 assert.Equal("www.wikipedia.org", tokens[2])
373 assert.Equal(3, len(tokens))
374
375 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200376 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200377 assert.Equal("www.info.biz/info", tokens[3])
378
379 // testTokenizerFtpHost
380 /*
381 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
382 assert.Equal("Kann", tokens[0])
383 assert.Equal("von", tokens[1])
384 assert.Equal("ftp.download.org", tokens[2])
385 assert.Equal(5, len(tokens))
386 // Ignored in KorAP-Tokenizer
387 */
388
389 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200390 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200391 assert.Equal(tokens[0], "Das")
392 assert.Equal(tokens[1], "war")
393 assert.Equal(tokens[2], "--")
394 assert.Equal(tokens[3], "spitze")
395 assert.Equal(len(tokens), 4)
396
397 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200398 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200399 assert.Equal(tokens[0], "Ich")
400 assert.Equal(tokens[1], "bin")
401 assert.Equal(tokens[2], "unter")
402 assert.Equal(tokens[3], "korap@ids-mannheim.de")
403 assert.Equal(tokens[4], "erreichbar")
404 assert.Equal(tokens[5], ".")
405 assert.Equal(len(tokens), 6)
406
407 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200408 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200409 assert.Equal(tokens[0], "Oder")
410 assert.Equal(tokens[1], "unter")
411 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
412 assert.Equal(tokens[3], ".")
413 assert.Equal(len(tokens), 4)
414
415 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200416 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200417 assert.Equal(tokens[0], "Oder")
418 assert.Equal(tokens[1], "unter")
419 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
420 assert.Equal(tokens[3], ".")
421 assert.Equal(len(tokens), 4)
422 // Ignored in KorAP-Tokenizer
423
424 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200425 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200426 assert.Equal("\"", tokens[0])
427 assert.Equal("John", tokens[1])
428 assert.Equal("Doe", tokens[2])
429 assert.Equal("\"", tokens[3])
430 assert.Equal("@xx", tokens[4])
431 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
432 assert.Equal("com", tokens[6])
433 assert.Equal(7, len(tokens))
434
435 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200436 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200437 assert.Equal(tokens[0], "Folgt")
438 assert.Equal(tokens[1], "@korap")
439 assert.Equal(tokens[2], "und")
440 assert.Equal(tokens[3], "#korap")
441 assert.Equal(len(tokens), 4)
442
443 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200444 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200445 assert.Equal(tokens[0], "Unsere")
446 assert.Equal(tokens[1], "Website")
447 assert.Equal(tokens[2], "ist")
448 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
449 assert.Equal(len(tokens), 4)
450
451 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200452 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200453 assert.Equal(tokens[0], "Wir")
454 assert.Equal(tokens[1], "sind")
455 assert.Equal(tokens[2], "auch")
456 assert.Equal(tokens[3], "im")
457 assert.Equal(tokens[4], "Internet")
458 assert.Equal(tokens[5], "(")
459 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
460 assert.Equal(tokens[7], ")")
461 assert.Equal(len(tokens), 8)
462 // Ignored in KorAP-Tokenizer
463
464 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200465 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200466 assert.Equal(tokens[0], "Die")
467 assert.Equal(tokens[1], "Adresse")
468 assert.Equal(tokens[2], "ist")
469 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
470 assert.Equal(tokens[4], ".")
471 assert.Equal(len(tokens), 5)
472 // Ignored in KorAP-Tokenizer
473
474 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200475 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200476 assert.Equal(tokens[0], "Unser")
477 assert.Equal(tokens[1], "Server")
478 assert.Equal(tokens[2], "ist")
479 assert.Equal(tokens[3], "10.0.10.51")
480 assert.Equal(tokens[4], ".")
481 assert.Equal(len(tokens), 5)
482
483 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200484 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200485 assert.Equal(tokens[0], "Zu")
486 assert.Equal(tokens[1], "50,4%")
487 assert.Equal(tokens[2], "ist")
488 assert.Equal(tokens[3], "es")
489 assert.Equal(tokens[4], "sicher")
490 assert.Equal(len(tokens), 5)
491 // Differs from KorAP-Tokenizer
492
493 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200494 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200495 assert.Equal(tokens[0], "Der")
496 assert.Equal(tokens[1], "Termin")
497 assert.Equal(tokens[2], "ist")
498 assert.Equal(tokens[3], "am")
499 assert.Equal(tokens[4], "5.9.2018")
500 assert.Equal(len(tokens), 5)
501
Akronec835ad2021-08-11 18:23:22 +0200502 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200503 assert.Equal(tokens[0], "Der")
504 assert.Equal(tokens[1], "Termin")
505 assert.Equal(tokens[2], "ist")
506 assert.Equal(tokens[3], "am")
507 assert.Equal(tokens[4], "5/9/2018")
508 assert.Equal(len(tokens), 5)
509
510 // testTokenizerDateRange
511 /*
512 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
513 assert.Equal(tokens[0], "Der")
514 assert.Equal(tokens[1], "Termin")
515 assert.Equal(tokens[2], "war")
516 assert.Equal(tokens[3], "vom")
517 assert.Equal(tokens[4], "4.")
518 assert.Equal(tokens[5], "-")
519 assert.Equal(tokens[6], "5.9.2018")
520 assert.Equal(len(tokens), 7)
521 // Ignored in KorAP-Tokenizer
522 */
523
524 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200525 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200526 assert.Equal(tokens[0], "Das")
527 assert.Equal(tokens[1], "ist")
528 assert.Equal(tokens[2], "toll")
529 assert.Equal(tokens[3], "!")
530 assert.Equal(tokens[4], ";)")
531 assert.Equal(len(tokens), 5)
532
533 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200534 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200535 assert.Equal(tokens[0], "Kupietz")
536 assert.Equal(tokens[1], "und")
537 assert.Equal(tokens[2], "Schmidt")
538 assert.Equal(tokens[3], "(2018)")
539 assert.Equal(tokens[4], ":")
540 assert.Equal(tokens[5], "Korpuslinguistik")
541 assert.Equal(len(tokens), 6)
542 // Differs from KorAP-Tokenizer!
543
544 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200545 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200546 assert.Equal(tokens[0], "Kupietz")
547 assert.Equal(tokens[1], "und")
548 assert.Equal(tokens[2], "Schmidt")
549 assert.Equal(tokens[3], "[2018]")
550 assert.Equal(tokens[4], ":")
551 assert.Equal(tokens[5], "Korpuslinguistik")
552 assert.Equal(len(tokens), 6)
553 // Differs from KorAP-Tokenizer!
554
555 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200556 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200557 assert.Equal(tokens[0], "Er")
558 assert.Equal(tokens[1], "ist")
559 assert.Equal(tokens[2], "ein")
560 assert.Equal(tokens[3], "A****loch")
561 assert.Equal(tokens[4], "!")
562 assert.Equal(len(tokens), 5)
563
564 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200565 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200566 assert.Equal(tokens[0], "F*ck")
567 assert.Equal(tokens[1], "!")
568 assert.Equal(len(tokens), 2)
569
570 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200571 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200572 assert.Equal(tokens[0], "Dieses")
573 assert.Equal(tokens[1], "verf*****")
574 assert.Equal(tokens[2], "Kleid")
575 assert.Equal(tokens[3], "!")
576 assert.Equal(len(tokens), 4)
577
578 // Probably interpreted as HOST
579 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200580 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200581 assert.Equal(tokens[0], "Ich")
582 assert.Equal(tokens[1], "habe")
583 assert.Equal(tokens[2], "die")
584 assert.Equal(tokens[3], "readme.txt")
585 assert.Equal(tokens[4], "heruntergeladen")
586 assert.Equal(len(tokens), 5)
587
588 // Probably interpreted as HOST
589 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200590 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200591 assert.Equal(tokens[0], "Nimm")
592 assert.Equal(tokens[1], "die")
593 assert.Equal(tokens[2], "README.TXT")
594 assert.Equal(tokens[3], "!")
595 assert.Equal(len(tokens), 4)
596
597 // Probably interpreted as HOST
598 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200599 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200600 assert.Equal(tokens[0], "Zeig")
601 assert.Equal(tokens[1], "mir")
602 assert.Equal(tokens[2], "profile.jpeg")
603 assert.Equal(len(tokens), 3)
604
605 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200606
Akronec835ad2021-08-11 18:23:22 +0200607 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200608 assert.Equal(tokens[0], "Zeig")
609 assert.Equal(tokens[1], "mir")
610 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
611 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200612
Akrone8837b52021-08-11 17:29:58 +0200613 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200614 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200615 assert.Equal(tokens[0], "Gehe")
616 assert.Equal(tokens[1], "zu")
617 assert.Equal(tokens[2], "/Dokumente/profile.docx")
618 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200619
Akrone8837b52021-08-11 17:29:58 +0200620 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200621 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200622 assert.Equal(tokens[0], "Zeig")
623 assert.Equal(tokens[1], "mir")
624 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
625 assert.Equal(len(tokens), 3)
626 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200627
Akronfd92d7e2021-08-11 16:31:43 +0200628 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200629 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200630 assert.Equal(tokens[0], "Er")
631 assert.Equal(tokens[1], "sagte")
632 assert.Equal(tokens[2], ":")
633 assert.Equal(tokens[3], "\"")
634 assert.Equal(tokens[4], "Es")
635 assert.Equal(tokens[5], "geht")
636 assert.Equal(tokens[6], "mir")
637 assert.Equal(tokens[7], "gut")
638 assert.Equal(tokens[8], "!")
639 assert.Equal(tokens[9], "\"")
640 assert.Equal(tokens[10], ",")
641 assert.Equal(tokens[11], "daraufhin")
642 assert.Equal(tokens[12], "ging")
643 assert.Equal(tokens[13], "er")
644 assert.Equal(tokens[14], ".")
645 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200646
647 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200648 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
649 assert.Equal(tokens[0], "&quot;")
650 assert.Equal(tokens[1], "Das")
651 assert.Equal(tokens[2], "ist")
652 assert.Equal(tokens[3], "von")
653 assert.Equal(tokens[4], "C&A")
654 assert.Equal(tokens[5], "!")
655 assert.Equal(tokens[6], "&quot;")
656 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200657
658 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200659 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200660 assert.Equal(tokens[0], "Siehst")
661 assert.Equal(tokens[1], "Du")
662 assert.Equal(tokens[2], "?!!?")
663 assert.Equal(len(tokens), 3)
664
665 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200666 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200667 assert.Equal(tokens[0], "Peter")
668 assert.Equal(tokens[1], "O'Toole")
669 assert.Equal(len(tokens), 2)
670
671 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200672 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200673 assert.Equal(tokens[0], "Früher")
674 assert.Equal(tokens[1], "bzw.")
675 assert.Equal(tokens[2], "später")
676 assert.Equal(tokens[3], "...")
677 assert.Equal(len(tokens), 4)
678
679 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200680 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200681 assert.Equal(tokens[0], "Es")
682 assert.Equal(tokens[1], "war")
683 assert.Equal(tokens[2], "spät")
684 assert.Equal(tokens[3], ".")
685 assert.Equal(tokens[4], "Morgen")
686 assert.Equal(tokens[5], "ist")
687 assert.Equal(tokens[6], "es")
688 assert.Equal(tokens[7], "früh")
689 assert.Equal(tokens[8], ".")
690 assert.Equal(len(tokens), 9)
691 // Ignored in KorAP-Tokenizer
692
693 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200694 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200695 assert.Equal(tokens[0], "Sie")
696 assert.Equal(tokens[1], "erreichte")
697 assert.Equal(tokens[2], "den")
698 assert.Equal(tokens[3], "1.")
699 assert.Equal(tokens[4], "Platz")
700 assert.Equal(tokens[5], "!")
701 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200702
703 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200704 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200705 assert.Equal(tokens[0], "Archive")
706 assert.Equal(tokens[1], ":")
707 assert.Equal(tokens[2], "Ich")
708 assert.Equal(tokens[3], "bin")
709 assert.Equal(tokens[4], "kein")
710 assert.Equal(tokens[5], "zip")
711 assert.Equal(6, len(tokens))
712
713 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200714 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200715 assert.Equal(tokens[4], "Weststr.")
716 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200717
718 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200719 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200720 assert.Equal("D'dorf", tokens[0])
721 assert.Equal("Ku'damm", tokens[1])
722 assert.Equal("Lu'hafen", tokens[2])
723 assert.Equal("M'gladbach", tokens[3])
724 assert.Equal("W'schaft", tokens[4])
725 assert.Equal(5, len(tokens))
726
727 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200728 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200729 assert.Equal("mach's", tokens[0])
730 assert.Equal("macht's", tokens[1])
731 assert.Equal("was'n", tokens[2])
732 assert.Equal("ist's", tokens[3])
733 assert.Equal("haste", tokens[4])
734 assert.Equal("willste", tokens[5])
735 assert.Equal("kannste", tokens[6])
736 assert.Equal("biste", tokens[7])
737 assert.Equal("kriegste", tokens[8])
738 assert.Equal(9, len(tokens))
739
740 /*
741 @Test
742 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
743 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
744 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
745 assert.Equal("'ve", tokens[1]);
746 assert.Equal("'ll", tokens[3]);
747 assert.Equal("'d", tokens[5]);
748 assert.Equal("'m", tokens[7]);
749 assert.Equal("'re", tokens[9]);
750 assert.Equal("'s", tokens[11]);
751 assert.Equal("is", tokens[12]);
752 assert.Equal("n't", tokens[13]);
753 assert.Equal(14, len(tokens));
754 }
755
756 @Test
757 public void frenchTokenizerKnowsFrenchAbbreviations () {
758 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
759 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
760 assert.Equal("Approx.", tokens[0]);
761 assert.Equal("juill.", tokens[2]);
762 assert.Equal("prof.", tokens[5]);
763 assert.Equal("exerc.", tokens[15]);
764 assert.Equal("no.", tokens[16]);
765 assert.Equal("pp.", tokens[21]);
766 }
767
768 @Test
769 public void frenchTokenizerKnowsFrenchContractions () {
770 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
771 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
772 assert.Equal("J'", tokens[0]);
773 assert.Equal("j'", tokens[2]);
774 assert.Equal("qu'", tokens[4]);
775 assert.Equal("d'", tokens[6]);
776 assert.Equal("jusqu'", tokens[8]);
777 assert.Equal("Aujourd'hui", tokens[10]);
778 assert.Equal("D'", tokens[11]); // ’
779 assert.Equal("Quelqu'un", tokens[13]); // ’
780 assert.Equal("Presqu'île", tokens[14]); // ’
781 }
782
783 @Test
784 public void frenchTokenizerKnowsFrenchClitics () {
785 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
786 tokens = tokenize(dat, w, "suis-je sont-elles ")
787 assert.Equal("suis", tokens[0]);
788 assert.Equal("-je", tokens[1]);
789 assert.Equal("sont", tokens[2]);
790 assert.Equal("-elles", tokens[3]);
791 }
792
793 @Test
794 public void testEnglishTokenizerScienceAbbreviations () {
795 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
796 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
797 assert.Equal("Approx.", tokens[0]);
798 assert.Equal("in", tokens[1]);
799 assert.Equal("Sept.", tokens[2]);
800 assert.Equal("1954", tokens[3]);
801 assert.Equal(",", tokens[4]);
802 assert.Equal("Assoc.", tokens[5]);
803 assert.Equal("Prof.", tokens[6]);
804 assert.Equal("Dr.", tokens[7]);
805 assert.Equal("R.", tokens[8]);
806 assert.Equal("J.", tokens[9]);
807 assert.Equal("Ewing", tokens[10]);
808 assert.Equal("reviewed", tokens[11]);
809 assert.Equal("articles", tokens[12]);
810 assert.Equal("on", tokens[13]);
811 assert.Equal("Enzymol.", tokens[14]);
812 assert.Equal("Bacteriol.", tokens[15]);
813 assert.Equal("effects", tokens[16]);
814 assert.Equal("later", tokens[17]);
815 assert.Equal("published", tokens[18]);
816 assert.Equal("in", tokens[19]);
817 assert.Equal("Nutr.", tokens[20]);
818 assert.Equal("Rheumatol.", tokens[21]);
819 assert.Equal("No.", tokens[22]);
820 assert.Equal("12", tokens[23]);
821 assert.Equal("and", tokens[24]);
822 assert.Equal("Nº.", tokens[25]);
823 assert.Equal("13.", tokens[26]);
824 assert.Equal(",", tokens[27]);
825 assert.Equal("pp.", tokens[28]);
826 assert.Equal("17-18", tokens[29]);
827 assert.Equal(".", tokens[30]);
828 }
829
830 @Test
831 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
832 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
833 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
834 assert.Equal("I.", tokens[1]);
835 assert.Equal("I", tokens[8]);
836 assert.Equal(".", tokens[9]);
837 assert.Equal("I", tokens[12]);
838 assert.Equal(".", tokens[13]);
839 }
840
841 @Test
842 public void testZipOuputArchive () {
843
844 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
845 System.setOut(new PrintStream(clearOut));
846 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
847 assert.Equal(0, len(tokens));
848 }
849 */
850 /*
851
852 @Test
853 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
854 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
855 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
856 .printOffsets(true)
857 .build();
858 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
859 assert.Equal("Text1", tokens[0].getType());
860 assert.Equal(len(tokens), 9 );
861 }
862 */
863}
Akronbd406802021-08-11 18:39:13 +0200864
Akronc9c0eae2021-10-22 19:49:43 +0200865func TestDoubleArrayLoadFactor1(t *testing.T) {
Akron29e306f2021-09-02 18:29:56 +0200866 assert := assert.New(t)
867 tok := LoadFomaFile("testdata/abbr_bench.fst")
868 dat := tok.ToDoubleArray()
869 assert.True(dat.LoadFactor() > 88)
870}
871
Akronc9c0eae2021-10-22 19:49:43 +0200872func TestDoubleArrayFullTokenizerXML(t *testing.T) {
Akron4c2a1ad2021-08-31 00:35:53 +0200873 assert := assert.New(t)
874
875 dat := LoadDatokFile("testdata/tokenizer.datok")
876 assert.NotNil(dat)
877
878 b := make([]byte, 0, 2048)
879 w := bytes.NewBuffer(b)
880 var tokens []string
881
882 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
883 assert.Equal("Das", tokens[0])
884 assert.Equal("<b>", tokens[1])
885 assert.Equal("beste", tokens[2])
886 assert.Equal("</b>", tokens[3])
887 assert.Equal("Fußballspiel", tokens[4])
888 assert.Equal(5, len(tokens))
889
890 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
891 assert.Equal("Das", tokens[0])
892 assert.Equal("<b class=\"c\">", tokens[1])
893 assert.Equal("beste", tokens[2])
894 assert.Equal("</b>", tokens[3])
895 assert.Equal("Fußballspiel", tokens[4])
896 assert.Equal(5, len(tokens))
897
898 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
899 assert.Equal("der", tokens[0])
900 assert.Equal("<x y=\"alte \">", tokens[1])
901 assert.Equal("<x x>", tokens[2])
902 assert.Equal("alte", tokens[3])
903 assert.Equal("</x>", tokens[4])
904 assert.Equal("etc.", tokens[5])
905 assert.Equal("et", tokens[6])
906 assert.Equal(".", tokens[7])
907 assert.Equal("Mann", tokens[8])
908 assert.Equal(".", tokens[9])
909 assert.Equal(10, len(tokens))
910}
911
Akronc9c0eae2021-10-22 19:49:43 +0200912func BenchmarkDoubleArrayTransduce(b *testing.B) {
Akronbd406802021-08-11 18:39:13 +0200913 bu := make([]byte, 0, 2048)
914 w := bytes.NewBuffer(bu)
915
916 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
917 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
918 Der Termin ist am 5.9.2018.
919 Ich habe die readme.txt heruntergeladen.
920 Ausschalten!!! Hast Du nicht gehört???
921 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
922 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
923 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
924 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
925 r := strings.NewReader(s)
926
927 dat := LoadDatokFile("testdata/tokenizer.datok")
928
Akrondf37a552021-09-02 12:16:08 +0200929 b.ResetTimer()
930
Akronbd406802021-08-11 18:39:13 +0200931 for i := 0; i < b.N; i++ {
932 w.Reset()
933 r.Reset(s)
934 ok := dat.Transduce(r, w)
935 if !ok {
936 fmt.Println("Fail!")
937 fmt.Println(w.String())
938 os.Exit(1)
939 }
940 }
Akronbd406802021-08-11 18:39:13 +0200941}
Akronbb4aac52021-08-13 00:52:27 +0200942
Akron6f1c16c2021-08-17 10:45:42 +0200943// This test is deprecated as the datok file changes over time
944func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200945 for i := 0; i < b.N; i++ {
946 dat := LoadDatokFile("testdata/tokenizer.datok")
947 if dat == nil {
948 fmt.Println("Fail!")
949 os.Exit(1)
950 }
951 }
952}
953
Akronc9c0eae2021-10-22 19:49:43 +0200954func BenchmarkDoubleArrayConstruction(b *testing.B) {
Akron6f1c16c2021-08-17 10:45:42 +0200955 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200956 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200957 for i := 0; i < b.N; i++ {
958 dat := tok.ToDoubleArray()
959 if dat == nil {
960 fmt.Println("Fail!")
961 os.Exit(1)
962 }
963 }
964}
965
Akronc9c0eae2021-10-22 19:49:43 +0200966func BenchmarkDoubleArrayLarger(b *testing.B) {
Akron7b1faa62021-09-02 16:10:21 +0200967 tok := LoadFomaFile("testdata/abbr_bench.fst")
968 b.ResetTimer()
969 for i := 0; i < b.N; i++ {
970 dat := tok.ToDoubleArray()
971 if dat == nil {
972 fmt.Println("Fail!")
973 os.Exit(1)
974 }
975 }
976}
977
Akronbb4aac52021-08-13 00:52:27 +0200978// 2021-08-11 (go 1.16)
979// go test -bench=. -test.benchmem
980// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200981// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200982// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
983// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
984// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
985// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200986// 2021-08-16
987// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
988// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
989// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
990// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200991// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
992// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200993// 2021-08-17
994// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
995// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200996// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
997// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +0200998// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +0200999// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
1000// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
1001// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
1002// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
1003// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
1004// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1005// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1006// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1007// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001008// 2021-09-02 - xCheckSkip() with .9
1009// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1010// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1011// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001012// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1013// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1014// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1015// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
Akron28031b72021-10-02 13:07:25 +02001016// 2021-09-30 - Go 1.17.1
1017// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
1018// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
1019// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
1020// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
Akron094a4e82021-10-02 18:37:00 +02001021// 2021-10-02
1022// BenchmarkTransduce-4 47676 25398 ns/op 8240 B/op 3 allocs/op
1023// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
1024// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
1025// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
Akrone396a932021-10-19 01:06:13 +02001026// 2021-10-12 - Introduction of Callbacks in Matrix
1027// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
1028// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
1029// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
1030// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
1031// 2021-10-18 - Introduction of Callbacks in DA
1032// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
1033// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
1034// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
1035// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
Akrona854faa2021-10-22 19:31:08 +02001036// 2021-10-21 - Simplify DA code to ignore final states
1037// BenchmarkTransduce-4 41365 33766 ns/op 12408 B/op 6 allocs/op
1038// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
1039// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
1040// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
Akronc9c0eae2021-10-22 19:49:43 +02001041// 2021-10-21 - Introduxe EOT
1042// BenchmarkDoubleArrayTransduce-4 43820 27661 ns/op 12408 B/op 6 allocs/op
1043// BenchmarkDoubleArrayConstruction-4 68259 16608 ns/op 10703 B/op 29 allocs/op
1044// BenchmarkDoubleArrayLarger-4 16 69889532 ns/op 6357901 B/op 2578 allocs/op
1045// BenchmarkMatrixTransduce-4 49426 25105 ns/op 12408 B/op 6 allocs/op