blob: 2435acf1c7d9906c2006a242eff53cd463dc9cbf [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron9fb63af2021-10-28 01:15:53 +020014var dat *DaTokenizer
15
Akron1c34ce62021-09-23 23:27:39 +020016func tmatch(tok Tokenizer, s string) bool {
Akronec835ad2021-08-11 18:23:22 +020017 b := make([]byte, 0, 2048)
18 w := bytes.NewBuffer(b)
Akron1c34ce62021-09-23 23:27:39 +020019 return tok.Transduce(strings.NewReader(s), w)
Akronec835ad2021-08-11 18:23:22 +020020}
21
Akron1c34ce62021-09-23 23:27:39 +020022func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020023 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020024 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020025 if !ok {
26 return []string{}
27 }
28 obj := regexp.MustCompile("\n+")
29
30 tokens := obj.Split(w.String(), -1)
31 return tokens[:len(tokens)-1]
32}
33
Akronc9c0eae2021-10-22 19:49:43 +020034func TestDoubleArraySimpleString(t *testing.T) {
Akron8ef408b2021-08-02 22:11:04 +020035 assert := assert.New(t)
36
37 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020040 assert.True(tmatch(dat, "bau"))
41 assert.True(tmatch(dat, "bauamt"))
42 assert.False(tmatch(dat, "baum"))
Akrone396a932021-10-19 01:06:13 +020043 assert.True(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020044}
Akron75ebe7f2021-08-03 10:34:10 +020045
Akronc9c0eae2021-10-22 19:49:43 +020046func TestDoubleArraySimpleBranches(t *testing.T) {
Akron75ebe7f2021-08-03 10:34:10 +020047 assert := assert.New(t)
48
49 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020050 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020051 dat := tok.ToDoubleArray()
Akrone396a932021-10-19 01:06:13 +020052 assert.True(tmatch(dat, "bau"))
Akronec835ad2021-08-11 18:23:22 +020053 assert.True(tmatch(dat, "bauamt"))
54 assert.True(tmatch(dat, "wahlamt"))
55 assert.True(tmatch(dat, "bauen"))
56 assert.True(tmatch(dat, "wahlen"))
57 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020058}
Akron730a79c2021-08-03 11:05:29 +020059
60func TestSimpleTokenizer(t *testing.T) {
61 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020062 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020063 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020064 assert.True(tmatch(dat, "bau"))
65 assert.True(tmatch(dat, "bad"))
66 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020067}
Akron740f3d72021-08-03 12:12:34 +020068
Akronc9c0eae2021-10-22 19:49:43 +020069func TestDoubleArraySimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020070 assert := assert.New(t)
71 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020072 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020073
74 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
75 b := make([]byte, 0, 2048)
76 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020077 var tokens []string
Akron524c5432021-08-05 14:14:27 +020078 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020080 assert.Equal(len(tokens), 11)
Akron3f8571a2021-08-05 11:18:10 +020081 assert.Equal("wald", tokens[0])
82 assert.Equal("gehen", tokens[1])
83 assert.Equal("Da", tokens[2])
84 assert.Equal("kann", tokens[3])
85 assert.Equal("man", tokens[4])
86 assert.Equal("was", tokens[5])
87 assert.Equal("\"erleben\"", tokens[6])
88
Akron524c5432021-08-05 14:14:27 +020089 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
90 w.Reset()
91 dat.Transduce(r, w)
92 tokens = strings.Split(w.String(), "\n")
93 assert.Equal("In", tokens[0])
94 assert.Equal("den", tokens[1])
95 assert.Equal("Wald", tokens[2])
96 assert.Equal("gehen", tokens[3])
97 assert.Equal("?", tokens[4])
98 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020099
Akron524c5432021-08-05 14:14:27 +0200100 r = strings.NewReader(" g? -- D")
101 w.Reset()
102 dat.Transduce(r, w)
103 tokens = strings.Split(w.String(), "\n")
104 assert.Equal("g", tokens[0])
105 assert.Equal("?", tokens[1])
106 assert.Equal("--", tokens[2])
107 assert.Equal("D", tokens[3])
108 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200109 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +0200110 assert.Equal(7, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200111}
112
Akronc9c0eae2021-10-22 19:49:43 +0200113func TestDoubleArrayReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200114 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200115 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200116 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200117 assert.True(tmatch(dat, "bau"))
118 assert.True(tmatch(dat, "bad"))
119 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200120
Akron3f8571a2021-08-05 11:18:10 +0200121 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200122 buf := bytes.NewBuffer(b)
123 n, err := dat.WriteTo(buf)
124 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200125 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200126
127 dat2 := ParseDatok(buf)
128 assert.NotNil(dat2)
129 assert.Equal(dat.array, dat2.array)
130 assert.Equal(dat.sigma, dat2.sigma)
131 assert.Equal(dat.epsilon, dat2.epsilon)
132 assert.Equal(dat.unknown, dat2.unknown)
133 assert.Equal(dat.identity, dat2.identity)
134 assert.Equal(dat.final, dat2.final)
135 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200136 assert.True(tmatch(dat2, "bau"))
137 assert.True(tmatch(dat2, "bad"))
138 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200139
Akron92704eb2021-08-27 10:59:46 +0200140 assert.Equal(dat.TransCount(), 17)
141 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200142}
143
Akronc9c0eae2021-10-22 19:49:43 +0200144func TestDoubleArrayIgnorableMCS(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200145
146 // This test relies on final states. That's why it is
147 // not working correctly anymore.
148
Akron31f3c062021-08-27 10:15:13 +0200149 assert := assert.New(t)
150 // File has MCS in sigma but not in net
151 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
152 assert.NotNil(tok)
153 dat := tok.ToDoubleArray()
154 assert.NotNil(dat)
155
156 b := make([]byte, 0, 2048)
157 w := bytes.NewBuffer(b)
158 var tokens []string
159
160 // Is only unambigous when transducing strictly greedy!
Akrone396a932021-10-19 01:06:13 +0200161 assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
Akron31f3c062021-08-27 10:15:13 +0200162 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200163 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akron31f3c062021-08-27 10:15:13 +0200164 assert.Equal("a", tokens[0])
165 assert.Equal("b", tokens[1])
Akrone396a932021-10-19 01:06:13 +0200166 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200167 assert.Equal(6, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200168 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200169}
170
Akronc9c0eae2021-10-22 19:49:43 +0200171func TestDoubleArrayFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200172 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200173
174 if dat == nil {
175 dat = LoadDatokFile("testdata/tokenizer.datok")
176 }
Akron3a063ef2021-08-05 19:36:35 +0200177 assert.NotNil(dat)
178 assert.True(dat.LoadFactor() >= 70)
179 assert.Equal(dat.epsilon, 1)
180 assert.Equal(dat.unknown, 2)
181 assert.Equal(dat.identity, 3)
Akronfac8abc2021-11-10 07:19:59 +0100182 assert.Equal(dat.final, 142)
183 assert.Equal(len(dat.sigma), 137)
184 // assert.True(len(dat.array) > 3000000)
185 // assert.True(dat.maxSize > 3000000)
Akronec835ad2021-08-11 18:23:22 +0200186 assert.True(tmatch(dat, "bau"))
187 assert.True(tmatch(dat, "bad"))
188 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200189}
Akron3f8571a2021-08-05 11:18:10 +0200190
Akronc9c0eae2021-10-22 19:49:43 +0200191func TestDoubleArrayTokenizerBranch(t *testing.T) {
Akron941f2152021-09-26 15:14:25 +0200192 assert := assert.New(t)
193 tok := LoadTokenizerFile("testdata/simpletok.datok")
194 assert.NotNil(tok)
195 assert.Equal(tok.Type(), "DATOK")
196
197 tok = LoadTokenizerFile("testdata/simpletok.matok")
198 assert.NotNil(tok)
199 assert.Equal(tok.Type(), "MATOK")
200}
201
Akronc9c0eae2021-10-22 19:49:43 +0200202func XTestDoubleArrayFullTokenizerBuild(t *testing.T) {
Akrona0bded52021-08-11 15:48:02 +0200203 assert := assert.New(t)
204 tok := LoadFomaFile("testdata/tokenizer.fst")
205 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200206 assert.NotNil(dat)
207 // n, err := dat.Save("testdata/tokenizer.datok")
208 // assert.Nil(err)
209 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200210}
211
Akronc9c0eae2021-10-22 19:49:43 +0200212func TestDoubleArrayFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200213 assert := assert.New(t)
214
Akron9fb63af2021-10-28 01:15:53 +0200215 if dat == nil {
216 dat = LoadDatokFile("testdata/tokenizer.datok")
217 }
218
Akron3610f102021-08-08 14:13:25 +0200219 assert.NotNil(dat)
220
Akron3610f102021-08-08 14:13:25 +0200221 b := make([]byte, 0, 2048)
222 w := bytes.NewBuffer(b)
223 var tokens []string
224
Akron03ca4252021-08-11 13:32:53 +0200225 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200226
227 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200229 assert.Equal("tra", tokens[0])
230 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200231 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200232 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200233 assert.Equal("Du", tokens[4])
234 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200235 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200236 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200237 assert.Equal("", tokens[8])
238 assert.Equal(9, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200239
240 w.Reset()
241 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200242 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200243}
Akronb7e1f132021-08-10 11:52:31 +0200244
Akronc9c0eae2021-10-22 19:49:43 +0200245func TestDoubleArrayFullTokenizerSentenceSplitter(t *testing.T) {
Akronb7e1f132021-08-10 11:52:31 +0200246 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200247
248 if dat == nil {
249 dat = LoadDatokFile("testdata/tokenizer.datok")
250 }
251
Akronb7e1f132021-08-10 11:52:31 +0200252 assert.NotNil(dat)
253
254 b := make([]byte, 0, 2048)
255 w := bytes.NewBuffer(b)
256 var sentences []string
257
258 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200259 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
260 sentences = strings.Split(w.String(), "\n\n")
261
Akrona854faa2021-10-22 19:31:08 +0200262 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron1594cb82021-08-11 11:14:56 +0200263 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200264 assert.Equal("\n", sentences[1])
265 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200266
267 w.Reset()
268 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
269 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200270 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200271 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200272 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200273
274 w.Reset()
275 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200276 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200277 assert.Equal(2, len(sentences))
278 assert.Equal("", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200279
280 w.Reset()
281 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
282 sentences = strings.Split(w.String(), "\n\n")
283 assert.Equal(len(sentences), 2)
284
285 w.Reset()
286 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
289
Akron6e70dc82021-08-11 11:33:18 +0200290 w.Reset()
291 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
292 sentences = strings.Split(w.String(), "\n\n")
293 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200294 assert.Equal("\n", sentences[1])
295 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200296
Akron6e70dc82021-08-11 11:33:18 +0200297 w.Reset()
298 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
299 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200300 assert.Equal("\n", sentences[1])
301 assert.Equal(2, len(sentences))
Akron1594cb82021-08-11 11:14:56 +0200302
Akron6e70dc82021-08-11 11:33:18 +0200303 w.Reset()
304 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
305 sentences = strings.Split(w.String(), "\n\n")
306 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200307
Akron6e70dc82021-08-11 11:33:18 +0200308 w.Reset()
309 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
310 sentences = strings.Split(w.String(), "\n\n")
311 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200312
Akron6e70dc82021-08-11 11:33:18 +0200313 w.Reset()
314 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
315 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200316 assert.Equal(2, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200317 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200318 assert.Equal("\n", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200319
Akron6e70dc82021-08-11 11:33:18 +0200320 w.Reset()
321 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
322 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200323 assert.Equal(3, len(sentences))
Akron6e70dc82021-08-11 11:33:18 +0200324 assert.Equal("Ausschalten\n!!!", sentences[0])
325 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200326 assert.Equal("\n", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200327
Akron4af79f12021-08-11 14:48:17 +0200328 w.Reset()
329 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
330 sentences = strings.Split(w.String(), "\n\n")
331 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200332
333 /*
334 Test:
335 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
336 */
Akronb7e1f132021-08-10 11:52:31 +0200337}
Akron03ca4252021-08-11 13:32:53 +0200338
Akronc9c0eae2021-10-22 19:49:43 +0200339func TestDoubleArrayFullTokenizerTokenSplitter(t *testing.T) {
Akron03ca4252021-08-11 13:32:53 +0200340 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200341
342 if dat == nil {
343 dat = LoadDatokFile("testdata/tokenizer.datok")
344 }
345
Akron03ca4252021-08-11 13:32:53 +0200346 assert.NotNil(dat)
347
348 b := make([]byte, 0, 2048)
349 w := bytes.NewBuffer(b)
350 var tokens []string
351
352 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200353 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200354 assert.Equal(tokens[0], "Der")
355 assert.Equal(tokens[1], "alte")
356 assert.Equal(tokens[2], "Mann")
357 assert.Equal(len(tokens), 3)
358
Akronec835ad2021-08-11 18:23:22 +0200359 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200360 assert.Equal(tokens[0], "Der")
361 assert.Equal(tokens[1], "alte")
362 assert.Equal(tokens[2], "Mann")
363 assert.Equal(tokens[3], ".")
364 assert.Equal(len(tokens), 4)
365
366 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200367 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200368 assert.Equal(tokens[0], "Der")
369 assert.Equal(tokens[1], "Vorsitzende")
370 assert.Equal(tokens[2], "der")
371 assert.Equal(tokens[3], "F.D.P.")
372 assert.Equal(tokens[4], "hat")
373 assert.Equal(tokens[5], "gewählt")
374 assert.Equal(len(tokens), 6)
375 // Ignored in KorAP-Tokenizer
376
377 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200378 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200379 assert.Equal(tokens[0], "Gefunden")
380 assert.Equal(tokens[1], "auf")
381 assert.Equal(tokens[2], "wikipedia.org")
382 assert.Equal(len(tokens), 3)
383
384 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200385 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200386 assert.Equal("Gefunden", tokens[0])
387 assert.Equal("auf", tokens[1])
388 assert.Equal("www.wikipedia.org", tokens[2])
389 assert.Equal(3, len(tokens))
390
391 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200392 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200393 assert.Equal("www.info.biz/info", tokens[3])
394
395 // testTokenizerFtpHost
396 /*
397 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
398 assert.Equal("Kann", tokens[0])
399 assert.Equal("von", tokens[1])
400 assert.Equal("ftp.download.org", tokens[2])
401 assert.Equal(5, len(tokens))
402 // Ignored in KorAP-Tokenizer
403 */
404
405 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200406 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200407 assert.Equal(tokens[0], "Das")
408 assert.Equal(tokens[1], "war")
409 assert.Equal(tokens[2], "--")
410 assert.Equal(tokens[3], "spitze")
411 assert.Equal(len(tokens), 4)
412
413 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200414 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200415 assert.Equal(tokens[0], "Ich")
416 assert.Equal(tokens[1], "bin")
417 assert.Equal(tokens[2], "unter")
418 assert.Equal(tokens[3], "korap@ids-mannheim.de")
419 assert.Equal(tokens[4], "erreichbar")
420 assert.Equal(tokens[5], ".")
421 assert.Equal(len(tokens), 6)
422
423 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200424 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200425 assert.Equal(tokens[0], "Oder")
426 assert.Equal(tokens[1], "unter")
427 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
428 assert.Equal(tokens[3], ".")
429 assert.Equal(len(tokens), 4)
430
431 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200432 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200433 assert.Equal(tokens[0], "Oder")
434 assert.Equal(tokens[1], "unter")
435 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
436 assert.Equal(tokens[3], ".")
437 assert.Equal(len(tokens), 4)
438 // Ignored in KorAP-Tokenizer
439
440 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200441 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200442 assert.Equal("\"", tokens[0])
443 assert.Equal("John", tokens[1])
444 assert.Equal("Doe", tokens[2])
445 assert.Equal("\"", tokens[3])
446 assert.Equal("@xx", tokens[4])
447 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
448 assert.Equal("com", tokens[6])
449 assert.Equal(7, len(tokens))
450
451 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200452 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200453 assert.Equal(tokens[0], "Folgt")
454 assert.Equal(tokens[1], "@korap")
455 assert.Equal(tokens[2], "und")
456 assert.Equal(tokens[3], "#korap")
457 assert.Equal(len(tokens), 4)
458
459 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200460 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200461 assert.Equal(tokens[0], "Unsere")
462 assert.Equal(tokens[1], "Website")
463 assert.Equal(tokens[2], "ist")
464 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
465 assert.Equal(len(tokens), 4)
466
467 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200468 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200469 assert.Equal(tokens[0], "Wir")
470 assert.Equal(tokens[1], "sind")
471 assert.Equal(tokens[2], "auch")
472 assert.Equal(tokens[3], "im")
473 assert.Equal(tokens[4], "Internet")
474 assert.Equal(tokens[5], "(")
475 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
476 assert.Equal(tokens[7], ")")
477 assert.Equal(len(tokens), 8)
478 // Ignored in KorAP-Tokenizer
479
480 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200481 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200482 assert.Equal(tokens[0], "Die")
483 assert.Equal(tokens[1], "Adresse")
484 assert.Equal(tokens[2], "ist")
485 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
486 assert.Equal(tokens[4], ".")
487 assert.Equal(len(tokens), 5)
488 // Ignored in KorAP-Tokenizer
489
490 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200491 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200492 assert.Equal(tokens[0], "Unser")
493 assert.Equal(tokens[1], "Server")
494 assert.Equal(tokens[2], "ist")
495 assert.Equal(tokens[3], "10.0.10.51")
496 assert.Equal(tokens[4], ".")
497 assert.Equal(len(tokens), 5)
498
499 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200500 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200501 assert.Equal(tokens[0], "Zu")
502 assert.Equal(tokens[1], "50,4%")
503 assert.Equal(tokens[2], "ist")
504 assert.Equal(tokens[3], "es")
505 assert.Equal(tokens[4], "sicher")
506 assert.Equal(len(tokens), 5)
507 // Differs from KorAP-Tokenizer
508
509 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200510 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200511 assert.Equal(tokens[0], "Der")
512 assert.Equal(tokens[1], "Termin")
513 assert.Equal(tokens[2], "ist")
514 assert.Equal(tokens[3], "am")
515 assert.Equal(tokens[4], "5.9.2018")
516 assert.Equal(len(tokens), 5)
517
Akronec835ad2021-08-11 18:23:22 +0200518 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200519 assert.Equal(tokens[0], "Der")
520 assert.Equal(tokens[1], "Termin")
521 assert.Equal(tokens[2], "ist")
522 assert.Equal(tokens[3], "am")
523 assert.Equal(tokens[4], "5/9/2018")
524 assert.Equal(len(tokens), 5)
525
526 // testTokenizerDateRange
527 /*
528 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
529 assert.Equal(tokens[0], "Der")
530 assert.Equal(tokens[1], "Termin")
531 assert.Equal(tokens[2], "war")
532 assert.Equal(tokens[3], "vom")
533 assert.Equal(tokens[4], "4.")
534 assert.Equal(tokens[5], "-")
535 assert.Equal(tokens[6], "5.9.2018")
536 assert.Equal(len(tokens), 7)
537 // Ignored in KorAP-Tokenizer
538 */
539
540 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200541 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200542 assert.Equal(tokens[0], "Das")
543 assert.Equal(tokens[1], "ist")
544 assert.Equal(tokens[2], "toll")
545 assert.Equal(tokens[3], "!")
546 assert.Equal(tokens[4], ";)")
547 assert.Equal(len(tokens), 5)
548
549 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200550 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200551 assert.Equal(tokens[0], "Kupietz")
552 assert.Equal(tokens[1], "und")
553 assert.Equal(tokens[2], "Schmidt")
554 assert.Equal(tokens[3], "(2018)")
555 assert.Equal(tokens[4], ":")
556 assert.Equal(tokens[5], "Korpuslinguistik")
557 assert.Equal(len(tokens), 6)
558 // Differs from KorAP-Tokenizer!
559
560 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200561 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200562 assert.Equal(tokens[0], "Kupietz")
563 assert.Equal(tokens[1], "und")
564 assert.Equal(tokens[2], "Schmidt")
565 assert.Equal(tokens[3], "[2018]")
566 assert.Equal(tokens[4], ":")
567 assert.Equal(tokens[5], "Korpuslinguistik")
568 assert.Equal(len(tokens), 6)
569 // Differs from KorAP-Tokenizer!
570
571 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200572 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200573 assert.Equal(tokens[0], "Er")
574 assert.Equal(tokens[1], "ist")
575 assert.Equal(tokens[2], "ein")
576 assert.Equal(tokens[3], "A****loch")
577 assert.Equal(tokens[4], "!")
578 assert.Equal(len(tokens), 5)
579
580 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200581 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200582 assert.Equal(tokens[0], "F*ck")
583 assert.Equal(tokens[1], "!")
584 assert.Equal(len(tokens), 2)
585
586 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200587 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200588 assert.Equal(tokens[0], "Dieses")
589 assert.Equal(tokens[1], "verf*****")
590 assert.Equal(tokens[2], "Kleid")
591 assert.Equal(tokens[3], "!")
592 assert.Equal(len(tokens), 4)
593
594 // Probably interpreted as HOST
595 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200596 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200597 assert.Equal(tokens[0], "Ich")
598 assert.Equal(tokens[1], "habe")
599 assert.Equal(tokens[2], "die")
600 assert.Equal(tokens[3], "readme.txt")
601 assert.Equal(tokens[4], "heruntergeladen")
602 assert.Equal(len(tokens), 5)
603
604 // Probably interpreted as HOST
605 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200606 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200607 assert.Equal(tokens[0], "Nimm")
608 assert.Equal(tokens[1], "die")
609 assert.Equal(tokens[2], "README.TXT")
610 assert.Equal(tokens[3], "!")
611 assert.Equal(len(tokens), 4)
612
613 // Probably interpreted as HOST
614 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200615 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200616 assert.Equal(tokens[0], "Zeig")
617 assert.Equal(tokens[1], "mir")
618 assert.Equal(tokens[2], "profile.jpeg")
619 assert.Equal(len(tokens), 3)
620
621 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200622
Akronec835ad2021-08-11 18:23:22 +0200623 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200624 assert.Equal(tokens[0], "Zeig")
625 assert.Equal(tokens[1], "mir")
626 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
627 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200628
Akrone8837b52021-08-11 17:29:58 +0200629 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200630 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200631 assert.Equal(tokens[0], "Gehe")
632 assert.Equal(tokens[1], "zu")
633 assert.Equal(tokens[2], "/Dokumente/profile.docx")
634 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200635
Akrone8837b52021-08-11 17:29:58 +0200636 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200637 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200638 assert.Equal(tokens[0], "Zeig")
639 assert.Equal(tokens[1], "mir")
640 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
641 assert.Equal(len(tokens), 3)
642 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200643
Akronfd92d7e2021-08-11 16:31:43 +0200644 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200645 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200646 assert.Equal(tokens[0], "Er")
647 assert.Equal(tokens[1], "sagte")
648 assert.Equal(tokens[2], ":")
649 assert.Equal(tokens[3], "\"")
650 assert.Equal(tokens[4], "Es")
651 assert.Equal(tokens[5], "geht")
652 assert.Equal(tokens[6], "mir")
653 assert.Equal(tokens[7], "gut")
654 assert.Equal(tokens[8], "!")
655 assert.Equal(tokens[9], "\"")
656 assert.Equal(tokens[10], ",")
657 assert.Equal(tokens[11], "daraufhin")
658 assert.Equal(tokens[12], "ging")
659 assert.Equal(tokens[13], "er")
660 assert.Equal(tokens[14], ".")
661 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200662
663 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200664 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
665 assert.Equal(tokens[0], "&quot;")
666 assert.Equal(tokens[1], "Das")
667 assert.Equal(tokens[2], "ist")
668 assert.Equal(tokens[3], "von")
669 assert.Equal(tokens[4], "C&A")
670 assert.Equal(tokens[5], "!")
671 assert.Equal(tokens[6], "&quot;")
672 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200673
674 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200675 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200676 assert.Equal(tokens[0], "Siehst")
677 assert.Equal(tokens[1], "Du")
678 assert.Equal(tokens[2], "?!!?")
679 assert.Equal(len(tokens), 3)
680
681 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200682 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200683 assert.Equal(tokens[0], "Peter")
684 assert.Equal(tokens[1], "O'Toole")
685 assert.Equal(len(tokens), 2)
686
687 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200688 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200689 assert.Equal(tokens[0], "Früher")
690 assert.Equal(tokens[1], "bzw.")
691 assert.Equal(tokens[2], "später")
692 assert.Equal(tokens[3], "...")
693 assert.Equal(len(tokens), 4)
694
695 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200696 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200697 assert.Equal(tokens[0], "Es")
698 assert.Equal(tokens[1], "war")
699 assert.Equal(tokens[2], "spät")
700 assert.Equal(tokens[3], ".")
701 assert.Equal(tokens[4], "Morgen")
702 assert.Equal(tokens[5], "ist")
703 assert.Equal(tokens[6], "es")
704 assert.Equal(tokens[7], "früh")
705 assert.Equal(tokens[8], ".")
706 assert.Equal(len(tokens), 9)
707 // Ignored in KorAP-Tokenizer
708
709 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200710 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200711 assert.Equal(tokens[0], "Sie")
712 assert.Equal(tokens[1], "erreichte")
713 assert.Equal(tokens[2], "den")
714 assert.Equal(tokens[3], "1.")
715 assert.Equal(tokens[4], "Platz")
716 assert.Equal(tokens[5], "!")
717 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200718
719 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200720 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200721 assert.Equal(tokens[0], "Archive")
722 assert.Equal(tokens[1], ":")
723 assert.Equal(tokens[2], "Ich")
724 assert.Equal(tokens[3], "bin")
725 assert.Equal(tokens[4], "kein")
726 assert.Equal(tokens[5], "zip")
727 assert.Equal(6, len(tokens))
728
729 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200730 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200731 assert.Equal(tokens[4], "Weststr.")
732 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200733
734 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200735 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200736 assert.Equal("D'dorf", tokens[0])
737 assert.Equal("Ku'damm", tokens[1])
738 assert.Equal("Lu'hafen", tokens[2])
739 assert.Equal("M'gladbach", tokens[3])
740 assert.Equal("W'schaft", tokens[4])
741 assert.Equal(5, len(tokens))
742
743 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200744 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200745 assert.Equal("mach's", tokens[0])
746 assert.Equal("macht's", tokens[1])
747 assert.Equal("was'n", tokens[2])
748 assert.Equal("ist's", tokens[3])
749 assert.Equal("haste", tokens[4])
750 assert.Equal("willste", tokens[5])
751 assert.Equal("kannste", tokens[6])
752 assert.Equal("biste", tokens[7])
753 assert.Equal("kriegste", tokens[8])
754 assert.Equal(9, len(tokens))
755
756 /*
757 @Test
758 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
759 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
760 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
761 assert.Equal("'ve", tokens[1]);
762 assert.Equal("'ll", tokens[3]);
763 assert.Equal("'d", tokens[5]);
764 assert.Equal("'m", tokens[7]);
765 assert.Equal("'re", tokens[9]);
766 assert.Equal("'s", tokens[11]);
767 assert.Equal("is", tokens[12]);
768 assert.Equal("n't", tokens[13]);
769 assert.Equal(14, len(tokens));
770 }
771
772 @Test
773 public void frenchTokenizerKnowsFrenchAbbreviations () {
774 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
775 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
776 assert.Equal("Approx.", tokens[0]);
777 assert.Equal("juill.", tokens[2]);
778 assert.Equal("prof.", tokens[5]);
779 assert.Equal("exerc.", tokens[15]);
780 assert.Equal("no.", tokens[16]);
781 assert.Equal("pp.", tokens[21]);
782 }
783
784 @Test
785 public void frenchTokenizerKnowsFrenchContractions () {
786 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
787 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
788 assert.Equal("J'", tokens[0]);
789 assert.Equal("j'", tokens[2]);
790 assert.Equal("qu'", tokens[4]);
791 assert.Equal("d'", tokens[6]);
792 assert.Equal("jusqu'", tokens[8]);
793 assert.Equal("Aujourd'hui", tokens[10]);
794 assert.Equal("D'", tokens[11]); // ’
795 assert.Equal("Quelqu'un", tokens[13]); // ’
796 assert.Equal("Presqu'île", tokens[14]); // ’
797 }
798
799 @Test
800 public void frenchTokenizerKnowsFrenchClitics () {
801 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
802 tokens = tokenize(dat, w, "suis-je sont-elles ")
803 assert.Equal("suis", tokens[0]);
804 assert.Equal("-je", tokens[1]);
805 assert.Equal("sont", tokens[2]);
806 assert.Equal("-elles", tokens[3]);
807 }
808
809 @Test
810 public void testEnglishTokenizerScienceAbbreviations () {
811 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
812 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
813 assert.Equal("Approx.", tokens[0]);
814 assert.Equal("in", tokens[1]);
815 assert.Equal("Sept.", tokens[2]);
816 assert.Equal("1954", tokens[3]);
817 assert.Equal(",", tokens[4]);
818 assert.Equal("Assoc.", tokens[5]);
819 assert.Equal("Prof.", tokens[6]);
820 assert.Equal("Dr.", tokens[7]);
821 assert.Equal("R.", tokens[8]);
822 assert.Equal("J.", tokens[9]);
823 assert.Equal("Ewing", tokens[10]);
824 assert.Equal("reviewed", tokens[11]);
825 assert.Equal("articles", tokens[12]);
826 assert.Equal("on", tokens[13]);
827 assert.Equal("Enzymol.", tokens[14]);
828 assert.Equal("Bacteriol.", tokens[15]);
829 assert.Equal("effects", tokens[16]);
830 assert.Equal("later", tokens[17]);
831 assert.Equal("published", tokens[18]);
832 assert.Equal("in", tokens[19]);
833 assert.Equal("Nutr.", tokens[20]);
834 assert.Equal("Rheumatol.", tokens[21]);
835 assert.Equal("No.", tokens[22]);
836 assert.Equal("12", tokens[23]);
837 assert.Equal("and", tokens[24]);
838 assert.Equal("Nº.", tokens[25]);
839 assert.Equal("13.", tokens[26]);
840 assert.Equal(",", tokens[27]);
841 assert.Equal("pp.", tokens[28]);
842 assert.Equal("17-18", tokens[29]);
843 assert.Equal(".", tokens[30]);
844 }
845
846 @Test
847 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
848 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
849 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
850 assert.Equal("I.", tokens[1]);
851 assert.Equal("I", tokens[8]);
852 assert.Equal(".", tokens[9]);
853 assert.Equal("I", tokens[12]);
854 assert.Equal(".", tokens[13]);
855 }
856
857 @Test
858 public void testZipOuputArchive () {
859
860 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
861 System.setOut(new PrintStream(clearOut));
862 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
863 assert.Equal(0, len(tokens));
864 }
865 */
866 /*
867
868 @Test
869 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
870 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
871 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
872 .printOffsets(true)
873 .build();
874 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
875 assert.Equal("Text1", tokens[0].getType());
876 assert.Equal(len(tokens), 9 );
877 }
878 */
879}
Akronbd406802021-08-11 18:39:13 +0200880
Akronc9c0eae2021-10-22 19:49:43 +0200881func TestDoubleArrayLoadFactor1(t *testing.T) {
Akron29e306f2021-09-02 18:29:56 +0200882 assert := assert.New(t)
883 tok := LoadFomaFile("testdata/abbr_bench.fst")
884 dat := tok.ToDoubleArray()
885 assert.True(dat.LoadFactor() > 88)
886}
887
Akronc9c0eae2021-10-22 19:49:43 +0200888func TestDoubleArrayFullTokenizerXML(t *testing.T) {
Akron4c2a1ad2021-08-31 00:35:53 +0200889 assert := assert.New(t)
890
Akron9fb63af2021-10-28 01:15:53 +0200891 if dat == nil {
892 dat = LoadDatokFile("testdata/tokenizer.datok")
893 }
894
Akron4c2a1ad2021-08-31 00:35:53 +0200895 assert.NotNil(dat)
896
897 b := make([]byte, 0, 2048)
898 w := bytes.NewBuffer(b)
899 var tokens []string
900
901 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
902 assert.Equal("Das", tokens[0])
903 assert.Equal("<b>", tokens[1])
904 assert.Equal("beste", tokens[2])
905 assert.Equal("</b>", tokens[3])
906 assert.Equal("Fußballspiel", tokens[4])
907 assert.Equal(5, len(tokens))
908
909 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
910 assert.Equal("Das", tokens[0])
911 assert.Equal("<b class=\"c\">", tokens[1])
912 assert.Equal("beste", tokens[2])
913 assert.Equal("</b>", tokens[3])
914 assert.Equal("Fußballspiel", tokens[4])
915 assert.Equal(5, len(tokens))
916
917 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
918 assert.Equal("der", tokens[0])
919 assert.Equal("<x y=\"alte \">", tokens[1])
920 assert.Equal("<x x>", tokens[2])
921 assert.Equal("alte", tokens[3])
922 assert.Equal("</x>", tokens[4])
923 assert.Equal("etc.", tokens[5])
924 assert.Equal("et", tokens[6])
925 assert.Equal(".", tokens[7])
926 assert.Equal("Mann", tokens[8])
927 assert.Equal(".", tokens[9])
928 assert.Equal(10, len(tokens))
929}
930
Akronc9c0eae2021-10-22 19:49:43 +0200931func BenchmarkDoubleArrayTransduce(b *testing.B) {
Akronbd406802021-08-11 18:39:13 +0200932 bu := make([]byte, 0, 2048)
933 w := bytes.NewBuffer(bu)
934
935 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
936 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
937 Der Termin ist am 5.9.2018.
938 Ich habe die readme.txt heruntergeladen.
939 Ausschalten!!! Hast Du nicht gehört???
940 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
941 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
942 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
943 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
944 r := strings.NewReader(s)
945
946 dat := LoadDatokFile("testdata/tokenizer.datok")
947
Akrondf37a552021-09-02 12:16:08 +0200948 b.ResetTimer()
949
Akronbd406802021-08-11 18:39:13 +0200950 for i := 0; i < b.N; i++ {
951 w.Reset()
952 r.Reset(s)
953 ok := dat.Transduce(r, w)
954 if !ok {
955 fmt.Println("Fail!")
956 fmt.Println(w.String())
957 os.Exit(1)
958 }
959 }
Akronbd406802021-08-11 18:39:13 +0200960}
Akronbb4aac52021-08-13 00:52:27 +0200961
Akron6f1c16c2021-08-17 10:45:42 +0200962// This test is deprecated as the datok file changes over time
963func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200964 for i := 0; i < b.N; i++ {
965 dat := LoadDatokFile("testdata/tokenizer.datok")
966 if dat == nil {
967 fmt.Println("Fail!")
968 os.Exit(1)
969 }
970 }
971}
972
Akronc9c0eae2021-10-22 19:49:43 +0200973func BenchmarkDoubleArrayConstruction(b *testing.B) {
Akron6f1c16c2021-08-17 10:45:42 +0200974 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200975 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200976 for i := 0; i < b.N; i++ {
977 dat := tok.ToDoubleArray()
978 if dat == nil {
979 fmt.Println("Fail!")
980 os.Exit(1)
981 }
982 }
983}
984
Akronc9c0eae2021-10-22 19:49:43 +0200985func BenchmarkDoubleArrayLarger(b *testing.B) {
Akron7b1faa62021-09-02 16:10:21 +0200986 tok := LoadFomaFile("testdata/abbr_bench.fst")
987 b.ResetTimer()
988 for i := 0; i < b.N; i++ {
989 dat := tok.ToDoubleArray()
990 if dat == nil {
991 fmt.Println("Fail!")
992 os.Exit(1)
993 }
994 }
995}
996
Akronbb4aac52021-08-13 00:52:27 +0200997// 2021-08-11 (go 1.16)
998// go test -bench=. -test.benchmem
999// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001000// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +02001001// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
1002// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
1003// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
1004// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +02001005// 2021-08-16
1006// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
1007// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
1008// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
1009// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +02001010// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
1011// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +02001012// 2021-08-17
1013// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
1014// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +02001015// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
1016// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +02001017// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +02001018// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
1019// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
1020// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
1021// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
1022// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
1023// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1024// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1025// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1026// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001027// 2021-09-02 - xCheckSkip() with .9
1028// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1029// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1030// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001031// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1032// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1033// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1034// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
Akron28031b72021-10-02 13:07:25 +02001035// 2021-09-30 - Go 1.17.1
1036// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
1037// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
1038// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
1039// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
Akron094a4e82021-10-02 18:37:00 +02001040// 2021-10-02
1041// BenchmarkTransduce-4 47676 25398 ns/op 8240 B/op 3 allocs/op
1042// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
1043// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
1044// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
Akrone396a932021-10-19 01:06:13 +02001045// 2021-10-12 - Introduction of Callbacks in Matrix
1046// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
1047// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
1048// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
1049// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
1050// 2021-10-18 - Introduction of Callbacks in DA
1051// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
1052// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
1053// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
1054// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op
Akrona854faa2021-10-22 19:31:08 +02001055// 2021-10-21 - Simplify DA code to ignore final states
1056// BenchmarkTransduce-4 41365 33766 ns/op 12408 B/op 6 allocs/op
1057// BenchmarkToDoubleArray-4 63663 17675 ns/op 10703 B/op 29 allocs/op
1058// BenchmarkToDoubleArrayLarger-4 16 83535733 ns/op 6357874 B/op 2577 allocs/op
1059// BenchmarkTransduceMatrix-4 45362 25258 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001060// 2021-10-22 - Introduxe EOT
Akronc9c0eae2021-10-22 19:49:43 +02001061// BenchmarkDoubleArrayTransduce-4 43820 27661 ns/op 12408 B/op 6 allocs/op
1062// BenchmarkDoubleArrayConstruction-4 68259 16608 ns/op 10703 B/op 29 allocs/op
1063// BenchmarkDoubleArrayLarger-4 16 69889532 ns/op 6357901 B/op 2578 allocs/op
1064// BenchmarkMatrixTransduce-4 49426 25105 ns/op 12408 B/op 6 allocs/op
Akron98fbfef2021-10-23 17:02:11 +02001065// 2021-10-23 - Improve offset handling
1066// BenchmarkDoubleArrayTransduce-4 41890 29729 ns/op 12408 B/op 6 allocs/op
1067// BenchmarkDoubleArrayConstruction-4 74510 15879 ns/op 10703 B/op 29 allocs/op
1068// BenchmarkDoubleArrayLarger-4 18 73752383 ns/op 6357956 B/op 2579 allocs/op
1069// BenchmarkMatrixTransduce-4 46870 27140 ns/op 12408 B/op 6 allocs/op
Akron04335c62021-10-28 11:56:00 +02001070// 2021-10-28 - Finalize feature compatibility with KorAP-Tokenizer
1071// BenchmarkDoubleArrayTransduce-4 39130 31612 ns/op 28944 B/op 16 allocs/op
1072// BenchmarkDoubleArrayConstruction-4 79302 14994 ns/op 10703 B/op 29 allocs/op
1073// BenchmarkDoubleArrayLarger-4 18 67942077 ns/op 6357870 B/op 2577 allocs/op
1074// BenchmarkMatrixTransduce-4 39536 30510 ns/op 28944 B/op 16 allocs/op
Akron289414f2021-11-09 19:56:42 +01001075// 2021-11-09 - go 1.17.3
1076// BenchmarkDoubleArrayTransduce-4 35067 34192 ns/op 28944 B/op 17 allocs/op
1077// BenchmarkDoubleArrayConstruction-4 72446 15614 ns/op 10703 B/op 29 allocs/op
1078// BenchmarkDoubleArrayLarger-4 16 71058822 ns/op 6357860 B/op 2577 allocs/op
1079// BenchmarkMatrixTransduce-4 36703 31891 ns/op 28944 B/op 17 allocs/op
Akronfac8abc2021-11-10 07:19:59 +01001080// 2021-11-10 - rearranged longest match operator
Akron4880fb62021-12-05 12:03:05 +01001081// BenchmarkDoubleArrayTransduce-4 34522 33210 ns/op 28944 B/op 17 allocs/op
1082// BenchmarkDoubleArrayConstruction-4 66990 16012 ns/op 10703 B/op 29 allocs/op
1083// BenchmarkDoubleArrayLarger-4 16 62829878 ns/op 6357823 B/op 2576 allocs/op
1084// BenchmarkMatrixTransduce-4 36154 32702 ns/op 28944 B/op 17 allocs/op
1085// 2021-12-04 - optimize identity branch
1086// BenchmarkDoubleArrayTransduce-4 34903 32255 ns/op 28944 B/op 17 allocs/op
1087// BenchmarkDoubleArrayConstruction-4 79394 14561 ns/op 10703 B/op 29 allocs/op
1088// BenchmarkDoubleArrayLarger-4 19 60257675 ns/op 6357911 B/op 2577 allocs/op
1089// BenchmarkMatrixTransduce-4 35076 30581 ns/op 28944 B/op 17 allocs/op
Akron00cecd12021-12-05 13:14:03 +01001090// 2021-12-05 - init identity for sigma < 256
1091// BenchmarkDoubleArrayTransduce-4 35284 31918 ns/op 28944 B/op 17 allocs/op
1092// BenchmarkDoubleArrayConstruction-4 80342 14504 ns/op 10703 B/op 29 allocs/op
1093// BenchmarkDoubleArrayLarger-4 19 60343253 ns/op 6357789 B/op 2575 allocs/op
1094// BenchmarkMatrixTransduce-4 34029 30238 ns/op 28944 B/op 17 allocs/op