blob: e58005fc42318906893d230118ae02d3a40c5dd0 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akronbe3d3662023-04-26 13:22:38 +020023var mat_de, mat_en *MatrixTokenizer
Akron9fb63af2021-10-28 01:15:53 +020024
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akrondf275812022-03-27 12:54:46 +020073func TestMatrixSimpleString(t *testing.T) {
74 assert := assert.New(t)
75 // bau | bauamt
76 tok := LoadFomaFile("testdata/bauamt.fst")
77 mat := tok.ToMatrix()
78
79 b := make([]byte, 0, 2048)
80 w := bytes.NewBuffer(b)
81 var tokens []string
82
83 tokens = ttokenize(mat, w, "ibauamt")
84 assert.Equal("i", tokens[0])
85 assert.Equal("bauamt", tokens[1])
86
87 tokens = ttokenize(mat, w, "ibbauamt")
88 assert.Equal("i", tokens[0])
89
90 assert.Equal("b", tokens[1])
91 assert.Equal("bauamt", tokens[2])
92
93 tokens = ttokenize(mat, w, "bau")
94 assert.Equal("bau", tokens[0])
95
96 tokens = ttokenize(mat, w, "baum")
97 assert.Equal("bau", tokens[0])
98 assert.Equal("m", tokens[1])
99
100 tokens = ttokenize(mat, w, "baudibauamt")
101 assert.Equal("bau", tokens[0])
102 assert.Equal("d", tokens[1])
103 assert.Equal("i", tokens[2])
104 assert.Equal("bauamt", tokens[3])
105}
106
Akroncae39112023-04-26 19:43:16 +0200107func TestMatrixCliticRule(t *testing.T) {
108 assert := assert.New(t)
109 mat := LoadMatrixFile("testdata/clitic_test.matok")
110
111 b := make([]byte, 0, 2048)
112 w := bytes.NewBuffer(b)
113 var tokens []string
114
115 tokens = ttokenize(mat, w, "ibauamt")
116 assert.Equal("ibauamt", tokens[0])
117
118 exstring := "dead. "
119
120 tokens = ttokenize(mat, w, exstring)
121 assert.Equal("dead", tokens[0])
122 assert.Equal(".", tokens[1])
123
124 w.Reset()
125 tws := NewTokenWriter(w, TOKENS|SENTENCES)
126
127 assert.True(mat.TransduceTokenWriter(
Akron72a64222023-04-26 17:00:45 +0200128
Akroncae39112023-04-26 19:43:16 +0200129 strings.NewReader(exstring), tws),
130 )
131 tws.Flush()
132
133 matStr := w.String()
Akron72a64222023-04-26 17:00:45 +0200134 assert.Equal("dead\n.\n\n\n", matStr)
135
136 tokens = ttokenize(mat, w, "they're")
137 assert.Equal("they", tokens[0])
138 assert.Equal("'re", tokens[1])
139
140 tokens = ttokenize(mat, w, "they're They're their don't wouldn't")
141 assert.Equal("they", tokens[0])
142 assert.Equal("'re", tokens[1])
143 assert.Equal("They", tokens[2])
144 assert.Equal("'re", tokens[3])
145 assert.Equal("their", tokens[4])
146 assert.Equal("do", tokens[5])
147 assert.Equal("n't", tokens[6])
148 assert.Equal("would", tokens[7])
149 assert.Equal("n't", tokens[8])
Akroncae39112023-04-26 19:43:16 +0200150}
151
Akronc9c0eae2021-10-22 19:49:43 +0200152func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +0200153 assert := assert.New(t)
154 foma := LoadFomaFile("testdata/simpletok.fst")
155 assert.NotNil(foma)
156
157 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +0200158 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +0200159
Akrondf275812022-03-27 12:54:46 +0200160 assert.Equal(ttokenizeStr(mat, "bau"), "bau")
161 assert.Equal(ttokenizeStr(mat, "bad"), "bad")
162 assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200163 b := make([]byte, 0, 1024)
164 buf := bytes.NewBuffer(b)
165 n, err := mat.WriteTo(buf)
166 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +0200167 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +0200168 mat2 := ParseMatrix(buf)
169 assert.NotNil(mat2)
170 assert.Equal(mat.sigma, mat2.sigma)
171 assert.Equal(mat.epsilon, mat2.epsilon)
172 assert.Equal(mat.unknown, mat2.unknown)
173 assert.Equal(mat.identity, mat2.identity)
174 assert.Equal(mat.stateCount, mat2.stateCount)
175 assert.Equal(len(mat.array), len(mat2.array))
176 assert.Equal(mat.array, mat2.array)
Akrondf275812022-03-27 12:54:46 +0200177 assert.Equal(ttokenizeStr(mat2, "bau"), "bau")
178 assert.Equal(ttokenizeStr(mat2, "bad"), "bad")
179 assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200180}
181
Akrone396a932021-10-19 01:06:13 +0200182func TestMatrixIgnorableMCS(t *testing.T) {
183 assert := assert.New(t)
184
185 // This test relies on final states. That's why it is
186 // not working correctly anymore.
187
188 // File has MCS in sigma but not in net
189 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
190 assert.NotNil(tok)
191 mat := tok.ToMatrix()
192 assert.NotNil(mat)
193
194 b := make([]byte, 0, 2048)
195 w := bytes.NewBuffer(b)
196 var tokens []string
197
198 // Is only unambigous when transducing strictly greedy!
199 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
200 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200201 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200202 assert.Equal("a", tokens[0])
203 assert.Equal("b", tokens[1])
204 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200205 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200206}
207
Akronc9c0eae2021-10-22 19:49:43 +0200208func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200209 assert := assert.New(t)
Akron0139bc52023-08-31 16:35:58 +0200210 foma := LoadFomaFile("testdata/tokenizer_de.fst")
Akron28031b72021-10-02 13:07:25 +0200211 assert.NotNil(foma)
212
213 mat := foma.ToMatrix()
214 assert.NotNil(foma)
215
216 tb := make([]byte, 0, 2048)
217 w := bytes.NewBuffer(tb)
218
219 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200220 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200221
222 b := make([]byte, 0, 1024)
223 buf := bytes.NewBuffer(b)
224 _, err := mat.WriteTo(buf)
225 assert.Nil(err)
226 w.Reset()
227 // assert.Equal(int64(248), n)
228
229 mat2 := ParseMatrix(buf)
230 assert.NotNil(mat2)
231 assert.Equal(mat.sigma, mat2.sigma)
232 assert.Equal(mat.epsilon, mat2.epsilon)
233 assert.Equal(mat.unknown, mat2.unknown)
234 assert.Equal(mat.identity, mat2.identity)
235 assert.Equal(mat.stateCount, mat2.stateCount)
236 assert.Equal(len(mat.array), len(mat2.array))
237 // assert.Equal(mat.array, mat2.array)
238
239 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200240 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200241}
242
Akronc9c0eae2021-10-22 19:49:43 +0200243func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200244 assert := assert.New(t)
245
Akronbe3d3662023-04-26 13:22:38 +0200246 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +0200247 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +0200248 }
Akron28031b72021-10-02 13:07:25 +0200249
Akronbe3d3662023-04-26 13:22:38 +0200250 assert.NotNil(mat_de)
Akron28031b72021-10-02 13:07:25 +0200251
252 b := make([]byte, 0, 2048)
253 w := bytes.NewBuffer(b)
254 var tokens []string
255
Akronbe3d3662023-04-26 13:22:38 +0200256 assert.True(mat_de.Transduce(strings.NewReader("tra. u Du?"), w))
Akron28031b72021-10-02 13:07:25 +0200257
258 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200259 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200260 assert.Equal("tra", tokens[0])
261 assert.Equal(".", tokens[1])
262 assert.Equal("", tokens[2])
263 assert.Equal("u", tokens[3])
264 assert.Equal("Du", tokens[4])
265 assert.Equal("?", tokens[5])
266 assert.Equal("", tokens[6])
267 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200268 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200269
270 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200271 assert.True(mat_de.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200272 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200273}
274
Akronc9c0eae2021-10-22 19:49:43 +0200275func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200276 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200277
Akronbe3d3662023-04-26 13:22:38 +0200278 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +0200279 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +0200280 }
Akron5c82a922021-09-24 19:11:29 +0200281
282 b := make([]byte, 0, 2048)
283 w := bytes.NewBuffer(b)
284 var sentences []string
285
286 // testSentSplitterSimple
Akronbe3d3662023-04-26 13:22:38 +0200287 assert.True(mat_de.Transduce(strings.NewReader("Der alte Mann."), w))
Akron5c82a922021-09-24 19:11:29 +0200288 sentences = strings.Split(w.String(), "\n\n")
289
Akrona854faa2021-10-22 19:31:08 +0200290 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200291 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200292 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200293 assert.Equal(len(sentences), 2)
294
295 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200296 assert.True(mat_de.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
Akrona854faa2021-10-22 19:31:08 +0200297 sentences = strings.Split(w.String(), "\n\n")
298 assert.Equal(len(sentences), 2)
299 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
300 assert.Equal("\n", sentences[1])
301
302 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200303 assert.True(mat_de.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
Akron5c82a922021-09-24 19:11:29 +0200304 sentences = strings.Split(w.String(), "\n\n")
305 assert.Equal(len(sentences), 2)
306 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200307 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200308
Akron28031b72021-10-02 13:07:25 +0200309 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200310 assert.True(mat_de.Transduce(strings.NewReader(""), w))
Akron28031b72021-10-02 13:07:25 +0200311 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200312 assert.Equal(len(sentences), 2)
313 assert.Equal("", sentences[0])
314 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200315
Akron28031b72021-10-02 13:07:25 +0200316 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200317 assert.True(mat_de.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
Akron28031b72021-10-02 13:07:25 +0200318 sentences = strings.Split(w.String(), "\n\n")
319 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200320
Akron28031b72021-10-02 13:07:25 +0200321 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200322 assert.True(mat_de.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
Akron28031b72021-10-02 13:07:25 +0200323 sentences = strings.Split(w.String(), "\n\n")
324 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200325
Akron28031b72021-10-02 13:07:25 +0200326 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200327 assert.True(mat_de.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
Akron28031b72021-10-02 13:07:25 +0200328 sentences = strings.Split(w.String(), "\n\n")
329 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200330 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200331 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200332
Akron28031b72021-10-02 13:07:25 +0200333 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200334 assert.True(mat_de.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
Akron28031b72021-10-02 13:07:25 +0200335 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200336 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200337 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200338
Akron28031b72021-10-02 13:07:25 +0200339 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200340 assert.True(mat_de.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
Akron28031b72021-10-02 13:07:25 +0200341 sentences = strings.Split(w.String(), "\n\n")
342 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200343
Akron28031b72021-10-02 13:07:25 +0200344 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200345 assert.True(mat_de.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
Akron28031b72021-10-02 13:07:25 +0200346 sentences = strings.Split(w.String(), "\n\n")
347 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200348
Akron28031b72021-10-02 13:07:25 +0200349 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200350 assert.True(mat_de.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
Akron28031b72021-10-02 13:07:25 +0200351 sentences = strings.Split(w.String(), "\n\n")
352 assert.Equal(len(sentences), 2)
353 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200354 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200355
Akron28031b72021-10-02 13:07:25 +0200356 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200357 assert.True(mat_de.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
Akron28031b72021-10-02 13:07:25 +0200358 sentences = strings.Split(w.String(), "\n\n")
359 assert.Equal(len(sentences), 3)
360 assert.Equal("Ausschalten\n!!!", sentences[0])
361 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200362 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200363
Akron28031b72021-10-02 13:07:25 +0200364 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200365 assert.True(mat_de.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
Akron28031b72021-10-02 13:07:25 +0200366 sentences = strings.Split(w.String(), "\n\n")
367 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100368
369 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200370 assert.True(mat_de.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
Akrone96895f2022-03-08 19:58:37 +0100371 sentences = strings.Split(w.String(), "\n\n")
372 assert.Equal(len(sentences), 5)
373 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
374 assert.Equal("Geh\n!!!", sentences[1])
375 assert.Equal("\"\nLass\n!\n\"", sentences[2])
376 assert.Equal("Dann\nging\ner\n.", sentences[3])
377
378 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200379 assert.True(mat_de.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
Akrone96895f2022-03-08 19:58:37 +0100380 sentences = strings.Split(w.String(), "\n\n")
381 assert.Equal(len(sentences), 3)
382 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
383 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100384
385 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200386 assert.True(mat_de.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
Akronece3f012022-03-09 19:12:15 +0100387 sentences = strings.Split(w.String(), "\n\n")
388 assert.Equal(len(sentences), 3)
389 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
390 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
391
392 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200393 assert.True(mat_de.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
Akronece3f012022-03-09 19:12:15 +0100394 sentences = strings.Split(w.String(), "\n\n")
395 assert.Equal(len(sentences), 3)
396 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
397 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
398
399 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
400Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
401bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
402'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
403zu Polterabend und Hochzeit.'«
404
405»Und was sagtest du da?«`
406
407 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200408 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akronece3f012022-03-09 19:12:15 +0100409 sentences = strings.Split(w.String(), "\n\n")
410 assert.Equal(len(sentences), 8)
411 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
412 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron4222ac82022-03-11 01:06:21 +0100413
414 text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
415Innstetten!`
416
417 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200418 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akron4222ac82022-03-11 01:06:21 +0100419 sentences = strings.Split(w.String(), "\n\n")
420 assert.Equal(len(sentences), 3)
421 assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
422 assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
Akrondf275812022-03-27 12:54:46 +0200423
Akronb98e4cf2022-03-27 23:56:49 +0200424 // Check parantheses at the end of the sentence
Akronf94b9ce2022-03-27 18:18:09 +0200425 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200426 assert.True(mat_de.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
Akronf94b9ce2022-03-27 18:18:09 +0200427 sentences = strings.Split(w.String(), "\n\n")
428 assert.Equal(len(sentences), 3)
429 assert.Equal("(\nEr\nging\n.\n)", sentences[0])
430 assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
Akron7aa1cbe2022-03-30 12:44:04 +0200431
432 // Check parantheses and quotes at the end of the sentence
433 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200434 assert.True(mat_de.Transduce(strings.NewReader("(Er sagte: \"Hallo!\") Dann ging er."), w))
Akron7aa1cbe2022-03-30 12:44:04 +0200435 sentences = strings.Split(w.String(), "\n\n")
436 assert.Equal(len(sentences), 3)
437 assert.Equal("(\nEr\nsagte\n:\n\"\nHallo\n!\n\"\n)", sentences[0])
438 assert.Equal("Dann\nging\ner\n.", sentences[1])
439
Akrondf275812022-03-27 12:54:46 +0200440}
441
442func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
443 assert := assert.New(t)
444
Akronbe3d3662023-04-26 13:22:38 +0200445 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +0200446 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akrondf275812022-03-27 12:54:46 +0200447 }
448
449 b := make([]byte, 0, 2048)
450 w := bytes.NewBuffer(b)
451 var sentences []string
452
453 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
454
455 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200456 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akrondf275812022-03-27 12:54:46 +0200457 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +0200458 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +0200459 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
460 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +0200461 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
462 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
463 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akron1c34ce62021-09-23 23:27:39 +0200464}
Akron28031b72021-10-02 13:07:25 +0200465
Akronc9c0eae2021-10-22 19:49:43 +0200466func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200467 assert := assert.New(t)
468
Akronbe3d3662023-04-26 13:22:38 +0200469 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +0200470 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +0200471 }
Akron28031b72021-10-02 13:07:25 +0200472
473 b := make([]byte, 0, 2048)
474 w := bytes.NewBuffer(b)
475 var tokens []string
476
477 // testTokenizerSimple
Akronbe3d3662023-04-26 13:22:38 +0200478 tokens = ttokenize(mat_de, w, "Der alte Mann")
Akron28031b72021-10-02 13:07:25 +0200479 assert.Equal(tokens[0], "Der")
480 assert.Equal(tokens[1], "alte")
481 assert.Equal(tokens[2], "Mann")
482 assert.Equal(len(tokens), 3)
483
Akronbe3d3662023-04-26 13:22:38 +0200484 tokens = ttokenize(mat_de, w, "Der alte Mann.")
Akron28031b72021-10-02 13:07:25 +0200485 assert.Equal(tokens[0], "Der")
486 assert.Equal(tokens[1], "alte")
487 assert.Equal(tokens[2], "Mann")
488 assert.Equal(tokens[3], ".")
489 assert.Equal(len(tokens), 4)
490
491 // testTokenizerAbbr
Akronbe3d3662023-04-26 13:22:38 +0200492 tokens = ttokenize(mat_de, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron28031b72021-10-02 13:07:25 +0200493 assert.Equal(tokens[0], "Der")
494 assert.Equal(tokens[1], "Vorsitzende")
495 assert.Equal(tokens[2], "der")
496 assert.Equal(tokens[3], "F.D.P.")
497 assert.Equal(tokens[4], "hat")
498 assert.Equal(tokens[5], "gewählt")
499 assert.Equal(len(tokens), 6)
500 // Ignored in KorAP-Tokenizer
501
502 // testTokenizerHost1
Akronbe3d3662023-04-26 13:22:38 +0200503 tokens = ttokenize(mat_de, w, "Gefunden auf wikipedia.org")
Akron28031b72021-10-02 13:07:25 +0200504 assert.Equal(tokens[0], "Gefunden")
505 assert.Equal(tokens[1], "auf")
506 assert.Equal(tokens[2], "wikipedia.org")
507 assert.Equal(len(tokens), 3)
508
509 // testTokenizerWwwHost
Akronbe3d3662023-04-26 13:22:38 +0200510 tokens = ttokenize(mat_de, w, "Gefunden auf www.wikipedia.org")
Akron28031b72021-10-02 13:07:25 +0200511 assert.Equal("Gefunden", tokens[0])
512 assert.Equal("auf", tokens[1])
513 assert.Equal("www.wikipedia.org", tokens[2])
514 assert.Equal(3, len(tokens))
515
516 // testTokenizerWwwUrl
Akronbe3d3662023-04-26 13:22:38 +0200517 tokens = ttokenize(mat_de, w, "Weitere Infos unter www.info.biz/info")
Akron28031b72021-10-02 13:07:25 +0200518 assert.Equal("www.info.biz/info", tokens[3])
519
520 // testTokenizerFtpHost
521 /*
522 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
523 assert.Equal("Kann", tokens[0])
524 assert.Equal("von", tokens[1])
525 assert.Equal("ftp.download.org", tokens[2])
526 assert.Equal(5, len(tokens))
527 // Ignored in KorAP-Tokenizer
528 */
529
530 // testTokenizerDash
Akronbe3d3662023-04-26 13:22:38 +0200531 tokens = ttokenize(mat_de, w, "Das war -- spitze")
Akron28031b72021-10-02 13:07:25 +0200532 assert.Equal(tokens[0], "Das")
533 assert.Equal(tokens[1], "war")
534 assert.Equal(tokens[2], "--")
535 assert.Equal(tokens[3], "spitze")
536 assert.Equal(len(tokens), 4)
537
538 // testTokenizerEmail1
Akronbe3d3662023-04-26 13:22:38 +0200539 tokens = ttokenize(mat_de, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron28031b72021-10-02 13:07:25 +0200540 assert.Equal(tokens[0], "Ich")
541 assert.Equal(tokens[1], "bin")
542 assert.Equal(tokens[2], "unter")
543 assert.Equal(tokens[3], "korap@ids-mannheim.de")
544 assert.Equal(tokens[4], "erreichbar")
545 assert.Equal(tokens[5], ".")
546 assert.Equal(len(tokens), 6)
547
548 // testTokenizerEmail2
Akronbe3d3662023-04-26 13:22:38 +0200549 tokens = ttokenize(mat_de, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron28031b72021-10-02 13:07:25 +0200550 assert.Equal(tokens[0], "Oder")
551 assert.Equal(tokens[1], "unter")
552 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
553 assert.Equal(tokens[3], ".")
554 assert.Equal(len(tokens), 4)
555
556 // testTokenizerEmail3
Akronbe3d3662023-04-26 13:22:38 +0200557 tokens = ttokenize(mat_de, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron28031b72021-10-02 13:07:25 +0200558 assert.Equal(tokens[0], "Oder")
559 assert.Equal(tokens[1], "unter")
560 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
561 assert.Equal(tokens[3], ".")
562 assert.Equal(len(tokens), 4)
563 // Ignored in KorAP-Tokenizer
564
565 // testTokenizerDoNotAcceptQuotedEmailNames
Akronbe3d3662023-04-26 13:22:38 +0200566 tokens = ttokenize(mat_de, w, "\"John Doe\"@xx.com")
Akron28031b72021-10-02 13:07:25 +0200567 assert.Equal("\"", tokens[0])
568 assert.Equal("John", tokens[1])
569 assert.Equal("Doe", tokens[2])
570 assert.Equal("\"", tokens[3])
571 assert.Equal("@xx", tokens[4])
572 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
573 assert.Equal("com", tokens[6])
574 assert.Equal(7, len(tokens))
575
576 // testTokenizerTwitter
Akronbe3d3662023-04-26 13:22:38 +0200577 tokens = ttokenize(mat_de, w, "Folgt @korap und #korap")
Akron28031b72021-10-02 13:07:25 +0200578 assert.Equal(tokens[0], "Folgt")
579 assert.Equal(tokens[1], "@korap")
580 assert.Equal(tokens[2], "und")
581 assert.Equal(tokens[3], "#korap")
582 assert.Equal(len(tokens), 4)
583
584 // testTokenizerWeb1
Akronbe3d3662023-04-26 13:22:38 +0200585 tokens = ttokenize(mat_de, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron28031b72021-10-02 13:07:25 +0200586 assert.Equal(tokens[0], "Unsere")
587 assert.Equal(tokens[1], "Website")
588 assert.Equal(tokens[2], "ist")
589 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
590 assert.Equal(len(tokens), 4)
591
592 // testTokenizerWeb2
Akronbe3d3662023-04-26 13:22:38 +0200593 tokens = ttokenize(mat_de, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron28031b72021-10-02 13:07:25 +0200594 assert.Equal(tokens[0], "Wir")
595 assert.Equal(tokens[1], "sind")
596 assert.Equal(tokens[2], "auch")
597 assert.Equal(tokens[3], "im")
598 assert.Equal(tokens[4], "Internet")
599 assert.Equal(tokens[5], "(")
600 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
601 assert.Equal(tokens[7], ")")
602 assert.Equal(len(tokens), 8)
603 // Ignored in KorAP-Tokenizer
604
605 // testTokenizerWeb3
Akronbe3d3662023-04-26 13:22:38 +0200606 tokens = ttokenize(mat_de, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron28031b72021-10-02 13:07:25 +0200607 assert.Equal(tokens[0], "Die")
608 assert.Equal(tokens[1], "Adresse")
609 assert.Equal(tokens[2], "ist")
610 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
611 assert.Equal(tokens[4], ".")
612 assert.Equal(len(tokens), 5)
613 // Ignored in KorAP-Tokenizer
614
615 // testTokenizerServer
Akronbe3d3662023-04-26 13:22:38 +0200616 tokens = ttokenize(mat_de, w, "Unser Server ist 10.0.10.51.")
Akron28031b72021-10-02 13:07:25 +0200617 assert.Equal(tokens[0], "Unser")
618 assert.Equal(tokens[1], "Server")
619 assert.Equal(tokens[2], "ist")
620 assert.Equal(tokens[3], "10.0.10.51")
621 assert.Equal(tokens[4], ".")
622 assert.Equal(len(tokens), 5)
623
624 // testTokenizerNum
Akronbe3d3662023-04-26 13:22:38 +0200625 tokens = ttokenize(mat_de, w, "Zu 50,4% ist es sicher")
Akron28031b72021-10-02 13:07:25 +0200626 assert.Equal(tokens[0], "Zu")
627 assert.Equal(tokens[1], "50,4%")
628 assert.Equal(tokens[2], "ist")
629 assert.Equal(tokens[3], "es")
630 assert.Equal(tokens[4], "sicher")
631 assert.Equal(len(tokens), 5)
632 // Differs from KorAP-Tokenizer
633
634 // testTokenizerDate
Akronbe3d3662023-04-26 13:22:38 +0200635 tokens = ttokenize(mat_de, w, "Der Termin ist am 5.9.2018")
Akron28031b72021-10-02 13:07:25 +0200636 assert.Equal(tokens[0], "Der")
637 assert.Equal(tokens[1], "Termin")
638 assert.Equal(tokens[2], "ist")
639 assert.Equal(tokens[3], "am")
640 assert.Equal(tokens[4], "5.9.2018")
641 assert.Equal(len(tokens), 5)
642
Akronbe3d3662023-04-26 13:22:38 +0200643 tokens = ttokenize(mat_de, w, "Der Termin ist am 5/9/2018")
Akron28031b72021-10-02 13:07:25 +0200644 assert.Equal(tokens[0], "Der")
645 assert.Equal(tokens[1], "Termin")
646 assert.Equal(tokens[2], "ist")
647 assert.Equal(tokens[3], "am")
648 assert.Equal(tokens[4], "5/9/2018")
649 assert.Equal(len(tokens), 5)
650
651 // testTokenizerDateRange
652 /*
653 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
654 assert.Equal(tokens[0], "Der")
655 assert.Equal(tokens[1], "Termin")
656 assert.Equal(tokens[2], "war")
657 assert.Equal(tokens[3], "vom")
658 assert.Equal(tokens[4], "4.")
659 assert.Equal(tokens[5], "-")
660 assert.Equal(tokens[6], "5.9.2018")
661 assert.Equal(len(tokens), 7)
662 // Ignored in KorAP-Tokenizer
663 */
664
665 // testTokenizerEmoji1
Akronbe3d3662023-04-26 13:22:38 +0200666 tokens = ttokenize(mat_de, w, "Das ist toll! ;)")
Akron28031b72021-10-02 13:07:25 +0200667 assert.Equal(tokens[0], "Das")
668 assert.Equal(tokens[1], "ist")
669 assert.Equal(tokens[2], "toll")
670 assert.Equal(tokens[3], "!")
671 assert.Equal(tokens[4], ";)")
672 assert.Equal(len(tokens), 5)
673
674 // testTokenizerRef1
Akronbe3d3662023-04-26 13:22:38 +0200675 tokens = ttokenize(mat_de, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron28031b72021-10-02 13:07:25 +0200676 assert.Equal(tokens[0], "Kupietz")
677 assert.Equal(tokens[1], "und")
678 assert.Equal(tokens[2], "Schmidt")
679 assert.Equal(tokens[3], "(2018)")
680 assert.Equal(tokens[4], ":")
681 assert.Equal(tokens[5], "Korpuslinguistik")
682 assert.Equal(len(tokens), 6)
683 // Differs from KorAP-Tokenizer!
684
685 // testTokenizerRef2 () {
Akronbe3d3662023-04-26 13:22:38 +0200686 tokens = ttokenize(mat_de, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron28031b72021-10-02 13:07:25 +0200687 assert.Equal(tokens[0], "Kupietz")
688 assert.Equal(tokens[1], "und")
689 assert.Equal(tokens[2], "Schmidt")
690 assert.Equal(tokens[3], "[2018]")
691 assert.Equal(tokens[4], ":")
692 assert.Equal(tokens[5], "Korpuslinguistik")
693 assert.Equal(len(tokens), 6)
694 // Differs from KorAP-Tokenizer!
695
696 // testTokenizerOmission1 () {
Akronbe3d3662023-04-26 13:22:38 +0200697 tokens = ttokenize(mat_de, w, "Er ist ein A****loch!")
Akron28031b72021-10-02 13:07:25 +0200698 assert.Equal(tokens[0], "Er")
699 assert.Equal(tokens[1], "ist")
700 assert.Equal(tokens[2], "ein")
701 assert.Equal(tokens[3], "A****loch")
702 assert.Equal(tokens[4], "!")
703 assert.Equal(len(tokens), 5)
704
705 // testTokenizerOmission2
Akronbe3d3662023-04-26 13:22:38 +0200706 tokens = ttokenize(mat_de, w, "F*ck!")
Akron28031b72021-10-02 13:07:25 +0200707 assert.Equal(tokens[0], "F*ck")
708 assert.Equal(tokens[1], "!")
709 assert.Equal(len(tokens), 2)
710
711 // testTokenizerOmission3 () {
Akronbe3d3662023-04-26 13:22:38 +0200712 tokens = ttokenize(mat_de, w, "Dieses verf***** Kleid!")
Akron28031b72021-10-02 13:07:25 +0200713 assert.Equal(tokens[0], "Dieses")
714 assert.Equal(tokens[1], "verf*****")
715 assert.Equal(tokens[2], "Kleid")
716 assert.Equal(tokens[3], "!")
717 assert.Equal(len(tokens), 4)
718
719 // Probably interpreted as HOST
720 // testTokenizerFileExtension1
Akronbe3d3662023-04-26 13:22:38 +0200721 tokens = ttokenize(mat_de, w, "Ich habe die readme.txt heruntergeladen")
Akron28031b72021-10-02 13:07:25 +0200722 assert.Equal(tokens[0], "Ich")
723 assert.Equal(tokens[1], "habe")
724 assert.Equal(tokens[2], "die")
725 assert.Equal(tokens[3], "readme.txt")
726 assert.Equal(tokens[4], "heruntergeladen")
727 assert.Equal(len(tokens), 5)
728
729 // Probably interpreted as HOST
730 // testTokenizerFileExtension2
Akronbe3d3662023-04-26 13:22:38 +0200731 tokens = ttokenize(mat_de, w, "Nimm die README.TXT!")
Akron28031b72021-10-02 13:07:25 +0200732 assert.Equal(tokens[0], "Nimm")
733 assert.Equal(tokens[1], "die")
734 assert.Equal(tokens[2], "README.TXT")
735 assert.Equal(tokens[3], "!")
736 assert.Equal(len(tokens), 4)
737
738 // Probably interpreted as HOST
739 // testTokenizerFileExtension3
Akronbe3d3662023-04-26 13:22:38 +0200740 tokens = ttokenize(mat_de, w, "Zeig mir profile.jpeg")
Akron28031b72021-10-02 13:07:25 +0200741 assert.Equal(tokens[0], "Zeig")
742 assert.Equal(tokens[1], "mir")
743 assert.Equal(tokens[2], "profile.jpeg")
744 assert.Equal(len(tokens), 3)
745
746 // testTokenizerFile1
747
Akronbe3d3662023-04-26 13:22:38 +0200748 tokens = ttokenize(mat_de, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akron28031b72021-10-02 13:07:25 +0200749 assert.Equal(tokens[0], "Zeig")
750 assert.Equal(tokens[1], "mir")
751 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
752 assert.Equal(len(tokens), 3)
753
754 // testTokenizerFile2
Akronbe3d3662023-04-26 13:22:38 +0200755 tokens = ttokenize(mat_de, w, "Gehe zu /Dokumente/profile.docx")
Akron28031b72021-10-02 13:07:25 +0200756 assert.Equal(tokens[0], "Gehe")
757 assert.Equal(tokens[1], "zu")
758 assert.Equal(tokens[2], "/Dokumente/profile.docx")
759 assert.Equal(len(tokens), 3)
760
761 // testTokenizerFile3
Akronbe3d3662023-04-26 13:22:38 +0200762 tokens = ttokenize(mat_de, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akron28031b72021-10-02 13:07:25 +0200763 assert.Equal(tokens[0], "Zeig")
764 assert.Equal(tokens[1], "mir")
765 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
766 assert.Equal(len(tokens), 3)
767 // Ignored in KorAP-Tokenizer
768
769 // testTokenizerPunct
Akronbe3d3662023-04-26 13:22:38 +0200770 tokens = ttokenize(mat_de, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akron28031b72021-10-02 13:07:25 +0200771 assert.Equal(tokens[0], "Er")
772 assert.Equal(tokens[1], "sagte")
773 assert.Equal(tokens[2], ":")
774 assert.Equal(tokens[3], "\"")
775 assert.Equal(tokens[4], "Es")
776 assert.Equal(tokens[5], "geht")
777 assert.Equal(tokens[6], "mir")
778 assert.Equal(tokens[7], "gut")
779 assert.Equal(tokens[8], "!")
780 assert.Equal(tokens[9], "\"")
781 assert.Equal(tokens[10], ",")
782 assert.Equal(tokens[11], "daraufhin")
783 assert.Equal(tokens[12], "ging")
784 assert.Equal(tokens[13], "er")
785 assert.Equal(tokens[14], ".")
786 assert.Equal(len(tokens), 15)
787
788 // testTokenizerPlusAmpersand
Akronbe3d3662023-04-26 13:22:38 +0200789 tokens = ttokenize(mat_de, w, "&quot;Das ist von C&A!&quot;")
Akron28031b72021-10-02 13:07:25 +0200790 assert.Equal(tokens[0], "&quot;")
791 assert.Equal(tokens[1], "Das")
792 assert.Equal(tokens[2], "ist")
793 assert.Equal(tokens[3], "von")
794 assert.Equal(tokens[4], "C&A")
795 assert.Equal(tokens[5], "!")
796 assert.Equal(tokens[6], "&quot;")
797 assert.Equal(len(tokens), 7)
798
799 // testTokenizerLongEnd
Akronbe3d3662023-04-26 13:22:38 +0200800 tokens = ttokenize(mat_de, w, "Siehst Du?!!?")
Akron28031b72021-10-02 13:07:25 +0200801 assert.Equal(tokens[0], "Siehst")
802 assert.Equal(tokens[1], "Du")
803 assert.Equal(tokens[2], "?!!?")
804 assert.Equal(len(tokens), 3)
805
806 // testTokenizerIrishO
Akronbe3d3662023-04-26 13:22:38 +0200807 tokens = ttokenize(mat_de, w, "Peter O'Toole")
Akron28031b72021-10-02 13:07:25 +0200808 assert.Equal(tokens[0], "Peter")
809 assert.Equal(tokens[1], "O'Toole")
810 assert.Equal(len(tokens), 2)
811
812 // testTokenizerAbr
Akronbe3d3662023-04-26 13:22:38 +0200813 tokens = ttokenize(mat_de, w, "Früher bzw. später ...")
Akron28031b72021-10-02 13:07:25 +0200814 assert.Equal(tokens[0], "Früher")
815 assert.Equal(tokens[1], "bzw.")
816 assert.Equal(tokens[2], "später")
817 assert.Equal(tokens[3], "...")
818 assert.Equal(len(tokens), 4)
819
820 // testTokenizerUppercaseRule
Akronbe3d3662023-04-26 13:22:38 +0200821 tokens = ttokenize(mat_de, w, "Es war spät.Morgen ist es früh.")
Akron28031b72021-10-02 13:07:25 +0200822 assert.Equal(tokens[0], "Es")
823 assert.Equal(tokens[1], "war")
824 assert.Equal(tokens[2], "spät")
825 assert.Equal(tokens[3], ".")
826 assert.Equal(tokens[4], "Morgen")
827 assert.Equal(tokens[5], "ist")
828 assert.Equal(tokens[6], "es")
829 assert.Equal(tokens[7], "früh")
830 assert.Equal(tokens[8], ".")
831 assert.Equal(len(tokens), 9)
832 // Ignored in KorAP-Tokenizer
833
834 // testTokenizerOrd
Akronbe3d3662023-04-26 13:22:38 +0200835 tokens = ttokenize(mat_de, w, "Sie erreichte den 1. Platz!")
Akron28031b72021-10-02 13:07:25 +0200836 assert.Equal(tokens[0], "Sie")
837 assert.Equal(tokens[1], "erreichte")
838 assert.Equal(tokens[2], "den")
839 assert.Equal(tokens[3], "1.")
840 assert.Equal(tokens[4], "Platz")
841 assert.Equal(tokens[5], "!")
842 assert.Equal(len(tokens), 6)
843
844 // testNoZipOuputArchive
Akronbe3d3662023-04-26 13:22:38 +0200845 tokens = ttokenize(mat_de, w, "Archive: Ich bin kein zip\n")
Akron28031b72021-10-02 13:07:25 +0200846 assert.Equal(tokens[0], "Archive")
847 assert.Equal(tokens[1], ":")
848 assert.Equal(tokens[2], "Ich")
849 assert.Equal(tokens[3], "bin")
850 assert.Equal(tokens[4], "kein")
851 assert.Equal(tokens[5], "zip")
852 assert.Equal(6, len(tokens))
853
854 // testTokenizerStrasse
Akronbe3d3662023-04-26 13:22:38 +0200855 tokens = ttokenize(mat_de, w, "Ich wohne in der Weststr. und Du?")
Akron28031b72021-10-02 13:07:25 +0200856 assert.Equal(tokens[4], "Weststr.")
857 assert.Equal(8, len(tokens))
858
859 // germanTokenizerKnowsGermanOmissionWords
Akronbe3d3662023-04-26 13:22:38 +0200860 tokens = ttokenize(mat_de, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron28031b72021-10-02 13:07:25 +0200861 assert.Equal("D'dorf", tokens[0])
862 assert.Equal("Ku'damm", tokens[1])
863 assert.Equal("Lu'hafen", tokens[2])
864 assert.Equal("M'gladbach", tokens[3])
865 assert.Equal("W'schaft", tokens[4])
866 assert.Equal(5, len(tokens))
867
868 // germanTokenizerDoesNOTSeparateGermanContractions
Akronbe3d3662023-04-26 13:22:38 +0200869 tokens = ttokenize(mat_de, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron28031b72021-10-02 13:07:25 +0200870 assert.Equal("mach's", tokens[0])
871 assert.Equal("macht's", tokens[1])
872 assert.Equal("was'n", tokens[2])
873 assert.Equal("ist's", tokens[3])
874 assert.Equal("haste", tokens[4])
875 assert.Equal("willste", tokens[5])
876 assert.Equal("kannste", tokens[6])
877 assert.Equal("biste", tokens[7])
878 assert.Equal("kriegste", tokens[8])
879 assert.Equal(9, len(tokens))
880
Akronbe3d3662023-04-26 13:22:38 +0200881 tokens = ttokenize(mat_de, w, "Es ist gleich 2:30 Uhr.")
Akron78dba062021-10-28 19:30:46 +0200882 assert.Equal("Es", tokens[0])
883 assert.Equal("ist", tokens[1])
884 assert.Equal("gleich", tokens[2])
885 assert.Equal("2:30", tokens[3])
886 assert.Equal("Uhr", tokens[4])
887 assert.Equal(".", tokens[5])
888 assert.Equal(6, len(tokens))
889
Akronbe3d3662023-04-26 13:22:38 +0200890 tokens = ttokenize(mat_de, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
Akron17984c82021-10-30 11:44:37 +0200891 assert.Equal("Sie", tokens[0])
892 assert.Equal("schwamm", tokens[1])
893 assert.Equal("die", tokens[2])
894 assert.Equal("Strecke", tokens[3])
895 assert.Equal("in", tokens[4])
896 assert.Equal("00:00:57,34", tokens[5])
897 assert.Equal("00:57,341", tokens[6])
898 assert.Equal("0:57", tokens[7])
899 assert.Equal("Stunden", tokens[8])
900 assert.Equal(".", tokens[9])
901 assert.Equal(10, len(tokens))
902
Akronf1106ec2021-11-05 13:04:44 +0100903 // waste example
Akronbe3d3662023-04-26 13:22:38 +0200904 tokens = ttokenize(mat_de, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
Akronf1106ec2021-11-05 13:04:44 +0100905 assert.Equal(tokens[0], "Am")
906 assert.Equal(tokens[1], "24.1.1806")
907 assert.Equal(tokens[2], "feierte")
908 assert.Equal(tokens[3], "E.")
909 assert.Equal(tokens[4], "T.")
910 assert.Equal(tokens[5], "A.")
911 assert.Equal(tokens[6], "Hoffmann")
912 assert.Equal(tokens[7], "seinen")
913 assert.Equal(tokens[8], "30.")
914 assert.Equal(tokens[9], "Geburtstag")
915 assert.Equal(tokens[10], ".")
916 assert.Equal(11, len(tokens))
917
Akron9135b202021-11-06 13:16:07 +0100918 // IPtest
Akronbe3d3662023-04-26 13:22:38 +0200919 tokens = ttokenize(mat_de, w, "Meine IP ist 192.178.168.55.")
Akron9135b202021-11-06 13:16:07 +0100920 assert.Equal(tokens[0], "Meine")
921 assert.Equal(tokens[1], "IP")
922 assert.Equal(tokens[2], "ist")
923 assert.Equal(tokens[3], "192.178.168.55")
924 assert.Equal(tokens[4], ".")
925 assert.Equal(5, len(tokens))
926
Akron6742b962021-11-09 01:17:20 +0100927 // XML entities
Akronbe3d3662023-04-26 13:22:38 +0200928 tokens = ttokenize(mat_de, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
Akron6742b962021-11-09 01:17:20 +0100929 assert.Equal(tokens[0], "Das")
930 assert.Equal(tokens[1], "ist")
931 assert.Equal(tokens[2], "&nbsp;")
932 assert.Equal(tokens[3], "1:30")
933 assert.Equal(tokens[4], "Stunden")
934 assert.Equal(tokens[5], "&")
935 assert.Equal(tokens[6], "20")
936 assert.Equal(tokens[7], "Minuten")
937 assert.Equal(tokens[8], "zu")
938 assert.Equal(tokens[9], "spät")
939 assert.Equal(tokens[10], "&GT;")
940 assert.Equal(tokens[11], ".")
941 assert.Equal(12, len(tokens))
942
Akron936c0f52021-12-07 11:30:53 +0100943 // Plusampersand compounds (1)
Akronbe3d3662023-04-26 13:22:38 +0200944 tokens = ttokenize(mat_de, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
Akrone62e8eb2021-12-03 11:59:53 +0100945 assert.Equal(tokens[0], "Die")
946 assert.Equal(tokens[1], "2G+-Regel")
947 assert.Equal(tokens[2], "soll")
948 assert.Equal(tokens[3], "weitere")
949 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
950 assert.Equal(tokens[5], "reduzieren")
951 assert.Equal(tokens[6], ".")
952 assert.Equal(7, len(tokens))
953
Akron936c0f52021-12-07 11:30:53 +0100954 // Plusampersand compounds (2)
Akronbe3d3662023-04-26 13:22:38 +0200955 tokens = ttokenize(mat_de, w, "Der Neu-C++-Programmierer.")
Akron936c0f52021-12-07 11:30:53 +0100956 assert.Equal(tokens[0], "Der")
957 assert.Equal(tokens[1], "Neu-C++-Programmierer")
958 assert.Equal(tokens[2], ".")
959 assert.Equal(3, len(tokens))
960
Akron54ed7e72022-01-04 12:05:00 +0100961 // z.B.
Akronbe3d3662023-04-26 13:22:38 +0200962 tokens = ttokenize(mat_de, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
Akron54ed7e72022-01-04 12:05:00 +0100963 assert.Equal(tokens[0], "Dies")
964 assert.Equal(tokens[1], "sind")
965 assert.Equal(tokens[2], "z.")
966 assert.Equal(tokens[3], "B.")
967 assert.Equal(tokens[4], "zwei")
968 assert.Equal(tokens[5], "Wörter")
969 assert.Equal(tokens[6], "-")
970 assert.Equal(tokens[7], "z.")
971 assert.Equal(tokens[8], "B.")
972 assert.Equal(tokens[9], "auch")
973 assert.Equal(tokens[10], ".")
974 assert.Equal(11, len(tokens))
975
Akron9a594712022-01-14 11:12:21 +0100976 // z.B.
Akronbe3d3662023-04-26 13:22:38 +0200977 tokens = ttokenize(mat_de, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
Akron9a594712022-01-14 11:12:21 +0100978 assert.Equal(tokens[0], "Dies")
979 assert.Equal(tokens[1], "sind")
980 assert.Equal(tokens[2], "z.")
981 assert.Equal(tokens[3], "B.")
982 assert.Equal(tokens[4], "zwei")
983 assert.Equal(tokens[5], "Wörter")
984 assert.Equal(tokens[6], "-")
985 assert.Equal(tokens[7], "z.")
986 assert.Equal(tokens[8], "B.")
987 assert.Equal(tokens[9], "auch")
988 assert.Equal(tokens[10], ".")
989 assert.Equal(11, len(tokens))
990
991 // Single quote handling
Akronbe3d3662023-04-26 13:22:38 +0200992 tokens = ttokenize(mat_de, w, "Es heißt 'Leitungssportteams' und nicht anders.")
Akron9a594712022-01-14 11:12:21 +0100993 assert.Equal(tokens[0], "Es")
994 assert.Equal(tokens[1], "heißt")
995 assert.Equal(tokens[2], "'")
996 assert.Equal(tokens[3], "Leitungssportteams")
997 assert.Equal(tokens[4], "'")
998 assert.Equal(tokens[5], "und")
999 assert.Equal(tokens[6], "nicht")
1000 assert.Equal(tokens[7], "anders")
1001 assert.Equal(tokens[8], ".")
1002 assert.Equal(9, len(tokens))
1003
Akronb02ad072022-01-19 12:41:44 +01001004 // Apostrophe handling
Akronbe3d3662023-04-26 13:22:38 +02001005 tokens = ttokenize(mat_de, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
Akronb02ad072022-01-19 12:41:44 +01001006 assert.Equal(tokens[0], "Das")
1007 assert.Equal(tokens[1], "ist")
1008 assert.Equal(tokens[2], "Nils’")
1009 assert.Equal(tokens[3], "Einkaufskorb")
1010 assert.Equal(tokens[4], "bei")
1011 assert.Equal(tokens[5], "McDonald's")
1012 assert.Equal(tokens[6], ".")
1013 assert.Equal(7, len(tokens))
1014
Akronbe3d3662023-04-26 13:22:38 +02001015}
1016
1017func TestMatrixFullTokenizerTokenSplitterEN(t *testing.T) {
1018 assert := assert.New(t)
1019
1020 if mat_en == nil {
1021 mat_en = LoadMatrixFile("testdata/tokenizer_en.matok")
1022 }
1023
1024 b := make([]byte, 0, 2048)
1025 w := bytes.NewBuffer(b)
1026 var tokens []string
1027
1028 // testEnglishTokenizerScienceAbbreviations
1029 tokens = ttokenize(mat_en, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
1030 assert.Equal("Approx.", tokens[0])
1031 assert.Equal("in", tokens[1])
1032 assert.Equal("Sept.", tokens[2])
1033 assert.Equal("1954", tokens[3])
1034 assert.Equal(",", tokens[4])
1035 assert.Equal("Assoc.", tokens[5])
1036 assert.Equal("Prof.", tokens[6])
1037 assert.Equal("Dr.", tokens[7])
1038 assert.Equal("R.", tokens[8])
1039 assert.Equal("J.", tokens[9])
1040 assert.Equal("Ewing", tokens[10])
1041 assert.Equal("reviewed", tokens[11])
1042 assert.Equal("articles", tokens[12])
1043 assert.Equal("on", tokens[13])
1044 assert.Equal("Enzymol.", tokens[14])
1045 assert.Equal("Bacteriol.", tokens[15])
1046 assert.Equal("effects", tokens[16])
1047 assert.Equal("later", tokens[17])
1048 assert.Equal("published", tokens[18])
1049 assert.Equal("in", tokens[19])
1050 assert.Equal("Nutr.", tokens[20])
1051 assert.Equal("Rheumatol.", tokens[21])
1052 assert.Equal("No.", tokens[22])
1053 assert.Equal("12", tokens[23])
1054 assert.Equal("and", tokens[24])
1055 assert.Equal("Nº.", tokens[25])
1056 assert.Equal("13.", tokens[26])
1057 assert.Equal(",", tokens[27])
1058 assert.Equal("pp.", tokens[28])
1059 assert.Equal("17-18", tokens[29])
1060 assert.Equal(".", tokens[30])
Akronbe3d3662023-04-26 13:22:38 +02001061
Akrond0dfea82023-04-26 19:24:17 +02001062 // englishTokenizerCanGuessWhetherIIsAbbrev
1063 tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
1064 assert.Equal("I.", tokens[1])
1065 assert.Equal("I", tokens[8])
1066 assert.Equal(".", tokens[9])
1067 assert.Equal("I", tokens[12])
1068 assert.Equal(".", tokens[13])
Akronbe3d3662023-04-26 13:22:38 +02001069
Akron72a64222023-04-26 17:00:45 +02001070 // englishTokenizerSeparatesEnglishContractionsAndClitics
1071 tokens = ttokenize(mat_en, w, "I've we'll you'd I'm we're Peter's isn't who'll've")
1072 assert.Equal("I", tokens[0])
1073 assert.Equal("'ve", tokens[1])
1074 assert.Equal("'ll", tokens[3])
1075 assert.Equal("'d", tokens[5])
1076 assert.Equal("'m", tokens[7])
1077 assert.Equal("'re", tokens[9])
1078 assert.Equal("'s", tokens[11])
1079 assert.Equal("is", tokens[12])
1080 assert.Equal("n't", tokens[13])
1081 assert.Equal("who", tokens[14])
1082 assert.Equal("'ll", tokens[15])
1083 assert.Equal("'ve", tokens[16])
1084 assert.Equal(17, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001085 /*
1086 @Test
Akron28031b72021-10-02 13:07:25 +02001087 public void frenchTokenizerKnowsFrenchAbbreviations () {
1088 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1089 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
1090 assert.Equal("Approx.", tokens[0]);
1091 assert.Equal("juill.", tokens[2]);
1092 assert.Equal("prof.", tokens[5]);
1093 assert.Equal("exerc.", tokens[15]);
1094 assert.Equal("no.", tokens[16]);
1095 assert.Equal("pp.", tokens[21]);
1096 }
1097
1098 @Test
1099 public void frenchTokenizerKnowsFrenchContractions () {
1100 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1101 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
1102 assert.Equal("J'", tokens[0]);
1103 assert.Equal("j'", tokens[2]);
1104 assert.Equal("qu'", tokens[4]);
1105 assert.Equal("d'", tokens[6]);
1106 assert.Equal("jusqu'", tokens[8]);
1107 assert.Equal("Aujourd'hui", tokens[10]);
1108 assert.Equal("D'", tokens[11]); // ’
1109 assert.Equal("Quelqu'un", tokens[13]); // ’
1110 assert.Equal("Presqu'île", tokens[14]); // ’
1111 }
1112
1113 @Test
1114 public void frenchTokenizerKnowsFrenchClitics () {
1115 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1116 tokens = tokenize(dat, w, "suis-je sont-elles ")
1117 assert.Equal("suis", tokens[0]);
1118 assert.Equal("-je", tokens[1]);
1119 assert.Equal("sont", tokens[2]);
1120 assert.Equal("-elles", tokens[3]);
1121 }
1122
Akron28031b72021-10-02 13:07:25 +02001123
1124 @Test
1125 public void testZipOuputArchive () {
1126
1127 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1128 System.setOut(new PrintStream(clearOut));
1129 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1130 assert.Equal(0, len(tokens));
1131 }
1132 */
1133 /*
1134
1135 @Test
1136 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1137 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1138 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1139 .printOffsets(true)
1140 .build();
1141 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1142 assert.Equal("Text1", tokens[0].getType());
1143 assert.Equal(len(tokens), 9 );
1144 }
1145 */
1146}
1147
Akronb98e4cf2022-03-27 23:56:49 +02001148func TestMatrixEmoticons(t *testing.T) {
1149 assert := assert.New(t)
1150
Akronbe3d3662023-04-26 13:22:38 +02001151 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001152 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akronb98e4cf2022-03-27 23:56:49 +02001153 }
1154
Akronbe3d3662023-04-26 13:22:38 +02001155 assert.NotNil(mat_de)
Akronb98e4cf2022-03-27 23:56:49 +02001156
1157 b := make([]byte, 0, 2048)
1158 w := bytes.NewBuffer(b)
1159 var tokens []string
1160
Akronbe3d3662023-04-26 13:22:38 +02001161 tokens = ttokenize(mat_de, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
Akronb98e4cf2022-03-27 23:56:49 +02001162 assert.Equal(tokens[0], ":-*")
1163 assert.Equal(tokens[1], ";)")
1164 assert.Equal(tokens[2], ":))")
1165 assert.Equal(tokens[3], ":*(")
1166 assert.Equal(tokens[4], "^___^")
1167 assert.Equal(tokens[5], "T__T")
1168 assert.Equal(tokens[6], "^^;")
1169 assert.Equal(tokens[7], "-_-;;;")
1170 assert.Equal(tokens[8], "-_-^")
1171 assert.Equal(len(tokens), 9)
Akron6dcb6ce2022-04-09 16:09:51 +02001172
Akronbe3d3662023-04-26 13:22:38 +02001173 tokens = ttokenize(mat_de, w, "das -> Lustig<-!")
Akron6dcb6ce2022-04-09 16:09:51 +02001174 assert.Equal("das", tokens[0])
1175 assert.Equal("->", tokens[1])
1176 assert.Equal("Lustig", tokens[2])
1177 assert.Equal("<-", tokens[3])
1178 assert.Equal("!", tokens[4])
1179 assert.Equal(5, len(tokens))
Akronb98e4cf2022-03-27 23:56:49 +02001180}
1181
Akronc9c0eae2021-10-22 19:49:43 +02001182func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001183 assert := assert.New(t)
1184
Akronbe3d3662023-04-26 13:22:38 +02001185 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001186 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +02001187 }
Akron28031b72021-10-02 13:07:25 +02001188
Akronbe3d3662023-04-26 13:22:38 +02001189 assert.NotNil(mat_de)
Akron28031b72021-10-02 13:07:25 +02001190
1191 b := make([]byte, 0, 2048)
1192 w := bytes.NewBuffer(b)
1193 var tokens []string
1194
Akronbe3d3662023-04-26 13:22:38 +02001195 tokens = ttokenize(mat_de, w, "Das <b>beste</b> Fußballspiel")
Akron28031b72021-10-02 13:07:25 +02001196 assert.Equal("Das", tokens[0])
1197 assert.Equal("<b>", tokens[1])
1198 assert.Equal("beste", tokens[2])
1199 assert.Equal("</b>", tokens[3])
1200 assert.Equal("Fußballspiel", tokens[4])
1201 assert.Equal(5, len(tokens))
1202
Akronbe3d3662023-04-26 13:22:38 +02001203 tokens = ttokenize(mat_de, w, "Das <b class=\"c\">beste</b> Fußballspiel")
Akron28031b72021-10-02 13:07:25 +02001204 assert.Equal("Das", tokens[0])
1205 assert.Equal("<b class=\"c\">", tokens[1])
1206 assert.Equal("beste", tokens[2])
1207 assert.Equal("</b>", tokens[3])
1208 assert.Equal("Fußballspiel", tokens[4])
1209 assert.Equal(5, len(tokens))
1210
Akronbe3d3662023-04-26 13:22:38 +02001211 tokens = ttokenize(mat_de, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
Akron28031b72021-10-02 13:07:25 +02001212 assert.Equal("der", tokens[0])
1213 assert.Equal("<x y=\"alte \">", tokens[1])
1214 assert.Equal("<x x>", tokens[2])
1215 assert.Equal("alte", tokens[3])
1216 assert.Equal("</x>", tokens[4])
1217 assert.Equal("etc.", tokens[5])
1218 assert.Equal("et", tokens[6])
1219 assert.Equal(".", tokens[7])
1220 assert.Equal("Mann", tokens[8])
1221 assert.Equal(".", tokens[9])
1222 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001223
Akronbe3d3662023-04-26 13:22:38 +02001224 tokens = ttokenize(mat_de, w, "das<br class=\"br\" />ging.")
Akron066d99c2021-10-28 19:04:59 +02001225 assert.Equal("das", tokens[0])
1226 assert.Equal("<br class=\"br\" />", tokens[1])
1227 assert.Equal("ging", tokens[2])
1228 assert.Equal(".", tokens[3])
1229 assert.Equal(4, len(tokens))
Akrond47c67e2022-04-10 11:02:59 +02001230
Akronbe3d3662023-04-26 13:22:38 +02001231 tokens = ttokenize(mat_de, w, "das <?robot xgh ?> <!-- hm hm --> <![CDATA[ cdata ]]> <br />")
Akrond47c67e2022-04-10 11:02:59 +02001232 assert.Equal("das", tokens[0])
1233 assert.Equal("<?robot", tokens[1])
1234 assert.Equal("xgh", tokens[2])
1235 assert.Equal("?>", tokens[3])
1236 assert.Equal("<!--", tokens[4])
1237 assert.Equal("hm", tokens[5])
1238 assert.Equal("hm", tokens[6])
1239 assert.Equal("-->", tokens[7])
1240 assert.Equal("<![CDATA[", tokens[8])
1241 assert.Equal("cdata", tokens[9])
1242 assert.Equal("]]>", tokens[10])
1243 assert.Equal("<br />", tokens[11])
1244 assert.Equal(12, len(tokens))
1245
Akron28031b72021-10-02 13:07:25 +02001246}
1247
Akronabcb6a52021-10-09 15:52:08 +02001248func TestMatokDatokEquivalence(t *testing.T) {
1249 assert := assert.New(t)
1250
Akronbe3d3662023-04-26 13:22:38 +02001251 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001252 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +02001253 }
Akron0139bc52023-08-31 16:35:58 +02001254 dat := LoadDatokFile("testdata/tokenizer_de.datok")
Akronabcb6a52021-10-09 15:52:08 +02001255
1256 r := strings.NewReader(s)
1257
1258 tb := make([]byte, 0, 2048)
1259 w := bytes.NewBuffer(tb)
1260
1261 // Transduce with double array representation
1262 dat.Transduce(r, w)
1263
1264 datStr := w.String()
1265
1266 r.Reset(s)
1267 w.Reset()
1268
1269 // Transduce with matrix representation
Akronbe3d3662023-04-26 13:22:38 +02001270 mat_de.Transduce(r, w)
Akronabcb6a52021-10-09 15:52:08 +02001271
1272 matStr := w.String()
1273
1274 assert.Equal(datStr, matStr)
1275}
1276
Akronc9c0eae2021-10-22 19:49:43 +02001277func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001278 assert := assert.New(t)
1279
Akronbe3d3662023-04-26 13:22:38 +02001280 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001281 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +02001282 }
Akrone396a932021-10-19 01:06:13 +02001283
Akronbe3d3662023-04-26 13:22:38 +02001284 assert.NotNil(mat_de)
Akrone396a932021-10-19 01:06:13 +02001285
1286 b := make([]byte, 0, 2048)
1287 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001288
Akronbe3d3662023-04-26 13:22:38 +02001289 assert.True(mat_de.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001290
1291 matStr := w.String()
1292
1293 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1294}
1295
Akronc9c0eae2021-10-22 19:49:43 +02001296func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001297 assert := assert.New(t)
1298
Akronbe3d3662023-04-26 13:22:38 +02001299 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001300 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +02001301 }
Akrona854faa2021-10-22 19:31:08 +02001302
Akronbe3d3662023-04-26 13:22:38 +02001303 assert.NotNil(mat_de)
Akrona854faa2021-10-22 19:31:08 +02001304
1305 b := make([]byte, 0, 2048)
1306 w := bytes.NewBuffer(b)
1307
Akronbe3d3662023-04-26 13:22:38 +02001308 assert.True(mat_de.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
Akrona854faa2021-10-22 19:31:08 +02001309 matStr := w.String()
1310 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001311}
Akrona854faa2021-10-22 19:31:08 +02001312
Akron22c565a2021-11-28 17:31:36 +01001313func TestMatrixFullTokenizerLongText(t *testing.T) {
1314 assert := assert.New(t)
1315
Akronbe3d3662023-04-26 13:22:38 +02001316 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001317 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron22c565a2021-11-28 17:31:36 +01001318 }
1319
Akronbe3d3662023-04-26 13:22:38 +02001320 assert.NotNil(mat_de)
Akron22c565a2021-11-28 17:31:36 +01001321
1322 b := make([]byte, 0, 2048)
1323 w := bytes.NewBuffer(b)
1324
1325 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1326
1327Copyright laws are changing all over the world. Be sure to check the
1328copyright laws for your country before downloading or redistributing
1329this or any other Project Gutenberg eBook.
1330
1331This header should be the first thing seen when viewing this Project
1332Gutenberg file. Please do not remove it. Do not change or edit the
1333header without written permission.
1334
1335Please read the "legal small print," and other information about the
1336eBook and Project Gutenberg at the bottom of this file. Included is
1337important information about your specific rights and restrictions in
1338how the file may be used. You can also find out about how to make a
1339donation to Project Gutenberg, and how to get involved.
1340
1341
1342**Welcome To The World of Free Plain Vanilla Electronic Texts**
1343
1344**eBooks Readable By Both Humans and By Computers, Since 1971**
1345
1346*****These eBooks Were Prepared By Thousands of Volunteers!*****
1347
1348
1349Title: Effi Briest
1350
1351Author: Theodor Fontane
1352
1353Release Date: March, 2004 [EBook #5323]
1354`
1355
Akronbe3d3662023-04-26 13:22:38 +02001356 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akron22c565a2021-11-28 17:31:36 +01001357
1358 assert.True(strings.Contains(w.String(), "Release"))
1359}
1360
Akronf6bdfdb2021-10-23 15:56:53 +02001361func TestMatrixTrimming(t *testing.T) {
1362 assert := assert.New(t)
1363
Akronbe3d3662023-04-26 13:22:38 +02001364 if mat_de == nil {
Akron0139bc52023-08-31 16:35:58 +02001365 mat_de = LoadMatrixFile("testdata/tokenizer_de.matok")
Akron9fb63af2021-10-28 01:15:53 +02001366 }
Akronf6bdfdb2021-10-23 15:56:53 +02001367
Akronbe3d3662023-04-26 13:22:38 +02001368 assert.NotNil(mat_de)
Akronf6bdfdb2021-10-23 15:56:53 +02001369
1370 b := make([]byte, 0, 2048)
1371 w := bytes.NewBuffer(b)
1372
Akronbe3d3662023-04-26 13:22:38 +02001373 assert.True(mat_de.Transduce(strings.NewReader(" Erste."), w))
Akronf6bdfdb2021-10-23 15:56:53 +02001374 matStr := w.String()
1375 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001376}
1377
Akronc9c0eae2021-10-22 19:49:43 +02001378func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001379 bu := make([]byte, 0, 2048)
1380 w := bytes.NewBuffer(bu)
1381
Akron28031b72021-10-02 13:07:25 +02001382 r := strings.NewReader(s)
1383
Akron0139bc52023-08-31 16:35:58 +02001384 mat := LoadMatrixFile("testdata/tokenizer_de.matok")
Akron28031b72021-10-02 13:07:25 +02001385
1386 b.ResetTimer()
1387
1388 for i := 0; i < b.N; i++ {
1389 w.Reset()
1390 r.Reset(s)
1391 ok := mat.Transduce(r, w)
1392 if !ok {
1393 fmt.Println("Fail!")
1394 fmt.Println(w.String())
1395 os.Exit(1)
1396 }
1397 }
1398}