blob: 31812d7ba7b3e742dd11f7fc37f24413324a4d46 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akronbe3d3662023-04-26 13:22:38 +020023var mat_de, mat_en *MatrixTokenizer
Akron9fb63af2021-10-28 01:15:53 +020024
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akrondf275812022-03-27 12:54:46 +020073func TestMatrixSimpleString(t *testing.T) {
74 assert := assert.New(t)
75 // bau | bauamt
76 tok := LoadFomaFile("testdata/bauamt.fst")
77 mat := tok.ToMatrix()
78
79 b := make([]byte, 0, 2048)
80 w := bytes.NewBuffer(b)
81 var tokens []string
82
83 tokens = ttokenize(mat, w, "ibauamt")
84 assert.Equal("i", tokens[0])
85 assert.Equal("bauamt", tokens[1])
86
87 tokens = ttokenize(mat, w, "ibbauamt")
88 assert.Equal("i", tokens[0])
89
90 assert.Equal("b", tokens[1])
91 assert.Equal("bauamt", tokens[2])
92
93 tokens = ttokenize(mat, w, "bau")
94 assert.Equal("bau", tokens[0])
95
96 tokens = ttokenize(mat, w, "baum")
97 assert.Equal("bau", tokens[0])
98 assert.Equal("m", tokens[1])
99
100 tokens = ttokenize(mat, w, "baudibauamt")
101 assert.Equal("bau", tokens[0])
102 assert.Equal("d", tokens[1])
103 assert.Equal("i", tokens[2])
104 assert.Equal("bauamt", tokens[3])
105}
106
Akronc9c0eae2021-10-22 19:49:43 +0200107func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +0200108 assert := assert.New(t)
109 foma := LoadFomaFile("testdata/simpletok.fst")
110 assert.NotNil(foma)
111
112 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +0200113 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +0200114
Akrondf275812022-03-27 12:54:46 +0200115 assert.Equal(ttokenizeStr(mat, "bau"), "bau")
116 assert.Equal(ttokenizeStr(mat, "bad"), "bad")
117 assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200118 b := make([]byte, 0, 1024)
119 buf := bytes.NewBuffer(b)
120 n, err := mat.WriteTo(buf)
121 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +0200122 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +0200123 mat2 := ParseMatrix(buf)
124 assert.NotNil(mat2)
125 assert.Equal(mat.sigma, mat2.sigma)
126 assert.Equal(mat.epsilon, mat2.epsilon)
127 assert.Equal(mat.unknown, mat2.unknown)
128 assert.Equal(mat.identity, mat2.identity)
129 assert.Equal(mat.stateCount, mat2.stateCount)
130 assert.Equal(len(mat.array), len(mat2.array))
131 assert.Equal(mat.array, mat2.array)
Akrondf275812022-03-27 12:54:46 +0200132 assert.Equal(ttokenizeStr(mat2, "bau"), "bau")
133 assert.Equal(ttokenizeStr(mat2, "bad"), "bad")
134 assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200135}
136
Akrone396a932021-10-19 01:06:13 +0200137func TestMatrixIgnorableMCS(t *testing.T) {
138 assert := assert.New(t)
139
140 // This test relies on final states. That's why it is
141 // not working correctly anymore.
142
143 // File has MCS in sigma but not in net
144 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
145 assert.NotNil(tok)
146 mat := tok.ToMatrix()
147 assert.NotNil(mat)
148
149 b := make([]byte, 0, 2048)
150 w := bytes.NewBuffer(b)
151 var tokens []string
152
153 // Is only unambigous when transducing strictly greedy!
154 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
155 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200156 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200157 assert.Equal("a", tokens[0])
158 assert.Equal("b", tokens[1])
159 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200160 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200161}
162
Akronc9c0eae2021-10-22 19:49:43 +0200163func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200164 assert := assert.New(t)
165 foma := LoadFomaFile("testdata/tokenizer.fst")
166 assert.NotNil(foma)
167
168 mat := foma.ToMatrix()
169 assert.NotNil(foma)
170
171 tb := make([]byte, 0, 2048)
172 w := bytes.NewBuffer(tb)
173
174 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200175 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200176
177 b := make([]byte, 0, 1024)
178 buf := bytes.NewBuffer(b)
179 _, err := mat.WriteTo(buf)
180 assert.Nil(err)
181 w.Reset()
182 // assert.Equal(int64(248), n)
183
184 mat2 := ParseMatrix(buf)
185 assert.NotNil(mat2)
186 assert.Equal(mat.sigma, mat2.sigma)
187 assert.Equal(mat.epsilon, mat2.epsilon)
188 assert.Equal(mat.unknown, mat2.unknown)
189 assert.Equal(mat.identity, mat2.identity)
190 assert.Equal(mat.stateCount, mat2.stateCount)
191 assert.Equal(len(mat.array), len(mat2.array))
192 // assert.Equal(mat.array, mat2.array)
193
194 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200195 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200196}
197
Akronc9c0eae2021-10-22 19:49:43 +0200198func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200199 assert := assert.New(t)
200
Akronbe3d3662023-04-26 13:22:38 +0200201 if mat_de == nil {
202 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +0200203 }
Akron28031b72021-10-02 13:07:25 +0200204
Akronbe3d3662023-04-26 13:22:38 +0200205 assert.NotNil(mat_de)
Akron28031b72021-10-02 13:07:25 +0200206
207 b := make([]byte, 0, 2048)
208 w := bytes.NewBuffer(b)
209 var tokens []string
210
Akronbe3d3662023-04-26 13:22:38 +0200211 assert.True(mat_de.Transduce(strings.NewReader("tra. u Du?"), w))
Akron28031b72021-10-02 13:07:25 +0200212
213 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200214 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200215 assert.Equal("tra", tokens[0])
216 assert.Equal(".", tokens[1])
217 assert.Equal("", tokens[2])
218 assert.Equal("u", tokens[3])
219 assert.Equal("Du", tokens[4])
220 assert.Equal("?", tokens[5])
221 assert.Equal("", tokens[6])
222 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200223 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200224
225 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200226 assert.True(mat_de.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200227 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200228}
229
Akronc9c0eae2021-10-22 19:49:43 +0200230func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200231 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200232
Akronbe3d3662023-04-26 13:22:38 +0200233 if mat_de == nil {
234 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +0200235 }
Akron5c82a922021-09-24 19:11:29 +0200236
237 b := make([]byte, 0, 2048)
238 w := bytes.NewBuffer(b)
239 var sentences []string
240
241 // testSentSplitterSimple
Akronbe3d3662023-04-26 13:22:38 +0200242 assert.True(mat_de.Transduce(strings.NewReader("Der alte Mann."), w))
Akron5c82a922021-09-24 19:11:29 +0200243 sentences = strings.Split(w.String(), "\n\n")
244
Akrona854faa2021-10-22 19:31:08 +0200245 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200246 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200247 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200248 assert.Equal(len(sentences), 2)
249
250 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200251 assert.True(mat_de.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
Akrona854faa2021-10-22 19:31:08 +0200252 sentences = strings.Split(w.String(), "\n\n")
253 assert.Equal(len(sentences), 2)
254 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
255 assert.Equal("\n", sentences[1])
256
257 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200258 assert.True(mat_de.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
Akron5c82a922021-09-24 19:11:29 +0200259 sentences = strings.Split(w.String(), "\n\n")
260 assert.Equal(len(sentences), 2)
261 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200262 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200263
Akron28031b72021-10-02 13:07:25 +0200264 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200265 assert.True(mat_de.Transduce(strings.NewReader(""), w))
Akron28031b72021-10-02 13:07:25 +0200266 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200267 assert.Equal(len(sentences), 2)
268 assert.Equal("", sentences[0])
269 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200270
Akron28031b72021-10-02 13:07:25 +0200271 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200272 assert.True(mat_de.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
Akron28031b72021-10-02 13:07:25 +0200273 sentences = strings.Split(w.String(), "\n\n")
274 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200275
Akron28031b72021-10-02 13:07:25 +0200276 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200277 assert.True(mat_de.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
Akron28031b72021-10-02 13:07:25 +0200278 sentences = strings.Split(w.String(), "\n\n")
279 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200280
Akron28031b72021-10-02 13:07:25 +0200281 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200282 assert.True(mat_de.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
Akron28031b72021-10-02 13:07:25 +0200283 sentences = strings.Split(w.String(), "\n\n")
284 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200285 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200286 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200287
Akron28031b72021-10-02 13:07:25 +0200288 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200289 assert.True(mat_de.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
Akron28031b72021-10-02 13:07:25 +0200290 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200291 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200292 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200293
Akron28031b72021-10-02 13:07:25 +0200294 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200295 assert.True(mat_de.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
Akron28031b72021-10-02 13:07:25 +0200296 sentences = strings.Split(w.String(), "\n\n")
297 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200298
Akron28031b72021-10-02 13:07:25 +0200299 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200300 assert.True(mat_de.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
Akron28031b72021-10-02 13:07:25 +0200301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200303
Akron28031b72021-10-02 13:07:25 +0200304 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200305 assert.True(mat_de.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
Akron28031b72021-10-02 13:07:25 +0200306 sentences = strings.Split(w.String(), "\n\n")
307 assert.Equal(len(sentences), 2)
308 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200309 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200310
Akron28031b72021-10-02 13:07:25 +0200311 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200312 assert.True(mat_de.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
Akron28031b72021-10-02 13:07:25 +0200313 sentences = strings.Split(w.String(), "\n\n")
314 assert.Equal(len(sentences), 3)
315 assert.Equal("Ausschalten\n!!!", sentences[0])
316 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200317 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200318
Akron28031b72021-10-02 13:07:25 +0200319 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200320 assert.True(mat_de.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
Akron28031b72021-10-02 13:07:25 +0200321 sentences = strings.Split(w.String(), "\n\n")
322 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100323
324 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200325 assert.True(mat_de.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
Akrone96895f2022-03-08 19:58:37 +0100326 sentences = strings.Split(w.String(), "\n\n")
327 assert.Equal(len(sentences), 5)
328 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
329 assert.Equal("Geh\n!!!", sentences[1])
330 assert.Equal("\"\nLass\n!\n\"", sentences[2])
331 assert.Equal("Dann\nging\ner\n.", sentences[3])
332
333 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200334 assert.True(mat_de.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
Akrone96895f2022-03-08 19:58:37 +0100335 sentences = strings.Split(w.String(), "\n\n")
336 assert.Equal(len(sentences), 3)
337 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
338 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100339
340 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200341 assert.True(mat_de.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
Akronece3f012022-03-09 19:12:15 +0100342 sentences = strings.Split(w.String(), "\n\n")
343 assert.Equal(len(sentences), 3)
344 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
345 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
346
347 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200348 assert.True(mat_de.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
Akronece3f012022-03-09 19:12:15 +0100349 sentences = strings.Split(w.String(), "\n\n")
350 assert.Equal(len(sentences), 3)
351 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
352 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
353
354 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
355Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
356bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
357'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
358zu Polterabend und Hochzeit.'«
359
360»Und was sagtest du da?«`
361
362 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200363 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akronece3f012022-03-09 19:12:15 +0100364 sentences = strings.Split(w.String(), "\n\n")
365 assert.Equal(len(sentences), 8)
366 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
367 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron4222ac82022-03-11 01:06:21 +0100368
369 text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
370Innstetten!`
371
372 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200373 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akron4222ac82022-03-11 01:06:21 +0100374 sentences = strings.Split(w.String(), "\n\n")
375 assert.Equal(len(sentences), 3)
376 assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
377 assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
Akrondf275812022-03-27 12:54:46 +0200378
Akronb98e4cf2022-03-27 23:56:49 +0200379 // Check parantheses at the end of the sentence
Akronf94b9ce2022-03-27 18:18:09 +0200380 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200381 assert.True(mat_de.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
Akronf94b9ce2022-03-27 18:18:09 +0200382 sentences = strings.Split(w.String(), "\n\n")
383 assert.Equal(len(sentences), 3)
384 assert.Equal("(\nEr\nging\n.\n)", sentences[0])
385 assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
Akron7aa1cbe2022-03-30 12:44:04 +0200386
387 // Check parantheses and quotes at the end of the sentence
388 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200389 assert.True(mat_de.Transduce(strings.NewReader("(Er sagte: \"Hallo!\") Dann ging er."), w))
Akron7aa1cbe2022-03-30 12:44:04 +0200390 sentences = strings.Split(w.String(), "\n\n")
391 assert.Equal(len(sentences), 3)
392 assert.Equal("(\nEr\nsagte\n:\n\"\nHallo\n!\n\"\n)", sentences[0])
393 assert.Equal("Dann\nging\ner\n.", sentences[1])
394
Akrondf275812022-03-27 12:54:46 +0200395}
396
397func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
398 assert := assert.New(t)
399
Akronbe3d3662023-04-26 13:22:38 +0200400 if mat_de == nil {
401 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akrondf275812022-03-27 12:54:46 +0200402 }
403
404 b := make([]byte, 0, 2048)
405 w := bytes.NewBuffer(b)
406 var sentences []string
407
408 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
409
410 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200411 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akrondf275812022-03-27 12:54:46 +0200412 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +0200413 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +0200414 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
415 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +0200416 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
417 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
418 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akron1c34ce62021-09-23 23:27:39 +0200419}
Akron28031b72021-10-02 13:07:25 +0200420
Akronc9c0eae2021-10-22 19:49:43 +0200421func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200422 assert := assert.New(t)
423
Akronbe3d3662023-04-26 13:22:38 +0200424 if mat_de == nil {
425 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +0200426 }
Akron28031b72021-10-02 13:07:25 +0200427
428 b := make([]byte, 0, 2048)
429 w := bytes.NewBuffer(b)
430 var tokens []string
431
432 // testTokenizerSimple
Akronbe3d3662023-04-26 13:22:38 +0200433 tokens = ttokenize(mat_de, w, "Der alte Mann")
Akron28031b72021-10-02 13:07:25 +0200434 assert.Equal(tokens[0], "Der")
435 assert.Equal(tokens[1], "alte")
436 assert.Equal(tokens[2], "Mann")
437 assert.Equal(len(tokens), 3)
438
Akronbe3d3662023-04-26 13:22:38 +0200439 tokens = ttokenize(mat_de, w, "Der alte Mann.")
Akron28031b72021-10-02 13:07:25 +0200440 assert.Equal(tokens[0], "Der")
441 assert.Equal(tokens[1], "alte")
442 assert.Equal(tokens[2], "Mann")
443 assert.Equal(tokens[3], ".")
444 assert.Equal(len(tokens), 4)
445
446 // testTokenizerAbbr
Akronbe3d3662023-04-26 13:22:38 +0200447 tokens = ttokenize(mat_de, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron28031b72021-10-02 13:07:25 +0200448 assert.Equal(tokens[0], "Der")
449 assert.Equal(tokens[1], "Vorsitzende")
450 assert.Equal(tokens[2], "der")
451 assert.Equal(tokens[3], "F.D.P.")
452 assert.Equal(tokens[4], "hat")
453 assert.Equal(tokens[5], "gewählt")
454 assert.Equal(len(tokens), 6)
455 // Ignored in KorAP-Tokenizer
456
457 // testTokenizerHost1
Akronbe3d3662023-04-26 13:22:38 +0200458 tokens = ttokenize(mat_de, w, "Gefunden auf wikipedia.org")
Akron28031b72021-10-02 13:07:25 +0200459 assert.Equal(tokens[0], "Gefunden")
460 assert.Equal(tokens[1], "auf")
461 assert.Equal(tokens[2], "wikipedia.org")
462 assert.Equal(len(tokens), 3)
463
464 // testTokenizerWwwHost
Akronbe3d3662023-04-26 13:22:38 +0200465 tokens = ttokenize(mat_de, w, "Gefunden auf www.wikipedia.org")
Akron28031b72021-10-02 13:07:25 +0200466 assert.Equal("Gefunden", tokens[0])
467 assert.Equal("auf", tokens[1])
468 assert.Equal("www.wikipedia.org", tokens[2])
469 assert.Equal(3, len(tokens))
470
471 // testTokenizerWwwUrl
Akronbe3d3662023-04-26 13:22:38 +0200472 tokens = ttokenize(mat_de, w, "Weitere Infos unter www.info.biz/info")
Akron28031b72021-10-02 13:07:25 +0200473 assert.Equal("www.info.biz/info", tokens[3])
474
475 // testTokenizerFtpHost
476 /*
477 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
478 assert.Equal("Kann", tokens[0])
479 assert.Equal("von", tokens[1])
480 assert.Equal("ftp.download.org", tokens[2])
481 assert.Equal(5, len(tokens))
482 // Ignored in KorAP-Tokenizer
483 */
484
485 // testTokenizerDash
Akronbe3d3662023-04-26 13:22:38 +0200486 tokens = ttokenize(mat_de, w, "Das war -- spitze")
Akron28031b72021-10-02 13:07:25 +0200487 assert.Equal(tokens[0], "Das")
488 assert.Equal(tokens[1], "war")
489 assert.Equal(tokens[2], "--")
490 assert.Equal(tokens[3], "spitze")
491 assert.Equal(len(tokens), 4)
492
493 // testTokenizerEmail1
Akronbe3d3662023-04-26 13:22:38 +0200494 tokens = ttokenize(mat_de, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron28031b72021-10-02 13:07:25 +0200495 assert.Equal(tokens[0], "Ich")
496 assert.Equal(tokens[1], "bin")
497 assert.Equal(tokens[2], "unter")
498 assert.Equal(tokens[3], "korap@ids-mannheim.de")
499 assert.Equal(tokens[4], "erreichbar")
500 assert.Equal(tokens[5], ".")
501 assert.Equal(len(tokens), 6)
502
503 // testTokenizerEmail2
Akronbe3d3662023-04-26 13:22:38 +0200504 tokens = ttokenize(mat_de, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron28031b72021-10-02 13:07:25 +0200505 assert.Equal(tokens[0], "Oder")
506 assert.Equal(tokens[1], "unter")
507 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
508 assert.Equal(tokens[3], ".")
509 assert.Equal(len(tokens), 4)
510
511 // testTokenizerEmail3
Akronbe3d3662023-04-26 13:22:38 +0200512 tokens = ttokenize(mat_de, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron28031b72021-10-02 13:07:25 +0200513 assert.Equal(tokens[0], "Oder")
514 assert.Equal(tokens[1], "unter")
515 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
516 assert.Equal(tokens[3], ".")
517 assert.Equal(len(tokens), 4)
518 // Ignored in KorAP-Tokenizer
519
520 // testTokenizerDoNotAcceptQuotedEmailNames
Akronbe3d3662023-04-26 13:22:38 +0200521 tokens = ttokenize(mat_de, w, "\"John Doe\"@xx.com")
Akron28031b72021-10-02 13:07:25 +0200522 assert.Equal("\"", tokens[0])
523 assert.Equal("John", tokens[1])
524 assert.Equal("Doe", tokens[2])
525 assert.Equal("\"", tokens[3])
526 assert.Equal("@xx", tokens[4])
527 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
528 assert.Equal("com", tokens[6])
529 assert.Equal(7, len(tokens))
530
531 // testTokenizerTwitter
Akronbe3d3662023-04-26 13:22:38 +0200532 tokens = ttokenize(mat_de, w, "Folgt @korap und #korap")
Akron28031b72021-10-02 13:07:25 +0200533 assert.Equal(tokens[0], "Folgt")
534 assert.Equal(tokens[1], "@korap")
535 assert.Equal(tokens[2], "und")
536 assert.Equal(tokens[3], "#korap")
537 assert.Equal(len(tokens), 4)
538
539 // testTokenizerWeb1
Akronbe3d3662023-04-26 13:22:38 +0200540 tokens = ttokenize(mat_de, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron28031b72021-10-02 13:07:25 +0200541 assert.Equal(tokens[0], "Unsere")
542 assert.Equal(tokens[1], "Website")
543 assert.Equal(tokens[2], "ist")
544 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
545 assert.Equal(len(tokens), 4)
546
547 // testTokenizerWeb2
Akronbe3d3662023-04-26 13:22:38 +0200548 tokens = ttokenize(mat_de, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron28031b72021-10-02 13:07:25 +0200549 assert.Equal(tokens[0], "Wir")
550 assert.Equal(tokens[1], "sind")
551 assert.Equal(tokens[2], "auch")
552 assert.Equal(tokens[3], "im")
553 assert.Equal(tokens[4], "Internet")
554 assert.Equal(tokens[5], "(")
555 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
556 assert.Equal(tokens[7], ")")
557 assert.Equal(len(tokens), 8)
558 // Ignored in KorAP-Tokenizer
559
560 // testTokenizerWeb3
Akronbe3d3662023-04-26 13:22:38 +0200561 tokens = ttokenize(mat_de, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron28031b72021-10-02 13:07:25 +0200562 assert.Equal(tokens[0], "Die")
563 assert.Equal(tokens[1], "Adresse")
564 assert.Equal(tokens[2], "ist")
565 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
566 assert.Equal(tokens[4], ".")
567 assert.Equal(len(tokens), 5)
568 // Ignored in KorAP-Tokenizer
569
570 // testTokenizerServer
Akronbe3d3662023-04-26 13:22:38 +0200571 tokens = ttokenize(mat_de, w, "Unser Server ist 10.0.10.51.")
Akron28031b72021-10-02 13:07:25 +0200572 assert.Equal(tokens[0], "Unser")
573 assert.Equal(tokens[1], "Server")
574 assert.Equal(tokens[2], "ist")
575 assert.Equal(tokens[3], "10.0.10.51")
576 assert.Equal(tokens[4], ".")
577 assert.Equal(len(tokens), 5)
578
579 // testTokenizerNum
Akronbe3d3662023-04-26 13:22:38 +0200580 tokens = ttokenize(mat_de, w, "Zu 50,4% ist es sicher")
Akron28031b72021-10-02 13:07:25 +0200581 assert.Equal(tokens[0], "Zu")
582 assert.Equal(tokens[1], "50,4%")
583 assert.Equal(tokens[2], "ist")
584 assert.Equal(tokens[3], "es")
585 assert.Equal(tokens[4], "sicher")
586 assert.Equal(len(tokens), 5)
587 // Differs from KorAP-Tokenizer
588
589 // testTokenizerDate
Akronbe3d3662023-04-26 13:22:38 +0200590 tokens = ttokenize(mat_de, w, "Der Termin ist am 5.9.2018")
Akron28031b72021-10-02 13:07:25 +0200591 assert.Equal(tokens[0], "Der")
592 assert.Equal(tokens[1], "Termin")
593 assert.Equal(tokens[2], "ist")
594 assert.Equal(tokens[3], "am")
595 assert.Equal(tokens[4], "5.9.2018")
596 assert.Equal(len(tokens), 5)
597
Akronbe3d3662023-04-26 13:22:38 +0200598 tokens = ttokenize(mat_de, w, "Der Termin ist am 5/9/2018")
Akron28031b72021-10-02 13:07:25 +0200599 assert.Equal(tokens[0], "Der")
600 assert.Equal(tokens[1], "Termin")
601 assert.Equal(tokens[2], "ist")
602 assert.Equal(tokens[3], "am")
603 assert.Equal(tokens[4], "5/9/2018")
604 assert.Equal(len(tokens), 5)
605
606 // testTokenizerDateRange
607 /*
608 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
609 assert.Equal(tokens[0], "Der")
610 assert.Equal(tokens[1], "Termin")
611 assert.Equal(tokens[2], "war")
612 assert.Equal(tokens[3], "vom")
613 assert.Equal(tokens[4], "4.")
614 assert.Equal(tokens[5], "-")
615 assert.Equal(tokens[6], "5.9.2018")
616 assert.Equal(len(tokens), 7)
617 // Ignored in KorAP-Tokenizer
618 */
619
620 // testTokenizerEmoji1
Akronbe3d3662023-04-26 13:22:38 +0200621 tokens = ttokenize(mat_de, w, "Das ist toll! ;)")
Akron28031b72021-10-02 13:07:25 +0200622 assert.Equal(tokens[0], "Das")
623 assert.Equal(tokens[1], "ist")
624 assert.Equal(tokens[2], "toll")
625 assert.Equal(tokens[3], "!")
626 assert.Equal(tokens[4], ";)")
627 assert.Equal(len(tokens), 5)
628
629 // testTokenizerRef1
Akronbe3d3662023-04-26 13:22:38 +0200630 tokens = ttokenize(mat_de, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron28031b72021-10-02 13:07:25 +0200631 assert.Equal(tokens[0], "Kupietz")
632 assert.Equal(tokens[1], "und")
633 assert.Equal(tokens[2], "Schmidt")
634 assert.Equal(tokens[3], "(2018)")
635 assert.Equal(tokens[4], ":")
636 assert.Equal(tokens[5], "Korpuslinguistik")
637 assert.Equal(len(tokens), 6)
638 // Differs from KorAP-Tokenizer!
639
640 // testTokenizerRef2 () {
Akronbe3d3662023-04-26 13:22:38 +0200641 tokens = ttokenize(mat_de, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron28031b72021-10-02 13:07:25 +0200642 assert.Equal(tokens[0], "Kupietz")
643 assert.Equal(tokens[1], "und")
644 assert.Equal(tokens[2], "Schmidt")
645 assert.Equal(tokens[3], "[2018]")
646 assert.Equal(tokens[4], ":")
647 assert.Equal(tokens[5], "Korpuslinguistik")
648 assert.Equal(len(tokens), 6)
649 // Differs from KorAP-Tokenizer!
650
651 // testTokenizerOmission1 () {
Akronbe3d3662023-04-26 13:22:38 +0200652 tokens = ttokenize(mat_de, w, "Er ist ein A****loch!")
Akron28031b72021-10-02 13:07:25 +0200653 assert.Equal(tokens[0], "Er")
654 assert.Equal(tokens[1], "ist")
655 assert.Equal(tokens[2], "ein")
656 assert.Equal(tokens[3], "A****loch")
657 assert.Equal(tokens[4], "!")
658 assert.Equal(len(tokens), 5)
659
660 // testTokenizerOmission2
Akronbe3d3662023-04-26 13:22:38 +0200661 tokens = ttokenize(mat_de, w, "F*ck!")
Akron28031b72021-10-02 13:07:25 +0200662 assert.Equal(tokens[0], "F*ck")
663 assert.Equal(tokens[1], "!")
664 assert.Equal(len(tokens), 2)
665
666 // testTokenizerOmission3 () {
Akronbe3d3662023-04-26 13:22:38 +0200667 tokens = ttokenize(mat_de, w, "Dieses verf***** Kleid!")
Akron28031b72021-10-02 13:07:25 +0200668 assert.Equal(tokens[0], "Dieses")
669 assert.Equal(tokens[1], "verf*****")
670 assert.Equal(tokens[2], "Kleid")
671 assert.Equal(tokens[3], "!")
672 assert.Equal(len(tokens), 4)
673
674 // Probably interpreted as HOST
675 // testTokenizerFileExtension1
Akronbe3d3662023-04-26 13:22:38 +0200676 tokens = ttokenize(mat_de, w, "Ich habe die readme.txt heruntergeladen")
Akron28031b72021-10-02 13:07:25 +0200677 assert.Equal(tokens[0], "Ich")
678 assert.Equal(tokens[1], "habe")
679 assert.Equal(tokens[2], "die")
680 assert.Equal(tokens[3], "readme.txt")
681 assert.Equal(tokens[4], "heruntergeladen")
682 assert.Equal(len(tokens), 5)
683
684 // Probably interpreted as HOST
685 // testTokenizerFileExtension2
Akronbe3d3662023-04-26 13:22:38 +0200686 tokens = ttokenize(mat_de, w, "Nimm die README.TXT!")
Akron28031b72021-10-02 13:07:25 +0200687 assert.Equal(tokens[0], "Nimm")
688 assert.Equal(tokens[1], "die")
689 assert.Equal(tokens[2], "README.TXT")
690 assert.Equal(tokens[3], "!")
691 assert.Equal(len(tokens), 4)
692
693 // Probably interpreted as HOST
694 // testTokenizerFileExtension3
Akronbe3d3662023-04-26 13:22:38 +0200695 tokens = ttokenize(mat_de, w, "Zeig mir profile.jpeg")
Akron28031b72021-10-02 13:07:25 +0200696 assert.Equal(tokens[0], "Zeig")
697 assert.Equal(tokens[1], "mir")
698 assert.Equal(tokens[2], "profile.jpeg")
699 assert.Equal(len(tokens), 3)
700
701 // testTokenizerFile1
702
Akronbe3d3662023-04-26 13:22:38 +0200703 tokens = ttokenize(mat_de, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akron28031b72021-10-02 13:07:25 +0200704 assert.Equal(tokens[0], "Zeig")
705 assert.Equal(tokens[1], "mir")
706 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
707 assert.Equal(len(tokens), 3)
708
709 // testTokenizerFile2
Akronbe3d3662023-04-26 13:22:38 +0200710 tokens = ttokenize(mat_de, w, "Gehe zu /Dokumente/profile.docx")
Akron28031b72021-10-02 13:07:25 +0200711 assert.Equal(tokens[0], "Gehe")
712 assert.Equal(tokens[1], "zu")
713 assert.Equal(tokens[2], "/Dokumente/profile.docx")
714 assert.Equal(len(tokens), 3)
715
716 // testTokenizerFile3
Akronbe3d3662023-04-26 13:22:38 +0200717 tokens = ttokenize(mat_de, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akron28031b72021-10-02 13:07:25 +0200718 assert.Equal(tokens[0], "Zeig")
719 assert.Equal(tokens[1], "mir")
720 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
721 assert.Equal(len(tokens), 3)
722 // Ignored in KorAP-Tokenizer
723
724 // testTokenizerPunct
Akronbe3d3662023-04-26 13:22:38 +0200725 tokens = ttokenize(mat_de, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akron28031b72021-10-02 13:07:25 +0200726 assert.Equal(tokens[0], "Er")
727 assert.Equal(tokens[1], "sagte")
728 assert.Equal(tokens[2], ":")
729 assert.Equal(tokens[3], "\"")
730 assert.Equal(tokens[4], "Es")
731 assert.Equal(tokens[5], "geht")
732 assert.Equal(tokens[6], "mir")
733 assert.Equal(tokens[7], "gut")
734 assert.Equal(tokens[8], "!")
735 assert.Equal(tokens[9], "\"")
736 assert.Equal(tokens[10], ",")
737 assert.Equal(tokens[11], "daraufhin")
738 assert.Equal(tokens[12], "ging")
739 assert.Equal(tokens[13], "er")
740 assert.Equal(tokens[14], ".")
741 assert.Equal(len(tokens), 15)
742
743 // testTokenizerPlusAmpersand
Akronbe3d3662023-04-26 13:22:38 +0200744 tokens = ttokenize(mat_de, w, "&quot;Das ist von C&A!&quot;")
Akron28031b72021-10-02 13:07:25 +0200745 assert.Equal(tokens[0], "&quot;")
746 assert.Equal(tokens[1], "Das")
747 assert.Equal(tokens[2], "ist")
748 assert.Equal(tokens[3], "von")
749 assert.Equal(tokens[4], "C&A")
750 assert.Equal(tokens[5], "!")
751 assert.Equal(tokens[6], "&quot;")
752 assert.Equal(len(tokens), 7)
753
754 // testTokenizerLongEnd
Akronbe3d3662023-04-26 13:22:38 +0200755 tokens = ttokenize(mat_de, w, "Siehst Du?!!?")
Akron28031b72021-10-02 13:07:25 +0200756 assert.Equal(tokens[0], "Siehst")
757 assert.Equal(tokens[1], "Du")
758 assert.Equal(tokens[2], "?!!?")
759 assert.Equal(len(tokens), 3)
760
761 // testTokenizerIrishO
Akronbe3d3662023-04-26 13:22:38 +0200762 tokens = ttokenize(mat_de, w, "Peter O'Toole")
Akron28031b72021-10-02 13:07:25 +0200763 assert.Equal(tokens[0], "Peter")
764 assert.Equal(tokens[1], "O'Toole")
765 assert.Equal(len(tokens), 2)
766
767 // testTokenizerAbr
Akronbe3d3662023-04-26 13:22:38 +0200768 tokens = ttokenize(mat_de, w, "Früher bzw. später ...")
Akron28031b72021-10-02 13:07:25 +0200769 assert.Equal(tokens[0], "Früher")
770 assert.Equal(tokens[1], "bzw.")
771 assert.Equal(tokens[2], "später")
772 assert.Equal(tokens[3], "...")
773 assert.Equal(len(tokens), 4)
774
775 // testTokenizerUppercaseRule
Akronbe3d3662023-04-26 13:22:38 +0200776 tokens = ttokenize(mat_de, w, "Es war spät.Morgen ist es früh.")
Akron28031b72021-10-02 13:07:25 +0200777 assert.Equal(tokens[0], "Es")
778 assert.Equal(tokens[1], "war")
779 assert.Equal(tokens[2], "spät")
780 assert.Equal(tokens[3], ".")
781 assert.Equal(tokens[4], "Morgen")
782 assert.Equal(tokens[5], "ist")
783 assert.Equal(tokens[6], "es")
784 assert.Equal(tokens[7], "früh")
785 assert.Equal(tokens[8], ".")
786 assert.Equal(len(tokens), 9)
787 // Ignored in KorAP-Tokenizer
788
789 // testTokenizerOrd
Akronbe3d3662023-04-26 13:22:38 +0200790 tokens = ttokenize(mat_de, w, "Sie erreichte den 1. Platz!")
Akron28031b72021-10-02 13:07:25 +0200791 assert.Equal(tokens[0], "Sie")
792 assert.Equal(tokens[1], "erreichte")
793 assert.Equal(tokens[2], "den")
794 assert.Equal(tokens[3], "1.")
795 assert.Equal(tokens[4], "Platz")
796 assert.Equal(tokens[5], "!")
797 assert.Equal(len(tokens), 6)
798
799 // testNoZipOuputArchive
Akronbe3d3662023-04-26 13:22:38 +0200800 tokens = ttokenize(mat_de, w, "Archive: Ich bin kein zip\n")
Akron28031b72021-10-02 13:07:25 +0200801 assert.Equal(tokens[0], "Archive")
802 assert.Equal(tokens[1], ":")
803 assert.Equal(tokens[2], "Ich")
804 assert.Equal(tokens[3], "bin")
805 assert.Equal(tokens[4], "kein")
806 assert.Equal(tokens[5], "zip")
807 assert.Equal(6, len(tokens))
808
809 // testTokenizerStrasse
Akronbe3d3662023-04-26 13:22:38 +0200810 tokens = ttokenize(mat_de, w, "Ich wohne in der Weststr. und Du?")
Akron28031b72021-10-02 13:07:25 +0200811 assert.Equal(tokens[4], "Weststr.")
812 assert.Equal(8, len(tokens))
813
814 // germanTokenizerKnowsGermanOmissionWords
Akronbe3d3662023-04-26 13:22:38 +0200815 tokens = ttokenize(mat_de, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron28031b72021-10-02 13:07:25 +0200816 assert.Equal("D'dorf", tokens[0])
817 assert.Equal("Ku'damm", tokens[1])
818 assert.Equal("Lu'hafen", tokens[2])
819 assert.Equal("M'gladbach", tokens[3])
820 assert.Equal("W'schaft", tokens[4])
821 assert.Equal(5, len(tokens))
822
823 // germanTokenizerDoesNOTSeparateGermanContractions
Akronbe3d3662023-04-26 13:22:38 +0200824 tokens = ttokenize(mat_de, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron28031b72021-10-02 13:07:25 +0200825 assert.Equal("mach's", tokens[0])
826 assert.Equal("macht's", tokens[1])
827 assert.Equal("was'n", tokens[2])
828 assert.Equal("ist's", tokens[3])
829 assert.Equal("haste", tokens[4])
830 assert.Equal("willste", tokens[5])
831 assert.Equal("kannste", tokens[6])
832 assert.Equal("biste", tokens[7])
833 assert.Equal("kriegste", tokens[8])
834 assert.Equal(9, len(tokens))
835
Akronbe3d3662023-04-26 13:22:38 +0200836 tokens = ttokenize(mat_de, w, "Es ist gleich 2:30 Uhr.")
Akron78dba062021-10-28 19:30:46 +0200837 assert.Equal("Es", tokens[0])
838 assert.Equal("ist", tokens[1])
839 assert.Equal("gleich", tokens[2])
840 assert.Equal("2:30", tokens[3])
841 assert.Equal("Uhr", tokens[4])
842 assert.Equal(".", tokens[5])
843 assert.Equal(6, len(tokens))
844
Akronbe3d3662023-04-26 13:22:38 +0200845 tokens = ttokenize(mat_de, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
Akron17984c82021-10-30 11:44:37 +0200846 assert.Equal("Sie", tokens[0])
847 assert.Equal("schwamm", tokens[1])
848 assert.Equal("die", tokens[2])
849 assert.Equal("Strecke", tokens[3])
850 assert.Equal("in", tokens[4])
851 assert.Equal("00:00:57,34", tokens[5])
852 assert.Equal("00:57,341", tokens[6])
853 assert.Equal("0:57", tokens[7])
854 assert.Equal("Stunden", tokens[8])
855 assert.Equal(".", tokens[9])
856 assert.Equal(10, len(tokens))
857
Akronf1106ec2021-11-05 13:04:44 +0100858 // waste example
Akronbe3d3662023-04-26 13:22:38 +0200859 tokens = ttokenize(mat_de, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
Akronf1106ec2021-11-05 13:04:44 +0100860 assert.Equal(tokens[0], "Am")
861 assert.Equal(tokens[1], "24.1.1806")
862 assert.Equal(tokens[2], "feierte")
863 assert.Equal(tokens[3], "E.")
864 assert.Equal(tokens[4], "T.")
865 assert.Equal(tokens[5], "A.")
866 assert.Equal(tokens[6], "Hoffmann")
867 assert.Equal(tokens[7], "seinen")
868 assert.Equal(tokens[8], "30.")
869 assert.Equal(tokens[9], "Geburtstag")
870 assert.Equal(tokens[10], ".")
871 assert.Equal(11, len(tokens))
872
Akron9135b202021-11-06 13:16:07 +0100873 // IPtest
Akronbe3d3662023-04-26 13:22:38 +0200874 tokens = ttokenize(mat_de, w, "Meine IP ist 192.178.168.55.")
Akron9135b202021-11-06 13:16:07 +0100875 assert.Equal(tokens[0], "Meine")
876 assert.Equal(tokens[1], "IP")
877 assert.Equal(tokens[2], "ist")
878 assert.Equal(tokens[3], "192.178.168.55")
879 assert.Equal(tokens[4], ".")
880 assert.Equal(5, len(tokens))
881
Akron6742b962021-11-09 01:17:20 +0100882 // XML entities
Akronbe3d3662023-04-26 13:22:38 +0200883 tokens = ttokenize(mat_de, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
Akron6742b962021-11-09 01:17:20 +0100884 assert.Equal(tokens[0], "Das")
885 assert.Equal(tokens[1], "ist")
886 assert.Equal(tokens[2], "&nbsp;")
887 assert.Equal(tokens[3], "1:30")
888 assert.Equal(tokens[4], "Stunden")
889 assert.Equal(tokens[5], "&")
890 assert.Equal(tokens[6], "20")
891 assert.Equal(tokens[7], "Minuten")
892 assert.Equal(tokens[8], "zu")
893 assert.Equal(tokens[9], "spät")
894 assert.Equal(tokens[10], "&GT;")
895 assert.Equal(tokens[11], ".")
896 assert.Equal(12, len(tokens))
897
Akron936c0f52021-12-07 11:30:53 +0100898 // Plusampersand compounds (1)
Akronbe3d3662023-04-26 13:22:38 +0200899 tokens = ttokenize(mat_de, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
Akrone62e8eb2021-12-03 11:59:53 +0100900 assert.Equal(tokens[0], "Die")
901 assert.Equal(tokens[1], "2G+-Regel")
902 assert.Equal(tokens[2], "soll")
903 assert.Equal(tokens[3], "weitere")
904 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
905 assert.Equal(tokens[5], "reduzieren")
906 assert.Equal(tokens[6], ".")
907 assert.Equal(7, len(tokens))
908
Akron936c0f52021-12-07 11:30:53 +0100909 // Plusampersand compounds (2)
Akronbe3d3662023-04-26 13:22:38 +0200910 tokens = ttokenize(mat_de, w, "Der Neu-C++-Programmierer.")
Akron936c0f52021-12-07 11:30:53 +0100911 assert.Equal(tokens[0], "Der")
912 assert.Equal(tokens[1], "Neu-C++-Programmierer")
913 assert.Equal(tokens[2], ".")
914 assert.Equal(3, len(tokens))
915
Akron54ed7e72022-01-04 12:05:00 +0100916 // z.B.
Akronbe3d3662023-04-26 13:22:38 +0200917 tokens = ttokenize(mat_de, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
Akron54ed7e72022-01-04 12:05:00 +0100918 assert.Equal(tokens[0], "Dies")
919 assert.Equal(tokens[1], "sind")
920 assert.Equal(tokens[2], "z.")
921 assert.Equal(tokens[3], "B.")
922 assert.Equal(tokens[4], "zwei")
923 assert.Equal(tokens[5], "Wörter")
924 assert.Equal(tokens[6], "-")
925 assert.Equal(tokens[7], "z.")
926 assert.Equal(tokens[8], "B.")
927 assert.Equal(tokens[9], "auch")
928 assert.Equal(tokens[10], ".")
929 assert.Equal(11, len(tokens))
930
Akron9a594712022-01-14 11:12:21 +0100931 // z.B.
Akronbe3d3662023-04-26 13:22:38 +0200932 tokens = ttokenize(mat_de, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
Akron9a594712022-01-14 11:12:21 +0100933 assert.Equal(tokens[0], "Dies")
934 assert.Equal(tokens[1], "sind")
935 assert.Equal(tokens[2], "z.")
936 assert.Equal(tokens[3], "B.")
937 assert.Equal(tokens[4], "zwei")
938 assert.Equal(tokens[5], "Wörter")
939 assert.Equal(tokens[6], "-")
940 assert.Equal(tokens[7], "z.")
941 assert.Equal(tokens[8], "B.")
942 assert.Equal(tokens[9], "auch")
943 assert.Equal(tokens[10], ".")
944 assert.Equal(11, len(tokens))
945
946 // Single quote handling
Akronbe3d3662023-04-26 13:22:38 +0200947 tokens = ttokenize(mat_de, w, "Es heißt 'Leitungssportteams' und nicht anders.")
Akron9a594712022-01-14 11:12:21 +0100948 assert.Equal(tokens[0], "Es")
949 assert.Equal(tokens[1], "heißt")
950 assert.Equal(tokens[2], "'")
951 assert.Equal(tokens[3], "Leitungssportteams")
952 assert.Equal(tokens[4], "'")
953 assert.Equal(tokens[5], "und")
954 assert.Equal(tokens[6], "nicht")
955 assert.Equal(tokens[7], "anders")
956 assert.Equal(tokens[8], ".")
957 assert.Equal(9, len(tokens))
958
Akronb02ad072022-01-19 12:41:44 +0100959 // Apostrophe handling
Akronbe3d3662023-04-26 13:22:38 +0200960 tokens = ttokenize(mat_de, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
Akronb02ad072022-01-19 12:41:44 +0100961 assert.Equal(tokens[0], "Das")
962 assert.Equal(tokens[1], "ist")
963 assert.Equal(tokens[2], "Nils’")
964 assert.Equal(tokens[3], "Einkaufskorb")
965 assert.Equal(tokens[4], "bei")
966 assert.Equal(tokens[5], "McDonald's")
967 assert.Equal(tokens[6], ".")
968 assert.Equal(7, len(tokens))
969
Akronbe3d3662023-04-26 13:22:38 +0200970}
971
972func TestMatrixFullTokenizerTokenSplitterEN(t *testing.T) {
973 assert := assert.New(t)
974
975 if mat_en == nil {
976 mat_en = LoadMatrixFile("testdata/tokenizer_en.matok")
977 }
978
979 b := make([]byte, 0, 2048)
980 w := bytes.NewBuffer(b)
981 var tokens []string
982
983 // testEnglishTokenizerScienceAbbreviations
984 tokens = ttokenize(mat_en, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
985 assert.Equal("Approx.", tokens[0])
986 assert.Equal("in", tokens[1])
987 assert.Equal("Sept.", tokens[2])
988 assert.Equal("1954", tokens[3])
989 assert.Equal(",", tokens[4])
990 assert.Equal("Assoc.", tokens[5])
991 assert.Equal("Prof.", tokens[6])
992 assert.Equal("Dr.", tokens[7])
993 assert.Equal("R.", tokens[8])
994 assert.Equal("J.", tokens[9])
995 assert.Equal("Ewing", tokens[10])
996 assert.Equal("reviewed", tokens[11])
997 assert.Equal("articles", tokens[12])
998 assert.Equal("on", tokens[13])
999 assert.Equal("Enzymol.", tokens[14])
1000 assert.Equal("Bacteriol.", tokens[15])
1001 assert.Equal("effects", tokens[16])
1002 assert.Equal("later", tokens[17])
1003 assert.Equal("published", tokens[18])
1004 assert.Equal("in", tokens[19])
1005 assert.Equal("Nutr.", tokens[20])
1006 assert.Equal("Rheumatol.", tokens[21])
1007 assert.Equal("No.", tokens[22])
1008 assert.Equal("12", tokens[23])
1009 assert.Equal("and", tokens[24])
1010 assert.Equal("Nº.", tokens[25])
1011 assert.Equal("13.", tokens[26])
1012 assert.Equal(",", tokens[27])
1013 assert.Equal("pp.", tokens[28])
1014 assert.Equal("17-18", tokens[29])
1015 assert.Equal(".", tokens[30])
Akronbe3d3662023-04-26 13:22:38 +02001016
Akrond0dfea82023-04-26 19:24:17 +02001017 // englishTokenizerCanGuessWhetherIIsAbbrev
1018 tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
1019 assert.Equal("I.", tokens[1])
1020 assert.Equal("I", tokens[8])
1021 assert.Equal(".", tokens[9])
1022 assert.Equal("I", tokens[12])
1023 assert.Equal(".", tokens[13])
Akronbe3d3662023-04-26 13:22:38 +02001024
Akron28031b72021-10-02 13:07:25 +02001025 /*
1026 @Test
1027 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
1028 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1029 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
1030 assert.Equal("'ve", tokens[1]);
1031 assert.Equal("'ll", tokens[3]);
1032 assert.Equal("'d", tokens[5]);
1033 assert.Equal("'m", tokens[7]);
1034 assert.Equal("'re", tokens[9]);
1035 assert.Equal("'s", tokens[11]);
1036 assert.Equal("is", tokens[12]);
1037 assert.Equal("n't", tokens[13]);
1038 assert.Equal(14, len(tokens));
1039 }
1040
1041 @Test
1042 public void frenchTokenizerKnowsFrenchAbbreviations () {
1043 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1044 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
1045 assert.Equal("Approx.", tokens[0]);
1046 assert.Equal("juill.", tokens[2]);
1047 assert.Equal("prof.", tokens[5]);
1048 assert.Equal("exerc.", tokens[15]);
1049 assert.Equal("no.", tokens[16]);
1050 assert.Equal("pp.", tokens[21]);
1051 }
1052
1053 @Test
1054 public void frenchTokenizerKnowsFrenchContractions () {
1055 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1056 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
1057 assert.Equal("J'", tokens[0]);
1058 assert.Equal("j'", tokens[2]);
1059 assert.Equal("qu'", tokens[4]);
1060 assert.Equal("d'", tokens[6]);
1061 assert.Equal("jusqu'", tokens[8]);
1062 assert.Equal("Aujourd'hui", tokens[10]);
1063 assert.Equal("D'", tokens[11]); // ’
1064 assert.Equal("Quelqu'un", tokens[13]); // ’
1065 assert.Equal("Presqu'île", tokens[14]); // ’
1066 }
1067
1068 @Test
1069 public void frenchTokenizerKnowsFrenchClitics () {
1070 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1071 tokens = tokenize(dat, w, "suis-je sont-elles ")
1072 assert.Equal("suis", tokens[0]);
1073 assert.Equal("-je", tokens[1]);
1074 assert.Equal("sont", tokens[2]);
1075 assert.Equal("-elles", tokens[3]);
1076 }
1077
Akron28031b72021-10-02 13:07:25 +02001078
1079 @Test
1080 public void testZipOuputArchive () {
1081
1082 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1083 System.setOut(new PrintStream(clearOut));
1084 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1085 assert.Equal(0, len(tokens));
1086 }
1087 */
1088 /*
1089
1090 @Test
1091 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1092 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1093 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1094 .printOffsets(true)
1095 .build();
1096 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1097 assert.Equal("Text1", tokens[0].getType());
1098 assert.Equal(len(tokens), 9 );
1099 }
1100 */
1101}
1102
Akronb98e4cf2022-03-27 23:56:49 +02001103func TestMatrixEmoticons(t *testing.T) {
1104 assert := assert.New(t)
1105
Akronbe3d3662023-04-26 13:22:38 +02001106 if mat_de == nil {
1107 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akronb98e4cf2022-03-27 23:56:49 +02001108 }
1109
Akronbe3d3662023-04-26 13:22:38 +02001110 assert.NotNil(mat_de)
Akronb98e4cf2022-03-27 23:56:49 +02001111
1112 b := make([]byte, 0, 2048)
1113 w := bytes.NewBuffer(b)
1114 var tokens []string
1115
Akronbe3d3662023-04-26 13:22:38 +02001116 tokens = ttokenize(mat_de, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
Akronb98e4cf2022-03-27 23:56:49 +02001117 assert.Equal(tokens[0], ":-*")
1118 assert.Equal(tokens[1], ";)")
1119 assert.Equal(tokens[2], ":))")
1120 assert.Equal(tokens[3], ":*(")
1121 assert.Equal(tokens[4], "^___^")
1122 assert.Equal(tokens[5], "T__T")
1123 assert.Equal(tokens[6], "^^;")
1124 assert.Equal(tokens[7], "-_-;;;")
1125 assert.Equal(tokens[8], "-_-^")
1126 assert.Equal(len(tokens), 9)
Akron6dcb6ce2022-04-09 16:09:51 +02001127
Akronbe3d3662023-04-26 13:22:38 +02001128 tokens = ttokenize(mat_de, w, "das -> Lustig<-!")
Akron6dcb6ce2022-04-09 16:09:51 +02001129 assert.Equal("das", tokens[0])
1130 assert.Equal("->", tokens[1])
1131 assert.Equal("Lustig", tokens[2])
1132 assert.Equal("<-", tokens[3])
1133 assert.Equal("!", tokens[4])
1134 assert.Equal(5, len(tokens))
Akronb98e4cf2022-03-27 23:56:49 +02001135}
1136
Akronc9c0eae2021-10-22 19:49:43 +02001137func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001138 assert := assert.New(t)
1139
Akronbe3d3662023-04-26 13:22:38 +02001140 if mat_de == nil {
1141 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001142 }
Akron28031b72021-10-02 13:07:25 +02001143
Akronbe3d3662023-04-26 13:22:38 +02001144 assert.NotNil(mat_de)
Akron28031b72021-10-02 13:07:25 +02001145
1146 b := make([]byte, 0, 2048)
1147 w := bytes.NewBuffer(b)
1148 var tokens []string
1149
Akronbe3d3662023-04-26 13:22:38 +02001150 tokens = ttokenize(mat_de, w, "Das <b>beste</b> Fußballspiel")
Akron28031b72021-10-02 13:07:25 +02001151 assert.Equal("Das", tokens[0])
1152 assert.Equal("<b>", tokens[1])
1153 assert.Equal("beste", tokens[2])
1154 assert.Equal("</b>", tokens[3])
1155 assert.Equal("Fußballspiel", tokens[4])
1156 assert.Equal(5, len(tokens))
1157
Akronbe3d3662023-04-26 13:22:38 +02001158 tokens = ttokenize(mat_de, w, "Das <b class=\"c\">beste</b> Fußballspiel")
Akron28031b72021-10-02 13:07:25 +02001159 assert.Equal("Das", tokens[0])
1160 assert.Equal("<b class=\"c\">", tokens[1])
1161 assert.Equal("beste", tokens[2])
1162 assert.Equal("</b>", tokens[3])
1163 assert.Equal("Fußballspiel", tokens[4])
1164 assert.Equal(5, len(tokens))
1165
Akronbe3d3662023-04-26 13:22:38 +02001166 tokens = ttokenize(mat_de, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
Akron28031b72021-10-02 13:07:25 +02001167 assert.Equal("der", tokens[0])
1168 assert.Equal("<x y=\"alte \">", tokens[1])
1169 assert.Equal("<x x>", tokens[2])
1170 assert.Equal("alte", tokens[3])
1171 assert.Equal("</x>", tokens[4])
1172 assert.Equal("etc.", tokens[5])
1173 assert.Equal("et", tokens[6])
1174 assert.Equal(".", tokens[7])
1175 assert.Equal("Mann", tokens[8])
1176 assert.Equal(".", tokens[9])
1177 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001178
Akronbe3d3662023-04-26 13:22:38 +02001179 tokens = ttokenize(mat_de, w, "das<br class=\"br\" />ging.")
Akron066d99c2021-10-28 19:04:59 +02001180 assert.Equal("das", tokens[0])
1181 assert.Equal("<br class=\"br\" />", tokens[1])
1182 assert.Equal("ging", tokens[2])
1183 assert.Equal(".", tokens[3])
1184 assert.Equal(4, len(tokens))
Akrond47c67e2022-04-10 11:02:59 +02001185
Akronbe3d3662023-04-26 13:22:38 +02001186 tokens = ttokenize(mat_de, w, "das <?robot xgh ?> <!-- hm hm --> <![CDATA[ cdata ]]> <br />")
Akrond47c67e2022-04-10 11:02:59 +02001187 assert.Equal("das", tokens[0])
1188 assert.Equal("<?robot", tokens[1])
1189 assert.Equal("xgh", tokens[2])
1190 assert.Equal("?>", tokens[3])
1191 assert.Equal("<!--", tokens[4])
1192 assert.Equal("hm", tokens[5])
1193 assert.Equal("hm", tokens[6])
1194 assert.Equal("-->", tokens[7])
1195 assert.Equal("<![CDATA[", tokens[8])
1196 assert.Equal("cdata", tokens[9])
1197 assert.Equal("]]>", tokens[10])
1198 assert.Equal("<br />", tokens[11])
1199 assert.Equal(12, len(tokens))
1200
Akron28031b72021-10-02 13:07:25 +02001201}
1202
Akronabcb6a52021-10-09 15:52:08 +02001203func TestMatokDatokEquivalence(t *testing.T) {
1204 assert := assert.New(t)
1205
Akronbe3d3662023-04-26 13:22:38 +02001206 if mat_de == nil {
1207 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001208 }
Akronabcb6a52021-10-09 15:52:08 +02001209 dat := LoadDatokFile("testdata/tokenizer.datok")
1210
1211 r := strings.NewReader(s)
1212
1213 tb := make([]byte, 0, 2048)
1214 w := bytes.NewBuffer(tb)
1215
1216 // Transduce with double array representation
1217 dat.Transduce(r, w)
1218
1219 datStr := w.String()
1220
1221 r.Reset(s)
1222 w.Reset()
1223
1224 // Transduce with matrix representation
Akronbe3d3662023-04-26 13:22:38 +02001225 mat_de.Transduce(r, w)
Akronabcb6a52021-10-09 15:52:08 +02001226
1227 matStr := w.String()
1228
1229 assert.Equal(datStr, matStr)
1230}
1231
Akronc9c0eae2021-10-22 19:49:43 +02001232func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001233 assert := assert.New(t)
1234
Akronbe3d3662023-04-26 13:22:38 +02001235 if mat_de == nil {
1236 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001237 }
Akrone396a932021-10-19 01:06:13 +02001238
Akronbe3d3662023-04-26 13:22:38 +02001239 assert.NotNil(mat_de)
Akrone396a932021-10-19 01:06:13 +02001240
1241 b := make([]byte, 0, 2048)
1242 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001243
Akronbe3d3662023-04-26 13:22:38 +02001244 assert.True(mat_de.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001245
1246 matStr := w.String()
1247
1248 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1249}
1250
Akronc9c0eae2021-10-22 19:49:43 +02001251func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001252 assert := assert.New(t)
1253
Akronbe3d3662023-04-26 13:22:38 +02001254 if mat_de == nil {
1255 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001256 }
Akrona854faa2021-10-22 19:31:08 +02001257
Akronbe3d3662023-04-26 13:22:38 +02001258 assert.NotNil(mat_de)
Akrona854faa2021-10-22 19:31:08 +02001259
1260 b := make([]byte, 0, 2048)
1261 w := bytes.NewBuffer(b)
1262
Akronbe3d3662023-04-26 13:22:38 +02001263 assert.True(mat_de.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
Akrona854faa2021-10-22 19:31:08 +02001264 matStr := w.String()
1265 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001266}
Akrona854faa2021-10-22 19:31:08 +02001267
Akron22c565a2021-11-28 17:31:36 +01001268func TestMatrixFullTokenizerLongText(t *testing.T) {
1269 assert := assert.New(t)
1270
Akronbe3d3662023-04-26 13:22:38 +02001271 if mat_de == nil {
1272 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron22c565a2021-11-28 17:31:36 +01001273 }
1274
Akronbe3d3662023-04-26 13:22:38 +02001275 assert.NotNil(mat_de)
Akron22c565a2021-11-28 17:31:36 +01001276
1277 b := make([]byte, 0, 2048)
1278 w := bytes.NewBuffer(b)
1279
1280 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1281
1282Copyright laws are changing all over the world. Be sure to check the
1283copyright laws for your country before downloading or redistributing
1284this or any other Project Gutenberg eBook.
1285
1286This header should be the first thing seen when viewing this Project
1287Gutenberg file. Please do not remove it. Do not change or edit the
1288header without written permission.
1289
1290Please read the "legal small print," and other information about the
1291eBook and Project Gutenberg at the bottom of this file. Included is
1292important information about your specific rights and restrictions in
1293how the file may be used. You can also find out about how to make a
1294donation to Project Gutenberg, and how to get involved.
1295
1296
1297**Welcome To The World of Free Plain Vanilla Electronic Texts**
1298
1299**eBooks Readable By Both Humans and By Computers, Since 1971**
1300
1301*****These eBooks Were Prepared By Thousands of Volunteers!*****
1302
1303
1304Title: Effi Briest
1305
1306Author: Theodor Fontane
1307
1308Release Date: March, 2004 [EBook #5323]
1309`
1310
Akronbe3d3662023-04-26 13:22:38 +02001311 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akron22c565a2021-11-28 17:31:36 +01001312
1313 assert.True(strings.Contains(w.String(), "Release"))
1314}
1315
Akronf6bdfdb2021-10-23 15:56:53 +02001316func TestMatrixTrimming(t *testing.T) {
1317 assert := assert.New(t)
1318
Akronbe3d3662023-04-26 13:22:38 +02001319 if mat_de == nil {
1320 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001321 }
Akronf6bdfdb2021-10-23 15:56:53 +02001322
Akronbe3d3662023-04-26 13:22:38 +02001323 assert.NotNil(mat_de)
Akronf6bdfdb2021-10-23 15:56:53 +02001324
1325 b := make([]byte, 0, 2048)
1326 w := bytes.NewBuffer(b)
1327
Akronbe3d3662023-04-26 13:22:38 +02001328 assert.True(mat_de.Transduce(strings.NewReader(" Erste."), w))
Akronf6bdfdb2021-10-23 15:56:53 +02001329 matStr := w.String()
1330 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001331}
1332
Akronc9c0eae2021-10-22 19:49:43 +02001333func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001334 bu := make([]byte, 0, 2048)
1335 w := bytes.NewBuffer(bu)
1336
Akron28031b72021-10-02 13:07:25 +02001337 r := strings.NewReader(s)
1338
Akron094a4e82021-10-02 18:37:00 +02001339 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001340
1341 b.ResetTimer()
1342
1343 for i := 0; i < b.N; i++ {
1344 w.Reset()
1345 r.Reset(s)
1346 ok := mat.Transduce(r, w)
1347 if !ok {
1348 fmt.Println("Fail!")
1349 fmt.Println(w.String())
1350 os.Exit(1)
1351 }
1352 }
1353}