blob: 37a61b291a01abadf5758c15f410e9bcae524508 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
5 "strings"
6 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestFullTokenizerMatrix(t *testing.T) {
12 assert := assert.New(t)
13 foma := LoadFomaFile("testdata/simpletok.fst")
14 assert.NotNil(foma)
15
16 mat := foma.ToMatrix()
17
18 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
19 b := make([]byte, 0, 2048)
20 w := bytes.NewBuffer(b)
21 var tokens []string
22 mat.Transduce(r, w)
23 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020024 assert.Equal(len(tokens), 10)
Akron1c34ce62021-09-23 23:27:39 +020025 assert.Equal("wald", tokens[0])
26 assert.Equal("gehen", tokens[1])
27 assert.Equal("Da", tokens[2])
28 assert.Equal("kann", tokens[3])
29 assert.Equal("man", tokens[4])
30 assert.Equal("was", tokens[5])
31 assert.Equal("\"erleben\"", tokens[6])
32 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020033
34 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
35 w.Reset()
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
38 assert.Equal("In", tokens[0])
39 assert.Equal("den", tokens[1])
40 assert.Equal("Wald", tokens[2])
41 assert.Equal("gehen", tokens[3])
42 assert.Equal("?", tokens[4])
43 assert.Equal("--", tokens[5])
44
45 r = strings.NewReader(" g? -- D")
46 w.Reset()
47 mat.Transduce(r, w)
48 tokens = strings.Split(w.String(), "\n")
49 assert.Equal("g", tokens[0])
50 assert.Equal("?", tokens[1])
51 assert.Equal("--", tokens[2])
52 assert.Equal("D", tokens[3])
53 assert.Equal("", tokens[4])
54 assert.Equal("", tokens[5])
55 assert.Equal(6, len(tokens))
56}
57
Akron16c312e2021-09-26 13:11:12 +020058func TestReadWriteMatrixTokenizer(t *testing.T) {
59 assert := assert.New(t)
60 foma := LoadFomaFile("testdata/simpletok.fst")
61 assert.NotNil(foma)
62
63 mat := foma.ToMatrix()
64 assert.NotNil(foma)
65
66 assert.True(tmatch(mat, "bau"))
67 assert.True(tmatch(mat, "bad"))
68 assert.True(tmatch(mat, "wald gehen"))
69 b := make([]byte, 0, 1024)
70 buf := bytes.NewBuffer(b)
71 n, err := mat.WriteTo(buf)
72 assert.Nil(err)
73 assert.Equal(int64(248), n)
74 mat2 := ParseMatrix(buf)
75 assert.NotNil(mat2)
76 assert.Equal(mat.sigma, mat2.sigma)
77 assert.Equal(mat.epsilon, mat2.epsilon)
78 assert.Equal(mat.unknown, mat2.unknown)
79 assert.Equal(mat.identity, mat2.identity)
80 assert.Equal(mat.stateCount, mat2.stateCount)
81 assert.Equal(len(mat.array), len(mat2.array))
82 assert.Equal(mat.array, mat2.array)
83 assert.True(tmatch(mat2, "bau"))
84 assert.True(tmatch(mat2, "bad"))
85 assert.True(tmatch(mat2, "wald gehen"))
86}
87
Akron5c82a922021-09-24 19:11:29 +020088func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
89 assert := assert.New(t)
90 foma := LoadFomaFile("testdata/tokenizer.fst")
91 assert.NotNil(foma)
92
93 mat := foma.ToMatrix()
94
95 b := make([]byte, 0, 2048)
96 w := bytes.NewBuffer(b)
97 var sentences []string
98
99 // testSentSplitterSimple
100 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
101 sentences = strings.Split(w.String(), "\n\n")
102
103 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
104 assert.Equal("Der\nalte\nMann\n.", sentences[0])
105 assert.Equal("", sentences[1])
106 assert.Equal(len(sentences), 2)
107
108 w.Reset()
109 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
110 sentences = strings.Split(w.String(), "\n\n")
111 assert.Equal(len(sentences), 2)
112 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
113 assert.Equal("", sentences[1])
114
115 /*
116
117 w.Reset()
118 assert.True(mat.Transduce(strings.NewReader(""), w))
119 sentences = strings.Split(w.String(), "\n\n")
120 assert.Equal(len(sentences), 1)
121 assert.Equal("\n", sentences[0])
122
123 w.Reset()
124 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
125 sentences = strings.Split(w.String(), "\n\n")
126 assert.Equal(len(sentences), 2)
127
128 w.Reset()
129 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
130 sentences = strings.Split(w.String(), "\n\n")
131 assert.Equal(len(sentences), 2)
132
133 w.Reset()
134 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
135 sentences = strings.Split(w.String(), "\n\n")
136 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
137 assert.Equal("", sentences[1])
138 assert.Equal(len(sentences), 2)
139
140 w.Reset()
141 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
142 sentences = strings.Split(w.String(), "\n\n")
143 assert.Equal("", sentences[1])
144 assert.Equal(len(sentences), 2)
145
146 w.Reset()
147 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
148 sentences = strings.Split(w.String(), "\n\n")
149 assert.Equal(len(sentences), 2)
150
151 w.Reset()
152 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
153 sentences = strings.Split(w.String(), "\n\n")
154 assert.Equal(len(sentences), 2)
155
156 w.Reset()
157 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
158 sentences = strings.Split(w.String(), "\n\n")
159 assert.Equal(len(sentences), 2)
160 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
161 assert.Equal("", sentences[1])
162
163 w.Reset()
164 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
165 sentences = strings.Split(w.String(), "\n\n")
166 assert.Equal(len(sentences), 3)
167 assert.Equal("Ausschalten\n!!!", sentences[0])
168 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
169 assert.Equal("", sentences[2])
170
171 w.Reset()
172 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
173 sentences = strings.Split(w.String(), "\n\n")
174 assert.Equal(len(sentences), 2)
175 */
176 /*
177 Test:
178 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
179 */
Akron1c34ce62021-09-23 23:27:39 +0200180}