blob: 356efa9de3c3db2de8c546dbe2bff52cfaae8789 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
5 "fmt"
6 "io"
7)
8
9type MatrixTokenizer struct {
10 sigma map[rune]int
11 sigmaASCII [256]int
12 array []int
13 stateCount int
14
15 // Special symbols in sigma
16 epsilon int
17 unknown int
18 identity int
19 final int
20 tokenend int
21}
22
23// ToMatrix turns the intermediate tokenizer into a
24// matrix representation.
25func (auto *Automaton) ToMatrix() *MatrixTokenizer {
26
27 mat := &MatrixTokenizer{
28 sigma: make(map[rune]int),
29 final: auto.final,
30 unknown: auto.unknown,
31 identity: auto.identity,
32 epsilon: auto.epsilon,
33 tokenend: auto.tokenend,
34 stateCount: auto.stateCount,
35 }
36
37 mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
38
39 for num, sym := range auto.sigmaRev {
40 if int(sym) < 256 {
41 mat.sigmaASCII[int(sym)] = num
42 }
43 mat.sigma[sym] = num
44 if num > auto.sigmaCount {
45 panic("sigmaCount is smaller")
46 }
47 }
48 remember := make([]bool, auto.stateCount+2)
49
50 // Store all transitions in matrix
51 var toMatrix func([]int, int)
52
53 toMatrix = func(matrix []int, start int) {
54 if start > auto.stateCount {
55 panic("stateCount is smaller")
56 }
57 if remember[start] {
58 return
59 }
60 remember[start] = true
61 for alpha, t := range auto.transitions[start] {
62 matrix[(alpha-1)*auto.stateCount+start] = t.end
63
64 // Mark nontoken transitions
65 if t.nontoken {
66 matrix[(alpha-1)*auto.stateCount+start] *= -1
67 }
68
69 toMatrix(matrix, t.end)
70 }
71 }
72
73 toMatrix(mat.array, 1)
74
75 return mat
76}
77
78func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
79 var a int
80 var t0 int
81 t := int(1) // Initial state
82 var ok, rewindBuffer bool
83
84 // Remember the last position of a possible tokenend,
85 // in case the automaton fails.
86 epsilonState := int(0)
87 epsilonOffset := 0
88
Akron5c82a922021-09-24 19:11:29 +020089 // Remember if the last transition was epsilon
90 sentenceEnd := false
91
Akron1c34ce62021-09-23 23:27:39 +020092 buffer := make([]rune, 1024)
93 buffo := 0 // Buffer offset
94 buffi := 0 // Buffer length
95
96 reader := bufio.NewReader(r)
97 writer := bufio.NewWriter(w)
98 defer writer.Flush()
99
100 var char rune
101
102 var err error
103 eof := false
104 newchar := true
105
106PARSECHARM:
107 for {
108
109 if newchar {
110 // Get from reader if buffer is empty
111 if buffo >= buffi {
112 if eof {
113 break
114 }
115 char, _, err = reader.ReadRune()
116
117 // No more runes to read
118 if err != nil {
119 eof = true
120 break
121 }
122 buffer[buffi] = char
123 buffi++
124 }
125
126 char = buffer[buffo]
127
128 if DEBUG {
129 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
130 }
131
132 // TODO:
133 // Better not repeatedly check for a!
134 // Possibly keep a buffer with a.
135 if int(char) < 256 {
136 a = mat.sigmaASCII[int(char)]
137 } else {
138 a, ok = mat.sigma[char]
139 if !ok {
140 a = 0
141 }
142 }
143
144 // Use identity symbol if character is not in sigma
145 if a == 0 && mat.identity != -1 {
146 a = mat.identity
147 }
148
149 t0 = t
150
151 // Check for epsilon transitions and remember
152
153 if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
154 // Remember state for backtracking to last tokenend state
155 epsilonState = t0
156 epsilonOffset = buffo
157 }
158 }
159
160 // Checks a transition based on t0, a and buffo
161 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
162 // t = mat.array[t0].getBase() + uint32(a)
163 // ta := dat.array[t]
164
165 if DEBUG {
166 // Char is only relevant if set
167 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
168 /*
169 if false {
170 fmt.Println(dat.outgoing(t0))
171 }
172 */
173 }
174
175 // Check if the transition is invalid according to the double array
176 // if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
177 if t == 0 {
178
179 if DEBUG {
180 fmt.Println("Match is not fine!")
181 }
182
183 if !ok && a == mat.identity {
184
185 // Try again with unknown symbol, in case identity failed
186 // Char is only relevant when set
187 if DEBUG {
188 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
189 }
190 a = mat.unknown
191
192 } else if a != mat.epsilon {
193
194 // Try again with epsilon symbol, in case everything else failed
195 t0 = epsilonState
196 epsilonState = 0 // reset
197 buffo = epsilonOffset
198 a = mat.epsilon
199
200 if DEBUG {
201 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
202 }
203
204 } else {
205 break
206 }
207
208 newchar = false
209 continue
210 }
211
212 // Transition was successful
213 rewindBuffer = false
214
215 // Transition consumes a character
216 if a != mat.epsilon {
217
218 buffo++
219
220 // Transition does not produce a character
221 // if buffo == 1 && ta.isNonToken() {
222 if buffo == 1 && t < 0 {
223 if DEBUG {
224 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
225 }
226 rewindBuffer = true
227 }
228
229 } else {
230 // Transition marks the end of a token - so flush the buffer
231
232 if buffi > 0 {
233 if DEBUG {
234 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
235 }
236 writer.WriteString(string(buffer[:buffo]))
237 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200238 sentenceEnd = false
239 } else {
240 sentenceEnd = true
Akron1c34ce62021-09-23 23:27:39 +0200241 }
242 if DEBUG {
243 fmt.Println("-> Newline")
244 }
245 writer.WriteRune('\n')
246 }
247
248 // Rewind the buffer if necessary
249 if rewindBuffer {
250
251 // TODO: Better as a ring buffer
252 for x, i := range buffer[buffo:buffi] {
253 buffer[x] = i
254 }
255
256 buffi -= buffo
257 epsilonOffset -= buffo
258 buffo = 0
259 if DEBUG {
260 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
261 }
262 }
263
264 // Move to representative state
265 /*
266 if ta.isSeparate() {
267 t = ta.getBase()
268 ta = dat.array[t]
269
270 if DEBUG {
271 fmt.Println("Representative pointing to", t)
272 }
273 }
274 */
275
276 // Ignore nontoken mark
277 if t < 0 {
278 t *= -1
279 }
280
281 newchar = true
282
283 // TODO:
284 // Prevent endless epsilon loops!
285 }
286
287 // Input reader is not yet finished
288 if !eof {
289 if DEBUG {
290 fmt.Println("Not at the end")
291 }
292 return false
293 }
294
295 if DEBUG {
296 fmt.Println("Entering final check")
297 }
298 /*
299 // Automaton is in a final state, so flush the buffer and return
300 x := dat.array[t].getBase() + uint32(dat.final)
301
302 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
303
304 if buffi > 0 {
305 if DEBUG {
306 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
307 }
308 writer.WriteString(string(buffer[:buffi]))
309
310 if dat.array[t].isTokenEnd() {
311 writer.WriteRune('\n')
312 if DEBUG {
313 fmt.Println("-> Newline")
314 }
315 }
316 }
317
318 // Add an additional sentence ending, if the file is over but no explicit
319 // sentence split was reached. This may be controversial and therefore
320 // optional via parameter.
321 if !dat.array[t0].isTokenEnd() {
322 writer.WriteRune('\n')
323 if DEBUG {
324 fmt.Println("-> Newline")
325 }
326 }
327
328 // TODO:
329 // There may be a new line at the end, from an epsilon,
330 // so we may need to go on!
331 return true
332 }
333 */
334
335 // Check epsilon transitions until a final state is reached
336 t0 = t
337 // t = dat.array[t0].getBase() + uint32(dat.epsilon)
338 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
339 a = mat.epsilon
340 newchar = false
341 // if dat.array[t].getCheck() == t0 {
342 // t can't be < 0
343 if t > 0 {
344 // Remember state for backtracking to last tokenend state
345 goto PARSECHARM
346
347 } else if epsilonState != 0 {
348 t0 = epsilonState
349 epsilonState = 0 // reset
350 buffo = epsilonOffset
351 if DEBUG {
352 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
353 }
354 goto PARSECHARM
355 }
Akron1c34ce62021-09-23 23:27:39 +0200356
Akron5c82a922021-09-24 19:11:29 +0200357 // Add an additional sentence ending, if the file is over but no explicit
358 // sentence split was reached. This may be controversial and therefore
359 // optional via parameter.
360 if !sentenceEnd {
361 writer.WriteRune('\n')
362 if DEBUG {
363 fmt.Println("-> Newline")
364 }
365 }
366
367 return true
Akron1c34ce62021-09-23 23:27:39 +0200368}