blob: 0ec940e200344d9e62dfb55487f8b81748c8a652 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
5 "fmt"
6 "io"
7)
8
9type MatrixTokenizer struct {
10 sigma map[rune]int
11 sigmaASCII [256]int
12 array []int
13 stateCount int
14
15 // Special symbols in sigma
16 epsilon int
17 unknown int
18 identity int
19 final int
20 tokenend int
21}
22
23// ToMatrix turns the intermediate tokenizer into a
24// matrix representation.
25func (auto *Automaton) ToMatrix() *MatrixTokenizer {
26
27 mat := &MatrixTokenizer{
28 sigma: make(map[rune]int),
29 final: auto.final,
30 unknown: auto.unknown,
31 identity: auto.identity,
32 epsilon: auto.epsilon,
33 tokenend: auto.tokenend,
34 stateCount: auto.stateCount,
35 }
36
37 mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
38
39 for num, sym := range auto.sigmaRev {
40 if int(sym) < 256 {
41 mat.sigmaASCII[int(sym)] = num
42 }
43 mat.sigma[sym] = num
44 if num > auto.sigmaCount {
45 panic("sigmaCount is smaller")
46 }
47 }
48 remember := make([]bool, auto.stateCount+2)
49
50 // Store all transitions in matrix
51 var toMatrix func([]int, int)
52
53 toMatrix = func(matrix []int, start int) {
54 if start > auto.stateCount {
55 panic("stateCount is smaller")
56 }
57 if remember[start] {
58 return
59 }
60 remember[start] = true
61 for alpha, t := range auto.transitions[start] {
62 matrix[(alpha-1)*auto.stateCount+start] = t.end
63
64 // Mark nontoken transitions
65 if t.nontoken {
66 matrix[(alpha-1)*auto.stateCount+start] *= -1
67 }
68
69 toMatrix(matrix, t.end)
70 }
71 }
72
73 toMatrix(mat.array, 1)
74
75 return mat
76}
77
78func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
79 var a int
80 var t0 int
81 t := int(1) // Initial state
82 var ok, rewindBuffer bool
83
84 // Remember the last position of a possible tokenend,
85 // in case the automaton fails.
86 epsilonState := int(0)
87 epsilonOffset := 0
88
89 buffer := make([]rune, 1024)
90 buffo := 0 // Buffer offset
91 buffi := 0 // Buffer length
92
93 reader := bufio.NewReader(r)
94 writer := bufio.NewWriter(w)
95 defer writer.Flush()
96
97 var char rune
98
99 var err error
100 eof := false
101 newchar := true
102
103PARSECHARM:
104 for {
105
106 if newchar {
107 // Get from reader if buffer is empty
108 if buffo >= buffi {
109 if eof {
110 break
111 }
112 char, _, err = reader.ReadRune()
113
114 // No more runes to read
115 if err != nil {
116 eof = true
117 break
118 }
119 buffer[buffi] = char
120 buffi++
121 }
122
123 char = buffer[buffo]
124
125 if DEBUG {
126 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
127 }
128
129 // TODO:
130 // Better not repeatedly check for a!
131 // Possibly keep a buffer with a.
132 if int(char) < 256 {
133 a = mat.sigmaASCII[int(char)]
134 } else {
135 a, ok = mat.sigma[char]
136 if !ok {
137 a = 0
138 }
139 }
140
141 // Use identity symbol if character is not in sigma
142 if a == 0 && mat.identity != -1 {
143 a = mat.identity
144 }
145
146 t0 = t
147
148 // Check for epsilon transitions and remember
149
150 if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
151 // Remember state for backtracking to last tokenend state
152 epsilonState = t0
153 epsilonOffset = buffo
154 }
155 }
156
157 // Checks a transition based on t0, a and buffo
158 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
159 // t = mat.array[t0].getBase() + uint32(a)
160 // ta := dat.array[t]
161
162 if DEBUG {
163 // Char is only relevant if set
164 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
165 /*
166 if false {
167 fmt.Println(dat.outgoing(t0))
168 }
169 */
170 }
171
172 // Check if the transition is invalid according to the double array
173 // if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
174 if t == 0 {
175
176 if DEBUG {
177 fmt.Println("Match is not fine!")
178 }
179
180 if !ok && a == mat.identity {
181
182 // Try again with unknown symbol, in case identity failed
183 // Char is only relevant when set
184 if DEBUG {
185 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
186 }
187 a = mat.unknown
188
189 } else if a != mat.epsilon {
190
191 // Try again with epsilon symbol, in case everything else failed
192 t0 = epsilonState
193 epsilonState = 0 // reset
194 buffo = epsilonOffset
195 a = mat.epsilon
196
197 if DEBUG {
198 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
199 }
200
201 } else {
202 break
203 }
204
205 newchar = false
206 continue
207 }
208
209 // Transition was successful
210 rewindBuffer = false
211
212 // Transition consumes a character
213 if a != mat.epsilon {
214
215 buffo++
216
217 // Transition does not produce a character
218 // if buffo == 1 && ta.isNonToken() {
219 if buffo == 1 && t < 0 {
220 if DEBUG {
221 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
222 }
223 rewindBuffer = true
224 }
225
226 } else {
227 // Transition marks the end of a token - so flush the buffer
228
229 if buffi > 0 {
230 if DEBUG {
231 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
232 }
233 writer.WriteString(string(buffer[:buffo]))
234 rewindBuffer = true
235 }
236 if DEBUG {
237 fmt.Println("-> Newline")
238 }
239 writer.WriteRune('\n')
240 }
241
242 // Rewind the buffer if necessary
243 if rewindBuffer {
244
245 // TODO: Better as a ring buffer
246 for x, i := range buffer[buffo:buffi] {
247 buffer[x] = i
248 }
249
250 buffi -= buffo
251 epsilonOffset -= buffo
252 buffo = 0
253 if DEBUG {
254 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
255 }
256 }
257
258 // Move to representative state
259 /*
260 if ta.isSeparate() {
261 t = ta.getBase()
262 ta = dat.array[t]
263
264 if DEBUG {
265 fmt.Println("Representative pointing to", t)
266 }
267 }
268 */
269
270 // Ignore nontoken mark
271 if t < 0 {
272 t *= -1
273 }
274
275 newchar = true
276
277 // TODO:
278 // Prevent endless epsilon loops!
279 }
280
281 // Input reader is not yet finished
282 if !eof {
283 if DEBUG {
284 fmt.Println("Not at the end")
285 }
286 return false
287 }
288
289 if DEBUG {
290 fmt.Println("Entering final check")
291 }
292 /*
293 // Automaton is in a final state, so flush the buffer and return
294 x := dat.array[t].getBase() + uint32(dat.final)
295
296 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
297
298 if buffi > 0 {
299 if DEBUG {
300 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
301 }
302 writer.WriteString(string(buffer[:buffi]))
303
304 if dat.array[t].isTokenEnd() {
305 writer.WriteRune('\n')
306 if DEBUG {
307 fmt.Println("-> Newline")
308 }
309 }
310 }
311
312 // Add an additional sentence ending, if the file is over but no explicit
313 // sentence split was reached. This may be controversial and therefore
314 // optional via parameter.
315 if !dat.array[t0].isTokenEnd() {
316 writer.WriteRune('\n')
317 if DEBUG {
318 fmt.Println("-> Newline")
319 }
320 }
321
322 // TODO:
323 // There may be a new line at the end, from an epsilon,
324 // so we may need to go on!
325 return true
326 }
327 */
328
329 // Check epsilon transitions until a final state is reached
330 t0 = t
331 // t = dat.array[t0].getBase() + uint32(dat.epsilon)
332 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
333 a = mat.epsilon
334 newchar = false
335 // if dat.array[t].getCheck() == t0 {
336 // t can't be < 0
337 if t > 0 {
338 // Remember state for backtracking to last tokenend state
339 goto PARSECHARM
340
341 } else if epsilonState != 0 {
342 t0 = epsilonState
343 epsilonState = 0 // reset
344 buffo = epsilonOffset
345 if DEBUG {
346 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
347 }
348 goto PARSECHARM
349 }
350 return false
351
352}