blob: b800d1a472cd381eec7d777aec7a99ddd2d2f051 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron28031b72021-10-02 13:07:25 +020040 max := 0
Akron1c34ce62021-09-23 23:27:39 +020041 for num, sym := range auto.sigmaRev {
42 if int(sym) < 256 {
43 mat.sigmaASCII[int(sym)] = num
44 }
45 mat.sigma[sym] = num
46 if num > auto.sigmaCount {
47 panic("sigmaCount is smaller")
48 }
Akron28031b72021-10-02 13:07:25 +020049 if num > max {
50 max = num
51 }
Akron1c34ce62021-09-23 23:27:39 +020052 }
Akron28031b72021-10-02 13:07:25 +020053 // Add final entry to the list (maybe not necessary actually)
54
Akron1c34ce62021-09-23 23:27:39 +020055 remember := make([]bool, auto.stateCount+2)
56
Akron28031b72021-10-02 13:07:25 +020057 // lower sigmaCount, as no final value exists
58 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
59
Akron1c34ce62021-09-23 23:27:39 +020060 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020061 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020062
Akron16c312e2021-09-26 13:11:12 +020063 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020064 if start > auto.stateCount {
65 panic("stateCount is smaller")
66 }
67 if remember[start] {
68 return
69 }
70 remember[start] = true
71 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020072 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020073
74 // Mark nontoken transitions
75 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020076 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020077 }
78
79 toMatrix(matrix, t.end)
80 }
81 }
82
83 toMatrix(mat.array, 1)
84
85 return mat
86}
87
Akron941f2152021-09-26 15:14:25 +020088// Type of tokenizer
89func (MatrixTokenizer) Type() string {
90 return MAMAGIC
91}
92
Akron16c312e2021-09-26 13:11:12 +020093// Save stores the matrix data in a file
94func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
95 f, err := os.Create(file)
96 if err != nil {
97 log.Println(err)
98 return 0, err
99 }
100 defer f.Close()
101 gz := gzip.NewWriter(f)
102 defer gz.Close()
103 n, err = mat.WriteTo(gz)
104 if err != nil {
105 log.Println(err)
106 return n, err
107 }
108 gz.Flush()
109 return n, nil
110}
111
112// WriteTo stores the matrix data in an io.Writer.
113func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
114
115 wb := bufio.NewWriter(w)
116 defer wb.Flush()
117
118 // Store magical header
119 all, err := wb.Write([]byte(MAMAGIC))
120 if err != nil {
121 log.Println(err)
122 return int64(all), err
123 }
124
125 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200126 // In datok it's 16 - 4*4
127 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200128 max := 0
129 for sym, num := range mat.sigma {
130 sigmalist[num] = sym
131 if num > max {
132 max = num
133 }
134 }
135
Akron28031b72021-10-02 13:07:25 +0200136 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200137 sigmalist = sigmalist[:max+1]
138
Akron28031b72021-10-02 13:07:25 +0200139 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200140 bo.PutUint16(buf[0:2], VERSION)
141 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
142 bo.PutUint16(buf[4:6], uint16(mat.unknown))
143 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200144 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
145 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
146 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200147 if err != nil {
148 log.Println(err)
149 return int64(all), err
150 }
151
152 all += more
153
154 // Write sigma
155 for _, sym := range sigmalist {
156
157 more, err = wb.WriteRune(sym)
158 if err != nil {
159 log.Println(err)
160 return int64(all), err
161 }
162 all += more
163 }
164
165 if err != nil {
166 log.Println(err)
167 return int64(all), err
168 }
169
170 // Test marker - could be checksum
171 more, err = wb.Write([]byte("M"))
172 if err != nil {
173 log.Println(err)
174 return int64(all), err
175 }
176 all += more
177
Akron16c312e2021-09-26 13:11:12 +0200178 for _, x := range mat.array {
179 bo.PutUint32(buf[0:4], uint32(x))
180 more, err = wb.Write(buf[0:4])
181 if err != nil {
182 log.Println(err)
183 return int64(all), err
184 }
185 all += more
186 if more != 4 {
187 log.Println("Can not write base uint32")
188 return int64(all), err
189 }
Akron16c312e2021-09-26 13:11:12 +0200190 }
191
192 return int64(all), err
193}
194
195// LoadDatokFile reads a double array represented tokenizer
196// from a file.
197func LoadMatrixFile(file string) *MatrixTokenizer {
198 f, err := os.Open(file)
199 if err != nil {
200 log.Println(err)
201 return nil
202 }
203 defer f.Close()
204
205 gz, err := gzip.NewReader(f)
206 if err != nil {
207 log.Println(err)
208 return nil
209 }
210 defer gz.Close()
211
212 // Todo: Read the whole file!
213 return ParseMatrix(gz)
214}
215
216// LoadMatrixFile reads a matrix represented tokenizer
217// from an io.Reader
218func ParseMatrix(ior io.Reader) *MatrixTokenizer {
219
220 // Initialize tokenizer with default values
221 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200222 sigma: make(map[rune]int),
223 epsilon: 0,
224 unknown: 0,
225 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200226 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 }
228
229 r := bufio.NewReader(ior)
230
231 buf := make([]byte, 1024)
232 buf = buf[0:len(MAMAGIC)]
233
234 _, err := r.Read(buf)
235
236 if err != nil {
237 log.Println(err)
238 return nil
239 }
240
241 if string(MAMAGIC) != string(buf) {
242 log.Println("Not a matok file")
243 return nil
244 }
245
Akron28031b72021-10-02 13:07:25 +0200246 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200247 if err != nil {
248 log.Println(err)
249 return nil
250 }
251
Akron28031b72021-10-02 13:07:25 +0200252 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200253 log.Println("Read bytes do not fit")
254 return nil
255 }
256
257 version := bo.Uint16(buf[0:2])
258
259 if version != VERSION {
260 log.Println("Version not compatible")
261 return nil
262 }
263
264 mat.epsilon = int(bo.Uint16(buf[2:4]))
265 mat.unknown = int(bo.Uint16(buf[4:6]))
266 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200267 mat.stateCount = int(bo.Uint32(buf[8:12]))
268 sigmaCount := int(bo.Uint16(buf[12:14]))
269 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200270
Akron16c312e2021-09-26 13:11:12 +0200271 for x := 0; x < sigmaCount; x++ {
272 sym, _, err := r.ReadRune()
273 if err == nil && sym != 0 {
274 if int(sym) < 256 {
275 mat.sigmaASCII[int(sym)] = x
276 }
277 mat.sigma[sym] = x
278 }
279 }
280
281 _, err = io.ReadFull(r, buf[0:1])
282
283 if err != nil {
284 log.Print(err)
285 return nil
286 }
287
288 if string("M") != string(buf[0:1]) {
289 log.Println("Not a matok file")
290 return nil
291 }
292
293 // Read based on length
294 mat.array = make([]uint32, arraySize)
295
296 dataArray, err := io.ReadAll(r)
297
298 if err == io.EOF {
299 log.Println(err)
300 return nil
301 }
302
303 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200304 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200305 return nil
306 }
307
308 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200309 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
310 }
311
312 return mat
313}
314
Akron1c34ce62021-09-23 23:27:39 +0200315func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
316 var a int
Akron16c312e2021-09-26 13:11:12 +0200317 var t0 uint32
318 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200319 var ok, rewindBuffer bool
320
321 // Remember the last position of a possible tokenend,
322 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200323 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200324 epsilonOffset := 0
325
Akron5c82a922021-09-24 19:11:29 +0200326 // Remember if the last transition was epsilon
327 sentenceEnd := false
328
Akron1c34ce62021-09-23 23:27:39 +0200329 buffer := make([]rune, 1024)
330 buffo := 0 // Buffer offset
331 buffi := 0 // Buffer length
332
333 reader := bufio.NewReader(r)
334 writer := bufio.NewWriter(w)
335 defer writer.Flush()
336
337 var char rune
338
339 var err error
340 eof := false
341 newchar := true
342
343PARSECHARM:
344 for {
345
346 if newchar {
347 // Get from reader if buffer is empty
348 if buffo >= buffi {
349 if eof {
350 break
351 }
352 char, _, err = reader.ReadRune()
353
354 // No more runes to read
355 if err != nil {
356 eof = true
357 break
358 }
359 buffer[buffi] = char
360 buffi++
361 }
362
363 char = buffer[buffo]
364
365 if DEBUG {
366 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
367 }
368
369 // TODO:
370 // Better not repeatedly check for a!
371 // Possibly keep a buffer with a.
372 if int(char) < 256 {
373 a = mat.sigmaASCII[int(char)]
374 } else {
375 a, ok = mat.sigma[char]
376 if !ok {
377 a = 0
378 }
379 }
380
381 // Use identity symbol if character is not in sigma
382 if a == 0 && mat.identity != -1 {
383 a = mat.identity
384 }
385
386 t0 = t
387
388 // Check for epsilon transitions and remember
389
Akron16c312e2021-09-26 13:11:12 +0200390 // TODO: Can t0 be negative here?
391 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200392 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200393
394 // Maybe not necessary - and should be simpler!
395 // Just Remove
396 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200397 epsilonState = t0
398 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200399
400 if DEBUG {
401 fmt.Println("epsilonOffset is set to", buffo)
402 }
Akron1c34ce62021-09-23 23:27:39 +0200403 }
404 }
405
406 // Checks a transition based on t0, a and buffo
407 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200408
409 if DEBUG {
410 // Char is only relevant if set
411 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200412 }
413
414 // Check if the transition is invalid according to the double array
Akron1c34ce62021-09-23 23:27:39 +0200415 if t == 0 {
416
417 if DEBUG {
418 fmt.Println("Match is not fine!")
419 }
420
421 if !ok && a == mat.identity {
422
423 // Try again with unknown symbol, in case identity failed
424 // Char is only relevant when set
425 if DEBUG {
426 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
427 }
428 a = mat.unknown
429
430 } else if a != mat.epsilon {
431
432 // Try again with epsilon symbol, in case everything else failed
433 t0 = epsilonState
434 epsilonState = 0 // reset
435 buffo = epsilonOffset
436 a = mat.epsilon
437
438 if DEBUG {
439 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
440 }
441
442 } else {
443 break
444 }
445
446 newchar = false
447 continue
448 }
449
450 // Transition was successful
451 rewindBuffer = false
452
453 // Transition consumes a character
454 if a != mat.epsilon {
455
456 buffo++
457
458 // Transition does not produce a character
Akron16c312e2021-09-26 13:11:12 +0200459 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200460 if DEBUG {
461 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
462 }
463 rewindBuffer = true
464 }
465
466 } else {
467 // Transition marks the end of a token - so flush the buffer
Akron1c34ce62021-09-23 23:27:39 +0200468 if buffi > 0 {
469 if DEBUG {
470 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
471 }
472 writer.WriteString(string(buffer[:buffo]))
473 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200474 sentenceEnd = false
475 } else {
476 sentenceEnd = true
Akron1c34ce62021-09-23 23:27:39 +0200477 }
478 if DEBUG {
479 fmt.Println("-> Newline")
480 }
481 writer.WriteRune('\n')
482 }
483
484 // Rewind the buffer if necessary
485 if rewindBuffer {
486
Akron16c312e2021-09-26 13:11:12 +0200487 if DEBUG {
488 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
489 }
490
Akron1c34ce62021-09-23 23:27:39 +0200491 // TODO: Better as a ring buffer
492 for x, i := range buffer[buffo:buffi] {
493 buffer[x] = i
494 }
495
496 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200497 // epsilonOffset -= buffo
498 epsilonOffset = 0
499 epsilonState = 0
500
Akron1c34ce62021-09-23 23:27:39 +0200501 buffo = 0
502 if DEBUG {
503 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
504 }
505 }
506
Akron16c312e2021-09-26 13:11:12 +0200507 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200508
509 newchar = true
510
511 // TODO:
512 // Prevent endless epsilon loops!
513 }
514
515 // Input reader is not yet finished
516 if !eof {
517 if DEBUG {
518 fmt.Println("Not at the end")
519 }
520 return false
521 }
522
523 if DEBUG {
524 fmt.Println("Entering final check")
525 }
Akron1c34ce62021-09-23 23:27:39 +0200526
527 // Check epsilon transitions until a final state is reached
528 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200529 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
530 a = mat.epsilon
531 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200532 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200533 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200534 // Remember state for backtracking to last tokenend state
535 goto PARSECHARM
536
537 } else if epsilonState != 0 {
538 t0 = epsilonState
539 epsilonState = 0 // reset
540 buffo = epsilonOffset
541 if DEBUG {
542 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
543 }
544 goto PARSECHARM
545 }
Akron1c34ce62021-09-23 23:27:39 +0200546
Akron5c82a922021-09-24 19:11:29 +0200547 // Add an additional sentence ending, if the file is over but no explicit
548 // sentence split was reached. This may be controversial and therefore
549 // optional via parameter.
550 if !sentenceEnd {
551 writer.WriteRune('\n')
552 if DEBUG {
553 fmt.Println("-> Newline")
554 }
555 }
556
557 return true
Akron1c34ce62021-09-23 23:27:39 +0200558}