blob: ac53b9afe96630576bdf9cb7d60ed834759a5b7e [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron28031b72021-10-02 13:07:25 +020040 max := 0
Akron1c34ce62021-09-23 23:27:39 +020041 for num, sym := range auto.sigmaRev {
42 if int(sym) < 256 {
43 mat.sigmaASCII[int(sym)] = num
44 }
45 mat.sigma[sym] = num
46 if num > auto.sigmaCount {
47 panic("sigmaCount is smaller")
48 }
Akron28031b72021-10-02 13:07:25 +020049 if num > max {
50 max = num
51 }
Akron1c34ce62021-09-23 23:27:39 +020052 }
Akron28031b72021-10-02 13:07:25 +020053 // Add final entry to the list (maybe not necessary actually)
54
Akron1c34ce62021-09-23 23:27:39 +020055 remember := make([]bool, auto.stateCount+2)
56
Akron28031b72021-10-02 13:07:25 +020057 // lower sigmaCount, as no final value exists
58 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
59
Akron1c34ce62021-09-23 23:27:39 +020060 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020061 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020062
Akron16c312e2021-09-26 13:11:12 +020063 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020064 if start > auto.stateCount {
65 panic("stateCount is smaller")
66 }
67 if remember[start] {
68 return
69 }
70 remember[start] = true
71 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020072 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020073
74 // Mark nontoken transitions
75 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020076 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020077 }
78
79 toMatrix(matrix, t.end)
80 }
81 }
82
83 toMatrix(mat.array, 1)
84
85 return mat
86}
87
Akron941f2152021-09-26 15:14:25 +020088// Type of tokenizer
89func (MatrixTokenizer) Type() string {
90 return MAMAGIC
91}
92
Akron16c312e2021-09-26 13:11:12 +020093// Save stores the matrix data in a file
94func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
95 f, err := os.Create(file)
96 if err != nil {
97 log.Println(err)
98 return 0, err
99 }
100 defer f.Close()
101 gz := gzip.NewWriter(f)
102 defer gz.Close()
103 n, err = mat.WriteTo(gz)
104 if err != nil {
105 log.Println(err)
106 return n, err
107 }
108 gz.Flush()
109 return n, nil
110}
111
112// WriteTo stores the matrix data in an io.Writer.
113func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
114
115 wb := bufio.NewWriter(w)
116 defer wb.Flush()
117
118 // Store magical header
119 all, err := wb.Write([]byte(MAMAGIC))
120 if err != nil {
121 log.Println(err)
122 return int64(all), err
123 }
124
125 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200126 // In datok it's 16 - 4*4
127 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200128 max := 0
129 for sym, num := range mat.sigma {
130 sigmalist[num] = sym
131 if num > max {
132 max = num
133 }
134 }
135
Akron28031b72021-10-02 13:07:25 +0200136 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200137 sigmalist = sigmalist[:max+1]
138
Akron28031b72021-10-02 13:07:25 +0200139 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200140 bo.PutUint16(buf[0:2], VERSION)
141 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
142 bo.PutUint16(buf[4:6], uint16(mat.unknown))
143 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200144 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
145 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
146 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200147 if err != nil {
148 log.Println(err)
149 return int64(all), err
150 }
151
152 all += more
153
154 // Write sigma
155 for _, sym := range sigmalist {
156
157 more, err = wb.WriteRune(sym)
158 if err != nil {
159 log.Println(err)
160 return int64(all), err
161 }
162 all += more
163 }
164
165 if err != nil {
166 log.Println(err)
167 return int64(all), err
168 }
169
170 // Test marker - could be checksum
171 more, err = wb.Write([]byte("M"))
172 if err != nil {
173 log.Println(err)
174 return int64(all), err
175 }
176 all += more
177
Akron16c312e2021-09-26 13:11:12 +0200178 for _, x := range mat.array {
179 bo.PutUint32(buf[0:4], uint32(x))
180 more, err = wb.Write(buf[0:4])
181 if err != nil {
182 log.Println(err)
183 return int64(all), err
184 }
185 all += more
186 if more != 4 {
187 log.Println("Can not write base uint32")
188 return int64(all), err
189 }
Akron16c312e2021-09-26 13:11:12 +0200190 }
191
192 return int64(all), err
193}
194
195// LoadDatokFile reads a double array represented tokenizer
196// from a file.
197func LoadMatrixFile(file string) *MatrixTokenizer {
198 f, err := os.Open(file)
199 if err != nil {
200 log.Println(err)
201 return nil
202 }
203 defer f.Close()
204
205 gz, err := gzip.NewReader(f)
206 if err != nil {
207 log.Println(err)
208 return nil
209 }
210 defer gz.Close()
211
212 // Todo: Read the whole file!
213 return ParseMatrix(gz)
214}
215
216// LoadMatrixFile reads a matrix represented tokenizer
217// from an io.Reader
218func ParseMatrix(ior io.Reader) *MatrixTokenizer {
219
220 // Initialize tokenizer with default values
221 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200222 sigma: make(map[rune]int),
223 epsilon: 0,
224 unknown: 0,
225 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200226 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 }
228
229 r := bufio.NewReader(ior)
230
231 buf := make([]byte, 1024)
232 buf = buf[0:len(MAMAGIC)]
233
234 _, err := r.Read(buf)
235
236 if err != nil {
237 log.Println(err)
238 return nil
239 }
240
241 if string(MAMAGIC) != string(buf) {
242 log.Println("Not a matok file")
243 return nil
244 }
245
Akron28031b72021-10-02 13:07:25 +0200246 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200247 if err != nil {
248 log.Println(err)
249 return nil
250 }
251
Akron28031b72021-10-02 13:07:25 +0200252 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200253 log.Println("Read bytes do not fit")
254 return nil
255 }
256
257 version := bo.Uint16(buf[0:2])
258
259 if version != VERSION {
260 log.Println("Version not compatible")
261 return nil
262 }
263
264 mat.epsilon = int(bo.Uint16(buf[2:4]))
265 mat.unknown = int(bo.Uint16(buf[4:6]))
266 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200267 mat.stateCount = int(bo.Uint32(buf[8:12]))
268 sigmaCount := int(bo.Uint16(buf[12:14]))
269 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200270
271 // Shouldn't be relevant though
272 // mat.maxSize = arraySize - 1
273
274 for x := 0; x < sigmaCount; x++ {
275 sym, _, err := r.ReadRune()
276 if err == nil && sym != 0 {
277 if int(sym) < 256 {
278 mat.sigmaASCII[int(sym)] = x
279 }
280 mat.sigma[sym] = x
281 }
282 }
283
284 _, err = io.ReadFull(r, buf[0:1])
285
286 if err != nil {
287 log.Print(err)
288 return nil
289 }
290
291 if string("M") != string(buf[0:1]) {
292 log.Println("Not a matok file")
293 return nil
294 }
295
296 // Read based on length
297 mat.array = make([]uint32, arraySize)
298
299 dataArray, err := io.ReadAll(r)
300
301 if err == io.EOF {
302 log.Println(err)
303 return nil
304 }
305
306 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200307 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200308 return nil
309 }
310
311 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200312 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
313 }
314
315 return mat
316}
317
Akron1c34ce62021-09-23 23:27:39 +0200318func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
319 var a int
Akron16c312e2021-09-26 13:11:12 +0200320 var t0 uint32
321 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200322 var ok, rewindBuffer bool
323
324 // Remember the last position of a possible tokenend,
325 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200326 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200327 epsilonOffset := 0
328
Akron5c82a922021-09-24 19:11:29 +0200329 // Remember if the last transition was epsilon
330 sentenceEnd := false
331
Akron1c34ce62021-09-23 23:27:39 +0200332 buffer := make([]rune, 1024)
333 buffo := 0 // Buffer offset
334 buffi := 0 // Buffer length
335
336 reader := bufio.NewReader(r)
337 writer := bufio.NewWriter(w)
338 defer writer.Flush()
339
340 var char rune
341
342 var err error
343 eof := false
344 newchar := true
345
346PARSECHARM:
347 for {
348
349 if newchar {
350 // Get from reader if buffer is empty
351 if buffo >= buffi {
352 if eof {
353 break
354 }
355 char, _, err = reader.ReadRune()
356
357 // No more runes to read
358 if err != nil {
359 eof = true
360 break
361 }
362 buffer[buffi] = char
363 buffi++
364 }
365
366 char = buffer[buffo]
367
368 if DEBUG {
369 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
370 }
371
372 // TODO:
373 // Better not repeatedly check for a!
374 // Possibly keep a buffer with a.
375 if int(char) < 256 {
376 a = mat.sigmaASCII[int(char)]
377 } else {
378 a, ok = mat.sigma[char]
379 if !ok {
380 a = 0
381 }
382 }
383
384 // Use identity symbol if character is not in sigma
385 if a == 0 && mat.identity != -1 {
386 a = mat.identity
387 }
388
389 t0 = t
390
391 // Check for epsilon transitions and remember
392
Akron16c312e2021-09-26 13:11:12 +0200393 // TODO: Can t0 be negative here?
394 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200395 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200396
397 // Maybe not necessary - and should be simpler!
398 // Just Remove
399 t0 &= ^FIRSTBIT
400 /*
401 if (t0 & FIRSTBIT) != 0 {
402 t0 ^= FIRSTBIT
403 }
404 */
Akron1c34ce62021-09-23 23:27:39 +0200405 epsilonState = t0
406 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200407
408 if DEBUG {
409 fmt.Println("epsilonOffset is set to", buffo)
410 }
Akron1c34ce62021-09-23 23:27:39 +0200411 }
412 }
413
414 // Checks a transition based on t0, a and buffo
415 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
416 // t = mat.array[t0].getBase() + uint32(a)
417 // ta := dat.array[t]
418
419 if DEBUG {
420 // Char is only relevant if set
421 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
422 /*
423 if false {
424 fmt.Println(dat.outgoing(t0))
425 }
426 */
427 }
428
429 // Check if the transition is invalid according to the double array
430 // if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
431 if t == 0 {
432
433 if DEBUG {
434 fmt.Println("Match is not fine!")
435 }
436
437 if !ok && a == mat.identity {
438
439 // Try again with unknown symbol, in case identity failed
440 // Char is only relevant when set
441 if DEBUG {
442 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
443 }
444 a = mat.unknown
445
446 } else if a != mat.epsilon {
447
448 // Try again with epsilon symbol, in case everything else failed
449 t0 = epsilonState
450 epsilonState = 0 // reset
451 buffo = epsilonOffset
452 a = mat.epsilon
453
454 if DEBUG {
455 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
456 }
457
458 } else {
459 break
460 }
461
462 newchar = false
463 continue
464 }
465
466 // Transition was successful
467 rewindBuffer = false
468
469 // Transition consumes a character
470 if a != mat.epsilon {
471
472 buffo++
473
474 // Transition does not produce a character
475 // if buffo == 1 && ta.isNonToken() {
Akron16c312e2021-09-26 13:11:12 +0200476 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200477 if DEBUG {
478 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
479 }
480 rewindBuffer = true
481 }
482
483 } else {
484 // Transition marks the end of a token - so flush the buffer
485
486 if buffi > 0 {
487 if DEBUG {
488 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
489 }
490 writer.WriteString(string(buffer[:buffo]))
491 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200492 sentenceEnd = false
493 } else {
494 sentenceEnd = true
Akron1c34ce62021-09-23 23:27:39 +0200495 }
496 if DEBUG {
497 fmt.Println("-> Newline")
498 }
499 writer.WriteRune('\n')
500 }
501
502 // Rewind the buffer if necessary
503 if rewindBuffer {
504
Akron16c312e2021-09-26 13:11:12 +0200505 if DEBUG {
506 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
507 }
508
Akron1c34ce62021-09-23 23:27:39 +0200509 // TODO: Better as a ring buffer
510 for x, i := range buffer[buffo:buffi] {
511 buffer[x] = i
512 }
513
514 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200515 // epsilonOffset -= buffo
516 epsilonOffset = 0
517 epsilonState = 0
518
Akron1c34ce62021-09-23 23:27:39 +0200519 buffo = 0
520 if DEBUG {
521 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
522 }
523 }
524
525 // Move to representative state
526 /*
527 if ta.isSeparate() {
528 t = ta.getBase()
529 ta = dat.array[t]
530
531 if DEBUG {
532 fmt.Println("Representative pointing to", t)
533 }
534 }
535 */
536
537 // Ignore nontoken mark
Akron16c312e2021-09-26 13:11:12 +0200538 /*
539 if t < 0 {
540 t *= -1
541 }
542 */
543 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200544
545 newchar = true
546
547 // TODO:
548 // Prevent endless epsilon loops!
549 }
550
551 // Input reader is not yet finished
552 if !eof {
553 if DEBUG {
554 fmt.Println("Not at the end")
555 }
556 return false
557 }
558
559 if DEBUG {
560 fmt.Println("Entering final check")
561 }
562 /*
563 // Automaton is in a final state, so flush the buffer and return
564 x := dat.array[t].getBase() + uint32(dat.final)
565
566 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
567
568 if buffi > 0 {
569 if DEBUG {
570 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
571 }
572 writer.WriteString(string(buffer[:buffi]))
573
574 if dat.array[t].isTokenEnd() {
575 writer.WriteRune('\n')
576 if DEBUG {
577 fmt.Println("-> Newline")
578 }
579 }
580 }
581
582 // Add an additional sentence ending, if the file is over but no explicit
583 // sentence split was reached. This may be controversial and therefore
584 // optional via parameter.
585 if !dat.array[t0].isTokenEnd() {
586 writer.WriteRune('\n')
587 if DEBUG {
588 fmt.Println("-> Newline")
589 }
590 }
591
592 // TODO:
593 // There may be a new line at the end, from an epsilon,
594 // so we may need to go on!
595 return true
596 }
597 */
598
599 // Check epsilon transitions until a final state is reached
600 t0 = t
601 // t = dat.array[t0].getBase() + uint32(dat.epsilon)
602 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
603 a = mat.epsilon
604 newchar = false
605 // if dat.array[t].getCheck() == t0 {
606 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200607 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200608 // Remember state for backtracking to last tokenend state
609 goto PARSECHARM
610
611 } else if epsilonState != 0 {
612 t0 = epsilonState
613 epsilonState = 0 // reset
614 buffo = epsilonOffset
615 if DEBUG {
616 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
617 }
618 goto PARSECHARM
619 }
Akron1c34ce62021-09-23 23:27:39 +0200620
Akron5c82a922021-09-24 19:11:29 +0200621 // Add an additional sentence ending, if the file is over but no explicit
622 // sentence split was reached. This may be controversial and therefore
623 // optional via parameter.
624 if !sentenceEnd {
625 writer.WriteRune('\n')
626 if DEBUG {
627 fmt.Println("-> Newline")
628 }
629 }
630
631 return true
Akron1c34ce62021-09-23 23:27:39 +0200632}