blob: 4ce82c9ddb4dcd258c6529fde27540e95edb8cde [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron16c312e2021-09-26 13:11:12 +020026 // final int
27 // tokenend int
Akron1c34ce62021-09-23 23:27:39 +020028}
29
30// ToMatrix turns the intermediate tokenizer into a
31// matrix representation.
32func (auto *Automaton) ToMatrix() *MatrixTokenizer {
33
34 mat := &MatrixTokenizer{
Akron16c312e2021-09-26 13:11:12 +020035 sigma: make(map[rune]int),
36 // final: auto.final,
37 unknown: auto.unknown,
38 identity: auto.identity,
39 epsilon: auto.epsilon,
40 // tokenend: auto.tokenend,
Akron1c34ce62021-09-23 23:27:39 +020041 stateCount: auto.stateCount,
42 }
43
Akron16c312e2021-09-26 13:11:12 +020044 mat.array = make([]uint32, (auto.stateCount+1)*(auto.sigmaCount))
Akron1c34ce62021-09-23 23:27:39 +020045
46 for num, sym := range auto.sigmaRev {
47 if int(sym) < 256 {
48 mat.sigmaASCII[int(sym)] = num
49 }
50 mat.sigma[sym] = num
51 if num > auto.sigmaCount {
52 panic("sigmaCount is smaller")
53 }
54 }
55 remember := make([]bool, auto.stateCount+2)
56
57 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020058 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020059
Akron16c312e2021-09-26 13:11:12 +020060 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020061 if start > auto.stateCount {
62 panic("stateCount is smaller")
63 }
64 if remember[start] {
65 return
66 }
67 remember[start] = true
68 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020069 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020070
71 // Mark nontoken transitions
72 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020073 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020074 }
75
76 toMatrix(matrix, t.end)
77 }
78 }
79
80 toMatrix(mat.array, 1)
81
82 return mat
83}
84
Akron16c312e2021-09-26 13:11:12 +020085// Save stores the matrix data in a file
86func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
87 f, err := os.Create(file)
88 if err != nil {
89 log.Println(err)
90 return 0, err
91 }
92 defer f.Close()
93 gz := gzip.NewWriter(f)
94 defer gz.Close()
95 n, err = mat.WriteTo(gz)
96 if err != nil {
97 log.Println(err)
98 return n, err
99 }
100 gz.Flush()
101 return n, nil
102}
103
104// WriteTo stores the matrix data in an io.Writer.
105func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
106
107 wb := bufio.NewWriter(w)
108 defer wb.Flush()
109
110 // Store magical header
111 all, err := wb.Write([]byte(MAMAGIC))
112 if err != nil {
113 log.Println(err)
114 return int64(all), err
115 }
116
117 // Get sigma as a list
118 sigmalist := make([]rune, len(mat.sigma)+12)
119 max := 0
120 for sym, num := range mat.sigma {
121 sigmalist[num] = sym
122 if num > max {
123 max = num
124 }
125 }
126
127 sigmalist = sigmalist[:max+1]
128
129 buf := make([]byte, 0, 12)
130 bo.PutUint16(buf[0:2], VERSION)
131 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
132 bo.PutUint16(buf[4:6], uint16(mat.unknown))
133 bo.PutUint16(buf[6:8], uint16(mat.identity))
134 bo.PutUint16(buf[8:10], uint16(mat.stateCount))
135 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
136 // bo.PutUint32(buf[12:16], uint32(len(mat.array)*2)) // Legacy support
137 more, err := wb.Write(buf[0:12])
138 if err != nil {
139 log.Println(err)
140 return int64(all), err
141 }
142
143 all += more
144
145 // Write sigma
146 for _, sym := range sigmalist {
147
148 more, err = wb.WriteRune(sym)
149 if err != nil {
150 log.Println(err)
151 return int64(all), err
152 }
153 all += more
154 }
155
156 if err != nil {
157 log.Println(err)
158 return int64(all), err
159 }
160
161 // Test marker - could be checksum
162 more, err = wb.Write([]byte("M"))
163 if err != nil {
164 log.Println(err)
165 return int64(all), err
166 }
167 all += more
168
169 // for x := 0; x < len(dat.array); x++ {
170 for _, x := range mat.array {
171 bo.PutUint32(buf[0:4], uint32(x))
172 more, err = wb.Write(buf[0:4])
173 if err != nil {
174 log.Println(err)
175 return int64(all), err
176 }
177 all += more
178 if more != 4 {
179 log.Println("Can not write base uint32")
180 return int64(all), err
181 }
182 /*
183 bo.PutUint32(buf[0:4], bc.check)
184 more, err = wb.Write(buf[0:4])
185 if err != nil {
186 log.Println(err)
187 return int64(all), err
188 }
189 all += more
190 if more != 4 {
191 log.Println("Can not write check uint32")
192 return int64(all), err
193 }
194 */
195 }
196
197 return int64(all), err
198}
199
200// LoadDatokFile reads a double array represented tokenizer
201// from a file.
202func LoadMatrixFile(file string) *MatrixTokenizer {
203 f, err := os.Open(file)
204 if err != nil {
205 log.Println(err)
206 return nil
207 }
208 defer f.Close()
209
210 gz, err := gzip.NewReader(f)
211 if err != nil {
212 log.Println(err)
213 return nil
214 }
215 defer gz.Close()
216
217 // Todo: Read the whole file!
218 return ParseMatrix(gz)
219}
220
221// LoadMatrixFile reads a matrix represented tokenizer
222// from an io.Reader
223func ParseMatrix(ior io.Reader) *MatrixTokenizer {
224
225 // Initialize tokenizer with default values
226 mat := &MatrixTokenizer{
227 sigma: make(map[rune]int),
228 epsilon: 0,
229 unknown: 0,
230 identity: 0,
231 // final: 0,
232 stateCount: 0,
233 // transCount: 0,
234 }
235
236 r := bufio.NewReader(ior)
237
238 buf := make([]byte, 1024)
239 buf = buf[0:len(MAMAGIC)]
240
241 _, err := r.Read(buf)
242
243 if err != nil {
244 log.Println(err)
245 return nil
246 }
247
248 if string(MAMAGIC) != string(buf) {
249 log.Println("Not a matok file")
250 return nil
251 }
252
253 more, err := io.ReadFull(r, buf[0:12])
254 if err != nil {
255 log.Println(err)
256 return nil
257 }
258
259 if more != 12 {
260 log.Println("Read bytes do not fit")
261 return nil
262 }
263
264 version := bo.Uint16(buf[0:2])
265
266 if version != VERSION {
267 log.Println("Version not compatible")
268 return nil
269 }
270
271 mat.epsilon = int(bo.Uint16(buf[2:4]))
272 mat.unknown = int(bo.Uint16(buf[4:6]))
273 mat.identity = int(bo.Uint16(buf[6:8]))
274 mat.stateCount = int(bo.Uint16(buf[8:10]))
275
276 sigmaCount := int(bo.Uint16(buf[10:12]))
277 arraySize := (mat.stateCount + 1) * (sigmaCount + 1)
278 // int(bo.Uint32(buf[12:16]))
279
280 // Shouldn't be relevant though
281 // mat.maxSize = arraySize - 1
282
283 for x := 0; x < sigmaCount; x++ {
284 sym, _, err := r.ReadRune()
285 if err == nil && sym != 0 {
286 if int(sym) < 256 {
287 mat.sigmaASCII[int(sym)] = x
288 }
289 mat.sigma[sym] = x
290 }
291 }
292
293 _, err = io.ReadFull(r, buf[0:1])
294
295 if err != nil {
296 log.Print(err)
297 return nil
298 }
299
300 if string("M") != string(buf[0:1]) {
301 log.Println("Not a matok file")
302 return nil
303 }
304
305 // Read based on length
306 mat.array = make([]uint32, arraySize)
307
308 dataArray, err := io.ReadAll(r)
309
310 if err == io.EOF {
311 log.Println(err)
312 return nil
313 }
314
315 if len(dataArray) < arraySize*4 {
316 log.Println("Not enough bytes read", len(dataArray), arraySize)
317 return nil
318 }
319
320 for x := 0; x < arraySize; x++ {
321 // mat.array[x] = bo.Uint32(dataArray[x*8 : (x*8)+4])
322 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
323 }
324
325 return mat
326}
327
Akron1c34ce62021-09-23 23:27:39 +0200328func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
329 var a int
Akron16c312e2021-09-26 13:11:12 +0200330 var t0 uint32
331 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200332 var ok, rewindBuffer bool
333
334 // Remember the last position of a possible tokenend,
335 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200336 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200337 epsilonOffset := 0
338
Akron5c82a922021-09-24 19:11:29 +0200339 // Remember if the last transition was epsilon
340 sentenceEnd := false
341
Akron1c34ce62021-09-23 23:27:39 +0200342 buffer := make([]rune, 1024)
343 buffo := 0 // Buffer offset
344 buffi := 0 // Buffer length
345
346 reader := bufio.NewReader(r)
347 writer := bufio.NewWriter(w)
348 defer writer.Flush()
349
350 var char rune
351
352 var err error
353 eof := false
354 newchar := true
355
356PARSECHARM:
357 for {
358
359 if newchar {
360 // Get from reader if buffer is empty
361 if buffo >= buffi {
362 if eof {
363 break
364 }
365 char, _, err = reader.ReadRune()
366
367 // No more runes to read
368 if err != nil {
369 eof = true
370 break
371 }
372 buffer[buffi] = char
373 buffi++
374 }
375
376 char = buffer[buffo]
377
378 if DEBUG {
379 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
380 }
381
382 // TODO:
383 // Better not repeatedly check for a!
384 // Possibly keep a buffer with a.
385 if int(char) < 256 {
386 a = mat.sigmaASCII[int(char)]
387 } else {
388 a, ok = mat.sigma[char]
389 if !ok {
390 a = 0
391 }
392 }
393
394 // Use identity symbol if character is not in sigma
395 if a == 0 && mat.identity != -1 {
396 a = mat.identity
397 }
398
399 t0 = t
400
401 // Check for epsilon transitions and remember
402
Akron16c312e2021-09-26 13:11:12 +0200403 // TODO: Can t0 be negative here?
404 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200405 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200406
407 // Maybe not necessary - and should be simpler!
408 // Just Remove
409 t0 &= ^FIRSTBIT
410 /*
411 if (t0 & FIRSTBIT) != 0 {
412 t0 ^= FIRSTBIT
413 }
414 */
Akron1c34ce62021-09-23 23:27:39 +0200415 epsilonState = t0
416 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200417
418 if DEBUG {
419 fmt.Println("epsilonOffset is set to", buffo)
420 }
Akron1c34ce62021-09-23 23:27:39 +0200421 }
422 }
423
424 // Checks a transition based on t0, a and buffo
425 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
426 // t = mat.array[t0].getBase() + uint32(a)
427 // ta := dat.array[t]
428
429 if DEBUG {
430 // Char is only relevant if set
431 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
432 /*
433 if false {
434 fmt.Println(dat.outgoing(t0))
435 }
436 */
437 }
438
439 // Check if the transition is invalid according to the double array
440 // if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
441 if t == 0 {
442
443 if DEBUG {
444 fmt.Println("Match is not fine!")
445 }
446
447 if !ok && a == mat.identity {
448
449 // Try again with unknown symbol, in case identity failed
450 // Char is only relevant when set
451 if DEBUG {
452 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
453 }
454 a = mat.unknown
455
456 } else if a != mat.epsilon {
457
458 // Try again with epsilon symbol, in case everything else failed
459 t0 = epsilonState
460 epsilonState = 0 // reset
461 buffo = epsilonOffset
462 a = mat.epsilon
463
464 if DEBUG {
465 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
466 }
467
468 } else {
469 break
470 }
471
472 newchar = false
473 continue
474 }
475
476 // Transition was successful
477 rewindBuffer = false
478
479 // Transition consumes a character
480 if a != mat.epsilon {
481
482 buffo++
483
484 // Transition does not produce a character
485 // if buffo == 1 && ta.isNonToken() {
Akron16c312e2021-09-26 13:11:12 +0200486 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200487 if DEBUG {
488 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
489 }
490 rewindBuffer = true
491 }
492
493 } else {
494 // Transition marks the end of a token - so flush the buffer
495
496 if buffi > 0 {
497 if DEBUG {
498 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
499 }
500 writer.WriteString(string(buffer[:buffo]))
501 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200502 sentenceEnd = false
503 } else {
504 sentenceEnd = true
Akron1c34ce62021-09-23 23:27:39 +0200505 }
506 if DEBUG {
507 fmt.Println("-> Newline")
508 }
509 writer.WriteRune('\n')
510 }
511
512 // Rewind the buffer if necessary
513 if rewindBuffer {
514
Akron16c312e2021-09-26 13:11:12 +0200515 if DEBUG {
516 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
517 }
518
Akron1c34ce62021-09-23 23:27:39 +0200519 // TODO: Better as a ring buffer
520 for x, i := range buffer[buffo:buffi] {
521 buffer[x] = i
522 }
523
524 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200525 // epsilonOffset -= buffo
526 epsilonOffset = 0
527 epsilonState = 0
528
Akron1c34ce62021-09-23 23:27:39 +0200529 buffo = 0
530 if DEBUG {
531 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
532 }
533 }
534
535 // Move to representative state
536 /*
537 if ta.isSeparate() {
538 t = ta.getBase()
539 ta = dat.array[t]
540
541 if DEBUG {
542 fmt.Println("Representative pointing to", t)
543 }
544 }
545 */
546
547 // Ignore nontoken mark
Akron16c312e2021-09-26 13:11:12 +0200548 /*
549 if t < 0 {
550 t *= -1
551 }
552 */
553 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200554
555 newchar = true
556
557 // TODO:
558 // Prevent endless epsilon loops!
559 }
560
561 // Input reader is not yet finished
562 if !eof {
563 if DEBUG {
564 fmt.Println("Not at the end")
565 }
566 return false
567 }
568
569 if DEBUG {
570 fmt.Println("Entering final check")
571 }
572 /*
573 // Automaton is in a final state, so flush the buffer and return
574 x := dat.array[t].getBase() + uint32(dat.final)
575
576 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
577
578 if buffi > 0 {
579 if DEBUG {
580 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
581 }
582 writer.WriteString(string(buffer[:buffi]))
583
584 if dat.array[t].isTokenEnd() {
585 writer.WriteRune('\n')
586 if DEBUG {
587 fmt.Println("-> Newline")
588 }
589 }
590 }
591
592 // Add an additional sentence ending, if the file is over but no explicit
593 // sentence split was reached. This may be controversial and therefore
594 // optional via parameter.
595 if !dat.array[t0].isTokenEnd() {
596 writer.WriteRune('\n')
597 if DEBUG {
598 fmt.Println("-> Newline")
599 }
600 }
601
602 // TODO:
603 // There may be a new line at the end, from an epsilon,
604 // so we may need to go on!
605 return true
606 }
607 */
608
609 // Check epsilon transitions until a final state is reached
610 t0 = t
611 // t = dat.array[t0].getBase() + uint32(dat.epsilon)
612 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
613 a = mat.epsilon
614 newchar = false
615 // if dat.array[t].getCheck() == t0 {
616 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200617 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200618 // Remember state for backtracking to last tokenend state
619 goto PARSECHARM
620
621 } else if epsilonState != 0 {
622 t0 = epsilonState
623 epsilonState = 0 // reset
624 buffo = epsilonOffset
625 if DEBUG {
626 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
627 }
628 goto PARSECHARM
629 }
Akron1c34ce62021-09-23 23:27:39 +0200630
Akron5c82a922021-09-24 19:11:29 +0200631 // Add an additional sentence ending, if the file is over but no explicit
632 // sentence split was reached. This may be controversial and therefore
633 // optional via parameter.
634 if !sentenceEnd {
635 writer.WriteRune('\n')
636 if DEBUG {
637 fmt.Println("-> Newline")
638 }
639 }
640
641 return true
Akron1c34ce62021-09-23 23:27:39 +0200642}