blob: 4b439d0f27bfcbeb61a04fe60d03b79216c127b7 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron16c312e2021-09-26 13:11:12 +020026 // final int
27 // tokenend int
Akron1c34ce62021-09-23 23:27:39 +020028}
29
30// ToMatrix turns the intermediate tokenizer into a
31// matrix representation.
32func (auto *Automaton) ToMatrix() *MatrixTokenizer {
33
34 mat := &MatrixTokenizer{
Akron16c312e2021-09-26 13:11:12 +020035 sigma: make(map[rune]int),
36 // final: auto.final,
37 unknown: auto.unknown,
38 identity: auto.identity,
39 epsilon: auto.epsilon,
40 // tokenend: auto.tokenend,
Akron1c34ce62021-09-23 23:27:39 +020041 stateCount: auto.stateCount,
42 }
43
Akron16c312e2021-09-26 13:11:12 +020044 mat.array = make([]uint32, (auto.stateCount+1)*(auto.sigmaCount))
Akron1c34ce62021-09-23 23:27:39 +020045
46 for num, sym := range auto.sigmaRev {
47 if int(sym) < 256 {
48 mat.sigmaASCII[int(sym)] = num
49 }
50 mat.sigma[sym] = num
51 if num > auto.sigmaCount {
52 panic("sigmaCount is smaller")
53 }
54 }
55 remember := make([]bool, auto.stateCount+2)
56
57 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020058 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020059
Akron16c312e2021-09-26 13:11:12 +020060 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020061 if start > auto.stateCount {
62 panic("stateCount is smaller")
63 }
64 if remember[start] {
65 return
66 }
67 remember[start] = true
68 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020069 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020070
71 // Mark nontoken transitions
72 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020073 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020074 }
75
76 toMatrix(matrix, t.end)
77 }
78 }
79
80 toMatrix(mat.array, 1)
81
82 return mat
83}
84
Akron941f2152021-09-26 15:14:25 +020085// Type of tokenizer
86func (MatrixTokenizer) Type() string {
87 return MAMAGIC
88}
89
Akron16c312e2021-09-26 13:11:12 +020090// Save stores the matrix data in a file
91func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
92 f, err := os.Create(file)
93 if err != nil {
94 log.Println(err)
95 return 0, err
96 }
97 defer f.Close()
98 gz := gzip.NewWriter(f)
99 defer gz.Close()
100 n, err = mat.WriteTo(gz)
101 if err != nil {
102 log.Println(err)
103 return n, err
104 }
105 gz.Flush()
106 return n, nil
107}
108
109// WriteTo stores the matrix data in an io.Writer.
110func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
111
112 wb := bufio.NewWriter(w)
113 defer wb.Flush()
114
115 // Store magical header
116 all, err := wb.Write([]byte(MAMAGIC))
117 if err != nil {
118 log.Println(err)
119 return int64(all), err
120 }
121
122 // Get sigma as a list
123 sigmalist := make([]rune, len(mat.sigma)+12)
124 max := 0
125 for sym, num := range mat.sigma {
126 sigmalist[num] = sym
127 if num > max {
128 max = num
129 }
130 }
131
132 sigmalist = sigmalist[:max+1]
133
134 buf := make([]byte, 0, 12)
135 bo.PutUint16(buf[0:2], VERSION)
136 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
137 bo.PutUint16(buf[4:6], uint16(mat.unknown))
138 bo.PutUint16(buf[6:8], uint16(mat.identity))
139 bo.PutUint16(buf[8:10], uint16(mat.stateCount))
140 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
141 // bo.PutUint32(buf[12:16], uint32(len(mat.array)*2)) // Legacy support
142 more, err := wb.Write(buf[0:12])
143 if err != nil {
144 log.Println(err)
145 return int64(all), err
146 }
147
148 all += more
149
150 // Write sigma
151 for _, sym := range sigmalist {
152
153 more, err = wb.WriteRune(sym)
154 if err != nil {
155 log.Println(err)
156 return int64(all), err
157 }
158 all += more
159 }
160
161 if err != nil {
162 log.Println(err)
163 return int64(all), err
164 }
165
166 // Test marker - could be checksum
167 more, err = wb.Write([]byte("M"))
168 if err != nil {
169 log.Println(err)
170 return int64(all), err
171 }
172 all += more
173
174 // for x := 0; x < len(dat.array); x++ {
175 for _, x := range mat.array {
176 bo.PutUint32(buf[0:4], uint32(x))
177 more, err = wb.Write(buf[0:4])
178 if err != nil {
179 log.Println(err)
180 return int64(all), err
181 }
182 all += more
183 if more != 4 {
184 log.Println("Can not write base uint32")
185 return int64(all), err
186 }
187 /*
188 bo.PutUint32(buf[0:4], bc.check)
189 more, err = wb.Write(buf[0:4])
190 if err != nil {
191 log.Println(err)
192 return int64(all), err
193 }
194 all += more
195 if more != 4 {
196 log.Println("Can not write check uint32")
197 return int64(all), err
198 }
199 */
200 }
201
202 return int64(all), err
203}
204
205// LoadDatokFile reads a double array represented tokenizer
206// from a file.
207func LoadMatrixFile(file string) *MatrixTokenizer {
208 f, err := os.Open(file)
209 if err != nil {
210 log.Println(err)
211 return nil
212 }
213 defer f.Close()
214
215 gz, err := gzip.NewReader(f)
216 if err != nil {
217 log.Println(err)
218 return nil
219 }
220 defer gz.Close()
221
222 // Todo: Read the whole file!
223 return ParseMatrix(gz)
224}
225
226// LoadMatrixFile reads a matrix represented tokenizer
227// from an io.Reader
228func ParseMatrix(ior io.Reader) *MatrixTokenizer {
229
230 // Initialize tokenizer with default values
231 mat := &MatrixTokenizer{
232 sigma: make(map[rune]int),
233 epsilon: 0,
234 unknown: 0,
235 identity: 0,
236 // final: 0,
237 stateCount: 0,
238 // transCount: 0,
239 }
240
241 r := bufio.NewReader(ior)
242
243 buf := make([]byte, 1024)
244 buf = buf[0:len(MAMAGIC)]
245
246 _, err := r.Read(buf)
247
248 if err != nil {
249 log.Println(err)
250 return nil
251 }
252
253 if string(MAMAGIC) != string(buf) {
254 log.Println("Not a matok file")
255 return nil
256 }
257
258 more, err := io.ReadFull(r, buf[0:12])
259 if err != nil {
260 log.Println(err)
261 return nil
262 }
263
264 if more != 12 {
265 log.Println("Read bytes do not fit")
266 return nil
267 }
268
269 version := bo.Uint16(buf[0:2])
270
271 if version != VERSION {
272 log.Println("Version not compatible")
273 return nil
274 }
275
276 mat.epsilon = int(bo.Uint16(buf[2:4]))
277 mat.unknown = int(bo.Uint16(buf[4:6]))
278 mat.identity = int(bo.Uint16(buf[6:8]))
279 mat.stateCount = int(bo.Uint16(buf[8:10]))
280
281 sigmaCount := int(bo.Uint16(buf[10:12]))
282 arraySize := (mat.stateCount + 1) * (sigmaCount + 1)
283 // int(bo.Uint32(buf[12:16]))
284
285 // Shouldn't be relevant though
286 // mat.maxSize = arraySize - 1
287
288 for x := 0; x < sigmaCount; x++ {
289 sym, _, err := r.ReadRune()
290 if err == nil && sym != 0 {
291 if int(sym) < 256 {
292 mat.sigmaASCII[int(sym)] = x
293 }
294 mat.sigma[sym] = x
295 }
296 }
297
298 _, err = io.ReadFull(r, buf[0:1])
299
300 if err != nil {
301 log.Print(err)
302 return nil
303 }
304
305 if string("M") != string(buf[0:1]) {
306 log.Println("Not a matok file")
307 return nil
308 }
309
310 // Read based on length
311 mat.array = make([]uint32, arraySize)
312
313 dataArray, err := io.ReadAll(r)
314
315 if err == io.EOF {
316 log.Println(err)
317 return nil
318 }
319
320 if len(dataArray) < arraySize*4 {
321 log.Println("Not enough bytes read", len(dataArray), arraySize)
322 return nil
323 }
324
325 for x := 0; x < arraySize; x++ {
326 // mat.array[x] = bo.Uint32(dataArray[x*8 : (x*8)+4])
327 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
328 }
329
330 return mat
331}
332
Akron1c34ce62021-09-23 23:27:39 +0200333func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
334 var a int
Akron16c312e2021-09-26 13:11:12 +0200335 var t0 uint32
336 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200337 var ok, rewindBuffer bool
338
339 // Remember the last position of a possible tokenend,
340 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200341 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200342 epsilonOffset := 0
343
Akron5c82a922021-09-24 19:11:29 +0200344 // Remember if the last transition was epsilon
345 sentenceEnd := false
346
Akron1c34ce62021-09-23 23:27:39 +0200347 buffer := make([]rune, 1024)
348 buffo := 0 // Buffer offset
349 buffi := 0 // Buffer length
350
351 reader := bufio.NewReader(r)
352 writer := bufio.NewWriter(w)
353 defer writer.Flush()
354
355 var char rune
356
357 var err error
358 eof := false
359 newchar := true
360
361PARSECHARM:
362 for {
363
364 if newchar {
365 // Get from reader if buffer is empty
366 if buffo >= buffi {
367 if eof {
368 break
369 }
370 char, _, err = reader.ReadRune()
371
372 // No more runes to read
373 if err != nil {
374 eof = true
375 break
376 }
377 buffer[buffi] = char
378 buffi++
379 }
380
381 char = buffer[buffo]
382
383 if DEBUG {
384 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
385 }
386
387 // TODO:
388 // Better not repeatedly check for a!
389 // Possibly keep a buffer with a.
390 if int(char) < 256 {
391 a = mat.sigmaASCII[int(char)]
392 } else {
393 a, ok = mat.sigma[char]
394 if !ok {
395 a = 0
396 }
397 }
398
399 // Use identity symbol if character is not in sigma
400 if a == 0 && mat.identity != -1 {
401 a = mat.identity
402 }
403
404 t0 = t
405
406 // Check for epsilon transitions and remember
407
Akron16c312e2021-09-26 13:11:12 +0200408 // TODO: Can t0 be negative here?
409 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200410 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200411
412 // Maybe not necessary - and should be simpler!
413 // Just Remove
414 t0 &= ^FIRSTBIT
415 /*
416 if (t0 & FIRSTBIT) != 0 {
417 t0 ^= FIRSTBIT
418 }
419 */
Akron1c34ce62021-09-23 23:27:39 +0200420 epsilonState = t0
421 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200422
423 if DEBUG {
424 fmt.Println("epsilonOffset is set to", buffo)
425 }
Akron1c34ce62021-09-23 23:27:39 +0200426 }
427 }
428
429 // Checks a transition based on t0, a and buffo
430 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
431 // t = mat.array[t0].getBase() + uint32(a)
432 // ta := dat.array[t]
433
434 if DEBUG {
435 // Char is only relevant if set
436 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
437 /*
438 if false {
439 fmt.Println(dat.outgoing(t0))
440 }
441 */
442 }
443
444 // Check if the transition is invalid according to the double array
445 // if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
446 if t == 0 {
447
448 if DEBUG {
449 fmt.Println("Match is not fine!")
450 }
451
452 if !ok && a == mat.identity {
453
454 // Try again with unknown symbol, in case identity failed
455 // Char is only relevant when set
456 if DEBUG {
457 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
458 }
459 a = mat.unknown
460
461 } else if a != mat.epsilon {
462
463 // Try again with epsilon symbol, in case everything else failed
464 t0 = epsilonState
465 epsilonState = 0 // reset
466 buffo = epsilonOffset
467 a = mat.epsilon
468
469 if DEBUG {
470 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
471 }
472
473 } else {
474 break
475 }
476
477 newchar = false
478 continue
479 }
480
481 // Transition was successful
482 rewindBuffer = false
483
484 // Transition consumes a character
485 if a != mat.epsilon {
486
487 buffo++
488
489 // Transition does not produce a character
490 // if buffo == 1 && ta.isNonToken() {
Akron16c312e2021-09-26 13:11:12 +0200491 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200492 if DEBUG {
493 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
494 }
495 rewindBuffer = true
496 }
497
498 } else {
499 // Transition marks the end of a token - so flush the buffer
500
501 if buffi > 0 {
502 if DEBUG {
503 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
504 }
505 writer.WriteString(string(buffer[:buffo]))
506 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200507 sentenceEnd = false
508 } else {
509 sentenceEnd = true
Akron1c34ce62021-09-23 23:27:39 +0200510 }
511 if DEBUG {
512 fmt.Println("-> Newline")
513 }
514 writer.WriteRune('\n')
515 }
516
517 // Rewind the buffer if necessary
518 if rewindBuffer {
519
Akron16c312e2021-09-26 13:11:12 +0200520 if DEBUG {
521 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
522 }
523
Akron1c34ce62021-09-23 23:27:39 +0200524 // TODO: Better as a ring buffer
525 for x, i := range buffer[buffo:buffi] {
526 buffer[x] = i
527 }
528
529 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200530 // epsilonOffset -= buffo
531 epsilonOffset = 0
532 epsilonState = 0
533
Akron1c34ce62021-09-23 23:27:39 +0200534 buffo = 0
535 if DEBUG {
536 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
537 }
538 }
539
540 // Move to representative state
541 /*
542 if ta.isSeparate() {
543 t = ta.getBase()
544 ta = dat.array[t]
545
546 if DEBUG {
547 fmt.Println("Representative pointing to", t)
548 }
549 }
550 */
551
552 // Ignore nontoken mark
Akron16c312e2021-09-26 13:11:12 +0200553 /*
554 if t < 0 {
555 t *= -1
556 }
557 */
558 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200559
560 newchar = true
561
562 // TODO:
563 // Prevent endless epsilon loops!
564 }
565
566 // Input reader is not yet finished
567 if !eof {
568 if DEBUG {
569 fmt.Println("Not at the end")
570 }
571 return false
572 }
573
574 if DEBUG {
575 fmt.Println("Entering final check")
576 }
577 /*
578 // Automaton is in a final state, so flush the buffer and return
579 x := dat.array[t].getBase() + uint32(dat.final)
580
581 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
582
583 if buffi > 0 {
584 if DEBUG {
585 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
586 }
587 writer.WriteString(string(buffer[:buffi]))
588
589 if dat.array[t].isTokenEnd() {
590 writer.WriteRune('\n')
591 if DEBUG {
592 fmt.Println("-> Newline")
593 }
594 }
595 }
596
597 // Add an additional sentence ending, if the file is over but no explicit
598 // sentence split was reached. This may be controversial and therefore
599 // optional via parameter.
600 if !dat.array[t0].isTokenEnd() {
601 writer.WriteRune('\n')
602 if DEBUG {
603 fmt.Println("-> Newline")
604 }
605 }
606
607 // TODO:
608 // There may be a new line at the end, from an epsilon,
609 // so we may need to go on!
610 return true
611 }
612 */
613
614 // Check epsilon transitions until a final state is reached
615 t0 = t
616 // t = dat.array[t0].getBase() + uint32(dat.epsilon)
617 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
618 a = mat.epsilon
619 newchar = false
620 // if dat.array[t].getCheck() == t0 {
621 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200622 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200623 // Remember state for backtracking to last tokenend state
624 goto PARSECHARM
625
626 } else if epsilonState != 0 {
627 t0 = epsilonState
628 epsilonState = 0 // reset
629 buffo = epsilonOffset
630 if DEBUG {
631 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
632 }
633 goto PARSECHARM
634 }
Akron1c34ce62021-09-23 23:27:39 +0200635
Akron5c82a922021-09-24 19:11:29 +0200636 // Add an additional sentence ending, if the file is over but no explicit
637 // sentence split was reached. This may be controversial and therefore
638 // optional via parameter.
639 if !sentenceEnd {
640 writer.WriteRune('\n')
641 if DEBUG {
642 fmt.Println("-> Newline")
643 }
644 }
645
646 return true
Akron1c34ce62021-09-23 23:27:39 +0200647}