blob: 8c68959d4bcf20cc85ebf3eaab1f038d6024b107 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020014 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020015)
16
17type MatrixTokenizer struct {
18 sigma map[rune]int
19 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020020 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020021 stateCount int
22
23 // Special symbols in sigma
24 epsilon int
25 unknown int
26 identity int
Akron1c34ce62021-09-23 23:27:39 +020027}
28
29// ToMatrix turns the intermediate tokenizer into a
30// matrix representation.
31func (auto *Automaton) ToMatrix() *MatrixTokenizer {
32
33 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020034 sigma: make(map[rune]int),
35 unknown: auto.unknown,
36 identity: auto.identity,
37 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020038 stateCount: auto.stateCount,
39 }
40
Akron28031b72021-10-02 13:07:25 +020041 max := 0
Akron1c34ce62021-09-23 23:27:39 +020042 for num, sym := range auto.sigmaRev {
43 if int(sym) < 256 {
44 mat.sigmaASCII[int(sym)] = num
45 }
46 mat.sigma[sym] = num
47 if num > auto.sigmaCount {
48 panic("sigmaCount is smaller")
49 }
Akron28031b72021-10-02 13:07:25 +020050 if num > max {
51 max = num
52 }
Akron1c34ce62021-09-23 23:27:39 +020053 }
Akron28031b72021-10-02 13:07:25 +020054 // Add final entry to the list (maybe not necessary actually)
55
Akron1c34ce62021-09-23 23:27:39 +020056 remember := make([]bool, auto.stateCount+2)
57
Akron28031b72021-10-02 13:07:25 +020058 // lower sigmaCount, as no final value exists
59 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
60
Akron1c34ce62021-09-23 23:27:39 +020061 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020062 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020063
Akron16c312e2021-09-26 13:11:12 +020064 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020065 if start > auto.stateCount {
66 panic("stateCount is smaller")
67 }
68 if remember[start] {
69 return
70 }
71 remember[start] = true
72 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020073 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020074
75 // Mark nontoken transitions
76 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020077 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020078 }
79
80 toMatrix(matrix, t.end)
81 }
82 }
83
84 toMatrix(mat.array, 1)
85
86 return mat
87}
88
Akron941f2152021-09-26 15:14:25 +020089// Type of tokenizer
90func (MatrixTokenizer) Type() string {
91 return MAMAGIC
92}
93
Akron16c312e2021-09-26 13:11:12 +020094// Save stores the matrix data in a file
95func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
96 f, err := os.Create(file)
97 if err != nil {
98 log.Println(err)
99 return 0, err
100 }
101 defer f.Close()
102 gz := gzip.NewWriter(f)
103 defer gz.Close()
104 n, err = mat.WriteTo(gz)
105 if err != nil {
106 log.Println(err)
107 return n, err
108 }
109 gz.Flush()
110 return n, nil
111}
112
113// WriteTo stores the matrix data in an io.Writer.
114func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
115
116 wb := bufio.NewWriter(w)
117 defer wb.Flush()
118
119 // Store magical header
120 all, err := wb.Write([]byte(MAMAGIC))
121 if err != nil {
122 log.Println(err)
123 return int64(all), err
124 }
125
126 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200127 // In datok it's 16 - 4*4
128 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200129 max := 0
130 for sym, num := range mat.sigma {
131 sigmalist[num] = sym
132 if num > max {
133 max = num
134 }
135 }
136
Akron28031b72021-10-02 13:07:25 +0200137 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200138 sigmalist = sigmalist[:max+1]
139
Akron28031b72021-10-02 13:07:25 +0200140 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200141 bo.PutUint16(buf[0:2], VERSION)
142 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
143 bo.PutUint16(buf[4:6], uint16(mat.unknown))
144 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200145 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
146 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
147 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200148 if err != nil {
149 log.Println(err)
150 return int64(all), err
151 }
152
153 all += more
154
155 // Write sigma
156 for _, sym := range sigmalist {
157
158 more, err = wb.WriteRune(sym)
159 if err != nil {
160 log.Println(err)
161 return int64(all), err
162 }
163 all += more
164 }
165
166 if err != nil {
167 log.Println(err)
168 return int64(all), err
169 }
170
171 // Test marker - could be checksum
172 more, err = wb.Write([]byte("M"))
173 if err != nil {
174 log.Println(err)
175 return int64(all), err
176 }
177 all += more
178
Akron16c312e2021-09-26 13:11:12 +0200179 for _, x := range mat.array {
180 bo.PutUint32(buf[0:4], uint32(x))
181 more, err = wb.Write(buf[0:4])
182 if err != nil {
183 log.Println(err)
184 return int64(all), err
185 }
186 all += more
187 if more != 4 {
188 log.Println("Can not write base uint32")
189 return int64(all), err
190 }
Akron16c312e2021-09-26 13:11:12 +0200191 }
192
193 return int64(all), err
194}
195
196// LoadDatokFile reads a double array represented tokenizer
197// from a file.
198func LoadMatrixFile(file string) *MatrixTokenizer {
199 f, err := os.Open(file)
200 if err != nil {
201 log.Println(err)
202 return nil
203 }
204 defer f.Close()
205
206 gz, err := gzip.NewReader(f)
207 if err != nil {
208 log.Println(err)
209 return nil
210 }
211 defer gz.Close()
212
213 // Todo: Read the whole file!
214 return ParseMatrix(gz)
215}
216
217// LoadMatrixFile reads a matrix represented tokenizer
218// from an io.Reader
219func ParseMatrix(ior io.Reader) *MatrixTokenizer {
220
221 // Initialize tokenizer with default values
222 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200223 sigma: make(map[rune]int),
224 epsilon: 0,
225 unknown: 0,
226 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200228 }
229
230 r := bufio.NewReader(ior)
231
232 buf := make([]byte, 1024)
233 buf = buf[0:len(MAMAGIC)]
234
235 _, err := r.Read(buf)
236
237 if err != nil {
238 log.Println(err)
239 return nil
240 }
241
242 if string(MAMAGIC) != string(buf) {
243 log.Println("Not a matok file")
244 return nil
245 }
246
Akron28031b72021-10-02 13:07:25 +0200247 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200248 if err != nil {
249 log.Println(err)
250 return nil
251 }
252
Akron28031b72021-10-02 13:07:25 +0200253 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200254 log.Println("Read bytes do not fit")
255 return nil
256 }
257
258 version := bo.Uint16(buf[0:2])
259
260 if version != VERSION {
261 log.Println("Version not compatible")
262 return nil
263 }
264
265 mat.epsilon = int(bo.Uint16(buf[2:4]))
266 mat.unknown = int(bo.Uint16(buf[4:6]))
267 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200268 mat.stateCount = int(bo.Uint32(buf[8:12]))
269 sigmaCount := int(bo.Uint16(buf[12:14]))
270 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200271
Akron16c312e2021-09-26 13:11:12 +0200272 for x := 0; x < sigmaCount; x++ {
273 sym, _, err := r.ReadRune()
274 if err == nil && sym != 0 {
275 if int(sym) < 256 {
276 mat.sigmaASCII[int(sym)] = x
277 }
278 mat.sigma[sym] = x
279 }
280 }
281
282 _, err = io.ReadFull(r, buf[0:1])
283
284 if err != nil {
285 log.Print(err)
286 return nil
287 }
288
289 if string("M") != string(buf[0:1]) {
290 log.Println("Not a matok file")
291 return nil
292 }
293
294 // Read based on length
295 mat.array = make([]uint32, arraySize)
296
297 dataArray, err := io.ReadAll(r)
298
299 if err == io.EOF {
300 log.Println(err)
301 return nil
302 }
303
304 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200305 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200306 return nil
307 }
308
309 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200310 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
311 }
312
313 return mat
314}
315
Akron1c34ce62021-09-23 23:27:39 +0200316func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akrone396a932021-10-19 01:06:13 +0200317 return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
318}
319
320func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
Akron1c34ce62021-09-23 23:27:39 +0200321 var a int
Akron16c312e2021-09-26 13:11:12 +0200322 var t0 uint32
323 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200324 var ok, rewindBuffer bool
325
326 // Remember the last position of a possible tokenend,
327 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200328 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200329 epsilonOffset := 0
330
Akron5c82a922021-09-24 19:11:29 +0200331 // Remember if the last transition was epsilon
332 sentenceEnd := false
333
Akrona854faa2021-10-22 19:31:08 +0200334 // Remember if a text end was already set
335 textEnd := false
336
Akron1c34ce62021-09-23 23:27:39 +0200337 buffer := make([]rune, 1024)
338 buffo := 0 // Buffer offset
339 buffi := 0 // Buffer length
340
341 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200342 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200343
344 var char rune
345
346 var err error
347 eof := false
Akrona854faa2021-10-22 19:31:08 +0200348 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200349 newchar := true
350
351PARSECHARM:
352 for {
353
354 if newchar {
355 // Get from reader if buffer is empty
356 if buffo >= buffi {
357 if eof {
358 break
359 }
360 char, _, err = reader.ReadRune()
361
362 // No more runes to read
363 if err != nil {
364 eof = true
365 break
366 }
367 buffer[buffi] = char
368 buffi++
369 }
370
371 char = buffer[buffo]
372
373 if DEBUG {
Akrona854faa2021-10-22 19:31:08 +0200374 fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200375 }
376
Akrona854faa2021-10-22 19:31:08 +0200377 eot = false
378
Akron1c34ce62021-09-23 23:27:39 +0200379 // TODO:
380 // Better not repeatedly check for a!
381 // Possibly keep a buffer with a.
382 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200383 if int(char) == EOT {
384 eot = true
385 }
Akron1c34ce62021-09-23 23:27:39 +0200386 a = mat.sigmaASCII[int(char)]
387 } else {
388 a, ok = mat.sigma[char]
389 if !ok {
390 a = 0
391 }
392 }
393
394 // Use identity symbol if character is not in sigma
395 if a == 0 && mat.identity != -1 {
396 a = mat.identity
397 }
398
399 t0 = t
400
401 // Check for epsilon transitions and remember
402
Akron16c312e2021-09-26 13:11:12 +0200403 // TODO: Can t0 be negative here?
404 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200405 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200406
407 // Maybe not necessary - and should be simpler!
408 // Just Remove
409 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200410 epsilonState = t0
411 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200412
413 if DEBUG {
414 fmt.Println("epsilonOffset is set to", buffo)
415 }
Akron1c34ce62021-09-23 23:27:39 +0200416 }
417 }
418
419 // Checks a transition based on t0, a and buffo
420 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200421
422 if DEBUG {
423 // Char is only relevant if set
424 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200425 }
426
Akrone396a932021-10-19 01:06:13 +0200427 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200428 if t == 0 {
429
430 if DEBUG {
431 fmt.Println("Match is not fine!")
432 }
433
434 if !ok && a == mat.identity {
435
436 // Try again with unknown symbol, in case identity failed
437 // Char is only relevant when set
438 if DEBUG {
439 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
440 }
441 a = mat.unknown
442
443 } else if a != mat.epsilon {
444
445 // Try again with epsilon symbol, in case everything else failed
446 t0 = epsilonState
447 epsilonState = 0 // reset
448 buffo = epsilonOffset
449 a = mat.epsilon
450
451 if DEBUG {
452 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
453 }
454
455 } else {
456 break
457 }
458
459 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200460 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200461 continue
462 }
463
464 // Transition was successful
465 rewindBuffer = false
466
467 // Transition consumes a character
468 if a != mat.epsilon {
469
470 buffo++
471
472 // Transition does not produce a character
Akron16c312e2021-09-26 13:11:12 +0200473 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200474 if DEBUG {
475 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
476 }
477 rewindBuffer = true
478 }
479
480 } else {
481 // Transition marks the end of a token - so flush the buffer
Akrone396a932021-10-19 01:06:13 +0200482 if buffo > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200483 if DEBUG {
484 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
485 }
Akrone396a932021-10-19 01:06:13 +0200486 w.Token(0, buffer[:buffo])
Akron1c34ce62021-09-23 23:27:39 +0200487 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200488 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200489 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200490 } else {
491 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +0200492 w.SentenceEnd(0)
Akron1c34ce62021-09-23 23:27:39 +0200493 }
494 if DEBUG {
495 fmt.Println("-> Newline")
496 }
Akrone396a932021-10-19 01:06:13 +0200497 // writer.WriteRune('\n')
Akron1c34ce62021-09-23 23:27:39 +0200498 }
499
500 // Rewind the buffer if necessary
501 if rewindBuffer {
502
Akron16c312e2021-09-26 13:11:12 +0200503 if DEBUG {
504 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
505 }
506
Akron1c34ce62021-09-23 23:27:39 +0200507 // TODO: Better as a ring buffer
508 for x, i := range buffer[buffo:buffi] {
509 buffer[x] = i
510 }
511
512 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200513 // epsilonOffset -= buffo
514 epsilonOffset = 0
515 epsilonState = 0
516
Akron1c34ce62021-09-23 23:27:39 +0200517 buffo = 0
518 if DEBUG {
519 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
520 }
Akrona854faa2021-10-22 19:31:08 +0200521
522 if eot {
523 eot = false
524 textEnd = true
525 w.TextEnd(0)
526 if DEBUG {
527 fmt.Println("END OF TEXT")
528 }
529 }
Akron1c34ce62021-09-23 23:27:39 +0200530 }
531
Akron16c312e2021-09-26 13:11:12 +0200532 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200533
534 newchar = true
535
536 // TODO:
537 // Prevent endless epsilon loops!
538 }
539
540 // Input reader is not yet finished
541 if !eof {
542 if DEBUG {
543 fmt.Println("Not at the end")
544 }
545 return false
546 }
547
548 if DEBUG {
549 fmt.Println("Entering final check")
550 }
Akron1c34ce62021-09-23 23:27:39 +0200551
Akrona854faa2021-10-22 19:31:08 +0200552 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200553 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200554 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
555 a = mat.epsilon
556 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200557 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200558 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200559 // Remember state for backtracking to last tokenend state
560 goto PARSECHARM
561
562 } else if epsilonState != 0 {
563 t0 = epsilonState
564 epsilonState = 0 // reset
565 buffo = epsilonOffset
566 if DEBUG {
567 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
568 }
569 goto PARSECHARM
570 }
Akron1c34ce62021-09-23 23:27:39 +0200571
Akron5c82a922021-09-24 19:11:29 +0200572 // Add an additional sentence ending, if the file is over but no explicit
573 // sentence split was reached. This may be controversial and therefore
574 // optional via parameter.
575 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +0200576 w.SentenceEnd(0)
Akron5c82a922021-09-24 19:11:29 +0200577 if DEBUG {
Akrona854faa2021-10-22 19:31:08 +0200578 fmt.Println("Sentence end")
579 }
580 }
581
582 if !textEnd {
583 w.TextEnd(0)
584
585 if DEBUG {
586 fmt.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200587 }
588 }
589
590 return true
Akron1c34ce62021-09-23 23:27:39 +0200591}