blob: 9e130d5375e10660c79de39f5899e1d71b3858b4 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020014 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020015)
16
17type MatrixTokenizer struct {
18 sigma map[rune]int
19 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020020 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020021 stateCount int
22
23 // Special symbols in sigma
24 epsilon int
25 unknown int
26 identity int
Akron1c34ce62021-09-23 23:27:39 +020027}
28
29// ToMatrix turns the intermediate tokenizer into a
30// matrix representation.
31func (auto *Automaton) ToMatrix() *MatrixTokenizer {
32
33 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020034 sigma: make(map[rune]int),
35 unknown: auto.unknown,
36 identity: auto.identity,
37 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020038 stateCount: auto.stateCount,
39 }
40
Akron28031b72021-10-02 13:07:25 +020041 max := 0
Akron1c34ce62021-09-23 23:27:39 +020042 for num, sym := range auto.sigmaRev {
43 if int(sym) < 256 {
44 mat.sigmaASCII[int(sym)] = num
45 }
46 mat.sigma[sym] = num
47 if num > auto.sigmaCount {
48 panic("sigmaCount is smaller")
49 }
Akron28031b72021-10-02 13:07:25 +020050 if num > max {
51 max = num
52 }
Akron1c34ce62021-09-23 23:27:39 +020053 }
Akron28031b72021-10-02 13:07:25 +020054 // Add final entry to the list (maybe not necessary actually)
55
Akron1c34ce62021-09-23 23:27:39 +020056 remember := make([]bool, auto.stateCount+2)
57
Akron28031b72021-10-02 13:07:25 +020058 // lower sigmaCount, as no final value exists
59 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
60
Akron1c34ce62021-09-23 23:27:39 +020061 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020062 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020063
Akron16c312e2021-09-26 13:11:12 +020064 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020065 if start > auto.stateCount {
66 panic("stateCount is smaller")
67 }
68 if remember[start] {
69 return
70 }
71 remember[start] = true
72 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020073 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020074
75 // Mark nontoken transitions
76 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020077 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020078 }
79
80 toMatrix(matrix, t.end)
81 }
82 }
83
84 toMatrix(mat.array, 1)
85
86 return mat
87}
88
Akron941f2152021-09-26 15:14:25 +020089// Type of tokenizer
90func (MatrixTokenizer) Type() string {
91 return MAMAGIC
92}
93
Akron16c312e2021-09-26 13:11:12 +020094// Save stores the matrix data in a file
95func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
96 f, err := os.Create(file)
97 if err != nil {
98 log.Println(err)
99 return 0, err
100 }
101 defer f.Close()
102 gz := gzip.NewWriter(f)
103 defer gz.Close()
104 n, err = mat.WriteTo(gz)
105 if err != nil {
106 log.Println(err)
107 return n, err
108 }
109 gz.Flush()
110 return n, nil
111}
112
113// WriteTo stores the matrix data in an io.Writer.
114func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
115
116 wb := bufio.NewWriter(w)
117 defer wb.Flush()
118
119 // Store magical header
120 all, err := wb.Write([]byte(MAMAGIC))
121 if err != nil {
122 log.Println(err)
123 return int64(all), err
124 }
125
126 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200127 // In datok it's 16 - 4*4
128 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200129 max := 0
130 for sym, num := range mat.sigma {
131 sigmalist[num] = sym
132 if num > max {
133 max = num
134 }
135 }
136
Akron28031b72021-10-02 13:07:25 +0200137 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200138 sigmalist = sigmalist[:max+1]
139
Akron28031b72021-10-02 13:07:25 +0200140 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200141 bo.PutUint16(buf[0:2], VERSION)
142 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
143 bo.PutUint16(buf[4:6], uint16(mat.unknown))
144 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200145 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
146 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
147 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200148 if err != nil {
149 log.Println(err)
150 return int64(all), err
151 }
152
153 all += more
154
155 // Write sigma
156 for _, sym := range sigmalist {
157
158 more, err = wb.WriteRune(sym)
159 if err != nil {
160 log.Println(err)
161 return int64(all), err
162 }
163 all += more
164 }
165
166 if err != nil {
167 log.Println(err)
168 return int64(all), err
169 }
170
171 // Test marker - could be checksum
172 more, err = wb.Write([]byte("M"))
173 if err != nil {
174 log.Println(err)
175 return int64(all), err
176 }
177 all += more
178
Akron16c312e2021-09-26 13:11:12 +0200179 for _, x := range mat.array {
180 bo.PutUint32(buf[0:4], uint32(x))
181 more, err = wb.Write(buf[0:4])
182 if err != nil {
183 log.Println(err)
184 return int64(all), err
185 }
186 all += more
187 if more != 4 {
188 log.Println("Can not write base uint32")
189 return int64(all), err
190 }
Akron16c312e2021-09-26 13:11:12 +0200191 }
192
193 return int64(all), err
194}
195
196// LoadDatokFile reads a double array represented tokenizer
197// from a file.
198func LoadMatrixFile(file string) *MatrixTokenizer {
199 f, err := os.Open(file)
200 if err != nil {
201 log.Println(err)
202 return nil
203 }
204 defer f.Close()
205
206 gz, err := gzip.NewReader(f)
207 if err != nil {
208 log.Println(err)
209 return nil
210 }
211 defer gz.Close()
212
213 // Todo: Read the whole file!
214 return ParseMatrix(gz)
215}
216
217// LoadMatrixFile reads a matrix represented tokenizer
218// from an io.Reader
219func ParseMatrix(ior io.Reader) *MatrixTokenizer {
220
221 // Initialize tokenizer with default values
222 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200223 sigma: make(map[rune]int),
224 epsilon: 0,
225 unknown: 0,
226 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200228 }
229
230 r := bufio.NewReader(ior)
231
232 buf := make([]byte, 1024)
233 buf = buf[0:len(MAMAGIC)]
234
235 _, err := r.Read(buf)
236
237 if err != nil {
238 log.Println(err)
239 return nil
240 }
241
242 if string(MAMAGIC) != string(buf) {
243 log.Println("Not a matok file")
244 return nil
245 }
246
Akron28031b72021-10-02 13:07:25 +0200247 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200248 if err != nil {
249 log.Println(err)
250 return nil
251 }
252
Akron28031b72021-10-02 13:07:25 +0200253 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200254 log.Println("Read bytes do not fit")
255 return nil
256 }
257
258 version := bo.Uint16(buf[0:2])
259
260 if version != VERSION {
261 log.Println("Version not compatible")
262 return nil
263 }
264
265 mat.epsilon = int(bo.Uint16(buf[2:4]))
266 mat.unknown = int(bo.Uint16(buf[4:6]))
267 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200268 mat.stateCount = int(bo.Uint32(buf[8:12]))
269 sigmaCount := int(bo.Uint16(buf[12:14]))
270 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200271
Akron16c312e2021-09-26 13:11:12 +0200272 for x := 0; x < sigmaCount; x++ {
273 sym, _, err := r.ReadRune()
274 if err == nil && sym != 0 {
275 if int(sym) < 256 {
276 mat.sigmaASCII[int(sym)] = x
277 }
278 mat.sigma[sym] = x
279 }
280 }
281
282 _, err = io.ReadFull(r, buf[0:1])
283
284 if err != nil {
285 log.Print(err)
286 return nil
287 }
288
289 if string("M") != string(buf[0:1]) {
290 log.Println("Not a matok file")
291 return nil
292 }
293
294 // Read based on length
295 mat.array = make([]uint32, arraySize)
296
297 dataArray, err := io.ReadAll(r)
298
299 if err == io.EOF {
300 log.Println(err)
301 return nil
302 }
303
304 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200305 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200306 return nil
307 }
308
309 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200310 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
311 }
312
313 return mat
314}
315
Akron1c34ce62021-09-23 23:27:39 +0200316func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akrone396a932021-10-19 01:06:13 +0200317 return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
318}
319
320func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
Akron1c34ce62021-09-23 23:27:39 +0200321 var a int
Akron16c312e2021-09-26 13:11:12 +0200322 var t0 uint32
323 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200324 var ok, rewindBuffer bool
325
326 // Remember the last position of a possible tokenend,
327 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200328 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200329 epsilonOffset := 0
330
Akrona854faa2021-10-22 19:31:08 +0200331 // TEMP
332 loopcounter := 0
333
Akron5c82a922021-09-24 19:11:29 +0200334 // Remember if the last transition was epsilon
335 sentenceEnd := false
336
Akrona854faa2021-10-22 19:31:08 +0200337 // Remember if a text end was already set
338 textEnd := false
339
Akron1c34ce62021-09-23 23:27:39 +0200340 buffer := make([]rune, 1024)
341 buffo := 0 // Buffer offset
342 buffi := 0 // Buffer length
343
344 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200345 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200346
347 var char rune
348
349 var err error
350 eof := false
Akrona854faa2021-10-22 19:31:08 +0200351 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200352 newchar := true
353
354PARSECHARM:
355 for {
356
357 if newchar {
358 // Get from reader if buffer is empty
359 if buffo >= buffi {
360 if eof {
361 break
362 }
363 char, _, err = reader.ReadRune()
364
365 // No more runes to read
366 if err != nil {
367 eof = true
368 break
369 }
370 buffer[buffi] = char
371 buffi++
372 }
373
374 char = buffer[buffo]
375
376 if DEBUG {
Akrona854faa2021-10-22 19:31:08 +0200377 fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200378 }
379
Akrona854faa2021-10-22 19:31:08 +0200380 eot = false
381
Akron1c34ce62021-09-23 23:27:39 +0200382 // TODO:
383 // Better not repeatedly check for a!
384 // Possibly keep a buffer with a.
385 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200386 if int(char) == EOT {
387 eot = true
388 }
Akron1c34ce62021-09-23 23:27:39 +0200389 a = mat.sigmaASCII[int(char)]
390 } else {
391 a, ok = mat.sigma[char]
392 if !ok {
393 a = 0
394 }
395 }
396
397 // Use identity symbol if character is not in sigma
398 if a == 0 && mat.identity != -1 {
399 a = mat.identity
400 }
401
402 t0 = t
403
404 // Check for epsilon transitions and remember
405
Akron16c312e2021-09-26 13:11:12 +0200406 // TODO: Can t0 be negative here?
407 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200408 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200409
410 // Maybe not necessary - and should be simpler!
411 // Just Remove
412 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200413 epsilonState = t0
414 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200415
416 if DEBUG {
417 fmt.Println("epsilonOffset is set to", buffo)
418 }
Akron1c34ce62021-09-23 23:27:39 +0200419 }
420 }
421
422 // Checks a transition based on t0, a and buffo
423 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200424
425 if DEBUG {
426 // Char is only relevant if set
427 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200428 }
429
Akrone396a932021-10-19 01:06:13 +0200430 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200431 if t == 0 {
432
433 if DEBUG {
434 fmt.Println("Match is not fine!")
435 }
436
437 if !ok && a == mat.identity {
438
439 // Try again with unknown symbol, in case identity failed
440 // Char is only relevant when set
441 if DEBUG {
442 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
443 }
444 a = mat.unknown
445
446 } else if a != mat.epsilon {
447
448 // Try again with epsilon symbol, in case everything else failed
449 t0 = epsilonState
450 epsilonState = 0 // reset
451 buffo = epsilonOffset
452 a = mat.epsilon
453
454 if DEBUG {
455 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
456 }
457
458 } else {
459 break
460 }
461
462 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200463 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200464 continue
465 }
466
467 // Transition was successful
468 rewindBuffer = false
469
470 // Transition consumes a character
471 if a != mat.epsilon {
472
473 buffo++
474
475 // Transition does not produce a character
Akron16c312e2021-09-26 13:11:12 +0200476 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200477 if DEBUG {
478 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
479 }
480 rewindBuffer = true
481 }
482
483 } else {
484 // Transition marks the end of a token - so flush the buffer
Akrone396a932021-10-19 01:06:13 +0200485 if buffo > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200486 if DEBUG {
487 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
488 }
Akrone396a932021-10-19 01:06:13 +0200489 w.Token(0, buffer[:buffo])
Akron1c34ce62021-09-23 23:27:39 +0200490 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200491 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200492 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200493 } else {
494 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +0200495 w.SentenceEnd(0)
Akron1c34ce62021-09-23 23:27:39 +0200496 }
497 if DEBUG {
498 fmt.Println("-> Newline")
499 }
Akrone396a932021-10-19 01:06:13 +0200500 // writer.WriteRune('\n')
Akron1c34ce62021-09-23 23:27:39 +0200501 }
502
503 // Rewind the buffer if necessary
504 if rewindBuffer {
505
Akron16c312e2021-09-26 13:11:12 +0200506 if DEBUG {
507 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
508 }
509
Akron1c34ce62021-09-23 23:27:39 +0200510 // TODO: Better as a ring buffer
511 for x, i := range buffer[buffo:buffi] {
512 buffer[x] = i
513 }
514
515 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200516 // epsilonOffset -= buffo
517 epsilonOffset = 0
518 epsilonState = 0
519
Akron1c34ce62021-09-23 23:27:39 +0200520 buffo = 0
521 if DEBUG {
522 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
523 }
Akrona854faa2021-10-22 19:31:08 +0200524
525 if eot {
526 eot = false
527 textEnd = true
528 w.TextEnd(0)
529 if DEBUG {
530 fmt.Println("END OF TEXT")
531 }
532 }
Akron1c34ce62021-09-23 23:27:39 +0200533 }
534
Akron16c312e2021-09-26 13:11:12 +0200535 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200536
537 newchar = true
538
539 // TODO:
540 // Prevent endless epsilon loops!
541 }
542
Akrona854faa2021-10-22 19:31:08 +0200543 if loopcounter > 100 {
544 return false
545 }
546 loopcounter++
547
Akron1c34ce62021-09-23 23:27:39 +0200548 // Input reader is not yet finished
549 if !eof {
550 if DEBUG {
551 fmt.Println("Not at the end")
552 }
553 return false
554 }
555
556 if DEBUG {
557 fmt.Println("Entering final check")
558 }
Akron1c34ce62021-09-23 23:27:39 +0200559
Akrona854faa2021-10-22 19:31:08 +0200560 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200561 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200562 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
563 a = mat.epsilon
564 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200565 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200566 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200567 // Remember state for backtracking to last tokenend state
568 goto PARSECHARM
569
570 } else if epsilonState != 0 {
571 t0 = epsilonState
572 epsilonState = 0 // reset
573 buffo = epsilonOffset
574 if DEBUG {
575 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
576 }
577 goto PARSECHARM
578 }
Akron1c34ce62021-09-23 23:27:39 +0200579
Akron5c82a922021-09-24 19:11:29 +0200580 // Add an additional sentence ending, if the file is over but no explicit
581 // sentence split was reached. This may be controversial and therefore
582 // optional via parameter.
583 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +0200584 w.SentenceEnd(0)
Akron5c82a922021-09-24 19:11:29 +0200585 if DEBUG {
Akrona854faa2021-10-22 19:31:08 +0200586 fmt.Println("Sentence end")
587 }
588 }
589
590 if !textEnd {
591 w.TextEnd(0)
592
593 if DEBUG {
594 fmt.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200595 }
596 }
597
598 return true
Akron1c34ce62021-09-23 23:27:39 +0200599}