blob: 545cd07b63ed60b2b44dfcc635cadd78d7d86cca [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020014 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020015)
16
17type MatrixTokenizer struct {
18 sigma map[rune]int
19 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020020 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020021 stateCount int
22
23 // Special symbols in sigma
24 epsilon int
25 unknown int
26 identity int
Akron1c34ce62021-09-23 23:27:39 +020027}
28
29// ToMatrix turns the intermediate tokenizer into a
30// matrix representation.
31func (auto *Automaton) ToMatrix() *MatrixTokenizer {
32
33 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020034 sigma: make(map[rune]int),
35 unknown: auto.unknown,
36 identity: auto.identity,
37 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020038 stateCount: auto.stateCount,
39 }
40
Akron28031b72021-10-02 13:07:25 +020041 max := 0
Akron1c34ce62021-09-23 23:27:39 +020042 for num, sym := range auto.sigmaRev {
43 if int(sym) < 256 {
44 mat.sigmaASCII[int(sym)] = num
45 }
46 mat.sigma[sym] = num
47 if num > auto.sigmaCount {
48 panic("sigmaCount is smaller")
49 }
Akron28031b72021-10-02 13:07:25 +020050 if num > max {
51 max = num
52 }
Akron1c34ce62021-09-23 23:27:39 +020053 }
Akron28031b72021-10-02 13:07:25 +020054 // Add final entry to the list (maybe not necessary actually)
55
Akron1c34ce62021-09-23 23:27:39 +020056 remember := make([]bool, auto.stateCount+2)
57
Akron28031b72021-10-02 13:07:25 +020058 // lower sigmaCount, as no final value exists
59 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
60
Akron1c34ce62021-09-23 23:27:39 +020061 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020062 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020063
Akron16c312e2021-09-26 13:11:12 +020064 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020065 if start > auto.stateCount {
66 panic("stateCount is smaller")
67 }
68 if remember[start] {
69 return
70 }
71 remember[start] = true
72 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020073 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020074
75 // Mark nontoken transitions
76 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020077 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020078 }
79
80 toMatrix(matrix, t.end)
81 }
82 }
83
84 toMatrix(mat.array, 1)
85
86 return mat
87}
88
Akron941f2152021-09-26 15:14:25 +020089// Type of tokenizer
90func (MatrixTokenizer) Type() string {
91 return MAMAGIC
92}
93
Akron16c312e2021-09-26 13:11:12 +020094// Save stores the matrix data in a file
95func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
96 f, err := os.Create(file)
97 if err != nil {
98 log.Println(err)
99 return 0, err
100 }
101 defer f.Close()
102 gz := gzip.NewWriter(f)
103 defer gz.Close()
104 n, err = mat.WriteTo(gz)
105 if err != nil {
106 log.Println(err)
107 return n, err
108 }
109 gz.Flush()
110 return n, nil
111}
112
113// WriteTo stores the matrix data in an io.Writer.
114func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
115
116 wb := bufio.NewWriter(w)
117 defer wb.Flush()
118
119 // Store magical header
120 all, err := wb.Write([]byte(MAMAGIC))
121 if err != nil {
122 log.Println(err)
123 return int64(all), err
124 }
125
126 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200127 // In datok it's 16 - 4*4
128 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200129 max := 0
130 for sym, num := range mat.sigma {
131 sigmalist[num] = sym
132 if num > max {
133 max = num
134 }
135 }
136
Akron28031b72021-10-02 13:07:25 +0200137 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200138 sigmalist = sigmalist[:max+1]
139
Akron28031b72021-10-02 13:07:25 +0200140 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200141 bo.PutUint16(buf[0:2], VERSION)
142 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
143 bo.PutUint16(buf[4:6], uint16(mat.unknown))
144 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200145 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
146 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
147 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200148 if err != nil {
149 log.Println(err)
150 return int64(all), err
151 }
152
153 all += more
154
155 // Write sigma
156 for _, sym := range sigmalist {
157
158 more, err = wb.WriteRune(sym)
159 if err != nil {
160 log.Println(err)
161 return int64(all), err
162 }
163 all += more
164 }
165
166 if err != nil {
167 log.Println(err)
168 return int64(all), err
169 }
170
171 // Test marker - could be checksum
172 more, err = wb.Write([]byte("M"))
173 if err != nil {
174 log.Println(err)
175 return int64(all), err
176 }
177 all += more
178
Akron16c312e2021-09-26 13:11:12 +0200179 for _, x := range mat.array {
180 bo.PutUint32(buf[0:4], uint32(x))
181 more, err = wb.Write(buf[0:4])
182 if err != nil {
183 log.Println(err)
184 return int64(all), err
185 }
186 all += more
187 if more != 4 {
188 log.Println("Can not write base uint32")
189 return int64(all), err
190 }
Akron16c312e2021-09-26 13:11:12 +0200191 }
192
193 return int64(all), err
194}
195
196// LoadDatokFile reads a double array represented tokenizer
197// from a file.
198func LoadMatrixFile(file string) *MatrixTokenizer {
199 f, err := os.Open(file)
200 if err != nil {
201 log.Println(err)
202 return nil
203 }
204 defer f.Close()
205
206 gz, err := gzip.NewReader(f)
207 if err != nil {
208 log.Println(err)
209 return nil
210 }
211 defer gz.Close()
212
213 // Todo: Read the whole file!
214 return ParseMatrix(gz)
215}
216
217// LoadMatrixFile reads a matrix represented tokenizer
218// from an io.Reader
219func ParseMatrix(ior io.Reader) *MatrixTokenizer {
220
221 // Initialize tokenizer with default values
222 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200223 sigma: make(map[rune]int),
224 epsilon: 0,
225 unknown: 0,
226 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200228 }
229
230 r := bufio.NewReader(ior)
231
232 buf := make([]byte, 1024)
233 buf = buf[0:len(MAMAGIC)]
234
235 _, err := r.Read(buf)
236
237 if err != nil {
238 log.Println(err)
239 return nil
240 }
241
242 if string(MAMAGIC) != string(buf) {
243 log.Println("Not a matok file")
244 return nil
245 }
246
Akron28031b72021-10-02 13:07:25 +0200247 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200248 if err != nil {
249 log.Println(err)
250 return nil
251 }
252
Akron28031b72021-10-02 13:07:25 +0200253 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200254 log.Println("Read bytes do not fit")
255 return nil
256 }
257
258 version := bo.Uint16(buf[0:2])
259
260 if version != VERSION {
261 log.Println("Version not compatible")
262 return nil
263 }
264
265 mat.epsilon = int(bo.Uint16(buf[2:4]))
266 mat.unknown = int(bo.Uint16(buf[4:6]))
267 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200268 mat.stateCount = int(bo.Uint32(buf[8:12]))
269 sigmaCount := int(bo.Uint16(buf[12:14]))
270 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200271
Akron16c312e2021-09-26 13:11:12 +0200272 for x := 0; x < sigmaCount; x++ {
273 sym, _, err := r.ReadRune()
274 if err == nil && sym != 0 {
275 if int(sym) < 256 {
276 mat.sigmaASCII[int(sym)] = x
277 }
278 mat.sigma[sym] = x
279 }
280 }
281
282 _, err = io.ReadFull(r, buf[0:1])
283
284 if err != nil {
285 log.Print(err)
286 return nil
287 }
288
289 if string("M") != string(buf[0:1]) {
290 log.Println("Not a matok file")
291 return nil
292 }
293
294 // Read based on length
295 mat.array = make([]uint32, arraySize)
296
297 dataArray, err := io.ReadAll(r)
298
299 if err == io.EOF {
300 log.Println(err)
301 return nil
302 }
303
304 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200305 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200306 return nil
307 }
308
309 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200310 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
311 }
312
313 return mat
314}
315
Akron98fbfef2021-10-23 17:02:11 +0200316// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200317func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akrone396a932021-10-19 01:06:13 +0200318 return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
319}
320
Akron98fbfef2021-10-23 17:02:11 +0200321// TransduceTokenWriter transduces an input string against
322// the matrix FSA. The rules are always greedy. If the
323// automaton fails, it takes the last possible token ending
324// branch.
Akrone396a932021-10-19 01:06:13 +0200325func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
Akron1c34ce62021-09-23 23:27:39 +0200326 var a int
Akron16c312e2021-09-26 13:11:12 +0200327 var t0 uint32
328 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200329 var ok, rewindBuffer bool
330
331 // Remember the last position of a possible tokenend,
332 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200333 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200334 epsilonOffset := 0
335
Akron5c82a922021-09-24 19:11:29 +0200336 // Remember if the last transition was epsilon
337 sentenceEnd := false
338
Akrona854faa2021-10-22 19:31:08 +0200339 // Remember if a text end was already set
340 textEnd := false
341
Akron1c34ce62021-09-23 23:27:39 +0200342 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200343 bufft := 0 // Buffer token offset
344 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200345 buffi := 0 // Buffer length
346
Akron98fbfef2021-10-23 17:02:11 +0200347 // The buffer is organized as follows:
348 // [ t[....c..]..i]
349
Akron1c34ce62021-09-23 23:27:39 +0200350 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200351 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200352
353 var char rune
354
355 var err error
356 eof := false
Akrona854faa2021-10-22 19:31:08 +0200357 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200358 newchar := true
359
360PARSECHARM:
361 for {
362
363 if newchar {
364 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200365 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200366 if eof {
367 break
368 }
369 char, _, err = reader.ReadRune()
370
371 // No more runes to read
372 if err != nil {
373 eof = true
374 break
375 }
376 buffer[buffi] = char
377 buffi++
378 }
379
Akron98fbfef2021-10-23 17:02:11 +0200380 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200381
382 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200383 fmt.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200384 }
385
Akrona854faa2021-10-22 19:31:08 +0200386 eot = false
387
Akron1c34ce62021-09-23 23:27:39 +0200388 // TODO:
389 // Better not repeatedly check for a!
390 // Possibly keep a buffer with a.
391 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200392 if int(char) == EOT {
393 eot = true
394 }
Akron1c34ce62021-09-23 23:27:39 +0200395 a = mat.sigmaASCII[int(char)]
396 } else {
397 a, ok = mat.sigma[char]
398 if !ok {
399 a = 0
400 }
401 }
402
403 // Use identity symbol if character is not in sigma
404 if a == 0 && mat.identity != -1 {
405 a = mat.identity
406 }
407
408 t0 = t
409
410 // Check for epsilon transitions and remember
411
Akron16c312e2021-09-26 13:11:12 +0200412 // TODO: Can t0 be negative here?
413 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200414 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200415
416 // Maybe not necessary - and should be simpler!
417 // Just Remove
418 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200419 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200420 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200421
422 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200423 fmt.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200424 }
Akron1c34ce62021-09-23 23:27:39 +0200425 }
426 }
427
428 // Checks a transition based on t0, a and buffo
429 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200430
431 if DEBUG {
432 // Char is only relevant if set
433 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200434 }
435
Akrone396a932021-10-19 01:06:13 +0200436 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200437 if t == 0 {
438
439 if DEBUG {
440 fmt.Println("Match is not fine!")
441 }
442
443 if !ok && a == mat.identity {
444
445 // Try again with unknown symbol, in case identity failed
446 // Char is only relevant when set
447 if DEBUG {
448 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
449 }
450 a = mat.unknown
451
452 } else if a != mat.epsilon {
453
454 // Try again with epsilon symbol, in case everything else failed
455 t0 = epsilonState
456 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200457 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200458 a = mat.epsilon
459
460 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200461 fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200462 }
463
464 } else {
465 break
466 }
467
468 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200469 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200470 continue
471 }
472
473 // Transition was successful
474 rewindBuffer = false
475
476 // Transition consumes a character
477 if a != mat.epsilon {
478
Akron98fbfef2021-10-23 17:02:11 +0200479 buffc++
Akron1c34ce62021-09-23 23:27:39 +0200480
481 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200482 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200483 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200484 fmt.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200485 }
Akron98fbfef2021-10-23 17:02:11 +0200486 bufft++
487 // rewindBuffer = true
Akron1c34ce62021-09-23 23:27:39 +0200488 }
489
490 } else {
491 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200492 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200493 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200494 fmt.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200495 }
Akron32416ce2021-10-23 17:09:41 +0200496 w.Token(bufft, buffer[:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200497 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200498 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200499 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200500 } else {
501 sentenceEnd = true
Akrona854faa2021-10-22 19:31:08 +0200502 w.SentenceEnd(0)
Akron1c34ce62021-09-23 23:27:39 +0200503 }
Akron1c34ce62021-09-23 23:27:39 +0200504 }
505
506 // Rewind the buffer if necessary
507 if rewindBuffer {
508
Akron16c312e2021-09-26 13:11:12 +0200509 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200510 fmt.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akron16c312e2021-09-26 13:11:12 +0200511 }
512
Akron1c34ce62021-09-23 23:27:39 +0200513 // TODO: Better as a ring buffer
Akron98fbfef2021-10-23 17:02:11 +0200514 for x, i := range buffer[buffc:buffi] {
Akron1c34ce62021-09-23 23:27:39 +0200515 buffer[x] = i
516 }
517
Akron98fbfef2021-10-23 17:02:11 +0200518 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200519 // epsilonOffset -= buffo
520 epsilonOffset = 0
521 epsilonState = 0
522
Akron98fbfef2021-10-23 17:02:11 +0200523 buffc = 0
524 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200525
Akron98fbfef2021-10-23 17:02:11 +0200526 if DEBUG {
527 fmt.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akrona854faa2021-10-22 19:31:08 +0200528 }
Akron1c34ce62021-09-23 23:27:39 +0200529 }
530
Akron98fbfef2021-10-23 17:02:11 +0200531 if eot {
532 eot = false
533 textEnd = true
534 w.TextEnd(0)
535 if DEBUG {
536 fmt.Println("END OF TEXT")
537 }
538 }
Akron16c312e2021-09-26 13:11:12 +0200539 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200540
541 newchar = true
542
543 // TODO:
544 // Prevent endless epsilon loops!
545 }
546
547 // Input reader is not yet finished
548 if !eof {
549 if DEBUG {
550 fmt.Println("Not at the end")
551 }
552 return false
553 }
554
555 if DEBUG {
556 fmt.Println("Entering final check")
557 }
Akron1c34ce62021-09-23 23:27:39 +0200558
Akrona854faa2021-10-22 19:31:08 +0200559 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200560 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200561 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
562 a = mat.epsilon
563 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200564 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200565 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200566 // Remember state for backtracking to last tokenend state
567 goto PARSECHARM
568
569 } else if epsilonState != 0 {
570 t0 = epsilonState
571 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200572 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200573 if DEBUG {
Akron98fbfef2021-10-23 17:02:11 +0200574 fmt.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200575 }
576 goto PARSECHARM
577 }
Akron1c34ce62021-09-23 23:27:39 +0200578
Akron5c82a922021-09-24 19:11:29 +0200579 // Add an additional sentence ending, if the file is over but no explicit
580 // sentence split was reached. This may be controversial and therefore
581 // optional via parameter.
582 if !sentenceEnd {
Akrona854faa2021-10-22 19:31:08 +0200583 w.SentenceEnd(0)
Akron5c82a922021-09-24 19:11:29 +0200584 if DEBUG {
Akrona854faa2021-10-22 19:31:08 +0200585 fmt.Println("Sentence end")
586 }
587 }
588
589 if !textEnd {
590 w.TextEnd(0)
591
592 if DEBUG {
593 fmt.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200594 }
595 }
596
597 return true
Akron1c34ce62021-09-23 23:27:39 +0200598}