blob: 10680c3b6cf449285fda389a99fc5223c92e7c58 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "fmt"
7 "io"
Akron16c312e2021-09-26 13:11:12 +02008 "log"
9 "os"
10)
11
12const (
13 MAMAGIC = "MATOK"
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron28031b72021-10-02 13:07:25 +020040 max := 0
Akron1c34ce62021-09-23 23:27:39 +020041 for num, sym := range auto.sigmaRev {
42 if int(sym) < 256 {
43 mat.sigmaASCII[int(sym)] = num
44 }
45 mat.sigma[sym] = num
46 if num > auto.sigmaCount {
47 panic("sigmaCount is smaller")
48 }
Akron28031b72021-10-02 13:07:25 +020049 if num > max {
50 max = num
51 }
Akron1c34ce62021-09-23 23:27:39 +020052 }
Akron28031b72021-10-02 13:07:25 +020053 // Add final entry to the list (maybe not necessary actually)
54
Akron1c34ce62021-09-23 23:27:39 +020055 remember := make([]bool, auto.stateCount+2)
56
Akron28031b72021-10-02 13:07:25 +020057 // lower sigmaCount, as no final value exists
58 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
59
Akron1c34ce62021-09-23 23:27:39 +020060 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020061 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020062
Akron16c312e2021-09-26 13:11:12 +020063 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020064 if start > auto.stateCount {
65 panic("stateCount is smaller")
66 }
67 if remember[start] {
68 return
69 }
70 remember[start] = true
71 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020072 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020073
74 // Mark nontoken transitions
75 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020076 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020077 }
78
79 toMatrix(matrix, t.end)
80 }
81 }
82
83 toMatrix(mat.array, 1)
84
85 return mat
86}
87
Akron941f2152021-09-26 15:14:25 +020088// Type of tokenizer
89func (MatrixTokenizer) Type() string {
90 return MAMAGIC
91}
92
Akron16c312e2021-09-26 13:11:12 +020093// Save stores the matrix data in a file
94func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
95 f, err := os.Create(file)
96 if err != nil {
97 log.Println(err)
98 return 0, err
99 }
100 defer f.Close()
101 gz := gzip.NewWriter(f)
102 defer gz.Close()
103 n, err = mat.WriteTo(gz)
104 if err != nil {
105 log.Println(err)
106 return n, err
107 }
108 gz.Flush()
109 return n, nil
110}
111
112// WriteTo stores the matrix data in an io.Writer.
113func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
114
115 wb := bufio.NewWriter(w)
116 defer wb.Flush()
117
118 // Store magical header
119 all, err := wb.Write([]byte(MAMAGIC))
120 if err != nil {
121 log.Println(err)
122 return int64(all), err
123 }
124
125 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200126 // In datok it's 16 - 4*4
127 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200128 max := 0
129 for sym, num := range mat.sigma {
130 sigmalist[num] = sym
131 if num > max {
132 max = num
133 }
134 }
135
Akron28031b72021-10-02 13:07:25 +0200136 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200137 sigmalist = sigmalist[:max+1]
138
Akron28031b72021-10-02 13:07:25 +0200139 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200140 bo.PutUint16(buf[0:2], VERSION)
141 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
142 bo.PutUint16(buf[4:6], uint16(mat.unknown))
143 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200144 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
145 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
146 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200147 if err != nil {
148 log.Println(err)
149 return int64(all), err
150 }
151
152 all += more
153
154 // Write sigma
155 for _, sym := range sigmalist {
156
157 more, err = wb.WriteRune(sym)
158 if err != nil {
159 log.Println(err)
160 return int64(all), err
161 }
162 all += more
163 }
164
165 if err != nil {
166 log.Println(err)
167 return int64(all), err
168 }
169
170 // Test marker - could be checksum
171 more, err = wb.Write([]byte("M"))
172 if err != nil {
173 log.Println(err)
174 return int64(all), err
175 }
176 all += more
177
Akron16c312e2021-09-26 13:11:12 +0200178 for _, x := range mat.array {
179 bo.PutUint32(buf[0:4], uint32(x))
180 more, err = wb.Write(buf[0:4])
181 if err != nil {
182 log.Println(err)
183 return int64(all), err
184 }
185 all += more
186 if more != 4 {
187 log.Println("Can not write base uint32")
188 return int64(all), err
189 }
Akron16c312e2021-09-26 13:11:12 +0200190 }
191
192 return int64(all), err
193}
194
195// LoadDatokFile reads a double array represented tokenizer
196// from a file.
197func LoadMatrixFile(file string) *MatrixTokenizer {
198 f, err := os.Open(file)
199 if err != nil {
200 log.Println(err)
201 return nil
202 }
203 defer f.Close()
204
205 gz, err := gzip.NewReader(f)
206 if err != nil {
207 log.Println(err)
208 return nil
209 }
210 defer gz.Close()
211
212 // Todo: Read the whole file!
213 return ParseMatrix(gz)
214}
215
216// LoadMatrixFile reads a matrix represented tokenizer
217// from an io.Reader
218func ParseMatrix(ior io.Reader) *MatrixTokenizer {
219
220 // Initialize tokenizer with default values
221 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200222 sigma: make(map[rune]int),
223 epsilon: 0,
224 unknown: 0,
225 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200226 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 }
228
229 r := bufio.NewReader(ior)
230
231 buf := make([]byte, 1024)
232 buf = buf[0:len(MAMAGIC)]
233
234 _, err := r.Read(buf)
235
236 if err != nil {
237 log.Println(err)
238 return nil
239 }
240
241 if string(MAMAGIC) != string(buf) {
242 log.Println("Not a matok file")
243 return nil
244 }
245
Akron28031b72021-10-02 13:07:25 +0200246 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200247 if err != nil {
248 log.Println(err)
249 return nil
250 }
251
Akron28031b72021-10-02 13:07:25 +0200252 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200253 log.Println("Read bytes do not fit")
254 return nil
255 }
256
257 version := bo.Uint16(buf[0:2])
258
259 if version != VERSION {
260 log.Println("Version not compatible")
261 return nil
262 }
263
264 mat.epsilon = int(bo.Uint16(buf[2:4]))
265 mat.unknown = int(bo.Uint16(buf[4:6]))
266 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200267 mat.stateCount = int(bo.Uint32(buf[8:12]))
268 sigmaCount := int(bo.Uint16(buf[12:14]))
269 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200270
Akron16c312e2021-09-26 13:11:12 +0200271 for x := 0; x < sigmaCount; x++ {
272 sym, _, err := r.ReadRune()
273 if err == nil && sym != 0 {
274 if int(sym) < 256 {
275 mat.sigmaASCII[int(sym)] = x
276 }
277 mat.sigma[sym] = x
278 }
279 }
280
281 _, err = io.ReadFull(r, buf[0:1])
282
283 if err != nil {
284 log.Print(err)
285 return nil
286 }
287
288 if string("M") != string(buf[0:1]) {
289 log.Println("Not a matok file")
290 return nil
291 }
292
293 // Read based on length
294 mat.array = make([]uint32, arraySize)
295
296 dataArray, err := io.ReadAll(r)
297
298 if err == io.EOF {
299 log.Println(err)
300 return nil
301 }
302
303 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200304 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200305 return nil
306 }
307
308 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200309 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
310 }
311
312 return mat
313}
314
Akron1c34ce62021-09-23 23:27:39 +0200315func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akrone396a932021-10-19 01:06:13 +0200316 return mat.TransduceTokenWriter(r, NewTokenWriterSimple(w))
317}
318
319func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w TokenWriterI) bool {
Akron1c34ce62021-09-23 23:27:39 +0200320 var a int
Akron16c312e2021-09-26 13:11:12 +0200321 var t0 uint32
322 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200323 var ok, rewindBuffer bool
324
325 // Remember the last position of a possible tokenend,
326 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200327 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200328 epsilonOffset := 0
329
Akron5c82a922021-09-24 19:11:29 +0200330 // Remember if the last transition was epsilon
331 sentenceEnd := false
332
Akron1c34ce62021-09-23 23:27:39 +0200333 buffer := make([]rune, 1024)
334 buffo := 0 // Buffer offset
335 buffi := 0 // Buffer length
336
337 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200338 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200339
340 var char rune
341
342 var err error
343 eof := false
344 newchar := true
345
346PARSECHARM:
347 for {
348
349 if newchar {
350 // Get from reader if buffer is empty
351 if buffo >= buffi {
352 if eof {
353 break
354 }
355 char, _, err = reader.ReadRune()
356
357 // No more runes to read
358 if err != nil {
359 eof = true
360 break
361 }
362 buffer[buffi] = char
363 buffi++
364 }
365
366 char = buffer[buffo]
367
368 if DEBUG {
369 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
370 }
371
372 // TODO:
373 // Better not repeatedly check for a!
374 // Possibly keep a buffer with a.
375 if int(char) < 256 {
376 a = mat.sigmaASCII[int(char)]
377 } else {
378 a, ok = mat.sigma[char]
379 if !ok {
380 a = 0
381 }
382 }
383
384 // Use identity symbol if character is not in sigma
385 if a == 0 && mat.identity != -1 {
386 a = mat.identity
387 }
388
389 t0 = t
390
391 // Check for epsilon transitions and remember
392
Akron16c312e2021-09-26 13:11:12 +0200393 // TODO: Can t0 be negative here?
394 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200395 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200396
397 // Maybe not necessary - and should be simpler!
398 // Just Remove
399 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200400 epsilonState = t0
401 epsilonOffset = buffo
Akron16c312e2021-09-26 13:11:12 +0200402
403 if DEBUG {
404 fmt.Println("epsilonOffset is set to", buffo)
405 }
Akron1c34ce62021-09-23 23:27:39 +0200406 }
407 }
408
409 // Checks a transition based on t0, a and buffo
410 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200411
412 if DEBUG {
413 // Char is only relevant if set
414 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200415 }
416
Akrone396a932021-10-19 01:06:13 +0200417 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200418 if t == 0 {
419
420 if DEBUG {
421 fmt.Println("Match is not fine!")
422 }
423
424 if !ok && a == mat.identity {
425
426 // Try again with unknown symbol, in case identity failed
427 // Char is only relevant when set
428 if DEBUG {
429 fmt.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
430 }
431 a = mat.unknown
432
433 } else if a != mat.epsilon {
434
435 // Try again with epsilon symbol, in case everything else failed
436 t0 = epsilonState
437 epsilonState = 0 // reset
438 buffo = epsilonOffset
439 a = mat.epsilon
440
441 if DEBUG {
442 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
443 }
444
445 } else {
446 break
447 }
448
449 newchar = false
450 continue
451 }
452
453 // Transition was successful
454 rewindBuffer = false
455
456 // Transition consumes a character
457 if a != mat.epsilon {
458
459 buffo++
460
461 // Transition does not produce a character
Akron16c312e2021-09-26 13:11:12 +0200462 if buffo == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200463 if DEBUG {
464 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
465 }
466 rewindBuffer = true
467 }
468
469 } else {
470 // Transition marks the end of a token - so flush the buffer
Akrone396a932021-10-19 01:06:13 +0200471 if buffo > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200472 if DEBUG {
473 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
474 }
Akrone396a932021-10-19 01:06:13 +0200475 w.Token(0, buffer[:buffo])
Akron1c34ce62021-09-23 23:27:39 +0200476 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200477 sentenceEnd = false
478 } else {
479 sentenceEnd = true
Akrone396a932021-10-19 01:06:13 +0200480 w.SentenceEnd()
Akron1c34ce62021-09-23 23:27:39 +0200481 }
482 if DEBUG {
483 fmt.Println("-> Newline")
484 }
Akrone396a932021-10-19 01:06:13 +0200485 // writer.WriteRune('\n')
Akron1c34ce62021-09-23 23:27:39 +0200486 }
487
488 // Rewind the buffer if necessary
489 if rewindBuffer {
490
Akron16c312e2021-09-26 13:11:12 +0200491 if DEBUG {
492 fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
493 }
494
Akron1c34ce62021-09-23 23:27:39 +0200495 // TODO: Better as a ring buffer
496 for x, i := range buffer[buffo:buffi] {
497 buffer[x] = i
498 }
499
500 buffi -= buffo
Akron16c312e2021-09-26 13:11:12 +0200501 // epsilonOffset -= buffo
502 epsilonOffset = 0
503 epsilonState = 0
504
Akron1c34ce62021-09-23 23:27:39 +0200505 buffo = 0
506 if DEBUG {
507 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
508 }
509 }
510
Akron16c312e2021-09-26 13:11:12 +0200511 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200512
513 newchar = true
514
515 // TODO:
516 // Prevent endless epsilon loops!
517 }
518
519 // Input reader is not yet finished
520 if !eof {
521 if DEBUG {
522 fmt.Println("Not at the end")
523 }
524 return false
525 }
526
527 if DEBUG {
528 fmt.Println("Entering final check")
529 }
Akron1c34ce62021-09-23 23:27:39 +0200530
531 // Check epsilon transitions until a final state is reached
532 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200533 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
534 a = mat.epsilon
535 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200536 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200537 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200538 // Remember state for backtracking to last tokenend state
539 goto PARSECHARM
540
541 } else if epsilonState != 0 {
542 t0 = epsilonState
543 epsilonState = 0 // reset
544 buffo = epsilonOffset
545 if DEBUG {
546 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
547 }
548 goto PARSECHARM
549 }
Akron1c34ce62021-09-23 23:27:39 +0200550
Akron5c82a922021-09-24 19:11:29 +0200551 // Add an additional sentence ending, if the file is over but no explicit
552 // sentence split was reached. This may be controversial and therefore
553 // optional via parameter.
554 if !sentenceEnd {
Akrone396a932021-10-19 01:06:13 +0200555 // writer.WriteRune('\n')
556 // ::Sentenceend
557 w.SentenceEnd()
Akron5c82a922021-09-24 19:11:29 +0200558 if DEBUG {
559 fmt.Println("-> Newline")
560 }
561 }
562
563 return true
Akron1c34ce62021-09-23 23:27:39 +0200564}