blob: eb88086bce09da2c6f202cfc0ab445aa788b5e8a [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "io"
Akron16c312e2021-09-26 13:11:12 +02007 "log"
8 "os"
9)
10
11const (
12 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020013 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron00cecd12021-12-05 13:14:03 +010040 max := 0
41
42 // Init with identity
43 if mat.identity != -1 {
44 for i := 0; i < 256; i++ {
45 mat.sigmaASCII[i] = mat.identity
46 }
47 max = mat.identity
Akron4880fb62021-12-05 12:03:05 +010048 }
49
Akron1c34ce62021-09-23 23:27:39 +020050 for num, sym := range auto.sigmaRev {
51 if int(sym) < 256 {
52 mat.sigmaASCII[int(sym)] = num
53 }
54 mat.sigma[sym] = num
55 if num > auto.sigmaCount {
56 panic("sigmaCount is smaller")
57 }
Akron28031b72021-10-02 13:07:25 +020058 if num > max {
59 max = num
60 }
Akron1c34ce62021-09-23 23:27:39 +020061 }
Akron28031b72021-10-02 13:07:25 +020062 // Add final entry to the list (maybe not necessary actually)
63
Akron1c34ce62021-09-23 23:27:39 +020064 remember := make([]bool, auto.stateCount+2)
65
Akron28031b72021-10-02 13:07:25 +020066 // lower sigmaCount, as no final value exists
67 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
68
Akron1c34ce62021-09-23 23:27:39 +020069 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020070 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020071
Akron16c312e2021-09-26 13:11:12 +020072 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020073 if start > auto.stateCount {
74 panic("stateCount is smaller")
75 }
76 if remember[start] {
77 return
78 }
79 remember[start] = true
80 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020081 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020082
83 // Mark nontoken transitions
84 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020085 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020086 }
87
88 toMatrix(matrix, t.end)
89 }
90 }
91
92 toMatrix(mat.array, 1)
93
94 return mat
95}
96
Akron941f2152021-09-26 15:14:25 +020097// Type of tokenizer
98func (MatrixTokenizer) Type() string {
99 return MAMAGIC
100}
101
Akron16c312e2021-09-26 13:11:12 +0200102// Save stores the matrix data in a file
103func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
104 f, err := os.Create(file)
105 if err != nil {
106 log.Println(err)
107 return 0, err
108 }
109 defer f.Close()
110 gz := gzip.NewWriter(f)
111 defer gz.Close()
112 n, err = mat.WriteTo(gz)
113 if err != nil {
114 log.Println(err)
115 return n, err
116 }
117 gz.Flush()
118 return n, nil
119}
120
121// WriteTo stores the matrix data in an io.Writer.
122func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
123
124 wb := bufio.NewWriter(w)
125 defer wb.Flush()
126
127 // Store magical header
128 all, err := wb.Write([]byte(MAMAGIC))
129 if err != nil {
130 log.Println(err)
131 return int64(all), err
132 }
133
134 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200135 // In datok it's 16 - 4*4
136 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200137 max := 0
138 for sym, num := range mat.sigma {
139 sigmalist[num] = sym
140 if num > max {
141 max = num
142 }
143 }
144
Akron28031b72021-10-02 13:07:25 +0200145 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200146 sigmalist = sigmalist[:max+1]
147
Akron28031b72021-10-02 13:07:25 +0200148 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200149 bo.PutUint16(buf[0:2], VERSION)
150 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
151 bo.PutUint16(buf[4:6], uint16(mat.unknown))
152 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200153 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
154 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
155 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200156 if err != nil {
157 log.Println(err)
158 return int64(all), err
159 }
160
161 all += more
162
163 // Write sigma
164 for _, sym := range sigmalist {
165
166 more, err = wb.WriteRune(sym)
167 if err != nil {
168 log.Println(err)
169 return int64(all), err
170 }
171 all += more
172 }
173
174 if err != nil {
175 log.Println(err)
176 return int64(all), err
177 }
178
179 // Test marker - could be checksum
180 more, err = wb.Write([]byte("M"))
181 if err != nil {
182 log.Println(err)
183 return int64(all), err
184 }
185 all += more
186
Akron16c312e2021-09-26 13:11:12 +0200187 for _, x := range mat.array {
188 bo.PutUint32(buf[0:4], uint32(x))
189 more, err = wb.Write(buf[0:4])
190 if err != nil {
191 log.Println(err)
192 return int64(all), err
193 }
194 all += more
195 if more != 4 {
196 log.Println("Can not write base uint32")
197 return int64(all), err
198 }
Akron16c312e2021-09-26 13:11:12 +0200199 }
200
201 return int64(all), err
202}
203
204// LoadDatokFile reads a double array represented tokenizer
205// from a file.
206func LoadMatrixFile(file string) *MatrixTokenizer {
207 f, err := os.Open(file)
208 if err != nil {
209 log.Println(err)
210 return nil
211 }
212 defer f.Close()
213
214 gz, err := gzip.NewReader(f)
215 if err != nil {
216 log.Println(err)
217 return nil
218 }
219 defer gz.Close()
220
221 // Todo: Read the whole file!
222 return ParseMatrix(gz)
223}
224
225// LoadMatrixFile reads a matrix represented tokenizer
226// from an io.Reader
227func ParseMatrix(ior io.Reader) *MatrixTokenizer {
228
229 // Initialize tokenizer with default values
230 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200231 sigma: make(map[rune]int),
232 epsilon: 0,
233 unknown: 0,
234 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200235 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200236 }
237
238 r := bufio.NewReader(ior)
239
240 buf := make([]byte, 1024)
241 buf = buf[0:len(MAMAGIC)]
242
243 _, err := r.Read(buf)
244
245 if err != nil {
246 log.Println(err)
247 return nil
248 }
249
250 if string(MAMAGIC) != string(buf) {
251 log.Println("Not a matok file")
252 return nil
253 }
254
Akron28031b72021-10-02 13:07:25 +0200255 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200256 if err != nil {
257 log.Println(err)
258 return nil
259 }
260
Akron28031b72021-10-02 13:07:25 +0200261 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200262 log.Println("Read bytes do not fit")
263 return nil
264 }
265
266 version := bo.Uint16(buf[0:2])
267
268 if version != VERSION {
269 log.Println("Version not compatible")
270 return nil
271 }
272
273 mat.epsilon = int(bo.Uint16(buf[2:4]))
274 mat.unknown = int(bo.Uint16(buf[4:6]))
275 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200276 mat.stateCount = int(bo.Uint32(buf[8:12]))
277 sigmaCount := int(bo.Uint16(buf[12:14]))
278 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200279
Akron00cecd12021-12-05 13:14:03 +0100280 // Init with identity
281 if mat.identity != -1 {
282 for i := 0; i < 256; i++ {
283 mat.sigmaASCII[i] = mat.identity
284 }
285 }
286
Akron16c312e2021-09-26 13:11:12 +0200287 for x := 0; x < sigmaCount; x++ {
288 sym, _, err := r.ReadRune()
289 if err == nil && sym != 0 {
290 if int(sym) < 256 {
291 mat.sigmaASCII[int(sym)] = x
292 }
293 mat.sigma[sym] = x
294 }
295 }
296
297 _, err = io.ReadFull(r, buf[0:1])
298
299 if err != nil {
300 log.Print(err)
301 return nil
302 }
303
304 if string("M") != string(buf[0:1]) {
305 log.Println("Not a matok file")
306 return nil
307 }
308
309 // Read based on length
310 mat.array = make([]uint32, arraySize)
311
312 dataArray, err := io.ReadAll(r)
313
314 if err == io.EOF {
315 log.Println(err)
316 return nil
317 }
318
319 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200320 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200321 return nil
322 }
323
324 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200325 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
326 }
327
328 return mat
329}
330
Akron98fbfef2021-10-23 17:02:11 +0200331// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200332func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200333 return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200334}
335
Akron98fbfef2021-10-23 17:02:11 +0200336// TransduceTokenWriter transduces an input string against
337// the matrix FSA. The rules are always greedy. If the
338// automaton fails, it takes the last possible token ending
339// branch.
Akron4f6b28c2021-10-25 00:52:03 +0200340func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron1c34ce62021-09-23 23:27:39 +0200341 var a int
Akron16c312e2021-09-26 13:11:12 +0200342 var t0 uint32
343 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200344 var ok, rewindBuffer bool
345
346 // Remember the last position of a possible tokenend,
347 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200348 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200349 epsilonOffset := 0
350
Akron5c82a922021-09-24 19:11:29 +0200351 // Remember if the last transition was epsilon
352 sentenceEnd := false
353
Akrona854faa2021-10-22 19:31:08 +0200354 // Remember if a text end was already set
355 textEnd := false
356
Akron1c34ce62021-09-23 23:27:39 +0200357 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200358 bufft := 0 // Buffer token offset
359 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200360 buffi := 0 // Buffer length
361
Akron98fbfef2021-10-23 17:02:11 +0200362 // The buffer is organized as follows:
363 // [ t[....c..]..i]
364
Akron1c34ce62021-09-23 23:27:39 +0200365 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200366 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200367
368 var char rune
369
370 var err error
371 eof := false
Akrona854faa2021-10-22 19:31:08 +0200372 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200373 newchar := true
374
375PARSECHARM:
376 for {
377
378 if newchar {
379 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200380 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200381 if eof {
382 break
383 }
384 char, _, err = reader.ReadRune()
385
386 // No more runes to read
387 if err != nil {
Akron274600e2021-11-03 20:09:06 +0100388 if err == io.EOF {
389 eof = true
390 break
391 }
392
393 log.Fatalln(err)
394 os.Exit(1)
395 return false
Akron1c34ce62021-09-23 23:27:39 +0200396 }
Akron274600e2021-11-03 20:09:06 +0100397
Akron1c34ce62021-09-23 23:27:39 +0200398 buffer[buffi] = char
399 buffi++
400 }
401
Akron98fbfef2021-10-23 17:02:11 +0200402 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200403
404 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100405 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200406 }
407
Akrona854faa2021-10-22 19:31:08 +0200408 eot = false
409
Akron1c34ce62021-09-23 23:27:39 +0200410 // TODO:
411 // Better not repeatedly check for a!
412 // Possibly keep a buffer with a.
413 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200414 if int(char) == EOT {
415 eot = true
416 }
Akrondf275812022-03-27 12:54:46 +0200417
418 // mat.SigmaASCII[] is initialized with mat.identity
Akron1c34ce62021-09-23 23:27:39 +0200419 a = mat.sigmaASCII[int(char)]
420 } else {
421 a, ok = mat.sigma[char]
Akron1c34ce62021-09-23 23:27:39 +0200422
Akron4880fb62021-12-05 12:03:05 +0100423 // Use identity symbol if character is not in sigma
424 if !ok && mat.identity != -1 {
Akrondf275812022-03-27 12:54:46 +0200425
426 // TODO: Maybe use unknown?
Akron4880fb62021-12-05 12:03:05 +0100427 a = mat.identity
428 }
Akron1c34ce62021-09-23 23:27:39 +0200429 }
430
431 t0 = t
432
433 // Check for epsilon transitions and remember
434
Akron16c312e2021-09-26 13:11:12 +0200435 // TODO: Can t0 be negative here?
436 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200437 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200438
439 // Maybe not necessary - and should be simpler!
440 // Just Remove
Akrondf275812022-03-27 12:54:46 +0200441 // t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200442 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200443 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200444
445 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100446 log.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200447 }
Akron1c34ce62021-09-23 23:27:39 +0200448 }
449 }
450
Akrondf275812022-03-27 12:54:46 +0200451 // can happen when no identity is defined.
452 // This shouldn't be tested in every loop
453 if a == 0 {
454 t = 0
455 } else {
456 // Checks a transition based on t0, a and buffo
457 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
458 }
Akron1c34ce62021-09-23 23:27:39 +0200459
460 if DEBUG {
461 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100462 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200463 }
464
Akrone396a932021-10-19 01:06:13 +0200465 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200466 if t == 0 {
467
468 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100469 log.Println("Match is not fine!")
Akron1c34ce62021-09-23 23:27:39 +0200470 }
471
472 if !ok && a == mat.identity {
473
474 // Try again with unknown symbol, in case identity failed
475 // Char is only relevant when set
476 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100477 log.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
Akron1c34ce62021-09-23 23:27:39 +0200478 }
479 a = mat.unknown
480
Akrondf275812022-03-27 12:54:46 +0200481 } else if a != mat.epsilon && epsilonState != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200482
483 // Try again with epsilon symbol, in case everything else failed
484 t0 = epsilonState
485 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200486 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200487 a = mat.epsilon
488
489 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100490 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200491 }
492
493 } else {
Akrondf275812022-03-27 12:54:46 +0200494
495 if DEBUG {
496 log.Println("Fail!")
497 }
498
499 // w.Fail(bufft)
500
501 // The following procedure means the automaton fails to consume a certain character.
502 // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
503 // token and start blank at the root node of the automaton for the remaining data.
504 // It may be beneficial to have something like a "drop()" event to capture these cases,
505 // as they are likely the result of a bad automaton design.
Akroncae39112023-04-26 19:43:16 +0200506
507 // fmt.Println("Problem", len(buffer), buffc, bufft)
508
509 if buffc-bufft <= 0 {
Akrondf275812022-03-27 12:54:46 +0200510 buffc++
Akroncae39112023-04-26 19:43:16 +0200511 if buffc == 0 {
512 eof = true
513 break
514 }
Akrondf275812022-03-27 12:54:46 +0200515 }
516
517 if DEBUG {
518 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
519 }
Akroncae39112023-04-26 19:43:16 +0200520
Akrondf275812022-03-27 12:54:46 +0200521 w.Token(bufft, buffer[:buffc])
522
523 sentenceEnd = false
524 textEnd = false
525
526 if DEBUG {
527 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
528 }
529
530 for x, i := range buffer[buffc:buffi] {
531 buffer[x] = i
532 }
533
534 buffi -= buffc
535 epsilonState = 0
536
537 buffc = 0
538 bufft = 0
539
540 a = mat.epsilon
541
542 // Restart from root state
543 t = uint32(1)
544 newchar = true
545 // goto PARSECHARM
546 continue
Akron1c34ce62021-09-23 23:27:39 +0200547 }
548
549 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200550 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200551 continue
552 }
553
554 // Transition was successful
555 rewindBuffer = false
556
Akron90aa45b2021-11-16 23:28:17 +0100557 // Transition consumes no character
558 if a == mat.epsilon {
Akron1c34ce62021-09-23 23:27:39 +0200559 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200560 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200561 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100562 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200563 }
Akron32416ce2021-10-23 17:09:41 +0200564 w.Token(bufft, buffer[:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200565 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200566 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200567 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200568 } else {
569 sentenceEnd = true
Akron4f6b28c2021-10-25 00:52:03 +0200570 w.SentenceEnd(buffc)
Akron1c34ce62021-09-23 23:27:39 +0200571 }
Akron90aa45b2021-11-16 23:28:17 +0100572
573 // Transition consumes a character
574 } else {
575 buffc++
576
577 // Transition does not produce a character
578 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
579 if DEBUG {
580 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
581 }
582 bufft++
583 // rewindBuffer = true
584 }
Akron1c34ce62021-09-23 23:27:39 +0200585 }
586
Akron8cc2dd92021-10-25 19:49:41 +0200587 if eot {
588 eot = false
589 textEnd = true
590 w.TextEnd(buffc)
591 rewindBuffer = true
592 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100593 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200594 }
595 }
596
Akron1c34ce62021-09-23 23:27:39 +0200597 // Rewind the buffer if necessary
598 if rewindBuffer {
599
Akron16c312e2021-09-26 13:11:12 +0200600 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100601 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akron16c312e2021-09-26 13:11:12 +0200602 }
603
Akron22c565a2021-11-28 17:31:36 +0100604 // buffer = buffer[buffc:]
605 for x, i := range buffer[buffc:buffi] {
606 buffer[x] = i
607 }
Akron1c34ce62021-09-23 23:27:39 +0200608
Akron98fbfef2021-10-23 17:02:11 +0200609 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200610 // epsilonOffset -= buffo
611 epsilonOffset = 0
612 epsilonState = 0
613
Akron98fbfef2021-10-23 17:02:11 +0200614 buffc = 0
615 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200616
Akron98fbfef2021-10-23 17:02:11 +0200617 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100618 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akrona854faa2021-10-22 19:31:08 +0200619 }
Akron1c34ce62021-09-23 23:27:39 +0200620 }
621
Akron16c312e2021-09-26 13:11:12 +0200622 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200623
624 newchar = true
625
626 // TODO:
627 // Prevent endless epsilon loops!
628 }
629
630 // Input reader is not yet finished
631 if !eof {
632 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100633 log.Println("Not at the end")
Akron1c34ce62021-09-23 23:27:39 +0200634 }
Akrondf275812022-03-27 12:54:46 +0200635 // This should never happen
Akron1c34ce62021-09-23 23:27:39 +0200636 return false
637 }
638
639 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100640 log.Println("Entering final check")
Akron1c34ce62021-09-23 23:27:39 +0200641 }
Akron1c34ce62021-09-23 23:27:39 +0200642
Akrona854faa2021-10-22 19:31:08 +0200643 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200644 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200645 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
646 a = mat.epsilon
647 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200648 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200649 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200650 // Remember state for backtracking to last tokenend state
651 goto PARSECHARM
652
653 } else if epsilonState != 0 {
654 t0 = epsilonState
655 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200656 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200657 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100658 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200659 }
660 goto PARSECHARM
661 }
Akron1c34ce62021-09-23 23:27:39 +0200662
Akrondf275812022-03-27 12:54:46 +0200663 // something left in buffer
664 if buffc-bufft > 0 {
665 if DEBUG {
666 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
667 }
668 w.Token(bufft, buffer[:buffc])
669 sentenceEnd = false
670 textEnd = false
671 }
672
Akron5c82a922021-09-24 19:11:29 +0200673 // Add an additional sentence ending, if the file is over but no explicit
674 // sentence split was reached. This may be controversial and therefore
675 // optional via parameter.
676 if !sentenceEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200677 w.SentenceEnd(buffc)
Akron5c82a922021-09-24 19:11:29 +0200678 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100679 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +0200680 }
681 }
682
683 if !textEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200684 w.TextEnd(buffc)
Akrona854faa2021-10-22 19:31:08 +0200685 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100686 log.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200687 }
688 }
689
690 return true
Akron1c34ce62021-09-23 23:27:39 +0200691}