blob: 52e3ad71507d5233b9050f2ca8c781fe44f0634c [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "io"
Akron16c312e2021-09-26 13:11:12 +02007 "log"
8 "os"
9)
10
11const (
12 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020013 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron00cecd12021-12-05 13:14:03 +010040 max := 0
41
42 // Init with identity
43 if mat.identity != -1 {
44 for i := 0; i < 256; i++ {
45 mat.sigmaASCII[i] = mat.identity
46 }
47 max = mat.identity
Akron4880fb62021-12-05 12:03:05 +010048 }
49
Akron1c34ce62021-09-23 23:27:39 +020050 for num, sym := range auto.sigmaRev {
51 if int(sym) < 256 {
52 mat.sigmaASCII[int(sym)] = num
53 }
54 mat.sigma[sym] = num
55 if num > auto.sigmaCount {
56 panic("sigmaCount is smaller")
57 }
Akron8e803932023-04-18 10:19:19 +020058
59 // Find max
60 // see https://dev.to/jobinrjohnson/branchless-programming-does-it-really-matter-20j4
61 max -= ((max - num) & ((max - num) >> 31))
62 // if num > max {
63 // max = num
64 // }
Akron1c34ce62021-09-23 23:27:39 +020065 }
Akron28031b72021-10-02 13:07:25 +020066 // Add final entry to the list (maybe not necessary actually)
67
Akron1c34ce62021-09-23 23:27:39 +020068 remember := make([]bool, auto.stateCount+2)
69
Akron28031b72021-10-02 13:07:25 +020070 // lower sigmaCount, as no final value exists
71 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
72
Akron1c34ce62021-09-23 23:27:39 +020073 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020074 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020075
Akron16c312e2021-09-26 13:11:12 +020076 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020077 if start > auto.stateCount {
78 panic("stateCount is smaller")
79 }
80 if remember[start] {
81 return
82 }
83 remember[start] = true
84 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020085 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020086
87 // Mark nontoken transitions
88 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020089 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020090 }
91
92 toMatrix(matrix, t.end)
93 }
94 }
95
96 toMatrix(mat.array, 1)
97
98 return mat
99}
100
Akron941f2152021-09-26 15:14:25 +0200101// Type of tokenizer
102func (MatrixTokenizer) Type() string {
103 return MAMAGIC
104}
105
Akron16c312e2021-09-26 13:11:12 +0200106// Save stores the matrix data in a file
107func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
108 f, err := os.Create(file)
109 if err != nil {
110 log.Println(err)
111 return 0, err
112 }
113 defer f.Close()
114 gz := gzip.NewWriter(f)
115 defer gz.Close()
116 n, err = mat.WriteTo(gz)
117 if err != nil {
118 log.Println(err)
119 return n, err
120 }
121 gz.Flush()
122 return n, nil
123}
124
125// WriteTo stores the matrix data in an io.Writer.
126func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
127
128 wb := bufio.NewWriter(w)
129 defer wb.Flush()
130
131 // Store magical header
132 all, err := wb.Write([]byte(MAMAGIC))
133 if err != nil {
134 log.Println(err)
135 return int64(all), err
136 }
137
138 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200139 // In datok it's 16 - 4*4
140 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200141 max := 0
142 for sym, num := range mat.sigma {
143 sigmalist[num] = sym
Akron8e803932023-04-18 10:19:19 +0200144
145 // Find max
146 // see https://dev.to/jobinrjohnson/branchless-programming-does-it-really-matter-20j4
147 max -= ((max - num) & ((max - num) >> 31))
148 // if num > max {
149 // max = num
150 // }
Akron16c312e2021-09-26 13:11:12 +0200151 }
152
Akron28031b72021-10-02 13:07:25 +0200153 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200154 sigmalist = sigmalist[:max+1]
155
Akron28031b72021-10-02 13:07:25 +0200156 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200157 bo.PutUint16(buf[0:2], VERSION)
158 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
159 bo.PutUint16(buf[4:6], uint16(mat.unknown))
160 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200161 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
162 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
163 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200164 if err != nil {
165 log.Println(err)
166 return int64(all), err
167 }
168
169 all += more
170
171 // Write sigma
172 for _, sym := range sigmalist {
173
174 more, err = wb.WriteRune(sym)
175 if err != nil {
176 log.Println(err)
177 return int64(all), err
178 }
179 all += more
180 }
181
182 if err != nil {
183 log.Println(err)
184 return int64(all), err
185 }
186
187 // Test marker - could be checksum
188 more, err = wb.Write([]byte("M"))
189 if err != nil {
190 log.Println(err)
191 return int64(all), err
192 }
193 all += more
194
Akron16c312e2021-09-26 13:11:12 +0200195 for _, x := range mat.array {
196 bo.PutUint32(buf[0:4], uint32(x))
197 more, err = wb.Write(buf[0:4])
198 if err != nil {
199 log.Println(err)
200 return int64(all), err
201 }
202 all += more
203 if more != 4 {
204 log.Println("Can not write base uint32")
205 return int64(all), err
206 }
Akron16c312e2021-09-26 13:11:12 +0200207 }
208
209 return int64(all), err
210}
211
Akronb84d4692024-08-20 14:33:00 +0200212// LoadMatrixFile reads a matrix represented tokenizer
Akron16c312e2021-09-26 13:11:12 +0200213// from a file.
214func LoadMatrixFile(file string) *MatrixTokenizer {
215 f, err := os.Open(file)
216 if err != nil {
217 log.Println(err)
218 return nil
219 }
220 defer f.Close()
221
222 gz, err := gzip.NewReader(f)
223 if err != nil {
224 log.Println(err)
225 return nil
226 }
227 defer gz.Close()
228
229 // Todo: Read the whole file!
230 return ParseMatrix(gz)
231}
232
Akronb84d4692024-08-20 14:33:00 +0200233// ParseMatrix reads a matrix represented tokenizer
Akron16c312e2021-09-26 13:11:12 +0200234// from an io.Reader
235func ParseMatrix(ior io.Reader) *MatrixTokenizer {
236
237 // Initialize tokenizer with default values
238 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200239 sigma: make(map[rune]int),
240 epsilon: 0,
241 unknown: 0,
242 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200243 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200244 }
245
246 r := bufio.NewReader(ior)
247
248 buf := make([]byte, 1024)
249 buf = buf[0:len(MAMAGIC)]
250
251 _, err := r.Read(buf)
252
253 if err != nil {
254 log.Println(err)
255 return nil
256 }
257
258 if string(MAMAGIC) != string(buf) {
259 log.Println("Not a matok file")
260 return nil
261 }
262
Akron28031b72021-10-02 13:07:25 +0200263 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200264 if err != nil {
265 log.Println(err)
266 return nil
267 }
268
Akron28031b72021-10-02 13:07:25 +0200269 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200270 log.Println("Read bytes do not fit")
271 return nil
272 }
273
274 version := bo.Uint16(buf[0:2])
275
276 if version != VERSION {
277 log.Println("Version not compatible")
278 return nil
279 }
280
281 mat.epsilon = int(bo.Uint16(buf[2:4]))
282 mat.unknown = int(bo.Uint16(buf[4:6]))
283 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200284 mat.stateCount = int(bo.Uint32(buf[8:12]))
285 sigmaCount := int(bo.Uint16(buf[12:14]))
286 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200287
Akron00cecd12021-12-05 13:14:03 +0100288 // Init with identity
289 if mat.identity != -1 {
290 for i := 0; i < 256; i++ {
291 mat.sigmaASCII[i] = mat.identity
292 }
293 }
294
Akron16c312e2021-09-26 13:11:12 +0200295 for x := 0; x < sigmaCount; x++ {
296 sym, _, err := r.ReadRune()
297 if err == nil && sym != 0 {
298 if int(sym) < 256 {
299 mat.sigmaASCII[int(sym)] = x
300 }
301 mat.sigma[sym] = x
302 }
303 }
304
305 _, err = io.ReadFull(r, buf[0:1])
306
307 if err != nil {
308 log.Print(err)
309 return nil
310 }
311
312 if string("M") != string(buf[0:1]) {
313 log.Println("Not a matok file")
314 return nil
315 }
316
317 // Read based on length
318 mat.array = make([]uint32, arraySize)
319
320 dataArray, err := io.ReadAll(r)
321
322 if err == io.EOF {
323 log.Println(err)
324 return nil
325 }
326
327 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200328 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200329 return nil
330 }
331
332 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200333 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
334 }
335
336 return mat
337}
338
Akron98fbfef2021-10-23 17:02:11 +0200339// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200340func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200341 return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200342}
343
Akron98fbfef2021-10-23 17:02:11 +0200344// TransduceTokenWriter transduces an input string against
345// the matrix FSA. The rules are always greedy. If the
346// automaton fails, it takes the last possible token ending
347// branch.
Akron4f6b28c2021-10-25 00:52:03 +0200348func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron1c34ce62021-09-23 23:27:39 +0200349 var a int
Akron16c312e2021-09-26 13:11:12 +0200350 var t0 uint32
351 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200352 var ok, rewindBuffer bool
353
354 // Remember the last position of a possible tokenend,
355 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200356 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200357 epsilonOffset := 0
358
Akron5c82a922021-09-24 19:11:29 +0200359 // Remember if the last transition was epsilon
360 sentenceEnd := false
361
Akrona854faa2021-10-22 19:31:08 +0200362 // Remember if a text end was already set
363 textEnd := false
364
Akron1c34ce62021-09-23 23:27:39 +0200365 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200366 bufft := 0 // Buffer token offset
367 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200368 buffi := 0 // Buffer length
369
Akron98fbfef2021-10-23 17:02:11 +0200370 // The buffer is organized as follows:
371 // [ t[....c..]..i]
372
Akron1c34ce62021-09-23 23:27:39 +0200373 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200374 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200375
376 var char rune
377
378 var err error
379 eof := false
Akrona854faa2021-10-22 19:31:08 +0200380 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200381 newchar := true
382
383PARSECHARM:
384 for {
385
386 if newchar {
387 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200388 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200389 if eof {
390 break
391 }
392 char, _, err = reader.ReadRune()
393
394 // No more runes to read
395 if err != nil {
Akron274600e2021-11-03 20:09:06 +0100396 if err == io.EOF {
397 eof = true
398 break
399 }
400
401 log.Fatalln(err)
402 os.Exit(1)
403 return false
Akron1c34ce62021-09-23 23:27:39 +0200404 }
Akron274600e2021-11-03 20:09:06 +0100405
Akron1c34ce62021-09-23 23:27:39 +0200406 buffer[buffi] = char
407 buffi++
408 }
409
Akron98fbfef2021-10-23 17:02:11 +0200410 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200411
412 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100413 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200414 }
415
Akrona854faa2021-10-22 19:31:08 +0200416 eot = false
417
Akron1c34ce62021-09-23 23:27:39 +0200418 // TODO:
419 // Better not repeatedly check for a!
420 // Possibly keep a buffer with a.
421 if int(char) < 256 {
Akron8e803932023-04-18 10:19:19 +0200422 eot = int(char) == EOT
Akrondf275812022-03-27 12:54:46 +0200423
424 // mat.SigmaASCII[] is initialized with mat.identity
Akron1c34ce62021-09-23 23:27:39 +0200425 a = mat.sigmaASCII[int(char)]
426 } else {
427 a, ok = mat.sigma[char]
Akron1c34ce62021-09-23 23:27:39 +0200428
Akron4880fb62021-12-05 12:03:05 +0100429 // Use identity symbol if character is not in sigma
430 if !ok && mat.identity != -1 {
Akrondf275812022-03-27 12:54:46 +0200431
432 // TODO: Maybe use unknown?
Akron4880fb62021-12-05 12:03:05 +0100433 a = mat.identity
434 }
Akron1c34ce62021-09-23 23:27:39 +0200435 }
436
437 t0 = t
438
439 // Check for epsilon transitions and remember
440
Akron16c312e2021-09-26 13:11:12 +0200441 // TODO: Can t0 be negative here?
442 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200443 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200444
445 // Maybe not necessary - and should be simpler!
446 // Just Remove
Akrondf275812022-03-27 12:54:46 +0200447 // t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200448 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200449 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200450
451 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100452 log.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200453 }
Akron1c34ce62021-09-23 23:27:39 +0200454 }
455 }
456
Akrondf275812022-03-27 12:54:46 +0200457 // can happen when no identity is defined.
458 // This shouldn't be tested in every loop
459 if a == 0 {
460 t = 0
461 } else {
462 // Checks a transition based on t0, a and buffo
463 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
464 }
Akron1c34ce62021-09-23 23:27:39 +0200465
466 if DEBUG {
467 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100468 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200469 }
470
Akrone396a932021-10-19 01:06:13 +0200471 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200472 if t == 0 {
473
474 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100475 log.Println("Match is not fine!")
Akron1c34ce62021-09-23 23:27:39 +0200476 }
477
478 if !ok && a == mat.identity {
479
480 // Try again with unknown symbol, in case identity failed
481 // Char is only relevant when set
482 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100483 log.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
Akron1c34ce62021-09-23 23:27:39 +0200484 }
485 a = mat.unknown
486
Akrondf275812022-03-27 12:54:46 +0200487 } else if a != mat.epsilon && epsilonState != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200488
489 // Try again with epsilon symbol, in case everything else failed
490 t0 = epsilonState
491 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200492 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200493 a = mat.epsilon
494
495 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100496 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200497 }
498
499 } else {
Akrondf275812022-03-27 12:54:46 +0200500
501 if DEBUG {
502 log.Println("Fail!")
503 }
504
505 // w.Fail(bufft)
506
507 // The following procedure means the automaton fails to consume a certain character.
508 // In the tokenization scenario, this means, the tokenizer will drop the old or current data as a
509 // token and start blank at the root node of the automaton for the remaining data.
510 // It may be beneficial to have something like a "drop()" event to capture these cases,
511 // as they are likely the result of a bad automaton design.
Akroncae39112023-04-26 19:43:16 +0200512
513 // fmt.Println("Problem", len(buffer), buffc, bufft)
514
515 if buffc-bufft <= 0 {
Akrondf275812022-03-27 12:54:46 +0200516 buffc++
Akroncae39112023-04-26 19:43:16 +0200517 if buffc == 0 {
518 eof = true
519 break
520 }
Akrondf275812022-03-27 12:54:46 +0200521 }
Akron8e803932023-04-18 10:19:19 +0200522 // This will hopefully be branchless by the compiler
Akrondf275812022-03-27 12:54:46 +0200523
524 if DEBUG {
525 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
526 }
Akroncae39112023-04-26 19:43:16 +0200527
Akrondf275812022-03-27 12:54:46 +0200528 w.Token(bufft, buffer[:buffc])
529
530 sentenceEnd = false
531 textEnd = false
532
533 if DEBUG {
534 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
535 }
536
Akron8e803932023-04-18 10:19:19 +0200537 copy(buffer[0:], buffer[buffc:buffi])
Akrondf275812022-03-27 12:54:46 +0200538
539 buffi -= buffc
540 epsilonState = 0
541
542 buffc = 0
543 bufft = 0
544
545 a = mat.epsilon
546
547 // Restart from root state
548 t = uint32(1)
549 newchar = true
550 // goto PARSECHARM
551 continue
Akron1c34ce62021-09-23 23:27:39 +0200552 }
553
554 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200555 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200556 continue
557 }
558
559 // Transition was successful
560 rewindBuffer = false
561
Akron90aa45b2021-11-16 23:28:17 +0100562 // Transition consumes no character
563 if a == mat.epsilon {
Akron1c34ce62021-09-23 23:27:39 +0200564 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200565 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200566 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100567 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200568 }
Akron32416ce2021-10-23 17:09:41 +0200569 w.Token(bufft, buffer[:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200570 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200571 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200572 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200573 } else {
574 sentenceEnd = true
Akron4f6b28c2021-10-25 00:52:03 +0200575 w.SentenceEnd(buffc)
Akron1c34ce62021-09-23 23:27:39 +0200576 }
Akron90aa45b2021-11-16 23:28:17 +0100577
578 // Transition consumes a character
579 } else {
580 buffc++
581
582 // Transition does not produce a character
Akron8e803932023-04-18 10:19:19 +0200583 // Hopefully generated branchless code
Akron90aa45b2021-11-16 23:28:17 +0100584 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
585 if DEBUG {
586 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
587 }
588 bufft++
589 // rewindBuffer = true
590 }
Akron1c34ce62021-09-23 23:27:39 +0200591 }
592
Akron8cc2dd92021-10-25 19:49:41 +0200593 if eot {
594 eot = false
Akronf66dc142023-09-06 20:00:47 +0200595 if !sentenceEnd {
596 sentenceEnd = true
597 w.SentenceEnd(buffc)
598 }
Akron8cc2dd92021-10-25 19:49:41 +0200599 textEnd = true
600 w.TextEnd(buffc)
601 rewindBuffer = true
602 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100603 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200604 }
605 }
606
Akron1c34ce62021-09-23 23:27:39 +0200607 // Rewind the buffer if necessary
608 if rewindBuffer {
609
Akron16c312e2021-09-26 13:11:12 +0200610 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100611 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akron16c312e2021-09-26 13:11:12 +0200612 }
613
Akron8e803932023-04-18 10:19:19 +0200614 copy(buffer[0:], buffer[buffc:buffi])
Akron1c34ce62021-09-23 23:27:39 +0200615
Akron98fbfef2021-10-23 17:02:11 +0200616 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200617 // epsilonOffset -= buffo
618 epsilonOffset = 0
619 epsilonState = 0
620
Akron98fbfef2021-10-23 17:02:11 +0200621 buffc = 0
622 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200623
Akron98fbfef2021-10-23 17:02:11 +0200624 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100625 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akrona854faa2021-10-22 19:31:08 +0200626 }
Akron1c34ce62021-09-23 23:27:39 +0200627 }
628
Akron16c312e2021-09-26 13:11:12 +0200629 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200630
631 newchar = true
632
633 // TODO:
634 // Prevent endless epsilon loops!
635 }
636
637 // Input reader is not yet finished
638 if !eof {
639 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100640 log.Println("Not at the end")
Akron1c34ce62021-09-23 23:27:39 +0200641 }
Akrondf275812022-03-27 12:54:46 +0200642 // This should never happen
Akron1c34ce62021-09-23 23:27:39 +0200643 return false
644 }
645
646 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100647 log.Println("Entering final check")
Akron1c34ce62021-09-23 23:27:39 +0200648 }
Akron1c34ce62021-09-23 23:27:39 +0200649
Akrona854faa2021-10-22 19:31:08 +0200650 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200651 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200652 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
653 a = mat.epsilon
654 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200655 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200656 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200657 // Remember state for backtracking to last tokenend state
658 goto PARSECHARM
659
660 } else if epsilonState != 0 {
661 t0 = epsilonState
662 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200663 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200664 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100665 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200666 }
667 goto PARSECHARM
668 }
Akron1c34ce62021-09-23 23:27:39 +0200669
Akrondf275812022-03-27 12:54:46 +0200670 // something left in buffer
671 if buffc-bufft > 0 {
672 if DEBUG {
673 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
674 }
675 w.Token(bufft, buffer[:buffc])
676 sentenceEnd = false
677 textEnd = false
678 }
679
Akron5c82a922021-09-24 19:11:29 +0200680 // Add an additional sentence ending, if the file is over but no explicit
681 // sentence split was reached. This may be controversial and therefore
682 // optional via parameter.
683 if !sentenceEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200684 w.SentenceEnd(buffc)
Akron5c82a922021-09-24 19:11:29 +0200685 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100686 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +0200687 }
688 }
689
690 if !textEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200691 w.TextEnd(buffc)
Akrona854faa2021-10-22 19:31:08 +0200692 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100693 log.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200694 }
695 }
696
697 return true
Akron1c34ce62021-09-23 23:27:39 +0200698}