blob: 84484746b43c4dee5f24da20abdab5d99ce3e204 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "io"
Akron16c312e2021-09-26 13:11:12 +02007 "log"
8 "os"
9)
10
11const (
12 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020013 EOT = 4
Akron65113a82021-11-13 10:50:53 +010014 BUFSIZE = 4096
Akron1c34ce62021-09-23 23:27:39 +020015)
16
17type MatrixTokenizer struct {
18 sigma map[rune]int
19 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020020 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020021 stateCount int
22
23 // Special symbols in sigma
24 epsilon int
25 unknown int
26 identity int
Akron1c34ce62021-09-23 23:27:39 +020027}
28
29// ToMatrix turns the intermediate tokenizer into a
30// matrix representation.
31func (auto *Automaton) ToMatrix() *MatrixTokenizer {
32
33 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020034 sigma: make(map[rune]int),
35 unknown: auto.unknown,
36 identity: auto.identity,
37 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020038 stateCount: auto.stateCount,
39 }
40
Akron28031b72021-10-02 13:07:25 +020041 max := 0
Akron1c34ce62021-09-23 23:27:39 +020042 for num, sym := range auto.sigmaRev {
43 if int(sym) < 256 {
44 mat.sigmaASCII[int(sym)] = num
45 }
46 mat.sigma[sym] = num
47 if num > auto.sigmaCount {
48 panic("sigmaCount is smaller")
49 }
Akron28031b72021-10-02 13:07:25 +020050 if num > max {
51 max = num
52 }
Akron1c34ce62021-09-23 23:27:39 +020053 }
Akron28031b72021-10-02 13:07:25 +020054 // Add final entry to the list (maybe not necessary actually)
55
Akron1c34ce62021-09-23 23:27:39 +020056 remember := make([]bool, auto.stateCount+2)
57
Akron28031b72021-10-02 13:07:25 +020058 // lower sigmaCount, as no final value exists
59 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
60
Akron1c34ce62021-09-23 23:27:39 +020061 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020062 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020063
Akron16c312e2021-09-26 13:11:12 +020064 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020065 if start > auto.stateCount {
66 panic("stateCount is smaller")
67 }
68 if remember[start] {
69 return
70 }
71 remember[start] = true
72 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020073 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020074
75 // Mark nontoken transitions
76 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020077 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020078 }
79
80 toMatrix(matrix, t.end)
81 }
82 }
83
84 toMatrix(mat.array, 1)
85
86 return mat
87}
88
Akron941f2152021-09-26 15:14:25 +020089// Type of tokenizer
90func (MatrixTokenizer) Type() string {
91 return MAMAGIC
92}
93
Akron16c312e2021-09-26 13:11:12 +020094// Save stores the matrix data in a file
95func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
96 f, err := os.Create(file)
97 if err != nil {
98 log.Println(err)
99 return 0, err
100 }
101 defer f.Close()
102 gz := gzip.NewWriter(f)
103 defer gz.Close()
104 n, err = mat.WriteTo(gz)
105 if err != nil {
106 log.Println(err)
107 return n, err
108 }
109 gz.Flush()
110 return n, nil
111}
112
113// WriteTo stores the matrix data in an io.Writer.
114func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
115
116 wb := bufio.NewWriter(w)
117 defer wb.Flush()
118
119 // Store magical header
120 all, err := wb.Write([]byte(MAMAGIC))
121 if err != nil {
122 log.Println(err)
123 return int64(all), err
124 }
125
126 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200127 // In datok it's 16 - 4*4
128 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200129 max := 0
130 for sym, num := range mat.sigma {
131 sigmalist[num] = sym
132 if num > max {
133 max = num
134 }
135 }
136
Akron28031b72021-10-02 13:07:25 +0200137 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200138 sigmalist = sigmalist[:max+1]
139
Akron28031b72021-10-02 13:07:25 +0200140 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200141 bo.PutUint16(buf[0:2], VERSION)
142 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
143 bo.PutUint16(buf[4:6], uint16(mat.unknown))
144 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200145 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
146 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
147 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200148 if err != nil {
149 log.Println(err)
150 return int64(all), err
151 }
152
153 all += more
154
155 // Write sigma
156 for _, sym := range sigmalist {
157
158 more, err = wb.WriteRune(sym)
159 if err != nil {
160 log.Println(err)
161 return int64(all), err
162 }
163 all += more
164 }
165
166 if err != nil {
167 log.Println(err)
168 return int64(all), err
169 }
170
171 // Test marker - could be checksum
172 more, err = wb.Write([]byte("M"))
173 if err != nil {
174 log.Println(err)
175 return int64(all), err
176 }
177 all += more
178
Akron16c312e2021-09-26 13:11:12 +0200179 for _, x := range mat.array {
180 bo.PutUint32(buf[0:4], uint32(x))
181 more, err = wb.Write(buf[0:4])
182 if err != nil {
183 log.Println(err)
184 return int64(all), err
185 }
186 all += more
187 if more != 4 {
188 log.Println("Can not write base uint32")
189 return int64(all), err
190 }
Akron16c312e2021-09-26 13:11:12 +0200191 }
192
193 return int64(all), err
194}
195
196// LoadDatokFile reads a double array represented tokenizer
197// from a file.
198func LoadMatrixFile(file string) *MatrixTokenizer {
199 f, err := os.Open(file)
200 if err != nil {
201 log.Println(err)
202 return nil
203 }
204 defer f.Close()
205
206 gz, err := gzip.NewReader(f)
207 if err != nil {
208 log.Println(err)
209 return nil
210 }
211 defer gz.Close()
212
213 // Todo: Read the whole file!
214 return ParseMatrix(gz)
215}
216
217// LoadMatrixFile reads a matrix represented tokenizer
218// from an io.Reader
219func ParseMatrix(ior io.Reader) *MatrixTokenizer {
220
221 // Initialize tokenizer with default values
222 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200223 sigma: make(map[rune]int),
224 epsilon: 0,
225 unknown: 0,
226 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200228 }
229
230 r := bufio.NewReader(ior)
231
232 buf := make([]byte, 1024)
233 buf = buf[0:len(MAMAGIC)]
234
235 _, err := r.Read(buf)
236
237 if err != nil {
238 log.Println(err)
239 return nil
240 }
241
242 if string(MAMAGIC) != string(buf) {
243 log.Println("Not a matok file")
244 return nil
245 }
246
Akron28031b72021-10-02 13:07:25 +0200247 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200248 if err != nil {
249 log.Println(err)
250 return nil
251 }
252
Akron28031b72021-10-02 13:07:25 +0200253 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200254 log.Println("Read bytes do not fit")
255 return nil
256 }
257
258 version := bo.Uint16(buf[0:2])
259
260 if version != VERSION {
261 log.Println("Version not compatible")
262 return nil
263 }
264
265 mat.epsilon = int(bo.Uint16(buf[2:4]))
266 mat.unknown = int(bo.Uint16(buf[4:6]))
267 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200268 mat.stateCount = int(bo.Uint32(buf[8:12]))
269 sigmaCount := int(bo.Uint16(buf[12:14]))
270 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200271
Akron16c312e2021-09-26 13:11:12 +0200272 for x := 0; x < sigmaCount; x++ {
273 sym, _, err := r.ReadRune()
274 if err == nil && sym != 0 {
275 if int(sym) < 256 {
276 mat.sigmaASCII[int(sym)] = x
277 }
278 mat.sigma[sym] = x
279 }
280 }
281
282 _, err = io.ReadFull(r, buf[0:1])
283
284 if err != nil {
285 log.Print(err)
286 return nil
287 }
288
289 if string("M") != string(buf[0:1]) {
290 log.Println("Not a matok file")
291 return nil
292 }
293
294 // Read based on length
295 mat.array = make([]uint32, arraySize)
296
297 dataArray, err := io.ReadAll(r)
298
299 if err == io.EOF {
300 log.Println(err)
301 return nil
302 }
303
304 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200305 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200306 return nil
307 }
308
309 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200310 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
311 }
312
313 return mat
314}
315
Akron98fbfef2021-10-23 17:02:11 +0200316// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200317func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200318 return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200319}
320
Akron98fbfef2021-10-23 17:02:11 +0200321// TransduceTokenWriter transduces an input string against
322// the matrix FSA. The rules are always greedy. If the
323// automaton fails, it takes the last possible token ending
324// branch.
Akron4f6b28c2021-10-25 00:52:03 +0200325func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron1c34ce62021-09-23 23:27:39 +0200326 var a int
Akron16c312e2021-09-26 13:11:12 +0200327 var t0 uint32
328 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200329 var ok, rewindBuffer bool
330
331 // Remember the last position of a possible tokenend,
332 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200333 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200334 epsilonOffset := 0
335
Akron5c82a922021-09-24 19:11:29 +0200336 // Remember if the last transition was epsilon
337 sentenceEnd := false
338
Akrona854faa2021-10-22 19:31:08 +0200339 // Remember if a text end was already set
340 textEnd := false
341
Akron65113a82021-11-13 10:50:53 +0100342 buffer := make([]rune, BUFSIZE)
343 buffo := 0 // Buffer absolute offset
Akron98fbfef2021-10-23 17:02:11 +0200344 bufft := 0 // Buffer token offset
345 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200346 buffi := 0 // Buffer length
347
Akron98fbfef2021-10-23 17:02:11 +0200348 // The buffer is organized as follows:
Akron65113a82021-11-13 10:50:53 +0100349 // ...o[...t[....c..]..i]
Akron98fbfef2021-10-23 17:02:11 +0200350
Akron1c34ce62021-09-23 23:27:39 +0200351 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200352 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200353
354 var char rune
355
356 var err error
357 eof := false
Akrona854faa2021-10-22 19:31:08 +0200358 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200359 newchar := true
360
361PARSECHARM:
362 for {
363
364 if newchar {
365 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200366 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200367 if eof {
368 break
369 }
370 char, _, err = reader.ReadRune()
371
372 // No more runes to read
373 if err != nil {
Akron274600e2021-11-03 20:09:06 +0100374 if err == io.EOF {
375 eof = true
376 break
377 }
378
379 log.Fatalln(err)
380 os.Exit(1)
381 return false
Akron1c34ce62021-09-23 23:27:39 +0200382 }
Akron274600e2021-11-03 20:09:06 +0100383
Akron1c34ce62021-09-23 23:27:39 +0200384 buffer[buffi] = char
385 buffi++
386 }
387
Akron98fbfef2021-10-23 17:02:11 +0200388 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200389
390 if DEBUG {
Akron65113a82021-11-13 10:50:53 +0100391 log.Println("Current char", string(char), int(char), showBufferNew2(buffer, buffo, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200392 }
393
Akrona854faa2021-10-22 19:31:08 +0200394 eot = false
395
Akron1c34ce62021-09-23 23:27:39 +0200396 // TODO:
397 // Better not repeatedly check for a!
398 // Possibly keep a buffer with a.
399 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200400 if int(char) == EOT {
401 eot = true
402 }
Akron1c34ce62021-09-23 23:27:39 +0200403 a = mat.sigmaASCII[int(char)]
404 } else {
405 a, ok = mat.sigma[char]
406 if !ok {
407 a = 0
408 }
409 }
410
411 // Use identity symbol if character is not in sigma
412 if a == 0 && mat.identity != -1 {
413 a = mat.identity
414 }
415
416 t0 = t
417
418 // Check for epsilon transitions and remember
419
Akron16c312e2021-09-26 13:11:12 +0200420 // TODO: Can t0 be negative here?
421 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200422 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200423
424 // Maybe not necessary - and should be simpler!
425 // Just Remove
426 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200427 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200428 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200429
430 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100431 log.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200432 }
Akron1c34ce62021-09-23 23:27:39 +0200433 }
434 }
435
436 // Checks a transition based on t0, a and buffo
437 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200438
439 if DEBUG {
440 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100441 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200442 }
443
Akrone396a932021-10-19 01:06:13 +0200444 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200445 if t == 0 {
446
447 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100448 log.Println("Match is not fine!")
Akron1c34ce62021-09-23 23:27:39 +0200449 }
450
451 if !ok && a == mat.identity {
452
453 // Try again with unknown symbol, in case identity failed
454 // Char is only relevant when set
455 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100456 log.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
Akron1c34ce62021-09-23 23:27:39 +0200457 }
458 a = mat.unknown
459
460 } else if a != mat.epsilon {
461
462 // Try again with epsilon symbol, in case everything else failed
463 t0 = epsilonState
464 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200465 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200466 a = mat.epsilon
467
468 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100469 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200470 }
471
472 } else {
473 break
474 }
475
476 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200477 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200478 continue
479 }
480
481 // Transition was successful
482 rewindBuffer = false
483
484 // Transition consumes a character
485 if a != mat.epsilon {
486
Akron98fbfef2021-10-23 17:02:11 +0200487 buffc++
Akron1c34ce62021-09-23 23:27:39 +0200488
489 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200490 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200491 if DEBUG {
Akron65113a82021-11-13 10:50:53 +0100492 log.Println("Nontoken forward", showBufferNew2(buffer, buffo, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200493 }
Akron98fbfef2021-10-23 17:02:11 +0200494 bufft++
495 // rewindBuffer = true
Akron1c34ce62021-09-23 23:27:39 +0200496 }
497
498 } else {
499 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200500 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200501 if DEBUG {
Akron65113a82021-11-13 10:50:53 +0100502 log.Println("-> Buffer values", buffo, bufft, buffc, buffi, epsilonOffset)
503 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew2(buffer, buffo, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200504 }
Akron65113a82021-11-13 10:50:53 +0100505 w.Token(bufft-buffo, buffer[buffo:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200506 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200507 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200508 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200509 } else {
510 sentenceEnd = true
Akron65113a82021-11-13 10:50:53 +0100511 w.SentenceEnd(buffc - buffo)
Akron1c34ce62021-09-23 23:27:39 +0200512 }
Akron1c34ce62021-09-23 23:27:39 +0200513 }
514
Akron8cc2dd92021-10-25 19:49:41 +0200515 if eot {
516 eot = false
517 textEnd = true
Akron65113a82021-11-13 10:50:53 +0100518 w.TextEnd(buffc - buffo)
Akron8cc2dd92021-10-25 19:49:41 +0200519 rewindBuffer = true
520 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100521 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200522 }
523 }
524
Akron1c34ce62021-09-23 23:27:39 +0200525 // Rewind the buffer if necessary
526 if rewindBuffer {
527
Akron65113a82021-11-13 10:50:53 +0100528 if buffo < (BUFSIZE - 128) {
529 buffo = buffc
530 bufft = buffc
531 epsilonOffset = 0
532 epsilonState = 0
533 } else {
Akron16c312e2021-09-26 13:11:12 +0200534
Akron65113a82021-11-13 10:50:53 +0100535 if DEBUG {
536 log.Println("-> Rewind buffer", buffo, bufft, buffc, buffi, epsilonOffset)
537 }
Akron1c34ce62021-09-23 23:27:39 +0200538
Akron65113a82021-11-13 10:50:53 +0100539 // TODO: Better as a ring buffer
540 // buffer = buffer[buffc:] !slower
541 for x, i := range buffer[buffc:buffi] {
542 buffer[x] = i
543 }
Akron16c312e2021-09-26 13:11:12 +0200544
Akron65113a82021-11-13 10:50:53 +0100545 buffo = 0
Akrona854faa2021-10-22 19:31:08 +0200546
Akron65113a82021-11-13 10:50:53 +0100547 buffi -= buffc
548 epsilonOffset = 0
549 epsilonState = 0
550
551 buffc = 0
552 bufft = 0
553
554 if DEBUG {
555 log.Println("Remaining:", showBufferNew2(buffer, buffo, bufft, buffc, buffi))
556 }
Akrona854faa2021-10-22 19:31:08 +0200557 }
Akron1c34ce62021-09-23 23:27:39 +0200558 }
559
Akron16c312e2021-09-26 13:11:12 +0200560 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200561
562 newchar = true
563
564 // TODO:
565 // Prevent endless epsilon loops!
566 }
567
568 // Input reader is not yet finished
569 if !eof {
570 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100571 log.Println("Not at the end")
Akron1c34ce62021-09-23 23:27:39 +0200572 }
573 return false
574 }
575
576 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100577 log.Println("Entering final check")
Akron1c34ce62021-09-23 23:27:39 +0200578 }
Akron1c34ce62021-09-23 23:27:39 +0200579
Akrona854faa2021-10-22 19:31:08 +0200580 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200581 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200582 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
583 a = mat.epsilon
584 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200585 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200586 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200587 // Remember state for backtracking to last tokenend state
588 goto PARSECHARM
589
590 } else if epsilonState != 0 {
591 t0 = epsilonState
592 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200593 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200594 if DEBUG {
Akron65113a82021-11-13 10:50:53 +0100595 log.Println("Get from epsilon stack and set buffc!", showBufferNew2(buffer, buffo, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200596 }
597 goto PARSECHARM
598 }
Akron1c34ce62021-09-23 23:27:39 +0200599
Akron5c82a922021-09-24 19:11:29 +0200600 // Add an additional sentence ending, if the file is over but no explicit
601 // sentence split was reached. This may be controversial and therefore
602 // optional via parameter.
603 if !sentenceEnd {
Akron65113a82021-11-13 10:50:53 +0100604 w.SentenceEnd(buffc - buffo)
Akron5c82a922021-09-24 19:11:29 +0200605 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100606 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +0200607 }
608 }
609
610 if !textEnd {
Akron65113a82021-11-13 10:50:53 +0100611 w.TextEnd(buffc - buffo)
Akrona854faa2021-10-22 19:31:08 +0200612
613 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100614 log.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200615 }
616 }
617
618 return true
Akron1c34ce62021-09-23 23:27:39 +0200619}