blob: 567430ba08e728cc8cd9af882433971f8b9de66e [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "io"
Akron16c312e2021-09-26 13:11:12 +02007 "log"
8 "os"
9)
10
11const (
12 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020013 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron00cecd12021-12-05 13:14:03 +010040 max := 0
41
42 // Init with identity
43 if mat.identity != -1 {
44 for i := 0; i < 256; i++ {
45 mat.sigmaASCII[i] = mat.identity
46 }
47 max = mat.identity
Akron4880fb62021-12-05 12:03:05 +010048 }
49
Akron1c34ce62021-09-23 23:27:39 +020050 for num, sym := range auto.sigmaRev {
51 if int(sym) < 256 {
52 mat.sigmaASCII[int(sym)] = num
53 }
54 mat.sigma[sym] = num
55 if num > auto.sigmaCount {
56 panic("sigmaCount is smaller")
57 }
Akron28031b72021-10-02 13:07:25 +020058 if num > max {
59 max = num
60 }
Akron1c34ce62021-09-23 23:27:39 +020061 }
Akron28031b72021-10-02 13:07:25 +020062 // Add final entry to the list (maybe not necessary actually)
63
Akron1c34ce62021-09-23 23:27:39 +020064 remember := make([]bool, auto.stateCount+2)
65
Akron28031b72021-10-02 13:07:25 +020066 // lower sigmaCount, as no final value exists
67 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
68
Akron1c34ce62021-09-23 23:27:39 +020069 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020070 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020071
Akron16c312e2021-09-26 13:11:12 +020072 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020073 if start > auto.stateCount {
74 panic("stateCount is smaller")
75 }
76 if remember[start] {
77 return
78 }
79 remember[start] = true
80 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020081 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020082
83 // Mark nontoken transitions
84 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020085 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020086 }
87
88 toMatrix(matrix, t.end)
89 }
90 }
91
92 toMatrix(mat.array, 1)
93
94 return mat
95}
96
Akron941f2152021-09-26 15:14:25 +020097// Type of tokenizer
98func (MatrixTokenizer) Type() string {
99 return MAMAGIC
100}
101
Akron16c312e2021-09-26 13:11:12 +0200102// Save stores the matrix data in a file
103func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
104 f, err := os.Create(file)
105 if err != nil {
106 log.Println(err)
107 return 0, err
108 }
109 defer f.Close()
110 gz := gzip.NewWriter(f)
111 defer gz.Close()
112 n, err = mat.WriteTo(gz)
113 if err != nil {
114 log.Println(err)
115 return n, err
116 }
117 gz.Flush()
118 return n, nil
119}
120
121// WriteTo stores the matrix data in an io.Writer.
122func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
123
124 wb := bufio.NewWriter(w)
125 defer wb.Flush()
126
127 // Store magical header
128 all, err := wb.Write([]byte(MAMAGIC))
129 if err != nil {
130 log.Println(err)
131 return int64(all), err
132 }
133
134 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200135 // In datok it's 16 - 4*4
136 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200137 max := 0
138 for sym, num := range mat.sigma {
139 sigmalist[num] = sym
140 if num > max {
141 max = num
142 }
143 }
144
Akron28031b72021-10-02 13:07:25 +0200145 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200146 sigmalist = sigmalist[:max+1]
147
Akron28031b72021-10-02 13:07:25 +0200148 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200149 bo.PutUint16(buf[0:2], VERSION)
150 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
151 bo.PutUint16(buf[4:6], uint16(mat.unknown))
152 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200153 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
154 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
155 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200156 if err != nil {
157 log.Println(err)
158 return int64(all), err
159 }
160
161 all += more
162
163 // Write sigma
164 for _, sym := range sigmalist {
165
166 more, err = wb.WriteRune(sym)
167 if err != nil {
168 log.Println(err)
169 return int64(all), err
170 }
171 all += more
172 }
173
174 if err != nil {
175 log.Println(err)
176 return int64(all), err
177 }
178
179 // Test marker - could be checksum
180 more, err = wb.Write([]byte("M"))
181 if err != nil {
182 log.Println(err)
183 return int64(all), err
184 }
185 all += more
186
Akron16c312e2021-09-26 13:11:12 +0200187 for _, x := range mat.array {
188 bo.PutUint32(buf[0:4], uint32(x))
189 more, err = wb.Write(buf[0:4])
190 if err != nil {
191 log.Println(err)
192 return int64(all), err
193 }
194 all += more
195 if more != 4 {
196 log.Println("Can not write base uint32")
197 return int64(all), err
198 }
Akron16c312e2021-09-26 13:11:12 +0200199 }
200
201 return int64(all), err
202}
203
204// LoadDatokFile reads a double array represented tokenizer
205// from a file.
206func LoadMatrixFile(file string) *MatrixTokenizer {
207 f, err := os.Open(file)
208 if err != nil {
209 log.Println(err)
210 return nil
211 }
212 defer f.Close()
213
214 gz, err := gzip.NewReader(f)
215 if err != nil {
216 log.Println(err)
217 return nil
218 }
219 defer gz.Close()
220
221 // Todo: Read the whole file!
222 return ParseMatrix(gz)
223}
224
225// LoadMatrixFile reads a matrix represented tokenizer
226// from an io.Reader
227func ParseMatrix(ior io.Reader) *MatrixTokenizer {
228
229 // Initialize tokenizer with default values
230 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200231 sigma: make(map[rune]int),
232 epsilon: 0,
233 unknown: 0,
234 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200235 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200236 }
237
238 r := bufio.NewReader(ior)
239
240 buf := make([]byte, 1024)
241 buf = buf[0:len(MAMAGIC)]
242
243 _, err := r.Read(buf)
244
245 if err != nil {
246 log.Println(err)
247 return nil
248 }
249
250 if string(MAMAGIC) != string(buf) {
251 log.Println("Not a matok file")
252 return nil
253 }
254
Akron28031b72021-10-02 13:07:25 +0200255 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200256 if err != nil {
257 log.Println(err)
258 return nil
259 }
260
Akron28031b72021-10-02 13:07:25 +0200261 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200262 log.Println("Read bytes do not fit")
263 return nil
264 }
265
266 version := bo.Uint16(buf[0:2])
267
268 if version != VERSION {
269 log.Println("Version not compatible")
270 return nil
271 }
272
273 mat.epsilon = int(bo.Uint16(buf[2:4]))
274 mat.unknown = int(bo.Uint16(buf[4:6]))
275 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200276 mat.stateCount = int(bo.Uint32(buf[8:12]))
277 sigmaCount := int(bo.Uint16(buf[12:14]))
278 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200279
Akron00cecd12021-12-05 13:14:03 +0100280 // Init with identity
281 if mat.identity != -1 {
282 for i := 0; i < 256; i++ {
283 mat.sigmaASCII[i] = mat.identity
284 }
285 }
286
Akron16c312e2021-09-26 13:11:12 +0200287 for x := 0; x < sigmaCount; x++ {
288 sym, _, err := r.ReadRune()
289 if err == nil && sym != 0 {
290 if int(sym) < 256 {
291 mat.sigmaASCII[int(sym)] = x
292 }
293 mat.sigma[sym] = x
294 }
295 }
296
297 _, err = io.ReadFull(r, buf[0:1])
298
299 if err != nil {
300 log.Print(err)
301 return nil
302 }
303
304 if string("M") != string(buf[0:1]) {
305 log.Println("Not a matok file")
306 return nil
307 }
308
309 // Read based on length
310 mat.array = make([]uint32, arraySize)
311
312 dataArray, err := io.ReadAll(r)
313
314 if err == io.EOF {
315 log.Println(err)
316 return nil
317 }
318
319 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200320 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200321 return nil
322 }
323
324 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200325 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
326 }
327
328 return mat
329}
330
Akron98fbfef2021-10-23 17:02:11 +0200331// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200332func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200333 return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200334}
335
Akron98fbfef2021-10-23 17:02:11 +0200336// TransduceTokenWriter transduces an input string against
337// the matrix FSA. The rules are always greedy. If the
338// automaton fails, it takes the last possible token ending
339// branch.
Akron4f6b28c2021-10-25 00:52:03 +0200340func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron1c34ce62021-09-23 23:27:39 +0200341 var a int
Akron16c312e2021-09-26 13:11:12 +0200342 var t0 uint32
343 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200344 var ok, rewindBuffer bool
345
346 // Remember the last position of a possible tokenend,
347 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200348 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200349 epsilonOffset := 0
350
Akron5c82a922021-09-24 19:11:29 +0200351 // Remember if the last transition was epsilon
352 sentenceEnd := false
353
Akrona854faa2021-10-22 19:31:08 +0200354 // Remember if a text end was already set
355 textEnd := false
356
Akron1c34ce62021-09-23 23:27:39 +0200357 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200358 bufft := 0 // Buffer token offset
359 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200360 buffi := 0 // Buffer length
361
Akron98fbfef2021-10-23 17:02:11 +0200362 // The buffer is organized as follows:
363 // [ t[....c..]..i]
364
Akron1c34ce62021-09-23 23:27:39 +0200365 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200366 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200367
368 var char rune
369
370 var err error
371 eof := false
Akrona854faa2021-10-22 19:31:08 +0200372 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200373 newchar := true
374
375PARSECHARM:
376 for {
377
378 if newchar {
379 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200380 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200381 if eof {
382 break
383 }
384 char, _, err = reader.ReadRune()
385
386 // No more runes to read
387 if err != nil {
Akron274600e2021-11-03 20:09:06 +0100388 if err == io.EOF {
389 eof = true
390 break
391 }
392
393 log.Fatalln(err)
394 os.Exit(1)
395 return false
Akron1c34ce62021-09-23 23:27:39 +0200396 }
Akron274600e2021-11-03 20:09:06 +0100397
Akron1c34ce62021-09-23 23:27:39 +0200398 buffer[buffi] = char
399 buffi++
400 }
401
Akron98fbfef2021-10-23 17:02:11 +0200402 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200403
404 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100405 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200406 }
407
Akrona854faa2021-10-22 19:31:08 +0200408 eot = false
409
Akron1c34ce62021-09-23 23:27:39 +0200410 // TODO:
411 // Better not repeatedly check for a!
412 // Possibly keep a buffer with a.
413 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200414 if int(char) == EOT {
415 eot = true
416 }
Akron1c34ce62021-09-23 23:27:39 +0200417 a = mat.sigmaASCII[int(char)]
418 } else {
419 a, ok = mat.sigma[char]
Akron1c34ce62021-09-23 23:27:39 +0200420
Akron4880fb62021-12-05 12:03:05 +0100421 // Use identity symbol if character is not in sigma
422 if !ok && mat.identity != -1 {
423 a = mat.identity
424 }
Akron1c34ce62021-09-23 23:27:39 +0200425 }
426
427 t0 = t
428
429 // Check for epsilon transitions and remember
430
Akron16c312e2021-09-26 13:11:12 +0200431 // TODO: Can t0 be negative here?
432 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200433 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200434
435 // Maybe not necessary - and should be simpler!
436 // Just Remove
437 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200438 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200439 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200440
441 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100442 log.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200443 }
Akron1c34ce62021-09-23 23:27:39 +0200444 }
445 }
446
447 // Checks a transition based on t0, a and buffo
448 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200449
450 if DEBUG {
451 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100452 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200453 }
454
Akrone396a932021-10-19 01:06:13 +0200455 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200456 if t == 0 {
457
458 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100459 log.Println("Match is not fine!")
Akron1c34ce62021-09-23 23:27:39 +0200460 }
461
462 if !ok && a == mat.identity {
463
464 // Try again with unknown symbol, in case identity failed
465 // Char is only relevant when set
466 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100467 log.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
Akron1c34ce62021-09-23 23:27:39 +0200468 }
469 a = mat.unknown
470
471 } else if a != mat.epsilon {
472
473 // Try again with epsilon symbol, in case everything else failed
474 t0 = epsilonState
475 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200476 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200477 a = mat.epsilon
478
479 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100480 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200481 }
482
483 } else {
484 break
485 }
486
487 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200488 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200489 continue
490 }
491
492 // Transition was successful
493 rewindBuffer = false
494
Akron90aa45b2021-11-16 23:28:17 +0100495 // Transition consumes no character
496 if a == mat.epsilon {
Akron1c34ce62021-09-23 23:27:39 +0200497 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200498 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200499 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100500 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200501 }
Akron32416ce2021-10-23 17:09:41 +0200502 w.Token(bufft, buffer[:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200503 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200504 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200505 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200506 } else {
507 sentenceEnd = true
Akron4f6b28c2021-10-25 00:52:03 +0200508 w.SentenceEnd(buffc)
Akron1c34ce62021-09-23 23:27:39 +0200509 }
Akron90aa45b2021-11-16 23:28:17 +0100510
511 // Transition consumes a character
512 } else {
513 buffc++
514
515 // Transition does not produce a character
516 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
517 if DEBUG {
518 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
519 }
520 bufft++
521 // rewindBuffer = true
522 }
Akron1c34ce62021-09-23 23:27:39 +0200523 }
524
Akron8cc2dd92021-10-25 19:49:41 +0200525 if eot {
526 eot = false
527 textEnd = true
528 w.TextEnd(buffc)
529 rewindBuffer = true
530 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100531 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200532 }
533 }
534
Akron1c34ce62021-09-23 23:27:39 +0200535 // Rewind the buffer if necessary
536 if rewindBuffer {
537
Akron16c312e2021-09-26 13:11:12 +0200538 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100539 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akron16c312e2021-09-26 13:11:12 +0200540 }
541
Akron22c565a2021-11-28 17:31:36 +0100542 // buffer = buffer[buffc:]
543 for x, i := range buffer[buffc:buffi] {
544 buffer[x] = i
545 }
Akron1c34ce62021-09-23 23:27:39 +0200546
Akron98fbfef2021-10-23 17:02:11 +0200547 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200548 // epsilonOffset -= buffo
549 epsilonOffset = 0
550 epsilonState = 0
551
Akron98fbfef2021-10-23 17:02:11 +0200552 buffc = 0
553 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200554
Akron98fbfef2021-10-23 17:02:11 +0200555 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100556 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akrona854faa2021-10-22 19:31:08 +0200557 }
Akron1c34ce62021-09-23 23:27:39 +0200558 }
559
Akron16c312e2021-09-26 13:11:12 +0200560 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200561
562 newchar = true
563
564 // TODO:
565 // Prevent endless epsilon loops!
566 }
567
568 // Input reader is not yet finished
569 if !eof {
570 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100571 log.Println("Not at the end")
Akron1c34ce62021-09-23 23:27:39 +0200572 }
573 return false
574 }
575
576 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100577 log.Println("Entering final check")
Akron1c34ce62021-09-23 23:27:39 +0200578 }
Akron1c34ce62021-09-23 23:27:39 +0200579
Akrona854faa2021-10-22 19:31:08 +0200580 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200581 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200582 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
583 a = mat.epsilon
584 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200585 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200586 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200587 // Remember state for backtracking to last tokenend state
588 goto PARSECHARM
589
590 } else if epsilonState != 0 {
591 t0 = epsilonState
592 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200593 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200594 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100595 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200596 }
597 goto PARSECHARM
598 }
Akron1c34ce62021-09-23 23:27:39 +0200599
Akron5c82a922021-09-24 19:11:29 +0200600 // Add an additional sentence ending, if the file is over but no explicit
601 // sentence split was reached. This may be controversial and therefore
602 // optional via parameter.
603 if !sentenceEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200604 w.SentenceEnd(buffc)
Akron5c82a922021-09-24 19:11:29 +0200605 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100606 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +0200607 }
608 }
609
610 if !textEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200611 w.TextEnd(buffc)
Akrona854faa2021-10-22 19:31:08 +0200612 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100613 log.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200614 }
615 }
616
617 return true
Akron1c34ce62021-09-23 23:27:39 +0200618}