blob: ecb22be46ab1b8646653f0d1c0e8c3ab594095f7 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "io"
Akron16c312e2021-09-26 13:11:12 +02007 "log"
8 "os"
9)
10
11const (
12 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020013 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron28031b72021-10-02 13:07:25 +020040 max := 0
Akron1c34ce62021-09-23 23:27:39 +020041 for num, sym := range auto.sigmaRev {
42 if int(sym) < 256 {
43 mat.sigmaASCII[int(sym)] = num
44 }
45 mat.sigma[sym] = num
46 if num > auto.sigmaCount {
47 panic("sigmaCount is smaller")
48 }
Akron28031b72021-10-02 13:07:25 +020049 if num > max {
50 max = num
51 }
Akron1c34ce62021-09-23 23:27:39 +020052 }
Akron28031b72021-10-02 13:07:25 +020053 // Add final entry to the list (maybe not necessary actually)
54
Akron1c34ce62021-09-23 23:27:39 +020055 remember := make([]bool, auto.stateCount+2)
56
Akron28031b72021-10-02 13:07:25 +020057 // lower sigmaCount, as no final value exists
58 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
59
Akron1c34ce62021-09-23 23:27:39 +020060 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020061 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020062
Akron16c312e2021-09-26 13:11:12 +020063 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020064 if start > auto.stateCount {
65 panic("stateCount is smaller")
66 }
67 if remember[start] {
68 return
69 }
70 remember[start] = true
71 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020072 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020073
74 // Mark nontoken transitions
75 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020076 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020077 }
78
79 toMatrix(matrix, t.end)
80 }
81 }
82
83 toMatrix(mat.array, 1)
84
85 return mat
86}
87
Akron941f2152021-09-26 15:14:25 +020088// Type of tokenizer
89func (MatrixTokenizer) Type() string {
90 return MAMAGIC
91}
92
Akron16c312e2021-09-26 13:11:12 +020093// Save stores the matrix data in a file
94func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
95 f, err := os.Create(file)
96 if err != nil {
97 log.Println(err)
98 return 0, err
99 }
100 defer f.Close()
101 gz := gzip.NewWriter(f)
102 defer gz.Close()
103 n, err = mat.WriteTo(gz)
104 if err != nil {
105 log.Println(err)
106 return n, err
107 }
108 gz.Flush()
109 return n, nil
110}
111
112// WriteTo stores the matrix data in an io.Writer.
113func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
114
115 wb := bufio.NewWriter(w)
116 defer wb.Flush()
117
118 // Store magical header
119 all, err := wb.Write([]byte(MAMAGIC))
120 if err != nil {
121 log.Println(err)
122 return int64(all), err
123 }
124
125 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200126 // In datok it's 16 - 4*4
127 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200128 max := 0
129 for sym, num := range mat.sigma {
130 sigmalist[num] = sym
131 if num > max {
132 max = num
133 }
134 }
135
Akron28031b72021-10-02 13:07:25 +0200136 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200137 sigmalist = sigmalist[:max+1]
138
Akron28031b72021-10-02 13:07:25 +0200139 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200140 bo.PutUint16(buf[0:2], VERSION)
141 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
142 bo.PutUint16(buf[4:6], uint16(mat.unknown))
143 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200144 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
145 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
146 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200147 if err != nil {
148 log.Println(err)
149 return int64(all), err
150 }
151
152 all += more
153
154 // Write sigma
155 for _, sym := range sigmalist {
156
157 more, err = wb.WriteRune(sym)
158 if err != nil {
159 log.Println(err)
160 return int64(all), err
161 }
162 all += more
163 }
164
165 if err != nil {
166 log.Println(err)
167 return int64(all), err
168 }
169
170 // Test marker - could be checksum
171 more, err = wb.Write([]byte("M"))
172 if err != nil {
173 log.Println(err)
174 return int64(all), err
175 }
176 all += more
177
Akron16c312e2021-09-26 13:11:12 +0200178 for _, x := range mat.array {
179 bo.PutUint32(buf[0:4], uint32(x))
180 more, err = wb.Write(buf[0:4])
181 if err != nil {
182 log.Println(err)
183 return int64(all), err
184 }
185 all += more
186 if more != 4 {
187 log.Println("Can not write base uint32")
188 return int64(all), err
189 }
Akron16c312e2021-09-26 13:11:12 +0200190 }
191
192 return int64(all), err
193}
194
195// LoadDatokFile reads a double array represented tokenizer
196// from a file.
197func LoadMatrixFile(file string) *MatrixTokenizer {
198 f, err := os.Open(file)
199 if err != nil {
200 log.Println(err)
201 return nil
202 }
203 defer f.Close()
204
205 gz, err := gzip.NewReader(f)
206 if err != nil {
207 log.Println(err)
208 return nil
209 }
210 defer gz.Close()
211
212 // Todo: Read the whole file!
213 return ParseMatrix(gz)
214}
215
216// LoadMatrixFile reads a matrix represented tokenizer
217// from an io.Reader
218func ParseMatrix(ior io.Reader) *MatrixTokenizer {
219
220 // Initialize tokenizer with default values
221 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200222 sigma: make(map[rune]int),
223 epsilon: 0,
224 unknown: 0,
225 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200226 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200227 }
228
229 r := bufio.NewReader(ior)
230
231 buf := make([]byte, 1024)
232 buf = buf[0:len(MAMAGIC)]
233
234 _, err := r.Read(buf)
235
236 if err != nil {
237 log.Println(err)
238 return nil
239 }
240
241 if string(MAMAGIC) != string(buf) {
242 log.Println("Not a matok file")
243 return nil
244 }
245
Akron28031b72021-10-02 13:07:25 +0200246 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200247 if err != nil {
248 log.Println(err)
249 return nil
250 }
251
Akron28031b72021-10-02 13:07:25 +0200252 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200253 log.Println("Read bytes do not fit")
254 return nil
255 }
256
257 version := bo.Uint16(buf[0:2])
258
259 if version != VERSION {
260 log.Println("Version not compatible")
261 return nil
262 }
263
264 mat.epsilon = int(bo.Uint16(buf[2:4]))
265 mat.unknown = int(bo.Uint16(buf[4:6]))
266 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200267 mat.stateCount = int(bo.Uint32(buf[8:12]))
268 sigmaCount := int(bo.Uint16(buf[12:14]))
269 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200270
Akron16c312e2021-09-26 13:11:12 +0200271 for x := 0; x < sigmaCount; x++ {
272 sym, _, err := r.ReadRune()
273 if err == nil && sym != 0 {
274 if int(sym) < 256 {
275 mat.sigmaASCII[int(sym)] = x
276 }
277 mat.sigma[sym] = x
278 }
279 }
280
281 _, err = io.ReadFull(r, buf[0:1])
282
283 if err != nil {
284 log.Print(err)
285 return nil
286 }
287
288 if string("M") != string(buf[0:1]) {
289 log.Println("Not a matok file")
290 return nil
291 }
292
293 // Read based on length
294 mat.array = make([]uint32, arraySize)
295
296 dataArray, err := io.ReadAll(r)
297
298 if err == io.EOF {
299 log.Println(err)
300 return nil
301 }
302
303 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200304 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200305 return nil
306 }
307
308 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200309 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
310 }
311
312 return mat
313}
314
Akron98fbfef2021-10-23 17:02:11 +0200315// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200316func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200317 return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200318}
319
Akron98fbfef2021-10-23 17:02:11 +0200320// TransduceTokenWriter transduces an input string against
321// the matrix FSA. The rules are always greedy. If the
322// automaton fails, it takes the last possible token ending
323// branch.
Akron4f6b28c2021-10-25 00:52:03 +0200324func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron1c34ce62021-09-23 23:27:39 +0200325 var a int
Akron16c312e2021-09-26 13:11:12 +0200326 var t0 uint32
327 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200328 var ok, rewindBuffer bool
329
330 // Remember the last position of a possible tokenend,
331 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200332 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200333 epsilonOffset := 0
334
Akron5c82a922021-09-24 19:11:29 +0200335 // Remember if the last transition was epsilon
336 sentenceEnd := false
337
Akrona854faa2021-10-22 19:31:08 +0200338 // Remember if a text end was already set
339 textEnd := false
340
Akron1c34ce62021-09-23 23:27:39 +0200341 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200342 bufft := 0 // Buffer token offset
343 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200344 buffi := 0 // Buffer length
345
Akron98fbfef2021-10-23 17:02:11 +0200346 // The buffer is organized as follows:
347 // [ t[....c..]..i]
348
Akron1c34ce62021-09-23 23:27:39 +0200349 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200350 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200351
352 var char rune
353
354 var err error
355 eof := false
Akrona854faa2021-10-22 19:31:08 +0200356 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200357 newchar := true
358
359PARSECHARM:
360 for {
361
362 if newchar {
363 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200364 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200365 if eof {
366 break
367 }
368 char, _, err = reader.ReadRune()
369
370 // No more runes to read
371 if err != nil {
Akron274600e2021-11-03 20:09:06 +0100372 if err == io.EOF {
373 eof = true
374 break
375 }
376
377 log.Fatalln(err)
378 os.Exit(1)
379 return false
Akron1c34ce62021-09-23 23:27:39 +0200380 }
Akron274600e2021-11-03 20:09:06 +0100381
Akron1c34ce62021-09-23 23:27:39 +0200382 buffer[buffi] = char
383 buffi++
384 }
385
Akron98fbfef2021-10-23 17:02:11 +0200386 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200387
388 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100389 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200390 }
391
Akrona854faa2021-10-22 19:31:08 +0200392 eot = false
393
Akron1c34ce62021-09-23 23:27:39 +0200394 // TODO:
395 // Better not repeatedly check for a!
396 // Possibly keep a buffer with a.
397 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200398 if int(char) == EOT {
399 eot = true
400 }
Akron1c34ce62021-09-23 23:27:39 +0200401 a = mat.sigmaASCII[int(char)]
402 } else {
403 a, ok = mat.sigma[char]
404 if !ok {
405 a = 0
406 }
407 }
408
409 // Use identity symbol if character is not in sigma
410 if a == 0 && mat.identity != -1 {
411 a = mat.identity
412 }
413
414 t0 = t
415
416 // Check for epsilon transitions and remember
417
Akron16c312e2021-09-26 13:11:12 +0200418 // TODO: Can t0 be negative here?
419 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200420 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200421
422 // Maybe not necessary - and should be simpler!
423 // Just Remove
424 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200425 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200426 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200427
428 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100429 log.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200430 }
Akron1c34ce62021-09-23 23:27:39 +0200431 }
432 }
433
434 // Checks a transition based on t0, a and buffo
435 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200436
437 if DEBUG {
438 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100439 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200440 }
441
Akrone396a932021-10-19 01:06:13 +0200442 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200443 if t == 0 {
444
445 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100446 log.Println("Match is not fine!")
Akron1c34ce62021-09-23 23:27:39 +0200447 }
448
449 if !ok && a == mat.identity {
450
451 // Try again with unknown symbol, in case identity failed
452 // Char is only relevant when set
453 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100454 log.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
Akron1c34ce62021-09-23 23:27:39 +0200455 }
456 a = mat.unknown
457
458 } else if a != mat.epsilon {
459
460 // Try again with epsilon symbol, in case everything else failed
461 t0 = epsilonState
462 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200463 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200464 a = mat.epsilon
465
466 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100467 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200468 }
469
470 } else {
471 break
472 }
473
474 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200475 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200476 continue
477 }
478
479 // Transition was successful
480 rewindBuffer = false
481
482 // Transition consumes a character
483 if a != mat.epsilon {
484
Akron98fbfef2021-10-23 17:02:11 +0200485 buffc++
Akron1c34ce62021-09-23 23:27:39 +0200486
487 // Transition does not produce a character
Akron98fbfef2021-10-23 17:02:11 +0200488 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200489 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100490 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200491 }
Akron98fbfef2021-10-23 17:02:11 +0200492 bufft++
493 // rewindBuffer = true
Akron1c34ce62021-09-23 23:27:39 +0200494 }
495
496 } else {
497 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200498 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200499 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100500 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200501 }
Akron32416ce2021-10-23 17:09:41 +0200502 w.Token(bufft, buffer[:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200503 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200504 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200505 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200506 } else {
507 sentenceEnd = true
Akron4f6b28c2021-10-25 00:52:03 +0200508 w.SentenceEnd(buffc)
Akron1c34ce62021-09-23 23:27:39 +0200509 }
Akron1c34ce62021-09-23 23:27:39 +0200510 }
511
Akron8cc2dd92021-10-25 19:49:41 +0200512 if eot {
513 eot = false
514 textEnd = true
515 w.TextEnd(buffc)
516 rewindBuffer = true
517 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100518 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200519 }
520 }
521
Akron1c34ce62021-09-23 23:27:39 +0200522 // Rewind the buffer if necessary
523 if rewindBuffer {
524
Akron16c312e2021-09-26 13:11:12 +0200525 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100526 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akron16c312e2021-09-26 13:11:12 +0200527 }
528
Akron1c34ce62021-09-23 23:27:39 +0200529 // TODO: Better as a ring buffer
Akron9c3bf7f2021-11-03 19:52:12 +0100530 // buffer = buffer[buffc:] !slower
Akron98fbfef2021-10-23 17:02:11 +0200531 for x, i := range buffer[buffc:buffi] {
Akron1c34ce62021-09-23 23:27:39 +0200532 buffer[x] = i
533 }
534
Akron98fbfef2021-10-23 17:02:11 +0200535 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200536 // epsilonOffset -= buffo
537 epsilonOffset = 0
538 epsilonState = 0
539
Akron98fbfef2021-10-23 17:02:11 +0200540 buffc = 0
541 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200542
Akron98fbfef2021-10-23 17:02:11 +0200543 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100544 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akrona854faa2021-10-22 19:31:08 +0200545 }
Akron1c34ce62021-09-23 23:27:39 +0200546 }
547
Akron16c312e2021-09-26 13:11:12 +0200548 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200549
550 newchar = true
551
552 // TODO:
553 // Prevent endless epsilon loops!
554 }
555
556 // Input reader is not yet finished
557 if !eof {
558 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100559 log.Println("Not at the end")
Akron1c34ce62021-09-23 23:27:39 +0200560 }
561 return false
562 }
563
564 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100565 log.Println("Entering final check")
Akron1c34ce62021-09-23 23:27:39 +0200566 }
Akron1c34ce62021-09-23 23:27:39 +0200567
Akrona854faa2021-10-22 19:31:08 +0200568 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200569 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200570 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
571 a = mat.epsilon
572 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200573 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200574 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200575 // Remember state for backtracking to last tokenend state
576 goto PARSECHARM
577
578 } else if epsilonState != 0 {
579 t0 = epsilonState
580 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200581 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200582 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100583 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200584 }
585 goto PARSECHARM
586 }
Akron1c34ce62021-09-23 23:27:39 +0200587
Akron5c82a922021-09-24 19:11:29 +0200588 // Add an additional sentence ending, if the file is over but no explicit
589 // sentence split was reached. This may be controversial and therefore
590 // optional via parameter.
591 if !sentenceEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200592 w.SentenceEnd(buffc)
Akron5c82a922021-09-24 19:11:29 +0200593 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100594 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +0200595 }
596 }
597
598 if !textEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200599 w.TextEnd(buffc)
Akrona854faa2021-10-22 19:31:08 +0200600
601 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100602 log.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200603 }
604 }
605
606 return true
Akron1c34ce62021-09-23 23:27:39 +0200607}