blob: 565de87665efc762f49030ad1dd88ac5ea4efc7a [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bufio"
Akron16c312e2021-09-26 13:11:12 +02005 "compress/gzip"
Akron1c34ce62021-09-23 23:27:39 +02006 "io"
Akron16c312e2021-09-26 13:11:12 +02007 "log"
8 "os"
9)
10
11const (
12 MAMAGIC = "MATOK"
Akrona854faa2021-10-22 19:31:08 +020013 EOT = 4
Akron1c34ce62021-09-23 23:27:39 +020014)
15
16type MatrixTokenizer struct {
17 sigma map[rune]int
18 sigmaASCII [256]int
Akron16c312e2021-09-26 13:11:12 +020019 array []uint32
Akron1c34ce62021-09-23 23:27:39 +020020 stateCount int
21
22 // Special symbols in sigma
23 epsilon int
24 unknown int
25 identity int
Akron1c34ce62021-09-23 23:27:39 +020026}
27
28// ToMatrix turns the intermediate tokenizer into a
29// matrix representation.
30func (auto *Automaton) ToMatrix() *MatrixTokenizer {
31
32 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +020033 sigma: make(map[rune]int),
34 unknown: auto.unknown,
35 identity: auto.identity,
36 epsilon: auto.epsilon,
Akron1c34ce62021-09-23 23:27:39 +020037 stateCount: auto.stateCount,
38 }
39
Akron4880fb62021-12-05 12:03:05 +010040 for i := 0; i < 256; i++ {
41 mat.sigmaASCII[i] = mat.identity
42 }
43
Akron28031b72021-10-02 13:07:25 +020044 max := 0
Akron1c34ce62021-09-23 23:27:39 +020045 for num, sym := range auto.sigmaRev {
46 if int(sym) < 256 {
47 mat.sigmaASCII[int(sym)] = num
48 }
49 mat.sigma[sym] = num
50 if num > auto.sigmaCount {
51 panic("sigmaCount is smaller")
52 }
Akron28031b72021-10-02 13:07:25 +020053 if num > max {
54 max = num
55 }
Akron1c34ce62021-09-23 23:27:39 +020056 }
Akron28031b72021-10-02 13:07:25 +020057 // Add final entry to the list (maybe not necessary actually)
58
Akron1c34ce62021-09-23 23:27:39 +020059 remember := make([]bool, auto.stateCount+2)
60
Akron28031b72021-10-02 13:07:25 +020061 // lower sigmaCount, as no final value exists
62 mat.array = make([]uint32, (auto.stateCount+1)*(max+1))
63
Akron1c34ce62021-09-23 23:27:39 +020064 // Store all transitions in matrix
Akron16c312e2021-09-26 13:11:12 +020065 var toMatrix func([]uint32, int)
Akron1c34ce62021-09-23 23:27:39 +020066
Akron16c312e2021-09-26 13:11:12 +020067 toMatrix = func(matrix []uint32, start int) {
Akron1c34ce62021-09-23 23:27:39 +020068 if start > auto.stateCount {
69 panic("stateCount is smaller")
70 }
71 if remember[start] {
72 return
73 }
74 remember[start] = true
75 for alpha, t := range auto.transitions[start] {
Akron16c312e2021-09-26 13:11:12 +020076 matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
Akron1c34ce62021-09-23 23:27:39 +020077
78 // Mark nontoken transitions
79 if t.nontoken {
Akron16c312e2021-09-26 13:11:12 +020080 matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +020081 }
82
83 toMatrix(matrix, t.end)
84 }
85 }
86
87 toMatrix(mat.array, 1)
88
89 return mat
90}
91
Akron941f2152021-09-26 15:14:25 +020092// Type of tokenizer
93func (MatrixTokenizer) Type() string {
94 return MAMAGIC
95}
96
Akron16c312e2021-09-26 13:11:12 +020097// Save stores the matrix data in a file
98func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
99 f, err := os.Create(file)
100 if err != nil {
101 log.Println(err)
102 return 0, err
103 }
104 defer f.Close()
105 gz := gzip.NewWriter(f)
106 defer gz.Close()
107 n, err = mat.WriteTo(gz)
108 if err != nil {
109 log.Println(err)
110 return n, err
111 }
112 gz.Flush()
113 return n, nil
114}
115
116// WriteTo stores the matrix data in an io.Writer.
117func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
118
119 wb := bufio.NewWriter(w)
120 defer wb.Flush()
121
122 // Store magical header
123 all, err := wb.Write([]byte(MAMAGIC))
124 if err != nil {
125 log.Println(err)
126 return int64(all), err
127 }
128
129 // Get sigma as a list
Akron28031b72021-10-02 13:07:25 +0200130 // In datok it's 16 - 4*4
131 sigmalist := make([]rune, len(mat.sigma)+16)
Akron16c312e2021-09-26 13:11:12 +0200132 max := 0
133 for sym, num := range mat.sigma {
134 sigmalist[num] = sym
135 if num > max {
136 max = num
137 }
138 }
139
Akron28031b72021-10-02 13:07:25 +0200140 // Add final entry to the list (maybe not necessary actually)
Akron16c312e2021-09-26 13:11:12 +0200141 sigmalist = sigmalist[:max+1]
142
Akron28031b72021-10-02 13:07:25 +0200143 buf := make([]byte, 0, 14)
Akron16c312e2021-09-26 13:11:12 +0200144 bo.PutUint16(buf[0:2], VERSION)
145 bo.PutUint16(buf[2:4], uint16(mat.epsilon))
146 bo.PutUint16(buf[4:6], uint16(mat.unknown))
147 bo.PutUint16(buf[6:8], uint16(mat.identity))
Akron28031b72021-10-02 13:07:25 +0200148 bo.PutUint32(buf[8:12], uint32(mat.stateCount))
149 bo.PutUint16(buf[12:14], uint16(len(sigmalist)))
150 more, err := wb.Write(buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200151 if err != nil {
152 log.Println(err)
153 return int64(all), err
154 }
155
156 all += more
157
158 // Write sigma
159 for _, sym := range sigmalist {
160
161 more, err = wb.WriteRune(sym)
162 if err != nil {
163 log.Println(err)
164 return int64(all), err
165 }
166 all += more
167 }
168
169 if err != nil {
170 log.Println(err)
171 return int64(all), err
172 }
173
174 // Test marker - could be checksum
175 more, err = wb.Write([]byte("M"))
176 if err != nil {
177 log.Println(err)
178 return int64(all), err
179 }
180 all += more
181
Akron16c312e2021-09-26 13:11:12 +0200182 for _, x := range mat.array {
183 bo.PutUint32(buf[0:4], uint32(x))
184 more, err = wb.Write(buf[0:4])
185 if err != nil {
186 log.Println(err)
187 return int64(all), err
188 }
189 all += more
190 if more != 4 {
191 log.Println("Can not write base uint32")
192 return int64(all), err
193 }
Akron16c312e2021-09-26 13:11:12 +0200194 }
195
196 return int64(all), err
197}
198
199// LoadDatokFile reads a double array represented tokenizer
200// from a file.
201func LoadMatrixFile(file string) *MatrixTokenizer {
202 f, err := os.Open(file)
203 if err != nil {
204 log.Println(err)
205 return nil
206 }
207 defer f.Close()
208
209 gz, err := gzip.NewReader(f)
210 if err != nil {
211 log.Println(err)
212 return nil
213 }
214 defer gz.Close()
215
216 // Todo: Read the whole file!
217 return ParseMatrix(gz)
218}
219
220// LoadMatrixFile reads a matrix represented tokenizer
221// from an io.Reader
222func ParseMatrix(ior io.Reader) *MatrixTokenizer {
223
224 // Initialize tokenizer with default values
225 mat := &MatrixTokenizer{
Akron28031b72021-10-02 13:07:25 +0200226 sigma: make(map[rune]int),
227 epsilon: 0,
228 unknown: 0,
229 identity: 0,
Akron16c312e2021-09-26 13:11:12 +0200230 stateCount: 0,
Akron16c312e2021-09-26 13:11:12 +0200231 }
232
233 r := bufio.NewReader(ior)
234
235 buf := make([]byte, 1024)
236 buf = buf[0:len(MAMAGIC)]
237
238 _, err := r.Read(buf)
239
240 if err != nil {
241 log.Println(err)
242 return nil
243 }
244
245 if string(MAMAGIC) != string(buf) {
246 log.Println("Not a matok file")
247 return nil
248 }
249
Akron28031b72021-10-02 13:07:25 +0200250 more, err := io.ReadFull(r, buf[0:14])
Akron16c312e2021-09-26 13:11:12 +0200251 if err != nil {
252 log.Println(err)
253 return nil
254 }
255
Akron28031b72021-10-02 13:07:25 +0200256 if more != 14 {
Akron16c312e2021-09-26 13:11:12 +0200257 log.Println("Read bytes do not fit")
258 return nil
259 }
260
261 version := bo.Uint16(buf[0:2])
262
263 if version != VERSION {
264 log.Println("Version not compatible")
265 return nil
266 }
267
268 mat.epsilon = int(bo.Uint16(buf[2:4]))
269 mat.unknown = int(bo.Uint16(buf[4:6]))
270 mat.identity = int(bo.Uint16(buf[6:8]))
Akron28031b72021-10-02 13:07:25 +0200271 mat.stateCount = int(bo.Uint32(buf[8:12]))
272 sigmaCount := int(bo.Uint16(buf[12:14]))
273 arraySize := (mat.stateCount + 1) * sigmaCount
Akron16c312e2021-09-26 13:11:12 +0200274
Akron16c312e2021-09-26 13:11:12 +0200275 for x := 0; x < sigmaCount; x++ {
276 sym, _, err := r.ReadRune()
277 if err == nil && sym != 0 {
278 if int(sym) < 256 {
279 mat.sigmaASCII[int(sym)] = x
280 }
281 mat.sigma[sym] = x
282 }
283 }
284
285 _, err = io.ReadFull(r, buf[0:1])
286
287 if err != nil {
288 log.Print(err)
289 return nil
290 }
291
292 if string("M") != string(buf[0:1]) {
293 log.Println("Not a matok file")
294 return nil
295 }
296
297 // Read based on length
298 mat.array = make([]uint32, arraySize)
299
300 dataArray, err := io.ReadAll(r)
301
302 if err == io.EOF {
303 log.Println(err)
304 return nil
305 }
306
307 if len(dataArray) < arraySize*4 {
Akron28031b72021-10-02 13:07:25 +0200308 log.Println("Not enough bytes read", len(dataArray), arraySize*4)
Akron16c312e2021-09-26 13:11:12 +0200309 return nil
310 }
311
312 for x := 0; x < arraySize; x++ {
Akron16c312e2021-09-26 13:11:12 +0200313 mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
314 }
315
316 return mat
317}
318
Akron98fbfef2021-10-23 17:02:11 +0200319// Transduce input to ouutput
Akron1c34ce62021-09-23 23:27:39 +0200320func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron96fdc9b2021-10-27 21:11:17 +0200321 return mat.TransduceTokenWriter(r, NewTokenWriter(w, SIMPLE))
Akrone396a932021-10-19 01:06:13 +0200322}
323
Akron98fbfef2021-10-23 17:02:11 +0200324// TransduceTokenWriter transduces an input string against
325// the matrix FSA. The rules are always greedy. If the
326// automaton fails, it takes the last possible token ending
327// branch.
Akron4f6b28c2021-10-25 00:52:03 +0200328func (mat *MatrixTokenizer) TransduceTokenWriter(r io.Reader, w *TokenWriter) bool {
Akron1c34ce62021-09-23 23:27:39 +0200329 var a int
Akron16c312e2021-09-26 13:11:12 +0200330 var t0 uint32
331 t := uint32(1) // Initial state
Akron1c34ce62021-09-23 23:27:39 +0200332 var ok, rewindBuffer bool
333
334 // Remember the last position of a possible tokenend,
335 // in case the automaton fails.
Akron16c312e2021-09-26 13:11:12 +0200336 epsilonState := uint32(0)
Akron1c34ce62021-09-23 23:27:39 +0200337 epsilonOffset := 0
338
Akron5c82a922021-09-24 19:11:29 +0200339 // Remember if the last transition was epsilon
340 sentenceEnd := false
341
Akrona854faa2021-10-22 19:31:08 +0200342 // Remember if a text end was already set
343 textEnd := false
344
Akron1c34ce62021-09-23 23:27:39 +0200345 buffer := make([]rune, 1024)
Akron98fbfef2021-10-23 17:02:11 +0200346 bufft := 0 // Buffer token offset
347 buffc := 0 // Buffer current symbol
Akron1c34ce62021-09-23 23:27:39 +0200348 buffi := 0 // Buffer length
349
Akron98fbfef2021-10-23 17:02:11 +0200350 // The buffer is organized as follows:
351 // [ t[....c..]..i]
352
Akron1c34ce62021-09-23 23:27:39 +0200353 reader := bufio.NewReader(r)
Akrone396a932021-10-19 01:06:13 +0200354 defer w.Flush()
Akron1c34ce62021-09-23 23:27:39 +0200355
356 var char rune
357
358 var err error
359 eof := false
Akrona854faa2021-10-22 19:31:08 +0200360 eot := false
Akron1c34ce62021-09-23 23:27:39 +0200361 newchar := true
362
363PARSECHARM:
364 for {
365
366 if newchar {
367 // Get from reader if buffer is empty
Akron98fbfef2021-10-23 17:02:11 +0200368 if buffc >= buffi {
Akron1c34ce62021-09-23 23:27:39 +0200369 if eof {
370 break
371 }
372 char, _, err = reader.ReadRune()
373
374 // No more runes to read
375 if err != nil {
Akron274600e2021-11-03 20:09:06 +0100376 if err == io.EOF {
377 eof = true
378 break
379 }
380
381 log.Fatalln(err)
382 os.Exit(1)
383 return false
Akron1c34ce62021-09-23 23:27:39 +0200384 }
Akron274600e2021-11-03 20:09:06 +0100385
Akron1c34ce62021-09-23 23:27:39 +0200386 buffer[buffi] = char
387 buffi++
388 }
389
Akron98fbfef2021-10-23 17:02:11 +0200390 char = buffer[buffc]
Akron1c34ce62021-09-23 23:27:39 +0200391
392 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100393 log.Println("Current char", string(char), int(char), showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200394 }
395
Akrona854faa2021-10-22 19:31:08 +0200396 eot = false
397
Akron1c34ce62021-09-23 23:27:39 +0200398 // TODO:
399 // Better not repeatedly check for a!
400 // Possibly keep a buffer with a.
401 if int(char) < 256 {
Akrona854faa2021-10-22 19:31:08 +0200402 if int(char) == EOT {
403 eot = true
404 }
Akron1c34ce62021-09-23 23:27:39 +0200405 a = mat.sigmaASCII[int(char)]
Akron4880fb62021-12-05 12:03:05 +0100406
407 if a == 0 && mat.identity != -1 {
408 a = mat.identity
409 }
410
Akron1c34ce62021-09-23 23:27:39 +0200411 } else {
412 a, ok = mat.sigma[char]
Akron1c34ce62021-09-23 23:27:39 +0200413
Akron4880fb62021-12-05 12:03:05 +0100414 // Use identity symbol if character is not in sigma
415 if !ok && mat.identity != -1 {
416 a = mat.identity
417 }
Akron1c34ce62021-09-23 23:27:39 +0200418 }
419
420 t0 = t
421
422 // Check for epsilon transitions and remember
423
Akron16c312e2021-09-26 13:11:12 +0200424 // TODO: Can t0 be negative here?
425 if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200426 // Remember state for backtracking to last tokenend state
Akron16c312e2021-09-26 13:11:12 +0200427
428 // Maybe not necessary - and should be simpler!
429 // Just Remove
430 t0 &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200431 epsilonState = t0
Akron98fbfef2021-10-23 17:02:11 +0200432 epsilonOffset = buffc
Akron16c312e2021-09-26 13:11:12 +0200433
434 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100435 log.Println("epsilonOffset is set to", buffc)
Akron16c312e2021-09-26 13:11:12 +0200436 }
Akron1c34ce62021-09-23 23:27:39 +0200437 }
438 }
439
440 // Checks a transition based on t0, a and buffo
441 t = mat.array[(int(a)-1)*mat.stateCount+int(t0)]
Akron1c34ce62021-09-23 23:27:39 +0200442
443 if DEBUG {
444 // Char is only relevant if set
Akron9c3bf7f2021-11-03 19:52:12 +0100445 log.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
Akron1c34ce62021-09-23 23:27:39 +0200446 }
447
Akrone396a932021-10-19 01:06:13 +0200448 // Check if the transition is invalid according to the matrix
Akron1c34ce62021-09-23 23:27:39 +0200449 if t == 0 {
450
451 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100452 log.Println("Match is not fine!")
Akron1c34ce62021-09-23 23:27:39 +0200453 }
454
455 if !ok && a == mat.identity {
456
457 // Try again with unknown symbol, in case identity failed
458 // Char is only relevant when set
459 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100460 log.Println("UNKNOWN symbol", string(char), "->", mat.unknown)
Akron1c34ce62021-09-23 23:27:39 +0200461 }
462 a = mat.unknown
463
464 } else if a != mat.epsilon {
465
466 // Try again with epsilon symbol, in case everything else failed
467 t0 = epsilonState
468 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200469 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200470 a = mat.epsilon
471
472 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100473 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200474 }
475
476 } else {
477 break
478 }
479
480 newchar = false
Akrona854faa2021-10-22 19:31:08 +0200481 eot = false
Akron1c34ce62021-09-23 23:27:39 +0200482 continue
483 }
484
485 // Transition was successful
486 rewindBuffer = false
487
Akron90aa45b2021-11-16 23:28:17 +0100488 // Transition consumes no character
489 if a == mat.epsilon {
Akron1c34ce62021-09-23 23:27:39 +0200490 // Transition marks the end of a token - so flush the buffer
Akron98fbfef2021-10-23 17:02:11 +0200491 if buffc-bufft > 0 {
Akron1c34ce62021-09-23 23:27:39 +0200492 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100493 log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200494 }
Akron32416ce2021-10-23 17:09:41 +0200495 w.Token(bufft, buffer[:buffc])
Akron1c34ce62021-09-23 23:27:39 +0200496 rewindBuffer = true
Akron5c82a922021-09-24 19:11:29 +0200497 sentenceEnd = false
Akrona854faa2021-10-22 19:31:08 +0200498 textEnd = false
Akron5c82a922021-09-24 19:11:29 +0200499 } else {
500 sentenceEnd = true
Akron4f6b28c2021-10-25 00:52:03 +0200501 w.SentenceEnd(buffc)
Akron1c34ce62021-09-23 23:27:39 +0200502 }
Akron90aa45b2021-11-16 23:28:17 +0100503
504 // Transition consumes a character
505 } else {
506 buffc++
507
508 // Transition does not produce a character
509 if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
510 if DEBUG {
511 log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
512 }
513 bufft++
514 // rewindBuffer = true
515 }
Akron1c34ce62021-09-23 23:27:39 +0200516 }
517
Akron8cc2dd92021-10-25 19:49:41 +0200518 if eot {
519 eot = false
520 textEnd = true
521 w.TextEnd(buffc)
522 rewindBuffer = true
523 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100524 log.Println("END OF TEXT")
Akron8cc2dd92021-10-25 19:49:41 +0200525 }
526 }
527
Akron1c34ce62021-09-23 23:27:39 +0200528 // Rewind the buffer if necessary
529 if rewindBuffer {
530
Akron16c312e2021-09-26 13:11:12 +0200531 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100532 log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
Akron16c312e2021-09-26 13:11:12 +0200533 }
534
Akron22c565a2021-11-28 17:31:36 +0100535 // buffer = buffer[buffc:]
536 for x, i := range buffer[buffc:buffi] {
537 buffer[x] = i
538 }
Akron1c34ce62021-09-23 23:27:39 +0200539
Akron98fbfef2021-10-23 17:02:11 +0200540 buffi -= buffc
Akron16c312e2021-09-26 13:11:12 +0200541 // epsilonOffset -= buffo
542 epsilonOffset = 0
543 epsilonState = 0
544
Akron98fbfef2021-10-23 17:02:11 +0200545 buffc = 0
546 bufft = 0
Akrona854faa2021-10-22 19:31:08 +0200547
Akron98fbfef2021-10-23 17:02:11 +0200548 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100549 log.Println("Remaining:", showBufferNew(buffer, bufft, buffc, buffi))
Akrona854faa2021-10-22 19:31:08 +0200550 }
Akron1c34ce62021-09-23 23:27:39 +0200551 }
552
Akron16c312e2021-09-26 13:11:12 +0200553 t &= ^FIRSTBIT
Akron1c34ce62021-09-23 23:27:39 +0200554
555 newchar = true
556
557 // TODO:
558 // Prevent endless epsilon loops!
559 }
560
561 // Input reader is not yet finished
562 if !eof {
563 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100564 log.Println("Not at the end")
Akron1c34ce62021-09-23 23:27:39 +0200565 }
566 return false
567 }
568
569 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100570 log.Println("Entering final check")
Akron1c34ce62021-09-23 23:27:39 +0200571 }
Akron1c34ce62021-09-23 23:27:39 +0200572
Akrona854faa2021-10-22 19:31:08 +0200573 // Check epsilon transitions as long as possible
Akron1c34ce62021-09-23 23:27:39 +0200574 t0 = t
Akron1c34ce62021-09-23 23:27:39 +0200575 t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
576 a = mat.epsilon
577 newchar = false
Akron1c34ce62021-09-23 23:27:39 +0200578 // t can't be < 0
Akron16c312e2021-09-26 13:11:12 +0200579 if t != 0 {
Akron1c34ce62021-09-23 23:27:39 +0200580 // Remember state for backtracking to last tokenend state
581 goto PARSECHARM
582
583 } else if epsilonState != 0 {
584 t0 = epsilonState
585 epsilonState = 0 // reset
Akron98fbfef2021-10-23 17:02:11 +0200586 buffc = epsilonOffset
Akron1c34ce62021-09-23 23:27:39 +0200587 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100588 log.Println("Get from epsilon stack and set buffo!", showBufferNew(buffer, bufft, buffc, buffi))
Akron1c34ce62021-09-23 23:27:39 +0200589 }
590 goto PARSECHARM
591 }
Akron1c34ce62021-09-23 23:27:39 +0200592
Akron5c82a922021-09-24 19:11:29 +0200593 // Add an additional sentence ending, if the file is over but no explicit
594 // sentence split was reached. This may be controversial and therefore
595 // optional via parameter.
596 if !sentenceEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200597 w.SentenceEnd(buffc)
Akron5c82a922021-09-24 19:11:29 +0200598 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100599 log.Println("Sentence end")
Akrona854faa2021-10-22 19:31:08 +0200600 }
601 }
602
603 if !textEnd {
Akron4f6b28c2021-10-25 00:52:03 +0200604 w.TextEnd(buffc)
Akrona854faa2021-10-22 19:31:08 +0200605 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100606 log.Println("Text end")
Akron5c82a922021-09-24 19:11:29 +0200607 }
608 }
609
610 return true
Akron1c34ce62021-09-23 23:27:39 +0200611}