blob: a213b33fed48950c5be119abd8fc1df3bde4ea9b [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
33 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron8ef408b2021-08-02 22:11:04 +020035 "strconv"
36 "strings"
37 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020038
Akron527c10c2021-08-13 01:45:18 +020039 "log"
Akron8ef408b2021-08-02 22:11:04 +020040)
41
42const (
Akron2a4b9292021-08-04 15:35:22 +020043 PROPS = 1
44 SIGMA = 2
45 STATES = 3
46 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020047 DEBUG = false
48 MAGIC = "DATOK"
49 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020050 FIRSTBIT uint32 = 1 << 31
51 SECONDBIT uint32 = 1 << 30
52 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020053)
54
Akron03c92fe2021-08-09 14:07:57 +020055// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020056var bo binary.ByteOrder = binary.LittleEndian
57
Akron8ef408b2021-08-02 22:11:04 +020058type mapping struct {
59 source int
Akron3fdfec62021-08-04 11:40:10 +020060 target uint32
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type edge struct {
Akron83e75a22021-08-04 13:14:06 +020064 inSym int
65 outSym int
66 end int
67 nontoken bool
68 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020069}
70
Akron03c92fe2021-08-09 14:07:57 +020071// Tokenizer is the intermediate representation
72// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020073type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020074 sigmaRev map[int]rune
Akron31f3c062021-08-27 10:15:13 +020075 sigmaMCS map[int]string
Akron740f3d72021-08-03 12:12:34 +020076 arcCount int
Akron740f3d72021-08-03 12:12:34 +020077 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020078 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020079
80 // Special symbols in sigma
81 epsilon int
82 unknown int
83 identity int
84 final int
Akron03c92fe2021-08-09 14:07:57 +020085 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020086}
87
Akronf1a16502021-08-16 15:24:38 +020088type bc struct {
89 base uint32
90 check uint32
91}
92
Akron03c92fe2021-08-09 14:07:57 +020093// DaTokenizer represents a tokenizer implemented as a
94// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020095type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020096 sigma map[rune]int
97 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020098 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020099 transCount int
Akronf1a16502021-08-16 15:24:38 +0200100 array []bc
Akronc17f1ca2021-08-03 19:47:27 +0200101
102 // Special symbols in sigma
103 epsilon int
104 unknown int
105 identity int
106 final int
Akron03c92fe2021-08-09 14:07:57 +0200107 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200108}
109
Akron03c92fe2021-08-09 14:07:57 +0200110// ParseFoma reads the FST from a foma file
111// and creates an internal representation,
112// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200113func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200114 f, err := os.Open(file)
115 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200116 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200117 return nil
Akron8ef408b2021-08-02 22:11:04 +0200118 }
119 defer f.Close()
120
121 gz, err := gzip.NewReader(f)
122 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200123 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200124 return nil
Akron8ef408b2021-08-02 22:11:04 +0200125 }
126 defer gz.Close()
127
Akron3fdfec62021-08-04 11:40:10 +0200128 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200129}
130
Akron03c92fe2021-08-09 14:07:57 +0200131// ParseFoma reads the FST from a foma file reader
132// and creates an internal representation,
133// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200134func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200135 r := bufio.NewReader(ior)
136
137 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200138 sigmaRev: make(map[int]rune),
Akron31f3c062021-08-27 10:15:13 +0200139 sigmaMCS: make(map[int]string),
Akronc17f1ca2021-08-03 19:47:27 +0200140 epsilon: -1,
141 unknown: -1,
142 identity: -1,
143 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200144 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200145 }
146
Akron740f3d72021-08-03 12:12:34 +0200147 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200148
149 mode := 0
150 var elem []string
151 var elemint [5]int
152
Akron03c92fe2021-08-09 14:07:57 +0200153 // Iterate over all lines of the file.
154 // This is mainly based on foma2js,
155 // licensed under the Apache License, version 2,
156 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200157 for {
158 line, err := r.ReadString('\n')
159 if err != nil {
160 if err == io.EOF {
161 break
162 }
Akron527c10c2021-08-13 01:45:18 +0200163 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200164 return nil
Akron8ef408b2021-08-02 22:11:04 +0200165 }
Akron8ef408b2021-08-02 22:11:04 +0200166
Akron439f4ec2021-08-09 15:45:38 +0200167 // Read parser mode for the following lines
168 if strings.HasPrefix(line, "##") {
169 if strings.HasPrefix(line, "##props##") {
170 mode = PROPS
171
172 } else if strings.HasPrefix(line, "##states##") {
173 mode = STATES
174
175 // Adds a final transition symbol to sigma
176 // written as '#' in Mizobuchi et al (2000)
177 tok.sigmaCount++
178 tok.final = tok.sigmaCount
179
180 } else if strings.HasPrefix(line, "##sigma##") {
181
182 mode = SIGMA
183
184 } else if strings.HasPrefix(line, "##end##") {
185
186 mode = NONE
187
188 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200189 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200190 break
191 }
Akron8ef408b2021-08-02 22:11:04 +0200192 continue
193 }
194
Akron439f4ec2021-08-09 15:45:38 +0200195 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200196 switch mode {
197 case PROPS:
198 {
199 elem = strings.Split(line, " ")
200 /*
201 fmt.Println("arity: " + elem[0])
202 fmt.Println("arccount: " + elem[1])
203 fmt.Println("statecount: " + elem[2])
204 fmt.Println("linecount: " + elem[3])
205 fmt.Println("finalcount: " + elem[4])
206 fmt.Println("pathcount: " + elem[5])
207 fmt.Println("is_deterministic: " + elem[6])
208 fmt.Println("is_pruned: " + elem[7])
209 fmt.Println("is_minimized: " + elem[8])
210 fmt.Println("is_epsilon_free: " + elem[9])
211 fmt.Println("is_loop_free: " + elem[10])
212 fmt.Println("extras: " + elem[11])
213 fmt.Println("name: " + elem[12])
214 */
215 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200216 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200217 return nil
Akron8ef408b2021-08-02 22:11:04 +0200218 }
Akron439f4ec2021-08-09 15:45:38 +0200219
Akron8ef408b2021-08-02 22:11:04 +0200220 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200221 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200222 return nil
Akron8ef408b2021-08-02 22:11:04 +0200223 }
224
225 elemint[0], err = strconv.Atoi(elem[1])
226 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200227 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200228 return nil
Akron8ef408b2021-08-02 22:11:04 +0200229 }
Akron740f3d72021-08-03 12:12:34 +0200230 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200231
Akron8ef408b2021-08-02 22:11:04 +0200232 elemint[0], err = strconv.Atoi(elem[2])
233 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200234 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200235 return nil
Akron8ef408b2021-08-02 22:11:04 +0200236 }
Akron439f4ec2021-08-09 15:45:38 +0200237
238 // States start at 1 in Mizobuchi et al (2000),
239 // as the state 0 is associated with a fail.
240 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200241 tok.transitions = make([]map[int]*edge, elemint[0]+1)
242 continue
243 }
244 case STATES:
245 {
246 elem = strings.Split(line[0:len(line)-1], " ")
247 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200248 if DEBUG {
249 fmt.Println("Skip", elem)
250 }
Akron8ef408b2021-08-02 22:11:04 +0200251 continue
252 }
253 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200254 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200255 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200256 break
257 }
Akron8ef408b2021-08-02 22:11:04 +0200258
259 if len(elem) > 1 {
260 elemint[1], err = strconv.Atoi(elem[1])
261 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200262 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200263 break
264 }
265 if len(elem) > 2 {
266 elemint[2], err = strconv.Atoi(elem[2])
267 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200268 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200269 break
270 }
271 if len(elem) > 3 {
272 elemint[3], err = strconv.Atoi(elem[3])
273 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200274 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200275 break
276 }
277 if len(elem) > 4 {
278 elemint[4], err = strconv.Atoi(elem[4])
279 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200280 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200281 break
282 }
283 }
284 }
285 }
286 }
287
288 switch len(elem) {
289 case 5:
290 {
Akron740f3d72021-08-03 12:12:34 +0200291 state = elemint[0]
292 inSym = elemint[1]
293 outSym = elemint[2]
294 end = elemint[3]
295 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200296 }
297 case 4:
298 {
299 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200300 state = elemint[0]
301 final = elemint[3]
Akron0630be52021-08-28 09:06:16 +0200302
303 // Final state that has no outgoing edges
304 if final == 1 {
305
306 // Initialize outgoing states
307 if tok.transitions[state+1] == nil {
308 tok.transitions[state+1] = make(map[int]*edge)
309 }
310
311 // TODO:
312 // Maybe this is less relevant for tokenizers
313 tok.transitions[state+1][tok.final] = &edge{}
314 }
315 continue
Akron8ef408b2021-08-02 22:11:04 +0200316 } else {
Akron740f3d72021-08-03 12:12:34 +0200317 state = elemint[0]
318 inSym = elemint[1]
319 end = elemint[2]
320 final = elemint[3]
321 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200322 }
323 }
324 case 3:
325 {
Akron740f3d72021-08-03 12:12:34 +0200326 inSym = elemint[0]
327 outSym = elemint[1]
328 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200329 }
330 case 2:
331 {
Akron740f3d72021-08-03 12:12:34 +0200332 inSym = elemint[0]
333 end = elemint[1]
334 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200335 }
336 }
337
Akron83e75a22021-08-04 13:14:06 +0200338 nontoken := false
339 tokenend := false
340
Akron439f4ec2021-08-09 15:45:38 +0200341 // While the states in foma start with 0, the states in the
342 // Mizobuchi FSA start with one - so we increase every state by 1.
343 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200344 inSym++
345 outSym++
346
Akron439f4ec2021-08-09 15:45:38 +0200347 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200348 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200349 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200350 tokenend = true
351 } else if outSym == tok.epsilon {
352 nontoken = true
353 } else {
Akron527c10c2021-08-13 01:45:18 +0200354 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200355 "Unsupported transition: " +
356 strconv.Itoa(state) +
357 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200358 " (" +
Akron740f3d72021-08-03 12:12:34 +0200359 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200360 ":" +
Akron740f3d72021-08-03 12:12:34 +0200361 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200362 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200363 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200364 ":" +
Akron740f3d72021-08-03 12:12:34 +0200365 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200366 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200367 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200368 }
Akron92704eb2021-08-27 10:59:46 +0200369 } else if inSym == tok.tokenend {
370 // Ignore tokenend accepting arcs
371 continue
Akron83e75a22021-08-04 13:14:06 +0200372 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200373 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200374 return nil
Akron31f3c062021-08-27 10:15:13 +0200375 } else if tok.sigmaMCS[inSym] != "" {
Akron34dbe972021-08-29 17:44:34 +0200376 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
377 // Ignore MCS transitions
378 continue
Akron8ef408b2021-08-02 22:11:04 +0200379 }
380
Akron03c92fe2021-08-09 14:07:57 +0200381 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200382 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200383 inSym: inSym,
384 outSym: outSym,
385 end: end + 1,
386 tokenend: tokenend,
387 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200388 }
389
Akron740f3d72021-08-03 12:12:34 +0200390 // Initialize outgoing states
391 if tok.transitions[state+1] == nil {
392 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200393 }
394
Akron740f3d72021-08-03 12:12:34 +0200395 // Ignore transitions with invalid symbols
396 if inSym >= 0 {
397 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200398 }
Akron8ef408b2021-08-02 22:11:04 +0200399
Akron740f3d72021-08-03 12:12:34 +0200400 // Add final transition
401 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200402 // TODO:
403 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200404 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200405 }
406
Akronb4bbb472021-08-09 11:49:38 +0200407 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200408 fmt.Println("Add",
409 state+1, "->", end+1,
410 "(",
411 inSym,
412 ":",
413 outSym,
414 ") (",
415 string(tok.sigmaRev[inSym]),
416 ":",
417 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200418 ")",
419 ";",
420 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200421 "NT:", nontoken,
422 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200423 }
Akron75ebe7f2021-08-03 10:34:10 +0200424
Akron8ef408b2021-08-02 22:11:04 +0200425 continue
426 }
427 case SIGMA:
428 {
429 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
430
431 // Turn string into sigma id
432 number, err := strconv.Atoi(elem[0])
433
Akron524c5432021-08-05 14:14:27 +0200434 // ID needs to be > 1
435 number++
436
Akron8ef408b2021-08-02 22:11:04 +0200437 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200438 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200439 return nil
Akron8ef408b2021-08-02 22:11:04 +0200440 }
441
Akron740f3d72021-08-03 12:12:34 +0200442 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200443
444 var symbol rune
445
446 // Read rune
447 if utf8.RuneCountInString(elem[1]) == 1 {
448 symbol = []rune(elem[1])[0]
449
Akron8ef408b2021-08-02 22:11:04 +0200450 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200451
452 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200453 switch elem[1] {
454 case "@_EPSILON_SYMBOL_@":
455 {
Akronc17f1ca2021-08-03 19:47:27 +0200456 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200457 }
458 case "@_UNKNOWN_SYMBOL_@":
459 {
Akronc17f1ca2021-08-03 19:47:27 +0200460 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200461 }
462
463 case "@_IDENTITY_SYMBOL_@":
464 {
Akronc17f1ca2021-08-03 19:47:27 +0200465 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200466 }
Akron03c92fe2021-08-09 14:07:57 +0200467
468 case "@_TOKEN_SYMBOL_@":
469 {
470 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200471 }
Akron8ef408b2021-08-02 22:11:04 +0200472 default:
Akron740f3d72021-08-03 12:12:34 +0200473 {
Akron31f3c062021-08-27 10:15:13 +0200474 // MCS not supported
475 tok.sigmaMCS[number] = line
Akron740f3d72021-08-03 12:12:34 +0200476 }
Akron8ef408b2021-08-02 22:11:04 +0200477 }
Akron439f4ec2021-08-09 15:45:38 +0200478 continue
Akron8ef408b2021-08-02 22:11:04 +0200479
Akron740f3d72021-08-03 12:12:34 +0200480 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200481 line, err = r.ReadString('\n')
482 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200483 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200484 return nil
Akron8ef408b2021-08-02 22:11:04 +0200485 }
486 if len(line) != 1 {
Akron31f3c062021-08-27 10:15:13 +0200487 // MCS not supported
488 tok.sigmaMCS[number] = line
489 continue
Akron8ef408b2021-08-02 22:11:04 +0200490 }
Akron03c92fe2021-08-09 14:07:57 +0200491 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200492 }
493
Akron740f3d72021-08-03 12:12:34 +0200494 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200495 }
496 }
497 }
Akron31f3c062021-08-27 10:15:13 +0200498 tok.sigmaMCS = nil
Akron8ef408b2021-08-02 22:11:04 +0200499 return tok
500}
501
Akron64ffd9a2021-08-03 19:55:21 +0200502// Set alphabet A to the list of all symbols
503// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200504func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200505 for a := range tok.transitions[s] {
506 *A = append(*A, a)
507 }
508
509 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200510 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200511}
512
Akron439f4ec2021-08-09 15:45:38 +0200513// ToDoubleArray turns the intermediate tokenizer representation
514// into a double array representation.
515//
516// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200517func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
518
519 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200520 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +0200521 transCount: -1,
Akron03a3c612021-08-04 11:51:27 +0200522 final: tok.final,
523 unknown: tok.unknown,
524 identity: tok.identity,
525 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200526 tokenend: tok.tokenend,
Akronf2120ca2021-08-03 16:26:41 +0200527 }
528
Akronf1a16502021-08-16 15:24:38 +0200529 dat.resize(dat.final)
530
Akronf2120ca2021-08-03 16:26:41 +0200531 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200532 if int(sym) < 256 {
533 dat.sigmaASCII[int(sym)] = num
534 }
Akronf2120ca2021-08-03 16:26:41 +0200535 dat.sigma[sym] = num
536 }
Akron8ef408b2021-08-02 22:11:04 +0200537
538 mark := 0
539 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200540 var base uint32
Akronde18e902021-08-27 09:34:12 +0200541 var atrans *edge
542 var s, s1 int
543 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200544
Akron439f4ec2021-08-09 15:45:38 +0200545 // Create a mapping from s (in Ms aka Intermediate FSA)
546 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200547 table := make([]*mapping, tok.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200548 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200549
Akron439f4ec2021-08-09 15:45:38 +0200550 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200551 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200552 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200553 size++
554
Akron740f3d72021-08-03 12:12:34 +0200555 // Allocate space for the outgoing symbol range
556 A := make([]int, 0, tok.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200557 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
558 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200559
Akron7b1faa62021-09-02 16:10:21 +0200560 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200561
Akron8ef408b2021-08-02 22:11:04 +0200562 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200563 s = table[mark].source // This is a state in Ms
564 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200565 // s = tableQueue[mark]
566 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200567 mark++
Akron740f3d72021-08-03 12:12:34 +0200568
569 // Following the paper, here the state t can be remembered
570 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200571 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200572 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200573
Akron740f3d72021-08-03 12:12:34 +0200574 // Set base to the first free slot in the double array
Akron6f1c16c2021-08-17 10:45:42 +0200575 base = dat.xCheck(A)
Akron7b1faa62021-09-02 16:10:21 +0200576 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron6f1c16c2021-08-17 10:45:42 +0200577 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200578
Akron773b1ef2021-08-03 17:37:20 +0200579 // TODO:
Akron068874c2021-08-04 15:19:56 +0200580 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200581 // outdegree of .end
582
Akron740f3d72021-08-03 12:12:34 +0200583 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200584 for _, a := range A {
585
Akronc17f1ca2021-08-03 19:47:27 +0200586 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200587
Akronde18e902021-08-27 09:34:12 +0200588 atrans = tok.transitions[s][a]
589
Akron740f3d72021-08-03 12:12:34 +0200590 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200591 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200592
Akron740f3d72021-08-03 12:12:34 +0200593 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200594 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200595 dat.array[t1].setCheck(t)
596
597 // Set maxSize
598 if dat.maxSize < int(t1) {
599 dat.maxSize = int(t1)
600 }
Akron8ef408b2021-08-02 22:11:04 +0200601
Akron439f4ec2021-08-09 15:45:38 +0200602 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200603 fmt.Println("Translate transition",
604 s, "->", s1, "(", a, ")", "to", t, "->", t1)
605 }
606
Akron83e75a22021-08-04 13:14:06 +0200607 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200608 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200609 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200610 if DEBUG {
611 fmt.Println("Set", t1, "to nontoken")
612 }
Akron83e75a22021-08-04 13:14:06 +0200613 }
614
Akron84d68e62021-08-04 17:06:52 +0200615 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200616 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200617 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200618 if DEBUG {
619 fmt.Println("Set", t1, "to tokenend")
620 }
Akron84d68e62021-08-04 17:06:52 +0200621 }
622
Akron740f3d72021-08-03 12:12:34 +0200623 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200624 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200625 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200626
Akron439f4ec2021-08-09 15:45:38 +0200627 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200628 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200629 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200630 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200631 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200632 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200633 size++
634 } else {
Akron740f3d72021-08-03 12:12:34 +0200635 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200636 dat.array[t1].setBase(r)
637 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200638 }
639 } else {
Akron740f3d72021-08-03 12:12:34 +0200640 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200641 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200642
Akronde18e902021-08-27 09:34:12 +0200643 if dat.maxSize < int(base)+dat.final {
644 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200645 }
Akron8ef408b2021-08-02 22:11:04 +0200646 }
647 }
648 }
649
650 // Following Mizobuchi et al (2000) the size of the
651 // FSA should be stored in check(1).
Akronf1a16502021-08-16 15:24:38 +0200652 // We make the size a bit smaller so we never have to check for boundaries.
Akron3fdfec62021-08-04 11:40:10 +0200653 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200654 dat.array = dat.array[:dat.maxSize+1]
655 return dat
Akron8ef408b2021-08-02 22:11:04 +0200656}
657
Akron8ef408b2021-08-02 22:11:04 +0200658// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200659// exists and return this as a representative.
660// Currently iterates through the whole table
661// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200662func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200663 for x := 0; x < size; x++ {
664 if table[x].source == s {
665 return table[x].target
666 }
667 }
668 return 0
669}
670
Akron64ffd9a2021-08-03 19:55:21 +0200671// Resize double array when necessary
672func (dat *DaTokenizer) resize(l int) {
673 // TODO:
674 // This is a bit too aggressive atm and should be calmed down.
675 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200676 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200677 }
Akron64ffd9a2021-08-03 19:55:21 +0200678}
Akronc9d84a62021-08-03 15:56:03 +0200679
Akron64ffd9a2021-08-03 19:55:21 +0200680// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200681func (bc *bc) setBase(v uint32) {
682 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200683}
684
685// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200686func (bc *bc) getBase() uint32 {
687 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200688}
689
690// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200691func (bc *bc) setCheck(v uint32) {
692 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200693}
694
695// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200696func (bc *bc) getCheck() uint32 {
697 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200698}
699
Akron3fdfec62021-08-04 11:40:10 +0200700// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200701func (bc *bc) isSeparate() bool {
702 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200703}
704
705// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200706func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200707 if sep {
Akronf1a16502021-08-16 15:24:38 +0200708 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200709 } else {
Akronf1a16502021-08-16 15:24:38 +0200710 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200711 }
712}
713
Akron83e75a22021-08-04 13:14:06 +0200714// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200715func (bc *bc) isNonToken() bool {
716 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200717}
718
719// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200720func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200721 if sep {
Akronf1a16502021-08-16 15:24:38 +0200722 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200723 } else {
Akronf1a16502021-08-16 15:24:38 +0200724 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200725 }
726}
727
Akron84d68e62021-08-04 17:06:52 +0200728// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200729func (bc *bc) isTokenEnd() bool {
730 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200731}
732
733// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200734func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200735 if sep {
Akronf1a16502021-08-16 15:24:38 +0200736 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200737 } else {
Akronf1a16502021-08-16 15:24:38 +0200738 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200739 }
740}
741
Akron64ffd9a2021-08-03 19:55:21 +0200742// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200743func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200744 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200745}
746
747// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200748func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200749 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200750}
751
752// Based on Mizobuchi et al (2000), p. 124
753// This iterates for every state through the complete double array
754// structure until it finds a gap that fits all outgoing transitions
755// of the state. This is extremely slow, but is only necessary in the
756// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200757func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200758
759 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200760 base := uint32(1)
761
Akron8ef408b2021-08-02 22:11:04 +0200762OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200763 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200764 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200765 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200766 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200767 base++
768 goto OVERLAP
769 }
770 }
Akron8ef408b2021-08-02 22:11:04 +0200771 return base
772}
773
Akron7b1faa62021-09-02 16:10:21 +0200774// This is an implementation of xCheck wit an improvement
775// proposed by Niu et al. (2013)
776func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
777
778 // Start at the first entry of the double array list
779 base := uint32(1)
780
781 if len(symbols) > 3 {
782 sort.Ints(symbols)
783 if *block_begin_pos > uint32(symbols[0]) {
784 dat.resize(int(*block_begin_pos) + dat.final)
785 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
786 return *block_begin_pos - uint32(symbols[0])
787 }
788 }
789
790OVERLAP:
791 // Resize the array if necessary
792 dat.resize(int(base) + dat.final)
793 for _, a := range symbols {
794 if dat.array[int(base)+a].getCheck() != 0 {
795 base++
796 goto OVERLAP
797 }
798 }
799 return base
800}
801
Akron3610f102021-08-08 14:13:25 +0200802// List all outgoing transitions for a state
803// for testing purposes
804func (dat *DaTokenizer) outgoing(t uint32) []int {
805
806 valid := make([]int, 0, len(dat.sigma))
807
808 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200809 t1 := dat.array[t].getBase() + uint32(a)
810 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200811 valid = append(valid, a)
812 }
813 }
814
815 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200816 t1 := dat.array[t].getBase() + uint32(a)
817 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200818 valid = append(valid, -1*a)
819 }
820 }
821
822 sort.Ints(valid)
823
824 return valid
825}
826
Akron4fa28b32021-08-27 10:55:41 +0200827// TransCount as the number of transitions aka arcs in the
828// finite state automaton
829func (dat *DaTokenizer) TransCount() int {
830 // Cache the transCount
831 if dat.transCount > 0 {
832 return dat.transCount
833 }
834
835 dat.transCount = 0
836 for x := 1; x < len(dat.array); x++ {
837 if dat.array[x].getBase() != 0 {
838 dat.transCount++
839 }
840 }
841
842 return dat.transCount
843}
844
Akron03a3c612021-08-04 11:51:27 +0200845// LoadFactor as defined in Kanda et al (2018),
846// i.e. the proportion of non-empty elements to all elements.
847func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200848 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200849}
850
Akron439f4ec2021-08-09 15:45:38 +0200851// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200852func (dat *DaTokenizer) Save(file string) (n int64, err error) {
853 f, err := os.Create(file)
854 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200855 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200856 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200857 }
858 defer f.Close()
859 gz := gzip.NewWriter(f)
860 defer gz.Close()
861 n, err = dat.WriteTo(gz)
862 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200863 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200864 return n, err
865 }
866 gz.Flush()
867 return n, nil
868}
869
870// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200871func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
872
Akron3a063ef2021-08-05 19:36:35 +0200873 wb := bufio.NewWriter(w)
874 defer wb.Flush()
875
Akron6247a5d2021-08-03 19:18:28 +0200876 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200877 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200878 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200879 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200880 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200881 }
882
883 // Get sigma as a list
884 sigmalist := make([]rune, len(dat.sigma)+16)
885 max := 0
886 for sym, num := range dat.sigma {
887 sigmalist[num] = sym
888 if num > max {
889 max = num
890 }
891 }
892
893 sigmalist = sigmalist[:max+1]
894
Akron3f8571a2021-08-05 11:18:10 +0200895 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200896 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200897 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
898 bo.PutUint16(buf[4:6], uint16(dat.unknown))
899 bo.PutUint16(buf[6:8], uint16(dat.identity))
900 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200901 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200902 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200903 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200904 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200905 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200906 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200907 }
908
909 all += more
910
Akron6247a5d2021-08-03 19:18:28 +0200911 // Write sigma
912 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200913
Akron3a063ef2021-08-05 19:36:35 +0200914 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200915 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200916 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200917 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200918 }
919 all += more
920 }
Akron439f4ec2021-08-09 15:45:38 +0200921
Akron6247a5d2021-08-03 19:18:28 +0200922 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200923 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200924 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200925 }
Akron6247a5d2021-08-03 19:18:28 +0200926
927 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200928 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200929 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200930 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200931 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200932 }
933 all += more
934
Akronf1a16502021-08-16 15:24:38 +0200935 // for x := 0; x < len(dat.array); x++ {
936 for _, bc := range dat.array {
937 bo.PutUint32(buf[0:4], bc.base)
938 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200939 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200940 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200941 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200942 }
Akron439f4ec2021-08-09 15:45:38 +0200943 all += more
Akron3a063ef2021-08-05 19:36:35 +0200944 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200945 log.Println("Can not write base uint32")
946 return int64(all), err
947 }
948 bo.PutUint32(buf[0:4], bc.check)
949 more, err = wb.Write(buf[0:4])
950 if err != nil {
951 log.Println(err)
952 return int64(all), err
953 }
954 all += more
955 if more != 4 {
956 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200957 return int64(all), err
958 }
Akron6247a5d2021-08-03 19:18:28 +0200959 }
960
961 return int64(all), err
962}
963
Akron439f4ec2021-08-09 15:45:38 +0200964// LoadDatokFile reads a double array represented tokenizer
965// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200966func LoadDatokFile(file string) *DaTokenizer {
967 f, err := os.Open(file)
968 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200969 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200970 return nil
Akron3f8571a2021-08-05 11:18:10 +0200971 }
972 defer f.Close()
973
974 gz, err := gzip.NewReader(f)
975 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200976 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200977 return nil
Akron3f8571a2021-08-05 11:18:10 +0200978 }
979 defer gz.Close()
980
Akron3a063ef2021-08-05 19:36:35 +0200981 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200982 return ParseDatok(gz)
983}
984
Akron439f4ec2021-08-09 15:45:38 +0200985// LoadDatokFile reads a double array represented tokenizer
986// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200987func ParseDatok(ior io.Reader) *DaTokenizer {
988
Akron439f4ec2021-08-09 15:45:38 +0200989 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200990 dat := &DaTokenizer{
991 sigma: make(map[rune]int),
992 epsilon: 0,
993 unknown: 0,
994 identity: 0,
995 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200996 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200997 }
998
999 r := bufio.NewReader(ior)
1000
Akron3f8571a2021-08-05 11:18:10 +02001001 buf := make([]byte, 1024)
1002 buf = buf[0:len(MAGIC)]
1003
Akron439f4ec2021-08-09 15:45:38 +02001004 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +02001005
1006 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001007 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +02001008 return nil
1009 }
1010
Akron3f8571a2021-08-05 11:18:10 +02001011 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +02001012 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001013 return nil
1014 }
1015
Akron439f4ec2021-08-09 15:45:38 +02001016 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +02001017 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001018 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +02001019 return nil
1020 }
1021
Akron439f4ec2021-08-09 15:45:38 +02001022 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +02001023 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +02001024 return nil
1025 }
Akron3f8571a2021-08-05 11:18:10 +02001026
Akron3a063ef2021-08-05 19:36:35 +02001027 version := bo.Uint16(buf[0:2])
1028
1029 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +02001030 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +02001031 return nil
1032 }
1033
Akron3f8571a2021-08-05 11:18:10 +02001034 dat.epsilon = int(bo.Uint16(buf[2:4]))
1035 dat.unknown = int(bo.Uint16(buf[4:6]))
1036 dat.identity = int(bo.Uint16(buf[6:8]))
1037 dat.final = int(bo.Uint16(buf[8:10]))
1038
1039 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +02001040 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +02001041
Akron3a063ef2021-08-05 19:36:35 +02001042 // Shouldn't be relevant though
1043 dat.maxSize = arraySize - 1
1044
Akron3f8571a2021-08-05 11:18:10 +02001045 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +02001046 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +02001047 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +02001048 if int(sym) < 256 {
1049 dat.sigmaASCII[int(sym)] = x
1050 }
Akron3f8571a2021-08-05 11:18:10 +02001051 dat.sigma[sym] = x
1052 }
Akron3f8571a2021-08-05 11:18:10 +02001053 }
1054
Akron439f4ec2021-08-09 15:45:38 +02001055 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +02001056
1057 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001058 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +02001059 return nil
1060 }
1061
Akron3f8571a2021-08-05 11:18:10 +02001062 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001063 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001064 return nil
1065 }
1066
1067 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001068 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001069
Akronbb4aac52021-08-13 00:52:27 +02001070 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001071
Akronbb4aac52021-08-13 00:52:27 +02001072 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001073 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001074 return nil
1075 }
1076
Akronf1a16502021-08-16 15:24:38 +02001077 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001078 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001079 return nil
1080 }
1081
1082 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001083 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1084 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001085 }
1086
1087 return dat
1088}
1089
Akron439f4ec2021-08-09 15:45:38 +02001090// Show the current state of the buffer,
1091// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001092func showBuffer(buffer []rune, buffo int, buffi int) string {
1093 out := make([]rune, 0, 1024)
1094 for x := 0; x < len(buffer); x++ {
1095 if buffi == x {
1096 out = append(out, '^')
1097 }
1098 if buffo == x {
1099 out = append(out, '[', buffer[x], ']')
1100 } else {
1101 out = append(out, buffer[x])
1102 }
1103 }
1104 return string(out)
1105}
1106
Akron84d68e62021-08-04 17:06:52 +02001107// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001108// FSA. The rules are always greedy. If the automaton fails,
1109// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001110//
Akron4db3ecf2021-08-11 18:49:03 +02001111// Based on Mizobuchi et al (2000), p. 129,
1112// with additional support for IDENTITY, UNKNOWN
1113// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001114func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001115 var a int
Akronb4bbb472021-08-09 11:49:38 +02001116 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001117 t := uint32(1) // Initial state
1118 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001119
Akron3610f102021-08-08 14:13:25 +02001120 // Remember the last position of a possible tokenend,
1121 // in case the automaton fails.
1122 epsilonState := uint32(0)
1123 epsilonOffset := 0
1124
1125 // Implement a low level buffer for full control,
1126 // however - it is probably better to introduce
1127 // this on a higher level with a io.Reader interface
1128 // The buffer stores a single word and may have white
1129 // space at the end (but not at the beginning).
1130 //
1131 // This is the only backtracking requirement because of
1132 // epsilon transitions, to support tokenizations like:
1133 // "this is an example|.| And it works." vs
1134 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001135 //
1136 // TODO:
1137 // Store a translation buffer as well, so characters don't
1138 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001139 buffer := make([]rune, 1024)
1140 buffo := 0 // Buffer offset
1141 buffi := 0 // Buffer length
1142
Akron3f8571a2021-08-05 11:18:10 +02001143 reader := bufio.NewReader(r)
1144 writer := bufio.NewWriter(w)
1145 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001146
Akron3f8571a2021-08-05 11:18:10 +02001147 var char rune
1148 var err error
1149 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001150 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001151
Akronc5d8d432021-08-10 16:48:44 +02001152PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001153 for {
1154
Akronb7e1f132021-08-10 11:52:31 +02001155 if newchar {
1156 // Get from reader if buffer is empty
1157 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001158 if eof {
1159 break
1160 }
Akronb7e1f132021-08-10 11:52:31 +02001161 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001162
Akronb7e1f132021-08-10 11:52:31 +02001163 // No more runes to read
1164 if err != nil {
1165 eof = true
1166 break
1167 }
1168 buffer[buffi] = char
1169 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001170 }
Akronb7e1f132021-08-10 11:52:31 +02001171
1172 char = buffer[buffo]
1173
1174 if DEBUG {
1175 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1176 }
1177
Akron6f1c16c2021-08-17 10:45:42 +02001178 // TODO:
1179 // Better not repeatedly check for a!
1180 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +02001181 if int(char) < 256 {
1182 a = dat.sigmaASCII[int(char)]
1183 } else {
1184 a, ok = dat.sigma[char]
1185 if !ok {
1186 a = 0
1187 }
1188 }
Akronb7e1f132021-08-10 11:52:31 +02001189
1190 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001191 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001192 a = dat.identity
1193 }
1194
1195 t0 = t
1196
1197 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001198 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001199 // Remember state for backtracking to last tokenend state
1200 epsilonState = t0
1201 epsilonOffset = buffo
1202 }
Akron3f8571a2021-08-05 11:18:10 +02001203 }
Akron3610f102021-08-08 14:13:25 +02001204
Akronb7e1f132021-08-10 11:52:31 +02001205 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001206 t = dat.array[t0].getBase() + uint32(a)
1207 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001208
Akron524c5432021-08-05 14:14:27 +02001209 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001210 // Char is only relevant if set
1211 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1212 if false {
1213 fmt.Println(dat.outgoing(t0))
1214 }
Akron524c5432021-08-05 14:14:27 +02001215 }
1216
Akronb7e1f132021-08-10 11:52:31 +02001217 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001218 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001219
1220 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001221 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001222 }
1223
1224 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001225
Akron068874c2021-08-04 15:19:56 +02001226 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001227 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001228 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001229 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001230 }
1231 a = dat.unknown
1232
1233 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001234
Akron068874c2021-08-04 15:19:56 +02001235 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001236 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001237 epsilonState = 0 // reset
1238 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001239 a = dat.epsilon
1240
Akron3610f102021-08-08 14:13:25 +02001241 if DEBUG {
1242 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1243 }
Akronb4bbb472021-08-09 11:49:38 +02001244
Akron068874c2021-08-04 15:19:56 +02001245 } else {
1246 break
1247 }
Akron068874c2021-08-04 15:19:56 +02001248
Akronb7e1f132021-08-10 11:52:31 +02001249 newchar = false
1250 continue
Akronb4bbb472021-08-09 11:49:38 +02001251 }
1252
Akronb7e1f132021-08-10 11:52:31 +02001253 // Transition was successful
1254 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001255
1256 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001257 if a != dat.epsilon {
1258
Akron3610f102021-08-08 14:13:25 +02001259 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001260
Akron439f4ec2021-08-09 15:45:38 +02001261 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001262 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001263 if DEBUG {
1264 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1265 }
Akron439f4ec2021-08-09 15:45:38 +02001266 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001267 }
Akron3f8571a2021-08-05 11:18:10 +02001268 }
Akron068874c2021-08-04 15:19:56 +02001269
Akronc5d8d432021-08-10 16:48:44 +02001270 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001271 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001272
Akronc5d8d432021-08-10 16:48:44 +02001273 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001274 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001275 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001276 }
Akron01912fc2021-08-12 11:41:58 +02001277 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001278 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001279 }
Akron1594cb82021-08-11 11:14:56 +02001280 if DEBUG {
1281 fmt.Println("-> Newline")
1282 }
1283 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001284 }
Akron3610f102021-08-08 14:13:25 +02001285
Akronc5d8d432021-08-10 16:48:44 +02001286 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001287 if rewindBuffer {
1288
1289 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001290 for x, i := range buffer[buffo:buffi] {
1291 buffer[x] = i
1292 }
Akronb4bbb472021-08-09 11:49:38 +02001293
Akron3610f102021-08-08 14:13:25 +02001294 buffi -= buffo
1295 epsilonOffset -= buffo
1296 buffo = 0
1297 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001298 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001299 }
Akron84d68e62021-08-04 17:06:52 +02001300 }
1301
Akronb7e1f132021-08-10 11:52:31 +02001302 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001303 if ta.isSeparate() {
1304 t = ta.getBase()
1305 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001306
1307 if DEBUG {
1308 fmt.Println("Representative pointing to", t)
1309 }
1310 }
1311
Akronc5d8d432021-08-10 16:48:44 +02001312 newchar = true
1313
Akron068874c2021-08-04 15:19:56 +02001314 // TODO:
1315 // Prevent endless epsilon loops!
1316 }
1317
Akron439f4ec2021-08-09 15:45:38 +02001318 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001319 if !eof {
Akron068874c2021-08-04 15:19:56 +02001320 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001321 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001322 }
1323 return false
1324 }
1325
Akronb7e1f132021-08-10 11:52:31 +02001326 if DEBUG {
1327 fmt.Println("Entering final check")
1328 }
1329
Akronc5d8d432021-08-10 16:48:44 +02001330 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001331 x := dat.array[t].getBase() + uint32(dat.final)
1332
1333 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001334
1335 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001336 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001337 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001338 }
Akron01912fc2021-08-12 11:41:58 +02001339 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001340
Akronf1a16502021-08-16 15:24:38 +02001341 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001342 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001343 if DEBUG {
1344 fmt.Println("-> Newline")
1345 }
Akrondf0a3ef2021-08-09 15:53:45 +02001346 }
Akron84d68e62021-08-04 17:06:52 +02001347 }
1348
Akron6e70dc82021-08-11 11:33:18 +02001349 // Add an additional sentence ending, if the file is over but no explicit
1350 // sentence split was reached. This may be controversial and therefore
1351 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001352 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001353 writer.WriteRune('\n')
1354 if DEBUG {
1355 fmt.Println("-> Newline")
1356 }
1357 }
1358
Akrone61380b2021-08-16 10:10:46 +02001359 // TODO:
1360 // There may be a new line at the end, from an epsilon,
1361 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001362 return true
1363 }
1364
Akronc5d8d432021-08-10 16:48:44 +02001365 // Check epsilon transitions until a final state is reached
1366 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001367 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001368 a = dat.epsilon
1369 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001370 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001371 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001372 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001373
Akronc5d8d432021-08-10 16:48:44 +02001374 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001375 t0 = epsilonState
1376 epsilonState = 0 // reset
1377 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001378 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001379 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001380 }
Akronc5d8d432021-08-10 16:48:44 +02001381 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001382 }
Akronc5d8d432021-08-10 16:48:44 +02001383 return false
Akron068874c2021-08-04 15:19:56 +02001384}