blob: 1d5291b3f24a5ebe89afcac8d928e9f4a41380b5 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020017
Akron8ef408b2021-08-02 22:11:04 +020018import (
19 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020020 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020021 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron2a4b9292021-08-04 15:35:22 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 firstBit uint32 = 1 << 31
44 secondBit uint32 = 1 << 30
45 restBit uint32 = ^uint32(0) &^ (firstBit | secondBit)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron6247a5d2021-08-03 19:18:28 +020048var bo binary.ByteOrder = binary.LittleEndian
49
Akron8ef408b2021-08-02 22:11:04 +020050type mapping struct {
51 source int
Akron3fdfec62021-08-04 11:40:10 +020052 target uint32
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type edge struct {
Akron83e75a22021-08-04 13:14:06 +020056 inSym int
57 outSym int
58 end int
59 nontoken bool
60 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020064 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020065 sigmaRev map[int]rune
66 arcCount int
67 stateCount int
68 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020069 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron8ef408b2021-08-02 22:11:04 +020076}
77
Akronf2120ca2021-08-03 16:26:41 +020078type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020079 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020080 sigma map[rune]int
81 maxSize int
82 loadFactor float64
83 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020084 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020085
86 // Special symbols in sigma
87 epsilon int
88 unknown int
89 identity int
90 final int
Akronf2120ca2021-08-03 16:26:41 +020091}
92
Akron64ffd9a2021-08-03 19:55:21 +020093func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020094 f, err := os.Open(file)
95 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020096 log.Error().Err(err)
97 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020098 }
99 defer f.Close()
100
101 gz, err := gzip.NewReader(f)
102 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200103 log.Error().Err(err)
104 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200105 }
106 defer gz.Close()
107
Akron3fdfec62021-08-04 11:40:10 +0200108 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200109}
110
Akron3fdfec62021-08-04 11:40:10 +0200111func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200112 r := bufio.NewReader(ior)
113
114 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200115 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200116 epsilon: -1,
117 unknown: -1,
118 identity: -1,
119 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121
Akron740f3d72021-08-03 12:12:34 +0200122 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200123
124 mode := 0
125 var elem []string
126 var elemint [5]int
127
128 for {
129 line, err := r.ReadString('\n')
130 if err != nil {
131 if err == io.EOF {
132 break
133 }
Akron740f3d72021-08-03 12:12:34 +0200134 log.Error().Err(err)
135 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200136 }
137 if strings.HasPrefix(line, "##foma-net") {
138 continue
139 }
140 if strings.HasPrefix(line, "##props##") {
141 mode = PROPS
142 continue
143 }
144 if strings.HasPrefix(line, "##states##") {
145 mode = STATES
146
147 // Adds a final transition symbol to sigma
148 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200149 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200150 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200151 continue
152 }
153 if strings.HasPrefix(line, "##sigma##") {
154 mode = SIGMA
155 continue
156 }
157 if strings.HasPrefix(line, "##end##") {
158 mode = NONE
159 continue
160 }
161
162 switch mode {
163 case PROPS:
164 {
165 elem = strings.Split(line, " ")
166 /*
167 fmt.Println("arity: " + elem[0])
168 fmt.Println("arccount: " + elem[1])
169 fmt.Println("statecount: " + elem[2])
170 fmt.Println("linecount: " + elem[3])
171 fmt.Println("finalcount: " + elem[4])
172 fmt.Println("pathcount: " + elem[5])
173 fmt.Println("is_deterministic: " + elem[6])
174 fmt.Println("is_pruned: " + elem[7])
175 fmt.Println("is_minimized: " + elem[8])
176 fmt.Println("is_epsilon_free: " + elem[9])
177 fmt.Println("is_loop_free: " + elem[10])
178 fmt.Println("extras: " + elem[11])
179 fmt.Println("name: " + elem[12])
180 */
181 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200182 log.Error().Msg("The FST needs to be deterministic")
183 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200184 }
185 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200186 log.Error().Msg("The FST needs to be epsilon free")
187 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200188 }
189
190 elemint[0], err = strconv.Atoi(elem[1])
191 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200192 log.Error().Msg("Can't read arccount")
193 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200194 }
Akron740f3d72021-08-03 12:12:34 +0200195 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200196
197 // States start at 1 in Mizobuchi et al (2000),
198 // as the state 0 is associated with a fail.
199 // Initialize states and transitions
200 elemint[0], err = strconv.Atoi(elem[2])
201 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200202 log.Error().Msg("Can't read statecount")
203 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200204 }
Akron740f3d72021-08-03 12:12:34 +0200205 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200206 tok.transitions = make([]map[int]*edge, elemint[0]+1)
207 continue
208 }
209 case STATES:
210 {
211 elem = strings.Split(line[0:len(line)-1], " ")
212 if elem[0] == "-1" {
213 continue
214 }
215 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200216 if err != nil {
217 break
218 }
Akron8ef408b2021-08-02 22:11:04 +0200219
220 if len(elem) > 1 {
221 elemint[1], err = strconv.Atoi(elem[1])
222 if err != nil {
223 break
224 }
225 if len(elem) > 2 {
226 elemint[2], err = strconv.Atoi(elem[2])
227 if err != nil {
228 break
229 }
230 if len(elem) > 3 {
231 elemint[3], err = strconv.Atoi(elem[3])
232 if err != nil {
233 break
234 }
235 if len(elem) > 4 {
236 elemint[4], err = strconv.Atoi(elem[4])
237 if err != nil {
238 break
239 }
240 }
241 }
242 }
243 }
244
245 switch len(elem) {
246 case 5:
247 {
Akron740f3d72021-08-03 12:12:34 +0200248 state = elemint[0]
249 inSym = elemint[1]
250 outSym = elemint[2]
251 end = elemint[3]
252 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200253 }
254 case 4:
255 {
256 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200257 state = elemint[0]
258 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200259 } else {
Akron740f3d72021-08-03 12:12:34 +0200260 state = elemint[0]
261 inSym = elemint[1]
262 end = elemint[2]
263 final = elemint[3]
264 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200265 }
266 }
267 case 3:
268 {
Akron740f3d72021-08-03 12:12:34 +0200269 inSym = elemint[0]
270 outSym = elemint[1]
271 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200272 }
273 case 2:
274 {
Akron740f3d72021-08-03 12:12:34 +0200275 inSym = elemint[0]
276 end = elemint[1]
277 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200278 }
279 }
280
Akron8ef408b2021-08-02 22:11:04 +0200281 // While the states in foma start with 0, the states in the
282 // Mizobuchi FSA start with one - so we increase every state by 1.
283
Akron83e75a22021-08-04 13:14:06 +0200284 nontoken := false
285 tokenend := false
286
Akron740f3d72021-08-03 12:12:34 +0200287 if inSym != outSym {
288
Akron83e75a22021-08-04 13:14:06 +0200289 if tok.sigmaRev[outSym] == NEWLINE {
290 tokenend = true
291 } else if outSym == tok.epsilon {
292 nontoken = true
293 } else {
Akron740f3d72021-08-03 12:12:34 +0200294 log.Error().Msg(
295 "Unsupported transition: " +
296 strconv.Itoa(state) +
297 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200298 " (" +
Akron740f3d72021-08-03 12:12:34 +0200299 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200300 ":" +
Akron740f3d72021-08-03 12:12:34 +0200301 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200302 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200303 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200304 ":" +
Akron740f3d72021-08-03 12:12:34 +0200305 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200306 ")")
Akron740f3d72021-08-03 12:12:34 +0200307 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200308 }
Akron83e75a22021-08-04 13:14:06 +0200309
Akron83e75a22021-08-04 13:14:06 +0200310 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200311 log.Error().Msg("Epsilon transitions not supported")
312 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200313 }
314
Akron740f3d72021-08-03 12:12:34 +0200315 // This collects all edges until arrstate changes
316
Akron8ef408b2021-08-02 22:11:04 +0200317 // TODO:
318 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
319 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
320 // This requires to remove the transition alltogether and marks the state instead.
321
322 // TODO:
323 // if arrout == EPSILON, mark the transition as NOTOKEN
324
325 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200326 inSym: inSym,
327 outSym: outSym,
328 end: end + 1,
329 tokenend: tokenend,
330 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200331 }
332
Akron740f3d72021-08-03 12:12:34 +0200333 // Initialize outgoing states
334 if tok.transitions[state+1] == nil {
335 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200336 }
337
Akron740f3d72021-08-03 12:12:34 +0200338 // Ignore transitions with invalid symbols
339 if inSym >= 0 {
340 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200341 }
Akron8ef408b2021-08-02 22:11:04 +0200342
Akron740f3d72021-08-03 12:12:34 +0200343 // Add final transition
344 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200345 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200346 }
347
Akron740f3d72021-08-03 12:12:34 +0200348 if DEBUG {
349 fmt.Println("Add",
350 state+1, "->", end+1,
351 "(",
352 inSym,
353 ":",
354 outSym,
355 ") (",
356 string(tok.sigmaRev[inSym]),
357 ":",
358 string(tok.sigmaRev[outSym]),
359 ")")
360 }
Akron75ebe7f2021-08-03 10:34:10 +0200361
Akron8ef408b2021-08-02 22:11:04 +0200362 continue
363 }
364 case SIGMA:
365 {
366 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
367
368 // Turn string into sigma id
369 number, err := strconv.Atoi(elem[0])
370
371 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200372 log.Error().Err(err)
373 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200374 }
375
Akron740f3d72021-08-03 12:12:34 +0200376 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200377
378 var symbol rune
379
380 // Read rune
381 if utf8.RuneCountInString(elem[1]) == 1 {
382 symbol = []rune(elem[1])[0]
383
384 // Probably a MCS
385 } else if utf8.RuneCountInString(elem[1]) > 1 {
386 switch elem[1] {
387 case "@_EPSILON_SYMBOL_@":
388 {
Akronc17f1ca2021-08-03 19:47:27 +0200389 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200390 continue
391 }
392 case "@_UNKNOWN_SYMBOL_@":
393 {
Akronc17f1ca2021-08-03 19:47:27 +0200394 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200395 continue
396 }
397
398 case "@_IDENTITY_SYMBOL_@":
399 {
Akronc17f1ca2021-08-03 19:47:27 +0200400 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200401 continue
402 }
403 default:
Akron740f3d72021-08-03 12:12:34 +0200404 {
405 log.Error().Msg("MCS not supported: " + line)
406 os.Exit(1)
407 }
Akron8ef408b2021-08-02 22:11:04 +0200408 }
409
Akron740f3d72021-08-03 12:12:34 +0200410 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200411 line, err = r.ReadString('\n')
412 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200413 log.Error().Err(err)
414 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200415 }
416 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200417 log.Error().Msg("MCS not supported:" + line)
418 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200419 }
Akron740f3d72021-08-03 12:12:34 +0200420 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200421 }
422
Akron740f3d72021-08-03 12:12:34 +0200423 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200424 }
425 }
426 }
427
428 return tok
429}
430
Akron64ffd9a2021-08-03 19:55:21 +0200431// Set alphabet A to the list of all symbols
432// outgoing from s
433func (tok *Tokenizer) get_set(s int, A *[]int) {
434 for a := range tok.transitions[s] {
435 *A = append(*A, a)
436 }
437
438 // Not required, but simplifies bug hunting
439 sort.Ints(*A)
440}
441
Akron8ef408b2021-08-02 22:11:04 +0200442// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200443func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
444
445 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200446 sigma: make(map[rune]int),
447 loadFactor: -1,
448 final: tok.final,
449 unknown: tok.unknown,
450 identity: tok.identity,
451 epsilon: tok.epsilon,
Akron3f8571a2021-08-05 11:18:10 +0200452 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200453 }
454
455 for num, sym := range tok.sigmaRev {
456 dat.sigma[sym] = num
457 }
Akron8ef408b2021-08-02 22:11:04 +0200458
459 mark := 0
460 size := 0
461
462 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200463 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200464
465 table[size] = &mapping{source: 1, target: 1}
466 size++
467
Akron740f3d72021-08-03 12:12:34 +0200468 // Allocate space for the outgoing symbol range
469 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200470
471 for mark < size {
472 s := table[mark].source // This is a state in Ms
473 t := table[mark].target // This is a state in Mt
474 mark++
Akron740f3d72021-08-03 12:12:34 +0200475
476 // Following the paper, here the state t can be remembered
477 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200478 A = A[:0]
479 tok.get_set(s, &A)
480
Akron740f3d72021-08-03 12:12:34 +0200481 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200482 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200483
Akron773b1ef2021-08-03 17:37:20 +0200484 // TODO:
Akron068874c2021-08-04 15:19:56 +0200485 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200486 // outdegree of .end
487
Akron740f3d72021-08-03 12:12:34 +0200488 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200489 for _, a := range A {
490
Akronc17f1ca2021-08-03 19:47:27 +0200491 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200492
Akron740f3d72021-08-03 12:12:34 +0200493 // Aka g(s, a)
494 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200495
Akron740f3d72021-08-03 12:12:34 +0200496 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200497 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200498 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200499
Akron83e75a22021-08-04 13:14:06 +0200500 // Mark the state as being the target of a nontoken transition
501 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200502 dat.setNonToken(t1, true)
Akron83e75a22021-08-04 13:14:06 +0200503 }
504
Akron84d68e62021-08-04 17:06:52 +0200505 // Mark the state as being the target of a tokenend transition
506 if tok.transitions[s][a].tokenend {
507 dat.setTokenEnd(t1, true)
508 }
509
Akron740f3d72021-08-03 12:12:34 +0200510 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200511 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200512
Akron8ef408b2021-08-02 22:11:04 +0200513 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200514 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200515 table[size] = &mapping{source: s1, target: t1}
516 size++
517 } else {
Akron740f3d72021-08-03 12:12:34 +0200518 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200519 dat.setBase(t1, r)
520 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200521 }
522 } else {
Akron740f3d72021-08-03 12:12:34 +0200523 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200524 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200525 }
526 }
527 }
528
529 // Following Mizobuchi et al (2000) the size of the
530 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200531 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200532 dat.array = dat.array[:dat.maxSize+1]
533 return dat
Akron8ef408b2021-08-02 22:11:04 +0200534}
535
Akron8ef408b2021-08-02 22:11:04 +0200536// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200537// exists and return this as a representative.
538// Currently iterates through the whole table
539// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200540func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200541 for x := 0; x < size; x++ {
542 if table[x].source == s {
543 return table[x].target
544 }
545 }
546 return 0
547}
548
Akron64ffd9a2021-08-03 19:55:21 +0200549// Resize double array when necessary
550func (dat *DaTokenizer) resize(l int) {
551 // TODO:
552 // This is a bit too aggressive atm and should be calmed down.
553 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200554 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200555 }
Akron64ffd9a2021-08-03 19:55:21 +0200556}
Akronc9d84a62021-08-03 15:56:03 +0200557
Akron64ffd9a2021-08-03 19:55:21 +0200558// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200559func (dat *DaTokenizer) setBase(p uint32, v uint32) {
560 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200561 dat.resize(l)
562 if dat.maxSize < l {
563 dat.maxSize = l
564 }
565 dat.array[p*2] = v
566}
567
Akron3fdfec62021-08-04 11:40:10 +0200568// Returns true if a state is separate pointing to a representative
569func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200570 return dat.array[p*2]&firstBit != 0
Akron3fdfec62021-08-04 11:40:10 +0200571}
572
573// Mark a state as separate pointing to a representative
574func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
575 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200576 dat.array[p*2] |= firstBit
Akron3fdfec62021-08-04 11:40:10 +0200577 } else {
Akron2a4b9292021-08-04 15:35:22 +0200578 dat.array[p*2] &= (restBit | secondBit)
Akron3fdfec62021-08-04 11:40:10 +0200579 }
580}
581
Akron83e75a22021-08-04 13:14:06 +0200582// Returns true if a state is the target of a nontoken transition
583func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200584 return dat.array[p*2+1]&firstBit != 0
Akron83e75a22021-08-04 13:14:06 +0200585}
586
587// Mark a state as being the target of a nontoken transition
588func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
589 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200590 dat.array[p*2+1] |= firstBit
Akron83e75a22021-08-04 13:14:06 +0200591 } else {
Akron2a4b9292021-08-04 15:35:22 +0200592 dat.array[p*2+1] &= (restBit | secondBit)
Akron83e75a22021-08-04 13:14:06 +0200593 }
594}
595
Akron84d68e62021-08-04 17:06:52 +0200596// Returns true if a state is the target of a tokenend transition
597func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
598 return dat.array[p*2+1]&secondBit != 0
599}
600
601// Mark a state as being the target of a tokenend transition
602func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
603 if sep {
604 dat.array[p*2+1] |= secondBit
605 } else {
606 dat.array[p*2+1] &= (restBit | firstBit)
607 }
608}
609
Akron64ffd9a2021-08-03 19:55:21 +0200610// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200611func (dat *DaTokenizer) getBase(p uint32) uint32 {
612 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200613 return 0
614 }
Akron3fdfec62021-08-04 11:40:10 +0200615 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200616}
617
618// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200619func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
620 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200621 dat.resize(l)
622 if dat.maxSize < l {
623 dat.maxSize = l
624 }
625 dat.array[(p*2)+1] = v
626}
627
628// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200629func (dat *DaTokenizer) getCheck(p uint32) uint32 {
630 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200631 return 0
632 }
Akron3fdfec62021-08-04 11:40:10 +0200633 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200634}
635
636// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200637func (dat *DaTokenizer) setSize(v int) {
638 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200639}
640
641// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200642func (dat *DaTokenizer) GetSize() int {
643 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200644}
645
646// Based on Mizobuchi et al (2000), p. 124
647// This iterates for every state through the complete double array
648// structure until it finds a gap that fits all outgoing transitions
649// of the state. This is extremely slow, but is only necessary in the
650// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200651func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200652
653 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200654 base := uint32(1) // dat.lastFilledBase
655 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200656OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200657
Akron3f8571a2021-08-05 11:18:10 +0200658 /*
659 if !skip {
660 if dat.getCheck(base) != 0 {
661 dat.lastFilledBase = base
662 } else {
663 skip = true
664 }
665 }
666 */
667
Akron740f3d72021-08-03 12:12:34 +0200668 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200669 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200670 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200671 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200672 base++
673 goto OVERLAP
674 }
675 }
Akron8ef408b2021-08-02 22:11:04 +0200676 return base
677}
678
Akron03a3c612021-08-04 11:51:27 +0200679// LoadFactor as defined in Kanda et al (2018),
680// i.e. the proportion of non-empty elements to all elements.
681func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200682
Akron03a3c612021-08-04 11:51:27 +0200683 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200684 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200685 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200686 }
Akrond66a9262021-08-03 17:09:09 +0200687 nonEmpty := 0
688 all := len(dat.array) / 2
689 for x := 1; x <= len(dat.array); x = x + 2 {
690 if dat.array[x] != 0 {
691 nonEmpty++
692 }
693 }
Akron03a3c612021-08-04 11:51:27 +0200694 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
695 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200696}
697
Akron6247a5d2021-08-03 19:18:28 +0200698// WriteTo stores the double array data in an io.Writer.
699func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
700
701 // Store magical header
702 all, err := w.Write([]byte(MAGIC))
703 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200704 log.Error().Err(err)
705 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200706 }
707
708 // Get sigma as a list
709 sigmalist := make([]rune, len(dat.sigma)+16)
710 max := 0
711 for sym, num := range dat.sigma {
712 sigmalist[num] = sym
713 if num > max {
714 max = num
715 }
716 }
717
718 sigmalist = sigmalist[:max+1]
719
Akron3f8571a2021-08-05 11:18:10 +0200720 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200721 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200722 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
723 bo.PutUint16(buf[4:6], uint16(dat.unknown))
724 bo.PutUint16(buf[6:8], uint16(dat.identity))
725 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200726 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200727 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
728 more, err := w.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200729 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200730 log.Error().Err(err)
731 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200732 }
733
734 all += more
735
736 wbuf := bytes.NewBuffer(nil)
737 wbufWrap := bufio.NewWriter(wbuf)
738
739 // Write sigma
740 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200741
Akron6247a5d2021-08-03 19:18:28 +0200742 more, err = wbufWrap.WriteRune(sym)
743 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200744 log.Error().Err(err)
745 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200746 }
747 all += more
748 }
749 wbufWrap.Flush()
750 more, err = w.Write(wbuf.Bytes())
751 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200752 log.Error().Err(err)
753 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200754 }
755 all += more
756
757 // Test marker - could be checksum
758 more, err = w.Write([]byte("T"))
759 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200760 log.Error().Err(err)
761 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200762 }
763 all += more
764
765 wbuf.Reset()
766
767 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200768 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200769 more, err := w.Write(buf[0:4])
770 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200771 log.Error().Err(err)
772 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200773 }
774 all += more
775 }
776
777 return int64(all), err
778}
779
Akron3f8571a2021-08-05 11:18:10 +0200780func LoadDatokFile(file string) *DaTokenizer {
781 f, err := os.Open(file)
782 if err != nil {
783 log.Error().Err(err)
784 os.Exit(0)
785 }
786 defer f.Close()
787
788 gz, err := gzip.NewReader(f)
789 if err != nil {
790 log.Error().Err(err)
791 os.Exit(0)
792 }
793 defer gz.Close()
794
795 return ParseDatok(gz)
796}
797
798func ParseDatok(ior io.Reader) *DaTokenizer {
799
800 dat := &DaTokenizer{
801 sigma: make(map[rune]int),
802 epsilon: 0,
803 unknown: 0,
804 identity: 0,
805 final: 0,
806 loadFactor: 0,
807 }
808
809 r := bufio.NewReader(ior)
810
811 all := 0
812
813 buf := make([]byte, 1024)
814 buf = buf[0:len(MAGIC)]
815
816 more, err := r.Read(buf)
817
818 if err != nil {
819 log.Error().Err(err)
820 return nil
821 }
822
823 all += more
824
825 if string(MAGIC) != string(buf) {
826 log.Error().Msg("Not a datok file")
827 return nil
828 }
829
830 more, err = r.Read(buf[0:16])
831 if err != nil {
832 log.Error().Err(err)
833 return nil
834 }
835
836 all += more
837
838 // version := bo.Uint16(buf[0:2])
839 dat.epsilon = int(bo.Uint16(buf[2:4]))
840 dat.unknown = int(bo.Uint16(buf[4:6]))
841 dat.identity = int(bo.Uint16(buf[6:8]))
842 dat.final = int(bo.Uint16(buf[8:10]))
843
844 sigmaCount := int(bo.Uint16(buf[10:12]))
845 arraySize := int(bo.Uint32(buf[12:16]))
846
847 for x := 0; x < sigmaCount; x++ {
848 sym, more, err := r.ReadRune()
849 if err == nil && sym != 0 {
850 dat.sigma[sym] = x
851 }
852 all += more
853 }
854
855 more, err = r.Read(buf[0:1])
856
857 if err != nil {
858 log.Error().Err(err)
859 return nil
860 }
861
862 all += more
863
864 if string("T") != string(buf[0:1]) {
865 log.Error().Msg("Not a datok file")
866 return nil
867 }
868
869 // Read based on length
870 dat.array = make([]uint32, arraySize)
871
872 for x := 0; x < arraySize; x++ {
873 more, err = r.Read(buf[0:4])
874 if err != nil {
875 log.Error().Err(err)
876 return nil
877 }
878 all += more
879 dat.array[x] = bo.Uint32(buf[0:4])
880 }
881
882 return dat
883}
884
Akron740f3d72021-08-03 12:12:34 +0200885// Match an input string against the double array
886// FSA.
887//
888// Based on Mizobuchi et al (2000), p. 129,
889// with additional support for IDENTITY, UNKNOWN
890// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200891func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200892 var a int
Akron3fdfec62021-08-04 11:40:10 +0200893 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200894 var ok bool
895
Akron3fdfec62021-08-04 11:40:10 +0200896 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200897 chars := []rune(input)
898 i := 0
899
Akron49d27ee2021-08-03 11:58:13 +0200900 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200901 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200902
Akron740f3d72021-08-03 12:12:34 +0200903 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200904 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200905 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200906 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200907 }
Akron64ffd9a2021-08-03 19:55:21 +0200908 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200909 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200910 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200911 }
Akron465a0992021-08-03 11:28:48 +0200912 tu = t
Akron730a79c2021-08-03 11:05:29 +0200913 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200914 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200915
Akron740f3d72021-08-03 12:12:34 +0200916 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200917 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200918
919 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200920 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200921 }
Akron740f3d72021-08-03 12:12:34 +0200922
Akron64ffd9a2021-08-03 19:55:21 +0200923 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200924 // Try again with unknown symbol, in case identity failed
925 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200926 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200927 }
Akron64ffd9a2021-08-03 19:55:21 +0200928 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200929
Akron64ffd9a2021-08-03 19:55:21 +0200930 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200931 // Try again with epsilon symbol, in case everything else failed
932 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200933 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200934 }
Akron64ffd9a2021-08-03 19:55:21 +0200935 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200936 } else {
937 break
938 }
939 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200940 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200941 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200942 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200943 }
Akron740f3d72021-08-03 12:12:34 +0200944
945 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200946 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200947 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200948 i++
949 }
Akron83e75a22021-08-04 13:14:06 +0200950
Akron740f3d72021-08-03 12:12:34 +0200951 // TODO:
952 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200953 }
954
Akron740f3d72021-08-03 12:12:34 +0200955 if i != len(chars) {
956 if DEBUG {
957 fmt.Println("Not at the end")
958 }
Akron8ef408b2021-08-02 22:11:04 +0200959 return false
960 }
961
Akron465a0992021-08-03 11:28:48 +0200962FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200963
964 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200965 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200966 return true
967 }
Akron465a0992021-08-03 11:28:48 +0200968
Akron740f3d72021-08-03 12:12:34 +0200969 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200970 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200971 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200972
Akron740f3d72021-08-03 12:12:34 +0200973 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200974 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200975 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200976 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200977 }
Akron465a0992021-08-03 11:28:48 +0200978 return false
Akron740f3d72021-08-03 12:12:34 +0200979
Akron3fdfec62021-08-04 11:40:10 +0200980 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +0200981 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200982 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200983 }
Akron740f3d72021-08-03 12:12:34 +0200984
Akron465a0992021-08-03 11:28:48 +0200985 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200986}
Akron068874c2021-08-04 15:19:56 +0200987
Akron84d68e62021-08-04 17:06:52 +0200988// Transduce an input string against the double array
Akron068874c2021-08-04 15:19:56 +0200989// FSA.
990//
991// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +0200992// for NONTOKEN and TOKENEND handling
Akron3f8571a2021-08-05 11:18:10 +0200993func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +0200994 var a int
995 var tu uint32
Akron84d68e62021-08-04 17:06:52 +0200996 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +0200997
Akron3f8571a2021-08-05 11:18:10 +0200998 reader := bufio.NewReader(r)
999 writer := bufio.NewWriter(w)
1000 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001001
Akron3f8571a2021-08-05 11:18:10 +02001002 t := uint32(1) // Initial state
1003 // chars := []rune(input)
1004 skip := false
1005
1006 var char rune
1007 var err error
1008 eof := false
1009
1010 for {
1011
1012 if !skip {
1013 char, _, err = reader.ReadRune()
1014 if err != nil {
1015 eof = true
1016 break
1017 }
1018 }
1019 skip = false
1020
1021 a, ok = dat.sigma[char]
Akron068874c2021-08-04 15:19:56 +02001022
1023 // Support identity symbol if character is not in sigma
1024 if !ok && dat.identity != -1 {
1025 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001026 fmt.Println("IDENTITY symbol", string(char), "->", dat.identity)
Akron068874c2021-08-04 15:19:56 +02001027 }
1028 a = dat.identity
1029 } else if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001030 fmt.Println("Sigma transition is okay for [", string(char), "]")
Akron068874c2021-08-04 15:19:56 +02001031 }
1032 tu = t
1033 CHECK:
1034 nontoken = false
Akron84d68e62021-08-04 17:06:52 +02001035 tokenend = false
1036
Akron068874c2021-08-04 15:19:56 +02001037 t = dat.getBase(tu) + uint32(a)
1038
1039 // Check if the transition is valid according to the double array
1040 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
1041
1042 if DEBUG {
1043 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
1044 }
1045
1046 if !ok && a == dat.identity {
1047 // Try again with unknown symbol, in case identity failed
1048 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001049 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001050 }
1051 a = dat.unknown
1052
1053 } else if a != dat.epsilon {
1054 // Try again with epsilon symbol, in case everything else failed
1055 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001056 fmt.Println("EPSILON symbol", string(char), "->", dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001057 }
1058 a = dat.epsilon
1059 } else {
1060 break
1061 }
1062 goto CHECK
1063 } else if dat.isSeparate(t) {
1064 // Move to representative state
1065 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +02001066 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +02001067
1068 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +02001069
Akron068874c2021-08-04 15:19:56 +02001070 } else {
1071 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +02001072 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +02001073 }
1074
1075 // Transition is fine
Akron3f8571a2021-08-05 11:18:10 +02001076 if a == dat.epsilon {
1077 skip = true
Akron068874c2021-08-04 15:19:56 +02001078 // Character consumed
Akron3f8571a2021-08-05 11:18:10 +02001079 } else if !nontoken {
1080 writer.WriteRune(char)
1081 }
Akron068874c2021-08-04 15:19:56 +02001082
Akron3f8571a2021-08-05 11:18:10 +02001083 /*
1084 if nontoken {
1085 writer.WriteRune(("<|>")
Akron068874c2021-08-04 15:19:56 +02001086 }
Akron3f8571a2021-08-05 11:18:10 +02001087 */
Akron068874c2021-08-04 15:19:56 +02001088
Akron84d68e62021-08-04 17:06:52 +02001089 if tokenend {
Akron3f8571a2021-08-05 11:18:10 +02001090 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001091 }
1092
Akron068874c2021-08-04 15:19:56 +02001093 // TODO:
1094 // Prevent endless epsilon loops!
1095 }
1096
Akron3f8571a2021-08-05 11:18:10 +02001097 if !eof {
Akron068874c2021-08-04 15:19:56 +02001098 if DEBUG {
1099 fmt.Println("Not at the end")
1100 }
1101 return false
1102 }
1103
1104FINALCHECK:
1105
1106 // Automaton is in a final state
1107 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron3f8571a2021-08-05 11:18:10 +02001108 /*
1109 if dat.isNonToken(t) {
1110 fmt.Print("<|>")
1111 }
1112 */
Akron84d68e62021-08-04 17:06:52 +02001113 if dat.isTokenEnd(t) {
Akron3f8571a2021-08-05 11:18:10 +02001114 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001115 }
1116
1117 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001118 return true
1119 }
1120
1121 // Check epsilon transitions until a final state is reached
1122 tu = t
1123 t = dat.getBase(tu) + uint32(dat.epsilon)
1124
1125 // Epsilon transition failed
1126 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
1127 if DEBUG {
1128 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
1129 }
1130 return false
1131
1132 } else if dat.isSeparate(t) {
Akron3f8571a2021-08-05 11:18:10 +02001133 // nontoken = dat.isNonToken(t)
1134 tokenend = dat.isTokenEnd(t)
1135
Akron068874c2021-08-04 15:19:56 +02001136 // Move to representative state
1137 t = dat.getBase(t)
1138 } else {
Akron3f8571a2021-08-05 11:18:10 +02001139 tokenend = dat.isTokenEnd(t)
1140 // nontoken = dat.isNonToken(t)
Akron068874c2021-08-04 15:19:56 +02001141 }
1142
Akron3f8571a2021-08-05 11:18:10 +02001143 /*
1144 if nontoken {
1145 fmt.Print("<|>")
1146 }
1147 */
1148
1149 if tokenend {
1150 writer.WriteRune('\n')
Akron068874c2021-08-04 15:19:56 +02001151 }
1152
1153 goto FINALCHECK
1154}