blob: e128d49b0c0758cc96f9597507f01f45f77ae0dd [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
12// - Serialize!
13// - Split Tokenizer and DATokenizer
14
Akron8ef408b2021-08-02 22:11:04 +020015import (
16 "bufio"
17 "compress/gzip"
18 "fmt"
19 "io"
20 "os"
21 "strconv"
22 "strings"
23 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020024
25 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020026)
27
28const (
Akron75ebe7f2021-08-03 10:34:10 +020029 PROPS = 1
30 SIGMA = 2
31 STATES = 3
32 NONE = 4
33 NEWLINE = '\u000a'
Akron740f3d72021-08-03 12:12:34 +020034 DEBUG = false
Akron8ef408b2021-08-02 22:11:04 +020035)
36
37// Special symbols in sigma
Akron730a79c2021-08-03 11:05:29 +020038var EPSILON = -1
39var UNKNOWN = -1
40var IDENTITY = -1
41var FINAL = -1
Akron8ef408b2021-08-02 22:11:04 +020042
43type mapping struct {
44 source int
45 target int
46}
47
48type edge struct {
Akron740f3d72021-08-03 12:12:34 +020049 inSym int
50 outSym int
51 end int
Akron8ef408b2021-08-02 22:11:04 +020052}
53
54type Tokenizer struct {
55 sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020056 sigmaRev map[int]rune
57 arcCount int
58 stateCount int
59 sigmaCount int
60 maxSize int
Akron8ef408b2021-08-02 22:11:04 +020061 array []int
62 transitions []map[int]*edge
63}
64
Akron740f3d72021-08-03 12:12:34 +020065func ParseFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020066 f, err := os.Open(file)
67 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020068 log.Error().Err(err)
69 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020070 }
71 defer f.Close()
72
73 gz, err := gzip.NewReader(f)
74 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020075 log.Error().Err(err)
76 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020077 }
78 defer gz.Close()
79
Akron740f3d72021-08-03 12:12:34 +020080 return Parse(gz)
Akron8ef408b2021-08-02 22:11:04 +020081}
82
Akron740f3d72021-08-03 12:12:34 +020083func Parse(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020084 r := bufio.NewReader(ior)
85
86 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +020087 sigma: make(map[rune]int),
88 sigmaRev: make(map[int]rune),
Akron8ef408b2021-08-02 22:11:04 +020089 }
90
Akron740f3d72021-08-03 12:12:34 +020091 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +020092
93 mode := 0
94 var elem []string
95 var elemint [5]int
96
97 for {
98 line, err := r.ReadString('\n')
99 if err != nil {
100 if err == io.EOF {
101 break
102 }
Akron740f3d72021-08-03 12:12:34 +0200103 log.Error().Err(err)
104 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200105 }
106 if strings.HasPrefix(line, "##foma-net") {
107 continue
108 }
109 if strings.HasPrefix(line, "##props##") {
110 mode = PROPS
111 continue
112 }
113 if strings.HasPrefix(line, "##states##") {
114 mode = STATES
115
116 // Adds a final transition symbol to sigma
117 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200118 tok.sigmaCount++
119 FINAL = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200120 continue
121 }
122 if strings.HasPrefix(line, "##sigma##") {
123 mode = SIGMA
124 continue
125 }
126 if strings.HasPrefix(line, "##end##") {
127 mode = NONE
128 continue
129 }
130
131 switch mode {
132 case PROPS:
133 {
134 elem = strings.Split(line, " ")
135 /*
136 fmt.Println("arity: " + elem[0])
137 fmt.Println("arccount: " + elem[1])
138 fmt.Println("statecount: " + elem[2])
139 fmt.Println("linecount: " + elem[3])
140 fmt.Println("finalcount: " + elem[4])
141 fmt.Println("pathcount: " + elem[5])
142 fmt.Println("is_deterministic: " + elem[6])
143 fmt.Println("is_pruned: " + elem[7])
144 fmt.Println("is_minimized: " + elem[8])
145 fmt.Println("is_epsilon_free: " + elem[9])
146 fmt.Println("is_loop_free: " + elem[10])
147 fmt.Println("extras: " + elem[11])
148 fmt.Println("name: " + elem[12])
149 */
150 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200151 log.Error().Msg("The FST needs to be deterministic")
152 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200153 }
154 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200155 log.Error().Msg("The FST needs to be epsilon free")
156 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200157 }
158
159 elemint[0], err = strconv.Atoi(elem[1])
160 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200161 log.Error().Msg("Can't read arccount")
162 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200163 }
Akron740f3d72021-08-03 12:12:34 +0200164 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200165
166 // States start at 1 in Mizobuchi et al (2000),
167 // as the state 0 is associated with a fail.
168 // Initialize states and transitions
169 elemint[0], err = strconv.Atoi(elem[2])
170 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200171 log.Error().Msg("Can't read statecount")
172 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200173 }
Akron740f3d72021-08-03 12:12:34 +0200174 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200175 tok.transitions = make([]map[int]*edge, elemint[0]+1)
176 continue
177 }
178 case STATES:
179 {
180 elem = strings.Split(line[0:len(line)-1], " ")
181 if elem[0] == "-1" {
182 continue
183 }
184 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200185 if err != nil {
186 break
187 }
Akron8ef408b2021-08-02 22:11:04 +0200188
189 if len(elem) > 1 {
190 elemint[1], err = strconv.Atoi(elem[1])
191 if err != nil {
192 break
193 }
194 if len(elem) > 2 {
195 elemint[2], err = strconv.Atoi(elem[2])
196 if err != nil {
197 break
198 }
199 if len(elem) > 3 {
200 elemint[3], err = strconv.Atoi(elem[3])
201 if err != nil {
202 break
203 }
204 if len(elem) > 4 {
205 elemint[4], err = strconv.Atoi(elem[4])
206 if err != nil {
207 break
208 }
209 }
210 }
211 }
212 }
213
214 switch len(elem) {
215 case 5:
216 {
Akron740f3d72021-08-03 12:12:34 +0200217 state = elemint[0]
218 inSym = elemint[1]
219 outSym = elemint[2]
220 end = elemint[3]
221 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200222 }
223 case 4:
224 {
225 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200226 state = elemint[0]
227 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200228 } else {
Akron740f3d72021-08-03 12:12:34 +0200229 state = elemint[0]
230 inSym = elemint[1]
231 end = elemint[2]
232 final = elemint[3]
233 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200234 }
235 }
236 case 3:
237 {
Akron740f3d72021-08-03 12:12:34 +0200238 inSym = elemint[0]
239 outSym = elemint[1]
240 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200241 }
242 case 2:
243 {
Akron740f3d72021-08-03 12:12:34 +0200244 inSym = elemint[0]
245 end = elemint[1]
246 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200247 }
248 }
249
Akron8ef408b2021-08-02 22:11:04 +0200250 // While the states in foma start with 0, the states in the
251 // Mizobuchi FSA start with one - so we increase every state by 1.
252
Akron740f3d72021-08-03 12:12:34 +0200253 if inSym != outSym {
254
255 // Allow any epsilon to become a newline
256 if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
257
258 // Allow any whitespace to be ignored
259 !(inSym != EPSILON && outSym == EPSILON) &&
260
261 // Allow any whitespace to become a new line
262 !(tok.sigmaRev[outSym] == NEWLINE) {
263
264 log.Error().Msg(
265 "Unsupported transition: " +
266 strconv.Itoa(state) +
267 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200268 " (" +
Akron740f3d72021-08-03 12:12:34 +0200269 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200270 ":" +
Akron740f3d72021-08-03 12:12:34 +0200271 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200272 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200273 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200274 ":" +
Akron740f3d72021-08-03 12:12:34 +0200275 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200276 ")")
Akron740f3d72021-08-03 12:12:34 +0200277 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200278 }
Akron8ef408b2021-08-02 22:11:04 +0200279 }
280
Akron740f3d72021-08-03 12:12:34 +0200281 // This collects all edges until arrstate changes
282
Akron8ef408b2021-08-02 22:11:04 +0200283 // TODO:
284 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
285 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
286 // This requires to remove the transition alltogether and marks the state instead.
287
288 // TODO:
289 // if arrout == EPSILON, mark the transition as NOTOKEN
290
291 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200292 inSym: inSym,
293 outSym: outSym,
294 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200295 }
296
Akron740f3d72021-08-03 12:12:34 +0200297 // Initialize outgoing states
298 if tok.transitions[state+1] == nil {
299 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200300 }
301
Akron740f3d72021-08-03 12:12:34 +0200302 // Ignore transitions with invalid symbols
303 if inSym >= 0 {
304 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200305 }
Akron8ef408b2021-08-02 22:11:04 +0200306
Akron740f3d72021-08-03 12:12:34 +0200307 // Add final transition
308 if final == 1 {
309 tok.transitions[state+1][FINAL] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200310 }
311
Akron740f3d72021-08-03 12:12:34 +0200312 if DEBUG {
313 fmt.Println("Add",
314 state+1, "->", end+1,
315 "(",
316 inSym,
317 ":",
318 outSym,
319 ") (",
320 string(tok.sigmaRev[inSym]),
321 ":",
322 string(tok.sigmaRev[outSym]),
323 ")")
324 }
Akron75ebe7f2021-08-03 10:34:10 +0200325
Akron8ef408b2021-08-02 22:11:04 +0200326 continue
327 }
328 case SIGMA:
329 {
330 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
331
332 // Turn string into sigma id
333 number, err := strconv.Atoi(elem[0])
334
335 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200336 log.Error().Err(err)
337 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200338 }
339
Akron740f3d72021-08-03 12:12:34 +0200340 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200341
342 var symbol rune
343
344 // Read rune
345 if utf8.RuneCountInString(elem[1]) == 1 {
346 symbol = []rune(elem[1])[0]
347
348 // Probably a MCS
349 } else if utf8.RuneCountInString(elem[1]) > 1 {
350 switch elem[1] {
351 case "@_EPSILON_SYMBOL_@":
352 {
353 EPSILON = number
354 continue
355 }
356 case "@_UNKNOWN_SYMBOL_@":
357 {
358 UNKNOWN = number
359 continue
360 }
361
362 case "@_IDENTITY_SYMBOL_@":
363 {
364 IDENTITY = number
365 continue
366 }
367 default:
Akron740f3d72021-08-03 12:12:34 +0200368 {
369 log.Error().Msg("MCS not supported: " + line)
370 os.Exit(1)
371 }
Akron8ef408b2021-08-02 22:11:04 +0200372 }
373
Akron740f3d72021-08-03 12:12:34 +0200374 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200375 line, err = r.ReadString('\n')
376 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200377 log.Error().Err(err)
378 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200379 }
380 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200381 log.Error().Msg("MCS not supported:" + line)
382 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200383 }
Akron740f3d72021-08-03 12:12:34 +0200384 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200385 }
386
387 tok.sigma[symbol] = number
Akron740f3d72021-08-03 12:12:34 +0200388 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200389 }
390 }
391 }
392
393 return tok
394}
395
396// Implementation of Mizobuchi et al (2000), p.128
Akron740f3d72021-08-03 12:12:34 +0200397func (tok *Tokenizer) ToDoubleArray() *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200398
399 mark := 0
400 size := 0
401
402 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200403 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200404
405 table[size] = &mapping{source: 1, target: 1}
406 size++
407
Akron740f3d72021-08-03 12:12:34 +0200408 // Allocate space for the outgoing symbol range
409 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200410
411 for mark < size {
412 s := table[mark].source // This is a state in Ms
413 t := table[mark].target // This is a state in Mt
414 mark++
Akron740f3d72021-08-03 12:12:34 +0200415
416 // Following the paper, here the state t can be remembered
417 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200418 A = A[:0]
419 tok.get_set(s, &A)
420
Akron740f3d72021-08-03 12:12:34 +0200421 // Set base to the first free slot in the double array
422 tok.setBase(t, tok.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200423
Akron740f3d72021-08-03 12:12:34 +0200424 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200425 for _, a := range A {
426
427 if a != FINAL {
Akron8ef408b2021-08-02 22:11:04 +0200428
Akron740f3d72021-08-03 12:12:34 +0200429 // Aka g(s, a)
430 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200431
Akron740f3d72021-08-03 12:12:34 +0200432 // Store the transition
433 t1 := tok.getBase(t) + a
434 tok.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200435
Akron740f3d72021-08-03 12:12:34 +0200436 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200437 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200438
Akron8ef408b2021-08-02 22:11:04 +0200439 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200440 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200441 table[size] = &mapping{source: s1, target: t1}
442 size++
443 } else {
Akron740f3d72021-08-03 12:12:34 +0200444 // Overwrite with the representative state
445 tok.setBase(t1, -1*r)
Akron8ef408b2021-08-02 22:11:04 +0200446 }
447 } else {
Akron740f3d72021-08-03 12:12:34 +0200448 // Store a final transition
449 tok.setCheck(tok.getBase(t)+FINAL, t)
Akron8ef408b2021-08-02 22:11:04 +0200450 }
451 }
452 }
453
454 // Following Mizobuchi et al (2000) the size of the
455 // FSA should be stored in check(1).
Akron740f3d72021-08-03 12:12:34 +0200456 tok.setCheck(1, tok.maxSize+1)
457 tok.array = tok.array[:tok.maxSize+1]
Akron8ef408b2021-08-02 22:11:04 +0200458 return tok
459}
460
Akron740f3d72021-08-03 12:12:34 +0200461// Resize double array when necessary
Akron8ef408b2021-08-02 22:11:04 +0200462func (tok *Tokenizer) resize(l int) {
Akron740f3d72021-08-03 12:12:34 +0200463 // TODO:
464 // This is a bit too aggressive atm and should be calmed down.
Akron75ebe7f2021-08-03 10:34:10 +0200465 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200466 tok.array = append(tok.array, make([]int, l)...)
467 }
468}
469
Akron740f3d72021-08-03 12:12:34 +0200470// Set base value in double array
471func (tok *Tokenizer) setBase(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200472 l := p*2 + 1
473 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200474 if tok.maxSize < l {
475 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200476 }
477 tok.array[p*2] = v
478}
479
Akron740f3d72021-08-03 12:12:34 +0200480// Get base value in double array
481func (tok *Tokenizer) getBase(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200482 if p*2 >= len(tok.array) {
483 return 0
484 }
485 return tok.array[p*2]
486}
487
Akron740f3d72021-08-03 12:12:34 +0200488// Set check value in double array
489func (tok *Tokenizer) setCheck(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200490 l := p*2 + 1
491 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200492 if tok.maxSize < l {
493 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200494 }
495 tok.array[(p*2)+1] = v
496}
497
Akron740f3d72021-08-03 12:12:34 +0200498// Get check value in double array
499func (tok *Tokenizer) getCheck(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200500 if (p*2)+1 >= len(tok.array) {
501 return 0
502 }
503 return tok.array[(p*2)+1]
504}
505
Akron740f3d72021-08-03 12:12:34 +0200506// Set size of double array
507func (tok *Tokenizer) setSize(p, v int) {
508 tok.setCheck(1, v)
509}
510
511// Get size of double array
512func (tok *Tokenizer) getSize(p int) int {
513 return tok.getCheck(1)
514}
515
Akron8ef408b2021-08-02 22:11:04 +0200516// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200517// exists and return this as a representative.
518// Currently iterates through the whole table
519// in a bruteforce manner.
Akron8ef408b2021-08-02 22:11:04 +0200520func in_table(s int, table []*mapping, size int) int {
521 for x := 0; x < size; x++ {
522 if table[x].source == s {
523 return table[x].target
524 }
525 }
526 return 0
527}
528
529// Set alphabet A to the list of all symbols
530// outgoing from s
531func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200532 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200533 *A = append(*A, a)
534 }
535}
536
537// Based on Mizobuchi et al (2000), p. 124
538// This iterates for every state through the complete double array
539// structure until it finds a gap that fits all outgoing transitions
540// of the state. This is extremely slow, but is only necessary in the
541// construction phase of the tokenizer.
Akron740f3d72021-08-03 12:12:34 +0200542func (tok *Tokenizer) xCheck(symbols []int) int {
543
544 // Start at the first entry of the double array list
Akron8ef408b2021-08-02 22:11:04 +0200545 base := 1
546
Akron8ef408b2021-08-02 22:11:04 +0200547OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200548
549 // Resize the array if necessary
Akron8ef408b2021-08-02 22:11:04 +0200550 tok.resize((base + FINAL) * 2)
551 for _, a := range symbols {
Akron740f3d72021-08-03 12:12:34 +0200552 if tok.getCheck(base+a) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200553 base++
554 goto OVERLAP
555 }
556 }
Akron8ef408b2021-08-02 22:11:04 +0200557 return base
558}
559
Akron740f3d72021-08-03 12:12:34 +0200560// Match an input string against the double array
561// FSA.
562//
563// Based on Mizobuchi et al (2000), p. 129,
564// with additional support for IDENTITY, UNKNOWN
565// and EPSILON transitions.
566func (tok *Tokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200567 var a int
568 var tu int
569 var ok bool
570
Akron740f3d72021-08-03 12:12:34 +0200571 t := 1 // Initial state
572 chars := []rune(input)
573 i := 0
574
Akron49d27ee2021-08-03 11:58:13 +0200575 for i < len(chars) {
Akron465a0992021-08-03 11:28:48 +0200576 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200577
Akron740f3d72021-08-03 12:12:34 +0200578 // Support identity symbol if character is not in sigma
Akron730a79c2021-08-03 11:05:29 +0200579 if !ok && IDENTITY != -1 {
Akron740f3d72021-08-03 12:12:34 +0200580 if DEBUG {
581 fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
582 }
Akron730a79c2021-08-03 11:05:29 +0200583 a = IDENTITY
Akron740f3d72021-08-03 12:12:34 +0200584 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200585 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200586 }
Akron465a0992021-08-03 11:28:48 +0200587 tu = t
Akron730a79c2021-08-03 11:05:29 +0200588 CHECK:
Akron740f3d72021-08-03 12:12:34 +0200589 t = tok.getBase(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200590
Akron740f3d72021-08-03 12:12:34 +0200591 // Check if the transition is valid according to the double array
592 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
593
594 if DEBUG {
595 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200596 }
Akron740f3d72021-08-03 12:12:34 +0200597
598 if !ok && a == IDENTITY {
599 // Try again with unknown symbol, in case identity failed
600 if DEBUG {
601 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
602 }
603 a = UNKNOWN
604
605 } else if a != EPSILON {
606 // Try again with epsilon symbol, in case everything else failed
607 if DEBUG {
608 fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
609 }
610 a = EPSILON
611 } else {
612 break
613 }
614 goto CHECK
615 } else if tok.getBase(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200616 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200617 t = -1 * tok.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200618 }
Akron740f3d72021-08-03 12:12:34 +0200619
620 // Transition is fine
Akron49d27ee2021-08-03 11:58:13 +0200621 if a != EPSILON {
Akron740f3d72021-08-03 12:12:34 +0200622 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200623 i++
624 }
Akron740f3d72021-08-03 12:12:34 +0200625 // TODO:
626 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200627 }
628
Akron740f3d72021-08-03 12:12:34 +0200629 if i != len(chars) {
630 if DEBUG {
631 fmt.Println("Not at the end")
632 }
Akron8ef408b2021-08-02 22:11:04 +0200633 return false
634 }
635
Akron465a0992021-08-03 11:28:48 +0200636FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200637
638 // Automaton is in a final state
639 if tok.getCheck(tok.getBase(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200640 return true
641 }
Akron465a0992021-08-03 11:28:48 +0200642
Akron740f3d72021-08-03 12:12:34 +0200643 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200644 tu = t
645 a = EPSILON
Akron740f3d72021-08-03 12:12:34 +0200646 t = tok.getBase(tu) + a
Akron465a0992021-08-03 11:28:48 +0200647
Akron740f3d72021-08-03 12:12:34 +0200648 // Epsilon transition failed
649 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
650 if DEBUG {
651 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
652 }
Akron465a0992021-08-03 11:28:48 +0200653 return false
Akron740f3d72021-08-03 12:12:34 +0200654
655 } else if tok.getBase(t) < 0 {
Akron465a0992021-08-03 11:28:48 +0200656 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200657 t = -1 * tok.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200658 }
Akron740f3d72021-08-03 12:12:34 +0200659
Akron465a0992021-08-03 11:28:48 +0200660 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200661}