blob: b6e877bf82f30d5cde7a6b7fb16efe0515164287 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
12// - Serialize!
13// - Split Tokenizer and DATokenizer
14
Akron8ef408b2021-08-02 22:11:04 +020015import (
16 "bufio"
17 "compress/gzip"
18 "fmt"
19 "io"
20 "os"
Akronc9d84a62021-08-03 15:56:03 +020021 "sort"
Akron8ef408b2021-08-02 22:11:04 +020022 "strconv"
23 "strings"
24 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020025
26 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020027)
28
29const (
Akron75ebe7f2021-08-03 10:34:10 +020030 PROPS = 1
31 SIGMA = 2
32 STATES = 3
33 NONE = 4
34 NEWLINE = '\u000a'
Akron740f3d72021-08-03 12:12:34 +020035 DEBUG = false
Akron8ef408b2021-08-02 22:11:04 +020036)
37
38// Special symbols in sigma
Akron730a79c2021-08-03 11:05:29 +020039var EPSILON = -1
40var UNKNOWN = -1
41var IDENTITY = -1
42var FINAL = -1
Akron8ef408b2021-08-02 22:11:04 +020043
44type mapping struct {
45 source int
46 target int
47}
48
49type edge struct {
Akron740f3d72021-08-03 12:12:34 +020050 inSym int
51 outSym int
52 end int
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type Tokenizer struct {
56 sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020057 sigmaRev map[int]rune
58 arcCount int
59 stateCount int
60 sigmaCount int
61 maxSize int
Akron8ef408b2021-08-02 22:11:04 +020062 array []int
63 transitions []map[int]*edge
64}
65
Akron740f3d72021-08-03 12:12:34 +020066func ParseFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020067 f, err := os.Open(file)
68 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020069 log.Error().Err(err)
70 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020071 }
72 defer f.Close()
73
74 gz, err := gzip.NewReader(f)
75 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020076 log.Error().Err(err)
77 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020078 }
79 defer gz.Close()
80
Akron740f3d72021-08-03 12:12:34 +020081 return Parse(gz)
Akron8ef408b2021-08-02 22:11:04 +020082}
83
Akron740f3d72021-08-03 12:12:34 +020084func Parse(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020085 r := bufio.NewReader(ior)
86
87 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +020088 sigma: make(map[rune]int),
89 sigmaRev: make(map[int]rune),
Akron8ef408b2021-08-02 22:11:04 +020090 }
91
Akron740f3d72021-08-03 12:12:34 +020092 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +020093
94 mode := 0
95 var elem []string
96 var elemint [5]int
97
98 for {
99 line, err := r.ReadString('\n')
100 if err != nil {
101 if err == io.EOF {
102 break
103 }
Akron740f3d72021-08-03 12:12:34 +0200104 log.Error().Err(err)
105 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200106 }
107 if strings.HasPrefix(line, "##foma-net") {
108 continue
109 }
110 if strings.HasPrefix(line, "##props##") {
111 mode = PROPS
112 continue
113 }
114 if strings.HasPrefix(line, "##states##") {
115 mode = STATES
116
117 // Adds a final transition symbol to sigma
118 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200119 tok.sigmaCount++
120 FINAL = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200121 continue
122 }
123 if strings.HasPrefix(line, "##sigma##") {
124 mode = SIGMA
125 continue
126 }
127 if strings.HasPrefix(line, "##end##") {
128 mode = NONE
129 continue
130 }
131
132 switch mode {
133 case PROPS:
134 {
135 elem = strings.Split(line, " ")
136 /*
137 fmt.Println("arity: " + elem[0])
138 fmt.Println("arccount: " + elem[1])
139 fmt.Println("statecount: " + elem[2])
140 fmt.Println("linecount: " + elem[3])
141 fmt.Println("finalcount: " + elem[4])
142 fmt.Println("pathcount: " + elem[5])
143 fmt.Println("is_deterministic: " + elem[6])
144 fmt.Println("is_pruned: " + elem[7])
145 fmt.Println("is_minimized: " + elem[8])
146 fmt.Println("is_epsilon_free: " + elem[9])
147 fmt.Println("is_loop_free: " + elem[10])
148 fmt.Println("extras: " + elem[11])
149 fmt.Println("name: " + elem[12])
150 */
151 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200152 log.Error().Msg("The FST needs to be deterministic")
153 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200154 }
155 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200156 log.Error().Msg("The FST needs to be epsilon free")
157 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200158 }
159
160 elemint[0], err = strconv.Atoi(elem[1])
161 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200162 log.Error().Msg("Can't read arccount")
163 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200164 }
Akron740f3d72021-08-03 12:12:34 +0200165 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200166
167 // States start at 1 in Mizobuchi et al (2000),
168 // as the state 0 is associated with a fail.
169 // Initialize states and transitions
170 elemint[0], err = strconv.Atoi(elem[2])
171 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200172 log.Error().Msg("Can't read statecount")
173 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200174 }
Akron740f3d72021-08-03 12:12:34 +0200175 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200176 tok.transitions = make([]map[int]*edge, elemint[0]+1)
177 continue
178 }
179 case STATES:
180 {
181 elem = strings.Split(line[0:len(line)-1], " ")
182 if elem[0] == "-1" {
183 continue
184 }
185 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200186 if err != nil {
187 break
188 }
Akron8ef408b2021-08-02 22:11:04 +0200189
190 if len(elem) > 1 {
191 elemint[1], err = strconv.Atoi(elem[1])
192 if err != nil {
193 break
194 }
195 if len(elem) > 2 {
196 elemint[2], err = strconv.Atoi(elem[2])
197 if err != nil {
198 break
199 }
200 if len(elem) > 3 {
201 elemint[3], err = strconv.Atoi(elem[3])
202 if err != nil {
203 break
204 }
205 if len(elem) > 4 {
206 elemint[4], err = strconv.Atoi(elem[4])
207 if err != nil {
208 break
209 }
210 }
211 }
212 }
213 }
214
215 switch len(elem) {
216 case 5:
217 {
Akron740f3d72021-08-03 12:12:34 +0200218 state = elemint[0]
219 inSym = elemint[1]
220 outSym = elemint[2]
221 end = elemint[3]
222 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200223 }
224 case 4:
225 {
226 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200227 state = elemint[0]
228 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200229 } else {
Akron740f3d72021-08-03 12:12:34 +0200230 state = elemint[0]
231 inSym = elemint[1]
232 end = elemint[2]
233 final = elemint[3]
234 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200235 }
236 }
237 case 3:
238 {
Akron740f3d72021-08-03 12:12:34 +0200239 inSym = elemint[0]
240 outSym = elemint[1]
241 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200242 }
243 case 2:
244 {
Akron740f3d72021-08-03 12:12:34 +0200245 inSym = elemint[0]
246 end = elemint[1]
247 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200248 }
249 }
250
Akron8ef408b2021-08-02 22:11:04 +0200251 // While the states in foma start with 0, the states in the
252 // Mizobuchi FSA start with one - so we increase every state by 1.
253
Akron740f3d72021-08-03 12:12:34 +0200254 if inSym != outSym {
255
256 // Allow any epsilon to become a newline
257 if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
258
259 // Allow any whitespace to be ignored
260 !(inSym != EPSILON && outSym == EPSILON) &&
261
262 // Allow any whitespace to become a new line
263 !(tok.sigmaRev[outSym] == NEWLINE) {
264
265 log.Error().Msg(
266 "Unsupported transition: " +
267 strconv.Itoa(state) +
268 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200269 " (" +
Akron740f3d72021-08-03 12:12:34 +0200270 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200271 ":" +
Akron740f3d72021-08-03 12:12:34 +0200272 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200273 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200274 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200275 ":" +
Akron740f3d72021-08-03 12:12:34 +0200276 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200277 ")")
Akron740f3d72021-08-03 12:12:34 +0200278 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200279 }
Akron8ef408b2021-08-02 22:11:04 +0200280 }
281
Akron740f3d72021-08-03 12:12:34 +0200282 // This collects all edges until arrstate changes
283
Akron8ef408b2021-08-02 22:11:04 +0200284 // TODO:
285 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
286 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
287 // This requires to remove the transition alltogether and marks the state instead.
288
289 // TODO:
290 // if arrout == EPSILON, mark the transition as NOTOKEN
291
292 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200293 inSym: inSym,
294 outSym: outSym,
295 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200296 }
297
Akron740f3d72021-08-03 12:12:34 +0200298 // Initialize outgoing states
299 if tok.transitions[state+1] == nil {
300 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200301 }
302
Akron740f3d72021-08-03 12:12:34 +0200303 // Ignore transitions with invalid symbols
304 if inSym >= 0 {
305 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200306 }
Akron8ef408b2021-08-02 22:11:04 +0200307
Akron740f3d72021-08-03 12:12:34 +0200308 // Add final transition
309 if final == 1 {
310 tok.transitions[state+1][FINAL] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200311 }
312
Akron740f3d72021-08-03 12:12:34 +0200313 if DEBUG {
314 fmt.Println("Add",
315 state+1, "->", end+1,
316 "(",
317 inSym,
318 ":",
319 outSym,
320 ") (",
321 string(tok.sigmaRev[inSym]),
322 ":",
323 string(tok.sigmaRev[outSym]),
324 ")")
325 }
Akron75ebe7f2021-08-03 10:34:10 +0200326
Akron8ef408b2021-08-02 22:11:04 +0200327 continue
328 }
329 case SIGMA:
330 {
331 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
332
333 // Turn string into sigma id
334 number, err := strconv.Atoi(elem[0])
335
336 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200337 log.Error().Err(err)
338 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200339 }
340
Akron740f3d72021-08-03 12:12:34 +0200341 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200342
343 var symbol rune
344
345 // Read rune
346 if utf8.RuneCountInString(elem[1]) == 1 {
347 symbol = []rune(elem[1])[0]
348
349 // Probably a MCS
350 } else if utf8.RuneCountInString(elem[1]) > 1 {
351 switch elem[1] {
352 case "@_EPSILON_SYMBOL_@":
353 {
354 EPSILON = number
355 continue
356 }
357 case "@_UNKNOWN_SYMBOL_@":
358 {
359 UNKNOWN = number
360 continue
361 }
362
363 case "@_IDENTITY_SYMBOL_@":
364 {
365 IDENTITY = number
366 continue
367 }
368 default:
Akron740f3d72021-08-03 12:12:34 +0200369 {
370 log.Error().Msg("MCS not supported: " + line)
371 os.Exit(1)
372 }
Akron8ef408b2021-08-02 22:11:04 +0200373 }
374
Akron740f3d72021-08-03 12:12:34 +0200375 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200376 line, err = r.ReadString('\n')
377 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200378 log.Error().Err(err)
379 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200380 }
381 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200382 log.Error().Msg("MCS not supported:" + line)
383 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200384 }
Akron740f3d72021-08-03 12:12:34 +0200385 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200386 }
387
388 tok.sigma[symbol] = number
Akron740f3d72021-08-03 12:12:34 +0200389 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200390 }
391 }
392 }
393
394 return tok
395}
396
397// Implementation of Mizobuchi et al (2000), p.128
Akron740f3d72021-08-03 12:12:34 +0200398func (tok *Tokenizer) ToDoubleArray() *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200399
400 mark := 0
401 size := 0
402
403 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200404 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200405
406 table[size] = &mapping{source: 1, target: 1}
407 size++
408
Akron740f3d72021-08-03 12:12:34 +0200409 // Allocate space for the outgoing symbol range
410 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200411
412 for mark < size {
413 s := table[mark].source // This is a state in Ms
414 t := table[mark].target // This is a state in Mt
415 mark++
Akron740f3d72021-08-03 12:12:34 +0200416
417 // Following the paper, here the state t can be remembered
418 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200419 A = A[:0]
420 tok.get_set(s, &A)
421
Akron740f3d72021-08-03 12:12:34 +0200422 // Set base to the first free slot in the double array
423 tok.setBase(t, tok.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200424
Akron740f3d72021-08-03 12:12:34 +0200425 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200426 for _, a := range A {
427
428 if a != FINAL {
Akron8ef408b2021-08-02 22:11:04 +0200429
Akron740f3d72021-08-03 12:12:34 +0200430 // Aka g(s, a)
431 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200432
Akron740f3d72021-08-03 12:12:34 +0200433 // Store the transition
434 t1 := tok.getBase(t) + a
435 tok.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200436
Akron740f3d72021-08-03 12:12:34 +0200437 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200438 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200439
Akron8ef408b2021-08-02 22:11:04 +0200440 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200441 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200442 table[size] = &mapping{source: s1, target: t1}
443 size++
444 } else {
Akron740f3d72021-08-03 12:12:34 +0200445 // Overwrite with the representative state
446 tok.setBase(t1, -1*r)
Akron8ef408b2021-08-02 22:11:04 +0200447 }
448 } else {
Akron740f3d72021-08-03 12:12:34 +0200449 // Store a final transition
450 tok.setCheck(tok.getBase(t)+FINAL, t)
Akron8ef408b2021-08-02 22:11:04 +0200451 }
452 }
453 }
454
455 // Following Mizobuchi et al (2000) the size of the
456 // FSA should be stored in check(1).
Akron740f3d72021-08-03 12:12:34 +0200457 tok.setCheck(1, tok.maxSize+1)
458 tok.array = tok.array[:tok.maxSize+1]
Akron8ef408b2021-08-02 22:11:04 +0200459 return tok
460}
461
Akron740f3d72021-08-03 12:12:34 +0200462// Resize double array when necessary
Akron8ef408b2021-08-02 22:11:04 +0200463func (tok *Tokenizer) resize(l int) {
Akron740f3d72021-08-03 12:12:34 +0200464 // TODO:
465 // This is a bit too aggressive atm and should be calmed down.
Akron75ebe7f2021-08-03 10:34:10 +0200466 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200467 tok.array = append(tok.array, make([]int, l)...)
468 }
469}
470
Akron740f3d72021-08-03 12:12:34 +0200471// Set base value in double array
472func (tok *Tokenizer) setBase(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200473 l := p*2 + 1
474 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200475 if tok.maxSize < l {
476 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200477 }
478 tok.array[p*2] = v
479}
480
Akron740f3d72021-08-03 12:12:34 +0200481// Get base value in double array
482func (tok *Tokenizer) getBase(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200483 if p*2 >= len(tok.array) {
484 return 0
485 }
486 return tok.array[p*2]
487}
488
Akron740f3d72021-08-03 12:12:34 +0200489// Set check value in double array
490func (tok *Tokenizer) setCheck(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200491 l := p*2 + 1
492 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200493 if tok.maxSize < l {
494 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200495 }
496 tok.array[(p*2)+1] = v
497}
498
Akron740f3d72021-08-03 12:12:34 +0200499// Get check value in double array
500func (tok *Tokenizer) getCheck(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200501 if (p*2)+1 >= len(tok.array) {
502 return 0
503 }
504 return tok.array[(p*2)+1]
505}
506
Akron740f3d72021-08-03 12:12:34 +0200507// Set size of double array
508func (tok *Tokenizer) setSize(p, v int) {
509 tok.setCheck(1, v)
510}
511
512// Get size of double array
513func (tok *Tokenizer) getSize(p int) int {
514 return tok.getCheck(1)
515}
516
Akron8ef408b2021-08-02 22:11:04 +0200517// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200518// exists and return this as a representative.
519// Currently iterates through the whole table
520// in a bruteforce manner.
Akron8ef408b2021-08-02 22:11:04 +0200521func in_table(s int, table []*mapping, size int) int {
522 for x := 0; x < size; x++ {
523 if table[x].source == s {
524 return table[x].target
525 }
526 }
527 return 0
528}
529
530// Set alphabet A to the list of all symbols
531// outgoing from s
532func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200533 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200534 *A = append(*A, a)
535 }
Akronc9d84a62021-08-03 15:56:03 +0200536
537 // Not required, but simplifies bug hunting
538 sort.Ints(*A)
Akron8ef408b2021-08-02 22:11:04 +0200539}
540
541// Based on Mizobuchi et al (2000), p. 124
542// This iterates for every state through the complete double array
543// structure until it finds a gap that fits all outgoing transitions
544// of the state. This is extremely slow, but is only necessary in the
545// construction phase of the tokenizer.
Akron740f3d72021-08-03 12:12:34 +0200546func (tok *Tokenizer) xCheck(symbols []int) int {
547
548 // Start at the first entry of the double array list
Akron8ef408b2021-08-02 22:11:04 +0200549 base := 1
550
Akron8ef408b2021-08-02 22:11:04 +0200551OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200552
553 // Resize the array if necessary
Akron8ef408b2021-08-02 22:11:04 +0200554 tok.resize((base + FINAL) * 2)
555 for _, a := range symbols {
Akron740f3d72021-08-03 12:12:34 +0200556 if tok.getCheck(base+a) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200557 base++
558 goto OVERLAP
559 }
560 }
Akron8ef408b2021-08-02 22:11:04 +0200561 return base
562}
563
Akron740f3d72021-08-03 12:12:34 +0200564// Match an input string against the double array
565// FSA.
566//
567// Based on Mizobuchi et al (2000), p. 129,
568// with additional support for IDENTITY, UNKNOWN
569// and EPSILON transitions.
570func (tok *Tokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200571 var a int
572 var tu int
573 var ok bool
574
Akron740f3d72021-08-03 12:12:34 +0200575 t := 1 // Initial state
576 chars := []rune(input)
577 i := 0
578
Akron49d27ee2021-08-03 11:58:13 +0200579 for i < len(chars) {
Akron465a0992021-08-03 11:28:48 +0200580 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200581
Akron740f3d72021-08-03 12:12:34 +0200582 // Support identity symbol if character is not in sigma
Akron730a79c2021-08-03 11:05:29 +0200583 if !ok && IDENTITY != -1 {
Akron740f3d72021-08-03 12:12:34 +0200584 if DEBUG {
585 fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
586 }
Akron730a79c2021-08-03 11:05:29 +0200587 a = IDENTITY
Akron740f3d72021-08-03 12:12:34 +0200588 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200589 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200590 }
Akron465a0992021-08-03 11:28:48 +0200591 tu = t
Akron730a79c2021-08-03 11:05:29 +0200592 CHECK:
Akron740f3d72021-08-03 12:12:34 +0200593 t = tok.getBase(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200594
Akron740f3d72021-08-03 12:12:34 +0200595 // Check if the transition is valid according to the double array
596 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
597
598 if DEBUG {
599 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200600 }
Akron740f3d72021-08-03 12:12:34 +0200601
602 if !ok && a == IDENTITY {
603 // Try again with unknown symbol, in case identity failed
604 if DEBUG {
605 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
606 }
607 a = UNKNOWN
608
609 } else if a != EPSILON {
610 // Try again with epsilon symbol, in case everything else failed
611 if DEBUG {
612 fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
613 }
614 a = EPSILON
615 } else {
616 break
617 }
618 goto CHECK
619 } else if tok.getBase(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200620 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200621 t = -1 * tok.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200622 }
Akron740f3d72021-08-03 12:12:34 +0200623
624 // Transition is fine
Akron49d27ee2021-08-03 11:58:13 +0200625 if a != EPSILON {
Akron740f3d72021-08-03 12:12:34 +0200626 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200627 i++
628 }
Akron740f3d72021-08-03 12:12:34 +0200629 // TODO:
630 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200631 }
632
Akron740f3d72021-08-03 12:12:34 +0200633 if i != len(chars) {
634 if DEBUG {
635 fmt.Println("Not at the end")
636 }
Akron8ef408b2021-08-02 22:11:04 +0200637 return false
638 }
639
Akron465a0992021-08-03 11:28:48 +0200640FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200641
642 // Automaton is in a final state
643 if tok.getCheck(tok.getBase(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200644 return true
645 }
Akron465a0992021-08-03 11:28:48 +0200646
Akron740f3d72021-08-03 12:12:34 +0200647 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200648 tu = t
649 a = EPSILON
Akron740f3d72021-08-03 12:12:34 +0200650 t = tok.getBase(tu) + a
Akron465a0992021-08-03 11:28:48 +0200651
Akron740f3d72021-08-03 12:12:34 +0200652 // Epsilon transition failed
653 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
654 if DEBUG {
655 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
656 }
Akron465a0992021-08-03 11:28:48 +0200657 return false
Akron740f3d72021-08-03 12:12:34 +0200658
659 } else if tok.getBase(t) < 0 {
Akron465a0992021-08-03 11:28:48 +0200660 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200661 t = -1 * tok.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200662 }
Akron740f3d72021-08-03 12:12:34 +0200663
Akron465a0992021-08-03 11:28:48 +0200664 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200665}