blob: 9a8b7925523fbeb63d41fcc862eb7b38c82c0d8f [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
12// - Serialize!
13// - Split Tokenizer and DATokenizer
14
Akron8ef408b2021-08-02 22:11:04 +020015import (
16 "bufio"
17 "compress/gzip"
18 "fmt"
19 "io"
20 "os"
Akronc9d84a62021-08-03 15:56:03 +020021 "sort"
Akron8ef408b2021-08-02 22:11:04 +020022 "strconv"
23 "strings"
24 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020025
26 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020027)
28
29const (
Akron75ebe7f2021-08-03 10:34:10 +020030 PROPS = 1
31 SIGMA = 2
32 STATES = 3
33 NONE = 4
34 NEWLINE = '\u000a'
Akron740f3d72021-08-03 12:12:34 +020035 DEBUG = false
Akron8ef408b2021-08-02 22:11:04 +020036)
37
38// Special symbols in sigma
Akron730a79c2021-08-03 11:05:29 +020039var EPSILON = -1
40var UNKNOWN = -1
41var IDENTITY = -1
42var FINAL = -1
Akron8ef408b2021-08-02 22:11:04 +020043
44type mapping struct {
45 source int
46 target int
47}
48
49type edge struct {
Akron740f3d72021-08-03 12:12:34 +020050 inSym int
51 outSym int
52 end int
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020056 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020057 sigmaRev map[int]rune
58 arcCount int
59 stateCount int
60 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020061 transitions []map[int]*edge
62}
63
Akronf2120ca2021-08-03 16:26:41 +020064type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020065 // sigmaRev map[int]rune
Akron773b1ef2021-08-03 17:37:20 +020066 sigma map[rune]int
67 maxSize int
68 loadLevel float64
69 array []int
Akronf2120ca2021-08-03 16:26:41 +020070}
71
Akron740f3d72021-08-03 12:12:34 +020072func ParseFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020073 f, err := os.Open(file)
74 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020075 log.Error().Err(err)
76 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020077 }
78 defer f.Close()
79
80 gz, err := gzip.NewReader(f)
81 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020082 log.Error().Err(err)
83 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020084 }
85 defer gz.Close()
86
Akron740f3d72021-08-03 12:12:34 +020087 return Parse(gz)
Akron8ef408b2021-08-02 22:11:04 +020088}
89
Akron740f3d72021-08-03 12:12:34 +020090func Parse(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020091 r := bufio.NewReader(ior)
92
93 tok := &Tokenizer{
Akronf2120ca2021-08-03 16:26:41 +020094 // sigma: make(map[rune]int),
Akron740f3d72021-08-03 12:12:34 +020095 sigmaRev: make(map[int]rune),
Akron8ef408b2021-08-02 22:11:04 +020096 }
97
Akron740f3d72021-08-03 12:12:34 +020098 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +020099
100 mode := 0
101 var elem []string
102 var elemint [5]int
103
104 for {
105 line, err := r.ReadString('\n')
106 if err != nil {
107 if err == io.EOF {
108 break
109 }
Akron740f3d72021-08-03 12:12:34 +0200110 log.Error().Err(err)
111 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200112 }
113 if strings.HasPrefix(line, "##foma-net") {
114 continue
115 }
116 if strings.HasPrefix(line, "##props##") {
117 mode = PROPS
118 continue
119 }
120 if strings.HasPrefix(line, "##states##") {
121 mode = STATES
122
123 // Adds a final transition symbol to sigma
124 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200125 tok.sigmaCount++
126 FINAL = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200127 continue
128 }
129 if strings.HasPrefix(line, "##sigma##") {
130 mode = SIGMA
131 continue
132 }
133 if strings.HasPrefix(line, "##end##") {
134 mode = NONE
135 continue
136 }
137
138 switch mode {
139 case PROPS:
140 {
141 elem = strings.Split(line, " ")
142 /*
143 fmt.Println("arity: " + elem[0])
144 fmt.Println("arccount: " + elem[1])
145 fmt.Println("statecount: " + elem[2])
146 fmt.Println("linecount: " + elem[3])
147 fmt.Println("finalcount: " + elem[4])
148 fmt.Println("pathcount: " + elem[5])
149 fmt.Println("is_deterministic: " + elem[6])
150 fmt.Println("is_pruned: " + elem[7])
151 fmt.Println("is_minimized: " + elem[8])
152 fmt.Println("is_epsilon_free: " + elem[9])
153 fmt.Println("is_loop_free: " + elem[10])
154 fmt.Println("extras: " + elem[11])
155 fmt.Println("name: " + elem[12])
156 */
157 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200158 log.Error().Msg("The FST needs to be deterministic")
159 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200160 }
161 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200162 log.Error().Msg("The FST needs to be epsilon free")
163 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200164 }
165
166 elemint[0], err = strconv.Atoi(elem[1])
167 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200168 log.Error().Msg("Can't read arccount")
169 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200170 }
Akron740f3d72021-08-03 12:12:34 +0200171 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200172
173 // States start at 1 in Mizobuchi et al (2000),
174 // as the state 0 is associated with a fail.
175 // Initialize states and transitions
176 elemint[0], err = strconv.Atoi(elem[2])
177 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200178 log.Error().Msg("Can't read statecount")
179 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200180 }
Akron740f3d72021-08-03 12:12:34 +0200181 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200182 tok.transitions = make([]map[int]*edge, elemint[0]+1)
183 continue
184 }
185 case STATES:
186 {
187 elem = strings.Split(line[0:len(line)-1], " ")
188 if elem[0] == "-1" {
189 continue
190 }
191 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200192 if err != nil {
193 break
194 }
Akron8ef408b2021-08-02 22:11:04 +0200195
196 if len(elem) > 1 {
197 elemint[1], err = strconv.Atoi(elem[1])
198 if err != nil {
199 break
200 }
201 if len(elem) > 2 {
202 elemint[2], err = strconv.Atoi(elem[2])
203 if err != nil {
204 break
205 }
206 if len(elem) > 3 {
207 elemint[3], err = strconv.Atoi(elem[3])
208 if err != nil {
209 break
210 }
211 if len(elem) > 4 {
212 elemint[4], err = strconv.Atoi(elem[4])
213 if err != nil {
214 break
215 }
216 }
217 }
218 }
219 }
220
221 switch len(elem) {
222 case 5:
223 {
Akron740f3d72021-08-03 12:12:34 +0200224 state = elemint[0]
225 inSym = elemint[1]
226 outSym = elemint[2]
227 end = elemint[3]
228 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200229 }
230 case 4:
231 {
232 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200233 state = elemint[0]
234 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200235 } else {
Akron740f3d72021-08-03 12:12:34 +0200236 state = elemint[0]
237 inSym = elemint[1]
238 end = elemint[2]
239 final = elemint[3]
240 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200241 }
242 }
243 case 3:
244 {
Akron740f3d72021-08-03 12:12:34 +0200245 inSym = elemint[0]
246 outSym = elemint[1]
247 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200248 }
249 case 2:
250 {
Akron740f3d72021-08-03 12:12:34 +0200251 inSym = elemint[0]
252 end = elemint[1]
253 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200254 }
255 }
256
Akron8ef408b2021-08-02 22:11:04 +0200257 // While the states in foma start with 0, the states in the
258 // Mizobuchi FSA start with one - so we increase every state by 1.
259
Akron740f3d72021-08-03 12:12:34 +0200260 if inSym != outSym {
261
262 // Allow any epsilon to become a newline
263 if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
264
265 // Allow any whitespace to be ignored
266 !(inSym != EPSILON && outSym == EPSILON) &&
267
268 // Allow any whitespace to become a new line
269 !(tok.sigmaRev[outSym] == NEWLINE) {
270
271 log.Error().Msg(
272 "Unsupported transition: " +
273 strconv.Itoa(state) +
274 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200275 " (" +
Akron740f3d72021-08-03 12:12:34 +0200276 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200277 ":" +
Akron740f3d72021-08-03 12:12:34 +0200278 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200279 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200280 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200281 ":" +
Akron740f3d72021-08-03 12:12:34 +0200282 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200283 ")")
Akron740f3d72021-08-03 12:12:34 +0200284 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200285 }
Akron8ef408b2021-08-02 22:11:04 +0200286 }
287
Akron740f3d72021-08-03 12:12:34 +0200288 // This collects all edges until arrstate changes
289
Akron8ef408b2021-08-02 22:11:04 +0200290 // TODO:
291 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
292 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
293 // This requires to remove the transition alltogether and marks the state instead.
294
295 // TODO:
296 // if arrout == EPSILON, mark the transition as NOTOKEN
297
298 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200299 inSym: inSym,
300 outSym: outSym,
301 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200302 }
303
Akron740f3d72021-08-03 12:12:34 +0200304 // Initialize outgoing states
305 if tok.transitions[state+1] == nil {
306 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200307 }
308
Akron740f3d72021-08-03 12:12:34 +0200309 // Ignore transitions with invalid symbols
310 if inSym >= 0 {
311 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200312 }
Akron8ef408b2021-08-02 22:11:04 +0200313
Akron740f3d72021-08-03 12:12:34 +0200314 // Add final transition
315 if final == 1 {
316 tok.transitions[state+1][FINAL] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200317 }
318
Akron740f3d72021-08-03 12:12:34 +0200319 if DEBUG {
320 fmt.Println("Add",
321 state+1, "->", end+1,
322 "(",
323 inSym,
324 ":",
325 outSym,
326 ") (",
327 string(tok.sigmaRev[inSym]),
328 ":",
329 string(tok.sigmaRev[outSym]),
330 ")")
331 }
Akron75ebe7f2021-08-03 10:34:10 +0200332
Akron8ef408b2021-08-02 22:11:04 +0200333 continue
334 }
335 case SIGMA:
336 {
337 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
338
339 // Turn string into sigma id
340 number, err := strconv.Atoi(elem[0])
341
342 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200343 log.Error().Err(err)
344 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200345 }
346
Akron740f3d72021-08-03 12:12:34 +0200347 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200348
349 var symbol rune
350
351 // Read rune
352 if utf8.RuneCountInString(elem[1]) == 1 {
353 symbol = []rune(elem[1])[0]
354
355 // Probably a MCS
356 } else if utf8.RuneCountInString(elem[1]) > 1 {
357 switch elem[1] {
358 case "@_EPSILON_SYMBOL_@":
359 {
360 EPSILON = number
361 continue
362 }
363 case "@_UNKNOWN_SYMBOL_@":
364 {
365 UNKNOWN = number
366 continue
367 }
368
369 case "@_IDENTITY_SYMBOL_@":
370 {
371 IDENTITY = number
372 continue
373 }
374 default:
Akron740f3d72021-08-03 12:12:34 +0200375 {
376 log.Error().Msg("MCS not supported: " + line)
377 os.Exit(1)
378 }
Akron8ef408b2021-08-02 22:11:04 +0200379 }
380
Akron740f3d72021-08-03 12:12:34 +0200381 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200382 line, err = r.ReadString('\n')
383 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200384 log.Error().Err(err)
385 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200386 }
387 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200388 log.Error().Msg("MCS not supported:" + line)
389 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200390 }
Akron740f3d72021-08-03 12:12:34 +0200391 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200392 }
393
Akron740f3d72021-08-03 12:12:34 +0200394 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200395 }
396 }
397 }
398
399 return tok
400}
401
402// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200403func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
404
405 dat := &DaTokenizer{
406 sigma: make(map[rune]int),
407 }
408
409 for num, sym := range tok.sigmaRev {
410 dat.sigma[sym] = num
411 }
Akron8ef408b2021-08-02 22:11:04 +0200412
413 mark := 0
414 size := 0
415
416 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200417 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200418
419 table[size] = &mapping{source: 1, target: 1}
420 size++
421
Akron740f3d72021-08-03 12:12:34 +0200422 // Allocate space for the outgoing symbol range
423 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200424
425 for mark < size {
426 s := table[mark].source // This is a state in Ms
427 t := table[mark].target // This is a state in Mt
428 mark++
Akron740f3d72021-08-03 12:12:34 +0200429
430 // Following the paper, here the state t can be remembered
431 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200432 A = A[:0]
433 tok.get_set(s, &A)
434
Akron740f3d72021-08-03 12:12:34 +0200435 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200436 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200437
Akron773b1ef2021-08-03 17:37:20 +0200438 // TODO:
439 // Sort the outgoing transitions based onm the
440 // outdegree of .end
441
Akron740f3d72021-08-03 12:12:34 +0200442 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200443 for _, a := range A {
444
445 if a != FINAL {
Akron8ef408b2021-08-02 22:11:04 +0200446
Akron740f3d72021-08-03 12:12:34 +0200447 // Aka g(s, a)
448 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200449
Akron740f3d72021-08-03 12:12:34 +0200450 // Store the transition
Akronf2120ca2021-08-03 16:26:41 +0200451 t1 := dat.getBase(t) + a
452 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200453
Akron740f3d72021-08-03 12:12:34 +0200454 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200455 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200456
Akron8ef408b2021-08-02 22:11:04 +0200457 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200458 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200459 table[size] = &mapping{source: s1, target: t1}
460 size++
461 } else {
Akron740f3d72021-08-03 12:12:34 +0200462 // Overwrite with the representative state
Akronf2120ca2021-08-03 16:26:41 +0200463 dat.setBase(t1, -1*r)
Akron8ef408b2021-08-02 22:11:04 +0200464 }
465 } else {
Akron740f3d72021-08-03 12:12:34 +0200466 // Store a final transition
Akronf2120ca2021-08-03 16:26:41 +0200467 dat.setCheck(dat.getBase(t)+FINAL, t)
Akron8ef408b2021-08-02 22:11:04 +0200468 }
469 }
470 }
471
472 // Following Mizobuchi et al (2000) the size of the
473 // FSA should be stored in check(1).
Akronf2120ca2021-08-03 16:26:41 +0200474 dat.setCheck(1, dat.maxSize+1)
475 dat.array = dat.array[:dat.maxSize+1]
476 return dat
Akron8ef408b2021-08-02 22:11:04 +0200477}
478
Akron740f3d72021-08-03 12:12:34 +0200479// Resize double array when necessary
Akronf2120ca2021-08-03 16:26:41 +0200480func (tok *DaTokenizer) resize(l int) {
Akron740f3d72021-08-03 12:12:34 +0200481 // TODO:
482 // This is a bit too aggressive atm and should be calmed down.
Akron75ebe7f2021-08-03 10:34:10 +0200483 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200484 tok.array = append(tok.array, make([]int, l)...)
485 }
486}
487
Akron740f3d72021-08-03 12:12:34 +0200488// Set base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200489func (tok *DaTokenizer) setBase(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200490 l := p*2 + 1
491 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200492 if tok.maxSize < l {
493 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200494 }
495 tok.array[p*2] = v
496}
497
Akron740f3d72021-08-03 12:12:34 +0200498// Get base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200499func (tok *DaTokenizer) getBase(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200500 if p*2 >= len(tok.array) {
501 return 0
502 }
503 return tok.array[p*2]
504}
505
Akron740f3d72021-08-03 12:12:34 +0200506// Set check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200507func (tok *DaTokenizer) setCheck(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200508 l := p*2 + 1
509 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200510 if tok.maxSize < l {
511 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200512 }
513 tok.array[(p*2)+1] = v
514}
515
Akron740f3d72021-08-03 12:12:34 +0200516// Get check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200517func (tok *DaTokenizer) getCheck(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200518 if (p*2)+1 >= len(tok.array) {
519 return 0
520 }
521 return tok.array[(p*2)+1]
522}
523
Akron740f3d72021-08-03 12:12:34 +0200524// Set size of double array
Akronf2120ca2021-08-03 16:26:41 +0200525func (tok *DaTokenizer) setSize(p, v int) {
Akron740f3d72021-08-03 12:12:34 +0200526 tok.setCheck(1, v)
527}
528
529// Get size of double array
Akron773b1ef2021-08-03 17:37:20 +0200530func (tok *DaTokenizer) GetSize(p int) int {
Akron740f3d72021-08-03 12:12:34 +0200531 return tok.getCheck(1)
532}
533
Akron8ef408b2021-08-02 22:11:04 +0200534// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200535// exists and return this as a representative.
536// Currently iterates through the whole table
537// in a bruteforce manner.
Akron8ef408b2021-08-02 22:11:04 +0200538func in_table(s int, table []*mapping, size int) int {
539 for x := 0; x < size; x++ {
540 if table[x].source == s {
541 return table[x].target
542 }
543 }
544 return 0
545}
546
547// Set alphabet A to the list of all symbols
548// outgoing from s
549func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200550 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200551 *A = append(*A, a)
552 }
Akronc9d84a62021-08-03 15:56:03 +0200553
554 // Not required, but simplifies bug hunting
555 sort.Ints(*A)
Akron8ef408b2021-08-02 22:11:04 +0200556}
557
558// Based on Mizobuchi et al (2000), p. 124
559// This iterates for every state through the complete double array
560// structure until it finds a gap that fits all outgoing transitions
561// of the state. This is extremely slow, but is only necessary in the
562// construction phase of the tokenizer.
Akronf2120ca2021-08-03 16:26:41 +0200563func (dat *DaTokenizer) xCheck(symbols []int) int {
Akron740f3d72021-08-03 12:12:34 +0200564
565 // Start at the first entry of the double array list
Akron8ef408b2021-08-02 22:11:04 +0200566 base := 1
567
Akron8ef408b2021-08-02 22:11:04 +0200568OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200569
570 // Resize the array if necessary
Akronf2120ca2021-08-03 16:26:41 +0200571 dat.resize((base + FINAL) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200572 for _, a := range symbols {
Akronf2120ca2021-08-03 16:26:41 +0200573 if dat.getCheck(base+a) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200574 base++
575 goto OVERLAP
576 }
577 }
Akron8ef408b2021-08-02 22:11:04 +0200578 return base
579}
580
Akron773b1ef2021-08-03 17:37:20 +0200581func (dat *DaTokenizer) LoadLevel() float64 {
Akrond66a9262021-08-03 17:09:09 +0200582
Akron773b1ef2021-08-03 17:37:20 +0200583 if dat.loadLevel >= 0 {
584 return dat.loadLevel
585 }
Akrond66a9262021-08-03 17:09:09 +0200586 nonEmpty := 0
587 all := len(dat.array) / 2
588 for x := 1; x <= len(dat.array); x = x + 2 {
589 if dat.array[x] != 0 {
590 nonEmpty++
591 }
592 }
Akron773b1ef2021-08-03 17:37:20 +0200593 dat.loadLevel = float64(nonEmpty) / float64(all) * 100
594 return dat.loadLevel
Akrond66a9262021-08-03 17:09:09 +0200595}
596
Akron740f3d72021-08-03 12:12:34 +0200597// Match an input string against the double array
598// FSA.
599//
600// Based on Mizobuchi et al (2000), p. 129,
601// with additional support for IDENTITY, UNKNOWN
602// and EPSILON transitions.
Akronf2120ca2021-08-03 16:26:41 +0200603func (tok *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200604 var a int
605 var tu int
606 var ok bool
607
Akron740f3d72021-08-03 12:12:34 +0200608 t := 1 // Initial state
609 chars := []rune(input)
610 i := 0
611
Akron49d27ee2021-08-03 11:58:13 +0200612 for i < len(chars) {
Akron465a0992021-08-03 11:28:48 +0200613 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200614
Akron740f3d72021-08-03 12:12:34 +0200615 // Support identity symbol if character is not in sigma
Akron730a79c2021-08-03 11:05:29 +0200616 if !ok && IDENTITY != -1 {
Akron740f3d72021-08-03 12:12:34 +0200617 if DEBUG {
618 fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
619 }
Akron730a79c2021-08-03 11:05:29 +0200620 a = IDENTITY
Akron740f3d72021-08-03 12:12:34 +0200621 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200622 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200623 }
Akron465a0992021-08-03 11:28:48 +0200624 tu = t
Akron730a79c2021-08-03 11:05:29 +0200625 CHECK:
Akron740f3d72021-08-03 12:12:34 +0200626 t = tok.getBase(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200627
Akron740f3d72021-08-03 12:12:34 +0200628 // Check if the transition is valid according to the double array
629 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
630
631 if DEBUG {
632 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200633 }
Akron740f3d72021-08-03 12:12:34 +0200634
635 if !ok && a == IDENTITY {
636 // Try again with unknown symbol, in case identity failed
637 if DEBUG {
638 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
639 }
640 a = UNKNOWN
641
642 } else if a != EPSILON {
643 // Try again with epsilon symbol, in case everything else failed
644 if DEBUG {
645 fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
646 }
647 a = EPSILON
648 } else {
649 break
650 }
651 goto CHECK
652 } else if tok.getBase(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200653 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200654 t = -1 * tok.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200655 }
Akron740f3d72021-08-03 12:12:34 +0200656
657 // Transition is fine
Akron49d27ee2021-08-03 11:58:13 +0200658 if a != EPSILON {
Akron740f3d72021-08-03 12:12:34 +0200659 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200660 i++
661 }
Akron740f3d72021-08-03 12:12:34 +0200662 // TODO:
663 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200664 }
665
Akron740f3d72021-08-03 12:12:34 +0200666 if i != len(chars) {
667 if DEBUG {
668 fmt.Println("Not at the end")
669 }
Akron8ef408b2021-08-02 22:11:04 +0200670 return false
671 }
672
Akron465a0992021-08-03 11:28:48 +0200673FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200674
675 // Automaton is in a final state
676 if tok.getCheck(tok.getBase(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200677 return true
678 }
Akron465a0992021-08-03 11:28:48 +0200679
Akron740f3d72021-08-03 12:12:34 +0200680 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200681 tu = t
Akron773b1ef2021-08-03 17:37:20 +0200682 t = tok.getBase(tu) + EPSILON
Akron465a0992021-08-03 11:28:48 +0200683
Akron740f3d72021-08-03 12:12:34 +0200684 // Epsilon transition failed
685 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
686 if DEBUG {
687 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
688 }
Akron465a0992021-08-03 11:28:48 +0200689 return false
Akron740f3d72021-08-03 12:12:34 +0200690
691 } else if tok.getBase(t) < 0 {
Akron465a0992021-08-03 11:28:48 +0200692 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200693 t = -1 * tok.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200694 }
Akron740f3d72021-08-03 12:12:34 +0200695
Akron465a0992021-08-03 11:28:48 +0200696 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200697}