blob: e52c8c374a094f97d2786ba853ca690292cc0621 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
12// - Serialize!
13// - Split Tokenizer and DATokenizer
14
Akron8ef408b2021-08-02 22:11:04 +020015import (
16 "bufio"
17 "compress/gzip"
18 "fmt"
19 "io"
20 "os"
Akronc9d84a62021-08-03 15:56:03 +020021 "sort"
Akron8ef408b2021-08-02 22:11:04 +020022 "strconv"
23 "strings"
24 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020025
26 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020027)
28
29const (
Akron75ebe7f2021-08-03 10:34:10 +020030 PROPS = 1
31 SIGMA = 2
32 STATES = 3
33 NONE = 4
34 NEWLINE = '\u000a'
Akron740f3d72021-08-03 12:12:34 +020035 DEBUG = false
Akron8ef408b2021-08-02 22:11:04 +020036)
37
38// Special symbols in sigma
Akron730a79c2021-08-03 11:05:29 +020039var EPSILON = -1
40var UNKNOWN = -1
41var IDENTITY = -1
42var FINAL = -1
Akron8ef408b2021-08-02 22:11:04 +020043
44type mapping struct {
45 source int
46 target int
47}
48
49type edge struct {
Akron740f3d72021-08-03 12:12:34 +020050 inSym int
51 outSym int
52 end int
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020056 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020057 sigmaRev map[int]rune
58 arcCount int
59 stateCount int
60 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020061 transitions []map[int]*edge
62}
63
Akronf2120ca2021-08-03 16:26:41 +020064type DaTokenizer struct {
65 sigma map[rune]int
66 // sigmaRev map[int]rune
67 maxSize int
68 array []int
69}
70
Akron740f3d72021-08-03 12:12:34 +020071func ParseFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020072 f, err := os.Open(file)
73 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020074 log.Error().Err(err)
75 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020076 }
77 defer f.Close()
78
79 gz, err := gzip.NewReader(f)
80 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020081 log.Error().Err(err)
82 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020083 }
84 defer gz.Close()
85
Akron740f3d72021-08-03 12:12:34 +020086 return Parse(gz)
Akron8ef408b2021-08-02 22:11:04 +020087}
88
Akron740f3d72021-08-03 12:12:34 +020089func Parse(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020090 r := bufio.NewReader(ior)
91
92 tok := &Tokenizer{
Akronf2120ca2021-08-03 16:26:41 +020093 // sigma: make(map[rune]int),
Akron740f3d72021-08-03 12:12:34 +020094 sigmaRev: make(map[int]rune),
Akron8ef408b2021-08-02 22:11:04 +020095 }
96
Akron740f3d72021-08-03 12:12:34 +020097 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +020098
99 mode := 0
100 var elem []string
101 var elemint [5]int
102
103 for {
104 line, err := r.ReadString('\n')
105 if err != nil {
106 if err == io.EOF {
107 break
108 }
Akron740f3d72021-08-03 12:12:34 +0200109 log.Error().Err(err)
110 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200111 }
112 if strings.HasPrefix(line, "##foma-net") {
113 continue
114 }
115 if strings.HasPrefix(line, "##props##") {
116 mode = PROPS
117 continue
118 }
119 if strings.HasPrefix(line, "##states##") {
120 mode = STATES
121
122 // Adds a final transition symbol to sigma
123 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200124 tok.sigmaCount++
125 FINAL = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200126 continue
127 }
128 if strings.HasPrefix(line, "##sigma##") {
129 mode = SIGMA
130 continue
131 }
132 if strings.HasPrefix(line, "##end##") {
133 mode = NONE
134 continue
135 }
136
137 switch mode {
138 case PROPS:
139 {
140 elem = strings.Split(line, " ")
141 /*
142 fmt.Println("arity: " + elem[0])
143 fmt.Println("arccount: " + elem[1])
144 fmt.Println("statecount: " + elem[2])
145 fmt.Println("linecount: " + elem[3])
146 fmt.Println("finalcount: " + elem[4])
147 fmt.Println("pathcount: " + elem[5])
148 fmt.Println("is_deterministic: " + elem[6])
149 fmt.Println("is_pruned: " + elem[7])
150 fmt.Println("is_minimized: " + elem[8])
151 fmt.Println("is_epsilon_free: " + elem[9])
152 fmt.Println("is_loop_free: " + elem[10])
153 fmt.Println("extras: " + elem[11])
154 fmt.Println("name: " + elem[12])
155 */
156 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200157 log.Error().Msg("The FST needs to be deterministic")
158 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200159 }
160 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200161 log.Error().Msg("The FST needs to be epsilon free")
162 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200163 }
164
165 elemint[0], err = strconv.Atoi(elem[1])
166 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200167 log.Error().Msg("Can't read arccount")
168 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200169 }
Akron740f3d72021-08-03 12:12:34 +0200170 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200171
172 // States start at 1 in Mizobuchi et al (2000),
173 // as the state 0 is associated with a fail.
174 // Initialize states and transitions
175 elemint[0], err = strconv.Atoi(elem[2])
176 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200177 log.Error().Msg("Can't read statecount")
178 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200179 }
Akron740f3d72021-08-03 12:12:34 +0200180 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200181 tok.transitions = make([]map[int]*edge, elemint[0]+1)
182 continue
183 }
184 case STATES:
185 {
186 elem = strings.Split(line[0:len(line)-1], " ")
187 if elem[0] == "-1" {
188 continue
189 }
190 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200191 if err != nil {
192 break
193 }
Akron8ef408b2021-08-02 22:11:04 +0200194
195 if len(elem) > 1 {
196 elemint[1], err = strconv.Atoi(elem[1])
197 if err != nil {
198 break
199 }
200 if len(elem) > 2 {
201 elemint[2], err = strconv.Atoi(elem[2])
202 if err != nil {
203 break
204 }
205 if len(elem) > 3 {
206 elemint[3], err = strconv.Atoi(elem[3])
207 if err != nil {
208 break
209 }
210 if len(elem) > 4 {
211 elemint[4], err = strconv.Atoi(elem[4])
212 if err != nil {
213 break
214 }
215 }
216 }
217 }
218 }
219
220 switch len(elem) {
221 case 5:
222 {
Akron740f3d72021-08-03 12:12:34 +0200223 state = elemint[0]
224 inSym = elemint[1]
225 outSym = elemint[2]
226 end = elemint[3]
227 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200228 }
229 case 4:
230 {
231 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200232 state = elemint[0]
233 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200234 } else {
Akron740f3d72021-08-03 12:12:34 +0200235 state = elemint[0]
236 inSym = elemint[1]
237 end = elemint[2]
238 final = elemint[3]
239 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200240 }
241 }
242 case 3:
243 {
Akron740f3d72021-08-03 12:12:34 +0200244 inSym = elemint[0]
245 outSym = elemint[1]
246 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200247 }
248 case 2:
249 {
Akron740f3d72021-08-03 12:12:34 +0200250 inSym = elemint[0]
251 end = elemint[1]
252 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200253 }
254 }
255
Akron8ef408b2021-08-02 22:11:04 +0200256 // While the states in foma start with 0, the states in the
257 // Mizobuchi FSA start with one - so we increase every state by 1.
258
Akron740f3d72021-08-03 12:12:34 +0200259 if inSym != outSym {
260
261 // Allow any epsilon to become a newline
262 if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
263
264 // Allow any whitespace to be ignored
265 !(inSym != EPSILON && outSym == EPSILON) &&
266
267 // Allow any whitespace to become a new line
268 !(tok.sigmaRev[outSym] == NEWLINE) {
269
270 log.Error().Msg(
271 "Unsupported transition: " +
272 strconv.Itoa(state) +
273 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200274 " (" +
Akron740f3d72021-08-03 12:12:34 +0200275 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200276 ":" +
Akron740f3d72021-08-03 12:12:34 +0200277 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200278 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200279 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200280 ":" +
Akron740f3d72021-08-03 12:12:34 +0200281 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200282 ")")
Akron740f3d72021-08-03 12:12:34 +0200283 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200284 }
Akron8ef408b2021-08-02 22:11:04 +0200285 }
286
Akron740f3d72021-08-03 12:12:34 +0200287 // This collects all edges until arrstate changes
288
Akron8ef408b2021-08-02 22:11:04 +0200289 // TODO:
290 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
291 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
292 // This requires to remove the transition alltogether and marks the state instead.
293
294 // TODO:
295 // if arrout == EPSILON, mark the transition as NOTOKEN
296
297 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200298 inSym: inSym,
299 outSym: outSym,
300 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200301 }
302
Akron740f3d72021-08-03 12:12:34 +0200303 // Initialize outgoing states
304 if tok.transitions[state+1] == nil {
305 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200306 }
307
Akron740f3d72021-08-03 12:12:34 +0200308 // Ignore transitions with invalid symbols
309 if inSym >= 0 {
310 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200311 }
Akron8ef408b2021-08-02 22:11:04 +0200312
Akron740f3d72021-08-03 12:12:34 +0200313 // Add final transition
314 if final == 1 {
315 tok.transitions[state+1][FINAL] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200316 }
317
Akron740f3d72021-08-03 12:12:34 +0200318 if DEBUG {
319 fmt.Println("Add",
320 state+1, "->", end+1,
321 "(",
322 inSym,
323 ":",
324 outSym,
325 ") (",
326 string(tok.sigmaRev[inSym]),
327 ":",
328 string(tok.sigmaRev[outSym]),
329 ")")
330 }
Akron75ebe7f2021-08-03 10:34:10 +0200331
Akron8ef408b2021-08-02 22:11:04 +0200332 continue
333 }
334 case SIGMA:
335 {
336 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
337
338 // Turn string into sigma id
339 number, err := strconv.Atoi(elem[0])
340
341 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200342 log.Error().Err(err)
343 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200344 }
345
Akron740f3d72021-08-03 12:12:34 +0200346 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200347
348 var symbol rune
349
350 // Read rune
351 if utf8.RuneCountInString(elem[1]) == 1 {
352 symbol = []rune(elem[1])[0]
353
354 // Probably a MCS
355 } else if utf8.RuneCountInString(elem[1]) > 1 {
356 switch elem[1] {
357 case "@_EPSILON_SYMBOL_@":
358 {
359 EPSILON = number
360 continue
361 }
362 case "@_UNKNOWN_SYMBOL_@":
363 {
364 UNKNOWN = number
365 continue
366 }
367
368 case "@_IDENTITY_SYMBOL_@":
369 {
370 IDENTITY = number
371 continue
372 }
373 default:
Akron740f3d72021-08-03 12:12:34 +0200374 {
375 log.Error().Msg("MCS not supported: " + line)
376 os.Exit(1)
377 }
Akron8ef408b2021-08-02 22:11:04 +0200378 }
379
Akron740f3d72021-08-03 12:12:34 +0200380 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200381 line, err = r.ReadString('\n')
382 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200383 log.Error().Err(err)
384 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200385 }
386 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200387 log.Error().Msg("MCS not supported:" + line)
388 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200389 }
Akron740f3d72021-08-03 12:12:34 +0200390 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200391 }
392
Akron740f3d72021-08-03 12:12:34 +0200393 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200394 }
395 }
396 }
397
398 return tok
399}
400
401// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200402func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
403
404 dat := &DaTokenizer{
405 sigma: make(map[rune]int),
406 }
407
408 for num, sym := range tok.sigmaRev {
409 dat.sigma[sym] = num
410 }
Akron8ef408b2021-08-02 22:11:04 +0200411
412 mark := 0
413 size := 0
414
415 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200416 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200417
418 table[size] = &mapping{source: 1, target: 1}
419 size++
420
Akron740f3d72021-08-03 12:12:34 +0200421 // Allocate space for the outgoing symbol range
422 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200423
424 for mark < size {
425 s := table[mark].source // This is a state in Ms
426 t := table[mark].target // This is a state in Mt
427 mark++
Akron740f3d72021-08-03 12:12:34 +0200428
429 // Following the paper, here the state t can be remembered
430 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200431 A = A[:0]
432 tok.get_set(s, &A)
433
Akron740f3d72021-08-03 12:12:34 +0200434 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200435 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200436
Akron740f3d72021-08-03 12:12:34 +0200437 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200438 for _, a := range A {
439
440 if a != FINAL {
Akron8ef408b2021-08-02 22:11:04 +0200441
Akron740f3d72021-08-03 12:12:34 +0200442 // Aka g(s, a)
443 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200444
Akron740f3d72021-08-03 12:12:34 +0200445 // Store the transition
Akronf2120ca2021-08-03 16:26:41 +0200446 t1 := dat.getBase(t) + a
447 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200448
Akron740f3d72021-08-03 12:12:34 +0200449 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200450 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200451
Akron8ef408b2021-08-02 22:11:04 +0200452 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200453 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200454 table[size] = &mapping{source: s1, target: t1}
455 size++
456 } else {
Akron740f3d72021-08-03 12:12:34 +0200457 // Overwrite with the representative state
Akronf2120ca2021-08-03 16:26:41 +0200458 dat.setBase(t1, -1*r)
Akron8ef408b2021-08-02 22:11:04 +0200459 }
460 } else {
Akron740f3d72021-08-03 12:12:34 +0200461 // Store a final transition
Akronf2120ca2021-08-03 16:26:41 +0200462 dat.setCheck(dat.getBase(t)+FINAL, t)
Akron8ef408b2021-08-02 22:11:04 +0200463 }
464 }
465 }
466
467 // Following Mizobuchi et al (2000) the size of the
468 // FSA should be stored in check(1).
Akronf2120ca2021-08-03 16:26:41 +0200469 dat.setCheck(1, dat.maxSize+1)
470 dat.array = dat.array[:dat.maxSize+1]
471 return dat
Akron8ef408b2021-08-02 22:11:04 +0200472}
473
Akron740f3d72021-08-03 12:12:34 +0200474// Resize double array when necessary
Akronf2120ca2021-08-03 16:26:41 +0200475func (tok *DaTokenizer) resize(l int) {
Akron740f3d72021-08-03 12:12:34 +0200476 // TODO:
477 // This is a bit too aggressive atm and should be calmed down.
Akron75ebe7f2021-08-03 10:34:10 +0200478 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200479 tok.array = append(tok.array, make([]int, l)...)
480 }
481}
482
Akron740f3d72021-08-03 12:12:34 +0200483// Set base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200484func (tok *DaTokenizer) setBase(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200485 l := p*2 + 1
486 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200487 if tok.maxSize < l {
488 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200489 }
490 tok.array[p*2] = v
491}
492
Akron740f3d72021-08-03 12:12:34 +0200493// Get base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200494func (tok *DaTokenizer) getBase(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200495 if p*2 >= len(tok.array) {
496 return 0
497 }
498 return tok.array[p*2]
499}
500
Akron740f3d72021-08-03 12:12:34 +0200501// Set check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200502func (tok *DaTokenizer) setCheck(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200503 l := p*2 + 1
504 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200505 if tok.maxSize < l {
506 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200507 }
508 tok.array[(p*2)+1] = v
509}
510
Akron740f3d72021-08-03 12:12:34 +0200511// Get check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200512func (tok *DaTokenizer) getCheck(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200513 if (p*2)+1 >= len(tok.array) {
514 return 0
515 }
516 return tok.array[(p*2)+1]
517}
518
Akron740f3d72021-08-03 12:12:34 +0200519// Set size of double array
Akronf2120ca2021-08-03 16:26:41 +0200520func (tok *DaTokenizer) setSize(p, v int) {
Akron740f3d72021-08-03 12:12:34 +0200521 tok.setCheck(1, v)
522}
523
524// Get size of double array
Akronf2120ca2021-08-03 16:26:41 +0200525func (tok *DaTokenizer) getSize(p int) int {
Akron740f3d72021-08-03 12:12:34 +0200526 return tok.getCheck(1)
527}
528
Akron8ef408b2021-08-02 22:11:04 +0200529// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200530// exists and return this as a representative.
531// Currently iterates through the whole table
532// in a bruteforce manner.
Akron8ef408b2021-08-02 22:11:04 +0200533func in_table(s int, table []*mapping, size int) int {
534 for x := 0; x < size; x++ {
535 if table[x].source == s {
536 return table[x].target
537 }
538 }
539 return 0
540}
541
542// Set alphabet A to the list of all symbols
543// outgoing from s
544func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200545 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200546 *A = append(*A, a)
547 }
Akronc9d84a62021-08-03 15:56:03 +0200548
549 // Not required, but simplifies bug hunting
550 sort.Ints(*A)
Akron8ef408b2021-08-02 22:11:04 +0200551}
552
553// Based on Mizobuchi et al (2000), p. 124
554// This iterates for every state through the complete double array
555// structure until it finds a gap that fits all outgoing transitions
556// of the state. This is extremely slow, but is only necessary in the
557// construction phase of the tokenizer.
Akronf2120ca2021-08-03 16:26:41 +0200558func (dat *DaTokenizer) xCheck(symbols []int) int {
Akron740f3d72021-08-03 12:12:34 +0200559
560 // Start at the first entry of the double array list
Akron8ef408b2021-08-02 22:11:04 +0200561 base := 1
562
Akron8ef408b2021-08-02 22:11:04 +0200563OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200564
565 // Resize the array if necessary
Akronf2120ca2021-08-03 16:26:41 +0200566 dat.resize((base + FINAL) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200567 for _, a := range symbols {
Akronf2120ca2021-08-03 16:26:41 +0200568 if dat.getCheck(base+a) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200569 base++
570 goto OVERLAP
571 }
572 }
Akron8ef408b2021-08-02 22:11:04 +0200573 return base
574}
575
Akrond66a9262021-08-03 17:09:09 +0200576func (dat *DaTokenizer) loadLevel() float64 {
577
578 nonEmpty := 0
579 all := len(dat.array) / 2
580 for x := 1; x <= len(dat.array); x = x + 2 {
581 if dat.array[x] != 0 {
582 nonEmpty++
583 }
584 }
585 fmt.Println("all:", all, "nonEmpty", nonEmpty)
586 return float64(nonEmpty) / float64(all) * 100
587}
588
Akron740f3d72021-08-03 12:12:34 +0200589// Match an input string against the double array
590// FSA.
591//
592// Based on Mizobuchi et al (2000), p. 129,
593// with additional support for IDENTITY, UNKNOWN
594// and EPSILON transitions.
Akronf2120ca2021-08-03 16:26:41 +0200595func (tok *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200596 var a int
597 var tu int
598 var ok bool
599
Akron740f3d72021-08-03 12:12:34 +0200600 t := 1 // Initial state
601 chars := []rune(input)
602 i := 0
603
Akron49d27ee2021-08-03 11:58:13 +0200604 for i < len(chars) {
Akron465a0992021-08-03 11:28:48 +0200605 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200606
Akron740f3d72021-08-03 12:12:34 +0200607 // Support identity symbol if character is not in sigma
Akron730a79c2021-08-03 11:05:29 +0200608 if !ok && IDENTITY != -1 {
Akron740f3d72021-08-03 12:12:34 +0200609 if DEBUG {
610 fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
611 }
Akron730a79c2021-08-03 11:05:29 +0200612 a = IDENTITY
Akron740f3d72021-08-03 12:12:34 +0200613 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200614 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200615 }
Akron465a0992021-08-03 11:28:48 +0200616 tu = t
Akron730a79c2021-08-03 11:05:29 +0200617 CHECK:
Akron740f3d72021-08-03 12:12:34 +0200618 t = tok.getBase(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200619
Akron740f3d72021-08-03 12:12:34 +0200620 // Check if the transition is valid according to the double array
621 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
622
623 if DEBUG {
624 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200625 }
Akron740f3d72021-08-03 12:12:34 +0200626
627 if !ok && a == IDENTITY {
628 // Try again with unknown symbol, in case identity failed
629 if DEBUG {
630 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
631 }
632 a = UNKNOWN
633
634 } else if a != EPSILON {
635 // Try again with epsilon symbol, in case everything else failed
636 if DEBUG {
637 fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
638 }
639 a = EPSILON
640 } else {
641 break
642 }
643 goto CHECK
644 } else if tok.getBase(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200645 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200646 t = -1 * tok.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200647 }
Akron740f3d72021-08-03 12:12:34 +0200648
649 // Transition is fine
Akron49d27ee2021-08-03 11:58:13 +0200650 if a != EPSILON {
Akron740f3d72021-08-03 12:12:34 +0200651 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200652 i++
653 }
Akron740f3d72021-08-03 12:12:34 +0200654 // TODO:
655 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200656 }
657
Akron740f3d72021-08-03 12:12:34 +0200658 if i != len(chars) {
659 if DEBUG {
660 fmt.Println("Not at the end")
661 }
Akron8ef408b2021-08-02 22:11:04 +0200662 return false
663 }
664
Akron465a0992021-08-03 11:28:48 +0200665FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200666
667 // Automaton is in a final state
668 if tok.getCheck(tok.getBase(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200669 return true
670 }
Akron465a0992021-08-03 11:28:48 +0200671
Akron740f3d72021-08-03 12:12:34 +0200672 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200673 tu = t
674 a = EPSILON
Akron740f3d72021-08-03 12:12:34 +0200675 t = tok.getBase(tu) + a
Akron465a0992021-08-03 11:28:48 +0200676
Akron740f3d72021-08-03 12:12:34 +0200677 // Epsilon transition failed
678 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
679 if DEBUG {
680 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
681 }
Akron465a0992021-08-03 11:28:48 +0200682 return false
Akron740f3d72021-08-03 12:12:34 +0200683
684 } else if tok.getBase(t) < 0 {
Akron465a0992021-08-03 11:28:48 +0200685 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200686 t = -1 * tok.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200687 }
Akron740f3d72021-08-03 12:12:34 +0200688
Akron465a0992021-08-03 11:28:48 +0200689 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200690}