blob: 1ebcd91f28a105d06748cff09f62a22194430a2d [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
12// - Serialize!
13// - Split Tokenizer and DATokenizer
Akron6247a5d2021-08-03 19:18:28 +020014// - Make epsilon etc. properties
Akron740f3d72021-08-03 12:12:34 +020015
Akron8ef408b2021-08-02 22:11:04 +020016import (
17 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020018 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020019 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020020 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020021 "fmt"
22 "io"
23 "os"
Akronc9d84a62021-08-03 15:56:03 +020024 "sort"
Akron8ef408b2021-08-02 22:11:04 +020025 "strconv"
26 "strings"
27 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020028
29 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020030)
31
32const (
Akron75ebe7f2021-08-03 10:34:10 +020033 PROPS = 1
34 SIGMA = 2
35 STATES = 3
36 NONE = 4
37 NEWLINE = '\u000a'
Akron740f3d72021-08-03 12:12:34 +020038 DEBUG = false
Akron6247a5d2021-08-03 19:18:28 +020039 MAGIC = "DATOK"
40 VERSION = uint16(1)
Akron8ef408b2021-08-02 22:11:04 +020041)
42
43// Special symbols in sigma
Akron730a79c2021-08-03 11:05:29 +020044var EPSILON = -1
45var UNKNOWN = -1
46var IDENTITY = -1
47var FINAL = -1
Akron8ef408b2021-08-02 22:11:04 +020048
Akron6247a5d2021-08-03 19:18:28 +020049var bo binary.ByteOrder = binary.LittleEndian
50
Akron8ef408b2021-08-02 22:11:04 +020051type mapping struct {
52 source int
53 target int
54}
55
56type edge struct {
Akron740f3d72021-08-03 12:12:34 +020057 inSym int
58 outSym int
59 end int
Akron8ef408b2021-08-02 22:11:04 +020060}
61
62type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020063 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020064 sigmaRev map[int]rune
65 arcCount int
66 stateCount int
67 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020068 transitions []map[int]*edge
69}
70
Akronf2120ca2021-08-03 16:26:41 +020071type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020072 // sigmaRev map[int]rune
Akron773b1ef2021-08-03 17:37:20 +020073 sigma map[rune]int
74 maxSize int
75 loadLevel float64
76 array []int
Akronf2120ca2021-08-03 16:26:41 +020077}
78
Akron740f3d72021-08-03 12:12:34 +020079func ParseFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020080 f, err := os.Open(file)
81 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020082 log.Error().Err(err)
83 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020084 }
85 defer f.Close()
86
87 gz, err := gzip.NewReader(f)
88 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020089 log.Error().Err(err)
90 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020091 }
92 defer gz.Close()
93
Akron740f3d72021-08-03 12:12:34 +020094 return Parse(gz)
Akron8ef408b2021-08-02 22:11:04 +020095}
96
Akron740f3d72021-08-03 12:12:34 +020097func Parse(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020098 r := bufio.NewReader(ior)
99
100 tok := &Tokenizer{
Akronf2120ca2021-08-03 16:26:41 +0200101 // sigma: make(map[rune]int),
Akron740f3d72021-08-03 12:12:34 +0200102 sigmaRev: make(map[int]rune),
Akron8ef408b2021-08-02 22:11:04 +0200103 }
104
Akron740f3d72021-08-03 12:12:34 +0200105 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200106
107 mode := 0
108 var elem []string
109 var elemint [5]int
110
111 for {
112 line, err := r.ReadString('\n')
113 if err != nil {
114 if err == io.EOF {
115 break
116 }
Akron740f3d72021-08-03 12:12:34 +0200117 log.Error().Err(err)
118 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120 if strings.HasPrefix(line, "##foma-net") {
121 continue
122 }
123 if strings.HasPrefix(line, "##props##") {
124 mode = PROPS
125 continue
126 }
127 if strings.HasPrefix(line, "##states##") {
128 mode = STATES
129
130 // Adds a final transition symbol to sigma
131 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200132 tok.sigmaCount++
133 FINAL = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200134 continue
135 }
136 if strings.HasPrefix(line, "##sigma##") {
137 mode = SIGMA
138 continue
139 }
140 if strings.HasPrefix(line, "##end##") {
141 mode = NONE
142 continue
143 }
144
145 switch mode {
146 case PROPS:
147 {
148 elem = strings.Split(line, " ")
149 /*
150 fmt.Println("arity: " + elem[0])
151 fmt.Println("arccount: " + elem[1])
152 fmt.Println("statecount: " + elem[2])
153 fmt.Println("linecount: " + elem[3])
154 fmt.Println("finalcount: " + elem[4])
155 fmt.Println("pathcount: " + elem[5])
156 fmt.Println("is_deterministic: " + elem[6])
157 fmt.Println("is_pruned: " + elem[7])
158 fmt.Println("is_minimized: " + elem[8])
159 fmt.Println("is_epsilon_free: " + elem[9])
160 fmt.Println("is_loop_free: " + elem[10])
161 fmt.Println("extras: " + elem[11])
162 fmt.Println("name: " + elem[12])
163 */
164 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200165 log.Error().Msg("The FST needs to be deterministic")
166 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200167 }
168 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200169 log.Error().Msg("The FST needs to be epsilon free")
170 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200171 }
172
173 elemint[0], err = strconv.Atoi(elem[1])
174 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200175 log.Error().Msg("Can't read arccount")
176 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200177 }
Akron740f3d72021-08-03 12:12:34 +0200178 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200179
180 // States start at 1 in Mizobuchi et al (2000),
181 // as the state 0 is associated with a fail.
182 // Initialize states and transitions
183 elemint[0], err = strconv.Atoi(elem[2])
184 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200185 log.Error().Msg("Can't read statecount")
186 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200187 }
Akron740f3d72021-08-03 12:12:34 +0200188 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200189 tok.transitions = make([]map[int]*edge, elemint[0]+1)
190 continue
191 }
192 case STATES:
193 {
194 elem = strings.Split(line[0:len(line)-1], " ")
195 if elem[0] == "-1" {
196 continue
197 }
198 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200199 if err != nil {
200 break
201 }
Akron8ef408b2021-08-02 22:11:04 +0200202
203 if len(elem) > 1 {
204 elemint[1], err = strconv.Atoi(elem[1])
205 if err != nil {
206 break
207 }
208 if len(elem) > 2 {
209 elemint[2], err = strconv.Atoi(elem[2])
210 if err != nil {
211 break
212 }
213 if len(elem) > 3 {
214 elemint[3], err = strconv.Atoi(elem[3])
215 if err != nil {
216 break
217 }
218 if len(elem) > 4 {
219 elemint[4], err = strconv.Atoi(elem[4])
220 if err != nil {
221 break
222 }
223 }
224 }
225 }
226 }
227
228 switch len(elem) {
229 case 5:
230 {
Akron740f3d72021-08-03 12:12:34 +0200231 state = elemint[0]
232 inSym = elemint[1]
233 outSym = elemint[2]
234 end = elemint[3]
235 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200236 }
237 case 4:
238 {
239 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200240 state = elemint[0]
241 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200242 } else {
Akron740f3d72021-08-03 12:12:34 +0200243 state = elemint[0]
244 inSym = elemint[1]
245 end = elemint[2]
246 final = elemint[3]
247 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200248 }
249 }
250 case 3:
251 {
Akron740f3d72021-08-03 12:12:34 +0200252 inSym = elemint[0]
253 outSym = elemint[1]
254 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200255 }
256 case 2:
257 {
Akron740f3d72021-08-03 12:12:34 +0200258 inSym = elemint[0]
259 end = elemint[1]
260 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200261 }
262 }
263
Akron8ef408b2021-08-02 22:11:04 +0200264 // While the states in foma start with 0, the states in the
265 // Mizobuchi FSA start with one - so we increase every state by 1.
266
Akron740f3d72021-08-03 12:12:34 +0200267 if inSym != outSym {
268
269 // Allow any epsilon to become a newline
270 if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
271
272 // Allow any whitespace to be ignored
273 !(inSym != EPSILON && outSym == EPSILON) &&
274
275 // Allow any whitespace to become a new line
276 !(tok.sigmaRev[outSym] == NEWLINE) {
277
278 log.Error().Msg(
279 "Unsupported transition: " +
280 strconv.Itoa(state) +
281 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200282 " (" +
Akron740f3d72021-08-03 12:12:34 +0200283 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200284 ":" +
Akron740f3d72021-08-03 12:12:34 +0200285 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200286 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200287 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200288 ":" +
Akron740f3d72021-08-03 12:12:34 +0200289 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200290 ")")
Akron740f3d72021-08-03 12:12:34 +0200291 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200292 }
Akron8ef408b2021-08-02 22:11:04 +0200293 }
294
Akron740f3d72021-08-03 12:12:34 +0200295 // This collects all edges until arrstate changes
296
Akron8ef408b2021-08-02 22:11:04 +0200297 // TODO:
298 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
299 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
300 // This requires to remove the transition alltogether and marks the state instead.
301
302 // TODO:
303 // if arrout == EPSILON, mark the transition as NOTOKEN
304
305 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200306 inSym: inSym,
307 outSym: outSym,
308 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200309 }
310
Akron740f3d72021-08-03 12:12:34 +0200311 // Initialize outgoing states
312 if tok.transitions[state+1] == nil {
313 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200314 }
315
Akron740f3d72021-08-03 12:12:34 +0200316 // Ignore transitions with invalid symbols
317 if inSym >= 0 {
318 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200319 }
Akron8ef408b2021-08-02 22:11:04 +0200320
Akron740f3d72021-08-03 12:12:34 +0200321 // Add final transition
322 if final == 1 {
323 tok.transitions[state+1][FINAL] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200324 }
325
Akron740f3d72021-08-03 12:12:34 +0200326 if DEBUG {
327 fmt.Println("Add",
328 state+1, "->", end+1,
329 "(",
330 inSym,
331 ":",
332 outSym,
333 ") (",
334 string(tok.sigmaRev[inSym]),
335 ":",
336 string(tok.sigmaRev[outSym]),
337 ")")
338 }
Akron75ebe7f2021-08-03 10:34:10 +0200339
Akron8ef408b2021-08-02 22:11:04 +0200340 continue
341 }
342 case SIGMA:
343 {
344 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
345
346 // Turn string into sigma id
347 number, err := strconv.Atoi(elem[0])
348
349 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200350 log.Error().Err(err)
351 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200352 }
353
Akron740f3d72021-08-03 12:12:34 +0200354 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200355
356 var symbol rune
357
358 // Read rune
359 if utf8.RuneCountInString(elem[1]) == 1 {
360 symbol = []rune(elem[1])[0]
361
362 // Probably a MCS
363 } else if utf8.RuneCountInString(elem[1]) > 1 {
364 switch elem[1] {
365 case "@_EPSILON_SYMBOL_@":
366 {
367 EPSILON = number
368 continue
369 }
370 case "@_UNKNOWN_SYMBOL_@":
371 {
372 UNKNOWN = number
373 continue
374 }
375
376 case "@_IDENTITY_SYMBOL_@":
377 {
378 IDENTITY = number
379 continue
380 }
381 default:
Akron740f3d72021-08-03 12:12:34 +0200382 {
383 log.Error().Msg("MCS not supported: " + line)
384 os.Exit(1)
385 }
Akron8ef408b2021-08-02 22:11:04 +0200386 }
387
Akron740f3d72021-08-03 12:12:34 +0200388 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200389 line, err = r.ReadString('\n')
390 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200391 log.Error().Err(err)
392 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200393 }
394 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200395 log.Error().Msg("MCS not supported:" + line)
396 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200397 }
Akron740f3d72021-08-03 12:12:34 +0200398 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200399 }
400
Akron740f3d72021-08-03 12:12:34 +0200401 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200402 }
403 }
404 }
405
406 return tok
407}
408
409// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200410func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
411
412 dat := &DaTokenizer{
Akron6247a5d2021-08-03 19:18:28 +0200413 sigma: make(map[rune]int),
414 loadLevel: -1,
Akronf2120ca2021-08-03 16:26:41 +0200415 }
416
417 for num, sym := range tok.sigmaRev {
418 dat.sigma[sym] = num
419 }
Akron8ef408b2021-08-02 22:11:04 +0200420
421 mark := 0
422 size := 0
423
424 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200425 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200426
427 table[size] = &mapping{source: 1, target: 1}
428 size++
429
Akron740f3d72021-08-03 12:12:34 +0200430 // Allocate space for the outgoing symbol range
431 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200432
433 for mark < size {
434 s := table[mark].source // This is a state in Ms
435 t := table[mark].target // This is a state in Mt
436 mark++
Akron740f3d72021-08-03 12:12:34 +0200437
438 // Following the paper, here the state t can be remembered
439 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200440 A = A[:0]
441 tok.get_set(s, &A)
442
Akron740f3d72021-08-03 12:12:34 +0200443 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200444 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200445
Akron773b1ef2021-08-03 17:37:20 +0200446 // TODO:
447 // Sort the outgoing transitions based onm the
448 // outdegree of .end
449
Akron740f3d72021-08-03 12:12:34 +0200450 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200451 for _, a := range A {
452
453 if a != FINAL {
Akron8ef408b2021-08-02 22:11:04 +0200454
Akron740f3d72021-08-03 12:12:34 +0200455 // Aka g(s, a)
456 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200457
Akron740f3d72021-08-03 12:12:34 +0200458 // Store the transition
Akronf2120ca2021-08-03 16:26:41 +0200459 t1 := dat.getBase(t) + a
460 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200461
Akron740f3d72021-08-03 12:12:34 +0200462 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200463 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200464
Akron8ef408b2021-08-02 22:11:04 +0200465 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200466 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200467 table[size] = &mapping{source: s1, target: t1}
468 size++
469 } else {
Akron740f3d72021-08-03 12:12:34 +0200470 // Overwrite with the representative state
Akronf2120ca2021-08-03 16:26:41 +0200471 dat.setBase(t1, -1*r)
Akron8ef408b2021-08-02 22:11:04 +0200472 }
473 } else {
Akron740f3d72021-08-03 12:12:34 +0200474 // Store a final transition
Akronf2120ca2021-08-03 16:26:41 +0200475 dat.setCheck(dat.getBase(t)+FINAL, t)
Akron8ef408b2021-08-02 22:11:04 +0200476 }
477 }
478 }
479
480 // Following Mizobuchi et al (2000) the size of the
481 // FSA should be stored in check(1).
Akronf2120ca2021-08-03 16:26:41 +0200482 dat.setCheck(1, dat.maxSize+1)
483 dat.array = dat.array[:dat.maxSize+1]
484 return dat
Akron8ef408b2021-08-02 22:11:04 +0200485}
486
Akron740f3d72021-08-03 12:12:34 +0200487// Resize double array when necessary
Akronf2120ca2021-08-03 16:26:41 +0200488func (tok *DaTokenizer) resize(l int) {
Akron740f3d72021-08-03 12:12:34 +0200489 // TODO:
490 // This is a bit too aggressive atm and should be calmed down.
Akron75ebe7f2021-08-03 10:34:10 +0200491 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200492 tok.array = append(tok.array, make([]int, l)...)
493 }
494}
495
Akron740f3d72021-08-03 12:12:34 +0200496// Set base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200497func (tok *DaTokenizer) setBase(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200498 l := p*2 + 1
499 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200500 if tok.maxSize < l {
501 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200502 }
503 tok.array[p*2] = v
504}
505
Akron740f3d72021-08-03 12:12:34 +0200506// Get base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200507func (tok *DaTokenizer) getBase(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200508 if p*2 >= len(tok.array) {
509 return 0
510 }
511 return tok.array[p*2]
512}
513
Akron740f3d72021-08-03 12:12:34 +0200514// Set check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200515func (tok *DaTokenizer) setCheck(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200516 l := p*2 + 1
517 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200518 if tok.maxSize < l {
519 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200520 }
521 tok.array[(p*2)+1] = v
522}
523
Akron740f3d72021-08-03 12:12:34 +0200524// Get check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200525func (tok *DaTokenizer) getCheck(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200526 if (p*2)+1 >= len(tok.array) {
527 return 0
528 }
529 return tok.array[(p*2)+1]
530}
531
Akron740f3d72021-08-03 12:12:34 +0200532// Set size of double array
Akronf2120ca2021-08-03 16:26:41 +0200533func (tok *DaTokenizer) setSize(p, v int) {
Akron740f3d72021-08-03 12:12:34 +0200534 tok.setCheck(1, v)
535}
536
537// Get size of double array
Akron773b1ef2021-08-03 17:37:20 +0200538func (tok *DaTokenizer) GetSize(p int) int {
Akron740f3d72021-08-03 12:12:34 +0200539 return tok.getCheck(1)
540}
541
Akron8ef408b2021-08-02 22:11:04 +0200542// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200543// exists and return this as a representative.
544// Currently iterates through the whole table
545// in a bruteforce manner.
Akron8ef408b2021-08-02 22:11:04 +0200546func in_table(s int, table []*mapping, size int) int {
547 for x := 0; x < size; x++ {
548 if table[x].source == s {
549 return table[x].target
550 }
551 }
552 return 0
553}
554
555// Set alphabet A to the list of all symbols
556// outgoing from s
557func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200558 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200559 *A = append(*A, a)
560 }
Akronc9d84a62021-08-03 15:56:03 +0200561
562 // Not required, but simplifies bug hunting
563 sort.Ints(*A)
Akron8ef408b2021-08-02 22:11:04 +0200564}
565
566// Based on Mizobuchi et al (2000), p. 124
567// This iterates for every state through the complete double array
568// structure until it finds a gap that fits all outgoing transitions
569// of the state. This is extremely slow, but is only necessary in the
570// construction phase of the tokenizer.
Akronf2120ca2021-08-03 16:26:41 +0200571func (dat *DaTokenizer) xCheck(symbols []int) int {
Akron740f3d72021-08-03 12:12:34 +0200572
573 // Start at the first entry of the double array list
Akron8ef408b2021-08-02 22:11:04 +0200574 base := 1
575
Akron8ef408b2021-08-02 22:11:04 +0200576OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200577
578 // Resize the array if necessary
Akronf2120ca2021-08-03 16:26:41 +0200579 dat.resize((base + FINAL) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200580 for _, a := range symbols {
Akronf2120ca2021-08-03 16:26:41 +0200581 if dat.getCheck(base+a) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200582 base++
583 goto OVERLAP
584 }
585 }
Akron8ef408b2021-08-02 22:11:04 +0200586 return base
587}
588
Akron773b1ef2021-08-03 17:37:20 +0200589func (dat *DaTokenizer) LoadLevel() float64 {
Akrond66a9262021-08-03 17:09:09 +0200590
Akron773b1ef2021-08-03 17:37:20 +0200591 if dat.loadLevel >= 0 {
592 return dat.loadLevel
593 }
Akrond66a9262021-08-03 17:09:09 +0200594 nonEmpty := 0
595 all := len(dat.array) / 2
596 for x := 1; x <= len(dat.array); x = x + 2 {
597 if dat.array[x] != 0 {
598 nonEmpty++
599 }
600 }
Akron773b1ef2021-08-03 17:37:20 +0200601 dat.loadLevel = float64(nonEmpty) / float64(all) * 100
602 return dat.loadLevel
Akrond66a9262021-08-03 17:09:09 +0200603}
604
Akron6247a5d2021-08-03 19:18:28 +0200605// WriteTo stores the double array data in an io.Writer.
606func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
607
608 // Store magical header
609 all, err := w.Write([]byte(MAGIC))
610 if err != nil {
611 log.Error().Msg("Unable to write data")
612 }
613
614 // Get sigma as a list
615 sigmalist := make([]rune, len(dat.sigma)+16)
616 max := 0
617 for sym, num := range dat.sigma {
618 sigmalist[num] = sym
619 if num > max {
620 max = num
621 }
622 }
623
624 sigmalist = sigmalist[:max+1]
625
626 buf := make([]byte, 0, 12)
627 bo.PutUint16(buf[0:2], VERSION)
628 bo.PutUint16(buf[2:4], uint16(EPSILON))
629 bo.PutUint16(buf[4:6], uint16(UNKNOWN))
630 bo.PutUint16(buf[6:8], uint16(IDENTITY))
631 bo.PutUint16(buf[8:10], uint16(FINAL))
632 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
633 more, err := w.Write(buf[0:12])
634 if err != nil {
635 log.Error().Msg("Unable to write data")
636 }
637
638 all += more
639
640 wbuf := bytes.NewBuffer(nil)
641 wbufWrap := bufio.NewWriter(wbuf)
642
643 // Write sigma
644 for _, sym := range sigmalist {
645 more, err = wbufWrap.WriteRune(sym)
646 if err != nil {
647 log.Error().Msg("Unable to write data")
648 }
649 all += more
650 }
651 wbufWrap.Flush()
652 more, err = w.Write(wbuf.Bytes())
653 if err != nil {
654 log.Error().Msg("Unable to write data")
655 }
656 all += more
657
658 // Test marker - could be checksum
659 more, err = w.Write([]byte("T"))
660 if err != nil {
661 log.Error().Msg("Unable to write data")
662 }
663 all += more
664
665 wbuf.Reset()
666
667 for _, d := range dat.array {
668 bo.PutUint32(buf[0:4], uint32(d))
669 more, err := w.Write(buf[0:4])
670 if err != nil {
671 log.Error().Msg("Unable to write data")
672 }
673 all += more
674 }
675
676 return int64(all), err
677}
678
Akron740f3d72021-08-03 12:12:34 +0200679// Match an input string against the double array
680// FSA.
681//
682// Based on Mizobuchi et al (2000), p. 129,
683// with additional support for IDENTITY, UNKNOWN
684// and EPSILON transitions.
Akronf2120ca2021-08-03 16:26:41 +0200685func (tok *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200686 var a int
687 var tu int
688 var ok bool
689
Akron740f3d72021-08-03 12:12:34 +0200690 t := 1 // Initial state
691 chars := []rune(input)
692 i := 0
693
Akron49d27ee2021-08-03 11:58:13 +0200694 for i < len(chars) {
Akron465a0992021-08-03 11:28:48 +0200695 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200696
Akron740f3d72021-08-03 12:12:34 +0200697 // Support identity symbol if character is not in sigma
Akron730a79c2021-08-03 11:05:29 +0200698 if !ok && IDENTITY != -1 {
Akron740f3d72021-08-03 12:12:34 +0200699 if DEBUG {
700 fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
701 }
Akron730a79c2021-08-03 11:05:29 +0200702 a = IDENTITY
Akron740f3d72021-08-03 12:12:34 +0200703 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200704 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200705 }
Akron465a0992021-08-03 11:28:48 +0200706 tu = t
Akron730a79c2021-08-03 11:05:29 +0200707 CHECK:
Akron740f3d72021-08-03 12:12:34 +0200708 t = tok.getBase(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200709
Akron740f3d72021-08-03 12:12:34 +0200710 // Check if the transition is valid according to the double array
711 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
712
713 if DEBUG {
714 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200715 }
Akron740f3d72021-08-03 12:12:34 +0200716
717 if !ok && a == IDENTITY {
718 // Try again with unknown symbol, in case identity failed
719 if DEBUG {
720 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
721 }
722 a = UNKNOWN
723
724 } else if a != EPSILON {
725 // Try again with epsilon symbol, in case everything else failed
726 if DEBUG {
727 fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
728 }
729 a = EPSILON
730 } else {
731 break
732 }
733 goto CHECK
734 } else if tok.getBase(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200735 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200736 t = -1 * tok.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200737 }
Akron740f3d72021-08-03 12:12:34 +0200738
739 // Transition is fine
Akron49d27ee2021-08-03 11:58:13 +0200740 if a != EPSILON {
Akron740f3d72021-08-03 12:12:34 +0200741 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200742 i++
743 }
Akron740f3d72021-08-03 12:12:34 +0200744 // TODO:
745 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200746 }
747
Akron740f3d72021-08-03 12:12:34 +0200748 if i != len(chars) {
749 if DEBUG {
750 fmt.Println("Not at the end")
751 }
Akron8ef408b2021-08-02 22:11:04 +0200752 return false
753 }
754
Akron465a0992021-08-03 11:28:48 +0200755FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200756
757 // Automaton is in a final state
758 if tok.getCheck(tok.getBase(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200759 return true
760 }
Akron465a0992021-08-03 11:28:48 +0200761
Akron740f3d72021-08-03 12:12:34 +0200762 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200763 tu = t
Akron773b1ef2021-08-03 17:37:20 +0200764 t = tok.getBase(tu) + EPSILON
Akron465a0992021-08-03 11:28:48 +0200765
Akron740f3d72021-08-03 12:12:34 +0200766 // Epsilon transition failed
767 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
768 if DEBUG {
769 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
770 }
Akron465a0992021-08-03 11:28:48 +0200771 return false
Akron740f3d72021-08-03 12:12:34 +0200772
773 } else if tok.getBase(t) < 0 {
Akron465a0992021-08-03 11:28:48 +0200774 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200775 t = -1 * tok.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200776 }
Akron740f3d72021-08-03 12:12:34 +0200777
Akron465a0992021-08-03 11:28:48 +0200778 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200779}