blob: ca18618cfd6cf5279ba46754d9cd80a88b141ad2 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020012
Akron8ef408b2021-08-02 22:11:04 +020013import (
14 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020015 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020016 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020017 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020018 "fmt"
19 "io"
20 "os"
Akronc9d84a62021-08-03 15:56:03 +020021 "sort"
Akron8ef408b2021-08-02 22:11:04 +020022 "strconv"
23 "strings"
24 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020025
26 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020027)
28
29const (
Akron3fdfec62021-08-04 11:40:10 +020030 PROPS = 1
31 SIGMA = 2
32 STATES = 3
33 NONE = 4
34 NEWLINE = '\u000a'
35 DEBUG = false
36 MAGIC = "DATOK"
37 VERSION = uint16(1)
38 leadingBit uint32 = 1 << 31
39 restBit uint32 = ^uint32(0) &^ (1 << 31)
Akron8ef408b2021-08-02 22:11:04 +020040)
41
Akron6247a5d2021-08-03 19:18:28 +020042var bo binary.ByteOrder = binary.LittleEndian
43
Akron8ef408b2021-08-02 22:11:04 +020044type mapping struct {
45 source int
Akron3fdfec62021-08-04 11:40:10 +020046 target uint32
Akron8ef408b2021-08-02 22:11:04 +020047}
48
49type edge struct {
Akron740f3d72021-08-03 12:12:34 +020050 inSym int
51 outSym int
52 end int
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020056 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020057 sigmaRev map[int]rune
58 arcCount int
59 stateCount int
60 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020061 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020062
63 // Special symbols in sigma
64 epsilon int
65 unknown int
66 identity int
67 final int
Akron8ef408b2021-08-02 22:11:04 +020068}
69
Akronf2120ca2021-08-03 16:26:41 +020070type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020071 // sigmaRev map[int]rune
Akron773b1ef2021-08-03 17:37:20 +020072 sigma map[rune]int
73 maxSize int
74 loadLevel float64
Akron3fdfec62021-08-04 11:40:10 +020075 array []uint32
Akronc17f1ca2021-08-03 19:47:27 +020076
77 // Special symbols in sigma
78 epsilon int
79 unknown int
80 identity int
81 final int
Akronf2120ca2021-08-03 16:26:41 +020082}
83
Akron64ffd9a2021-08-03 19:55:21 +020084func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020085 f, err := os.Open(file)
86 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020087 log.Error().Err(err)
88 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020089 }
90 defer f.Close()
91
92 gz, err := gzip.NewReader(f)
93 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020094 log.Error().Err(err)
95 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020096 }
97 defer gz.Close()
98
Akron3fdfec62021-08-04 11:40:10 +020099 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200100}
101
Akron3fdfec62021-08-04 11:40:10 +0200102func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200103 r := bufio.NewReader(ior)
104
105 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200106 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200107 epsilon: -1,
108 unknown: -1,
109 identity: -1,
110 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200111 }
112
Akron740f3d72021-08-03 12:12:34 +0200113 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200114
115 mode := 0
116 var elem []string
117 var elemint [5]int
118
119 for {
120 line, err := r.ReadString('\n')
121 if err != nil {
122 if err == io.EOF {
123 break
124 }
Akron740f3d72021-08-03 12:12:34 +0200125 log.Error().Err(err)
126 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200127 }
128 if strings.HasPrefix(line, "##foma-net") {
129 continue
130 }
131 if strings.HasPrefix(line, "##props##") {
132 mode = PROPS
133 continue
134 }
135 if strings.HasPrefix(line, "##states##") {
136 mode = STATES
137
138 // Adds a final transition symbol to sigma
139 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200140 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200141 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200142 continue
143 }
144 if strings.HasPrefix(line, "##sigma##") {
145 mode = SIGMA
146 continue
147 }
148 if strings.HasPrefix(line, "##end##") {
149 mode = NONE
150 continue
151 }
152
153 switch mode {
154 case PROPS:
155 {
156 elem = strings.Split(line, " ")
157 /*
158 fmt.Println("arity: " + elem[0])
159 fmt.Println("arccount: " + elem[1])
160 fmt.Println("statecount: " + elem[2])
161 fmt.Println("linecount: " + elem[3])
162 fmt.Println("finalcount: " + elem[4])
163 fmt.Println("pathcount: " + elem[5])
164 fmt.Println("is_deterministic: " + elem[6])
165 fmt.Println("is_pruned: " + elem[7])
166 fmt.Println("is_minimized: " + elem[8])
167 fmt.Println("is_epsilon_free: " + elem[9])
168 fmt.Println("is_loop_free: " + elem[10])
169 fmt.Println("extras: " + elem[11])
170 fmt.Println("name: " + elem[12])
171 */
172 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200173 log.Error().Msg("The FST needs to be deterministic")
174 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200175 }
176 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200177 log.Error().Msg("The FST needs to be epsilon free")
178 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200179 }
180
181 elemint[0], err = strconv.Atoi(elem[1])
182 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200183 log.Error().Msg("Can't read arccount")
184 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200185 }
Akron740f3d72021-08-03 12:12:34 +0200186 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200187
188 // States start at 1 in Mizobuchi et al (2000),
189 // as the state 0 is associated with a fail.
190 // Initialize states and transitions
191 elemint[0], err = strconv.Atoi(elem[2])
192 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200193 log.Error().Msg("Can't read statecount")
194 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200195 }
Akron740f3d72021-08-03 12:12:34 +0200196 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200197 tok.transitions = make([]map[int]*edge, elemint[0]+1)
198 continue
199 }
200 case STATES:
201 {
202 elem = strings.Split(line[0:len(line)-1], " ")
203 if elem[0] == "-1" {
204 continue
205 }
206 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200207 if err != nil {
208 break
209 }
Akron8ef408b2021-08-02 22:11:04 +0200210
211 if len(elem) > 1 {
212 elemint[1], err = strconv.Atoi(elem[1])
213 if err != nil {
214 break
215 }
216 if len(elem) > 2 {
217 elemint[2], err = strconv.Atoi(elem[2])
218 if err != nil {
219 break
220 }
221 if len(elem) > 3 {
222 elemint[3], err = strconv.Atoi(elem[3])
223 if err != nil {
224 break
225 }
226 if len(elem) > 4 {
227 elemint[4], err = strconv.Atoi(elem[4])
228 if err != nil {
229 break
230 }
231 }
232 }
233 }
234 }
235
236 switch len(elem) {
237 case 5:
238 {
Akron740f3d72021-08-03 12:12:34 +0200239 state = elemint[0]
240 inSym = elemint[1]
241 outSym = elemint[2]
242 end = elemint[3]
243 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200244 }
245 case 4:
246 {
247 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200248 state = elemint[0]
249 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200250 } else {
Akron740f3d72021-08-03 12:12:34 +0200251 state = elemint[0]
252 inSym = elemint[1]
253 end = elemint[2]
254 final = elemint[3]
255 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200256 }
257 }
258 case 3:
259 {
Akron740f3d72021-08-03 12:12:34 +0200260 inSym = elemint[0]
261 outSym = elemint[1]
262 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200263 }
264 case 2:
265 {
Akron740f3d72021-08-03 12:12:34 +0200266 inSym = elemint[0]
267 end = elemint[1]
268 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200269 }
270 }
271
Akron8ef408b2021-08-02 22:11:04 +0200272 // While the states in foma start with 0, the states in the
273 // Mizobuchi FSA start with one - so we increase every state by 1.
274
Akron740f3d72021-08-03 12:12:34 +0200275 if inSym != outSym {
276
277 // Allow any epsilon to become a newline
Akronc17f1ca2021-08-03 19:47:27 +0200278 if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
Akron740f3d72021-08-03 12:12:34 +0200279
280 // Allow any whitespace to be ignored
Akronc17f1ca2021-08-03 19:47:27 +0200281 !(inSym != tok.epsilon && outSym == tok.epsilon) &&
Akron740f3d72021-08-03 12:12:34 +0200282
283 // Allow any whitespace to become a new line
284 !(tok.sigmaRev[outSym] == NEWLINE) {
285
286 log.Error().Msg(
287 "Unsupported transition: " +
288 strconv.Itoa(state) +
289 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200290 " (" +
Akron740f3d72021-08-03 12:12:34 +0200291 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200292 ":" +
Akron740f3d72021-08-03 12:12:34 +0200293 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200294 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200295 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200296 ":" +
Akron740f3d72021-08-03 12:12:34 +0200297 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200298 ")")
Akron740f3d72021-08-03 12:12:34 +0200299 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200300 }
Akron8ef408b2021-08-02 22:11:04 +0200301 }
302
Akron740f3d72021-08-03 12:12:34 +0200303 // This collects all edges until arrstate changes
304
Akron8ef408b2021-08-02 22:11:04 +0200305 // TODO:
306 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
307 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
308 // This requires to remove the transition alltogether and marks the state instead.
309
310 // TODO:
311 // if arrout == EPSILON, mark the transition as NOTOKEN
312
313 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200314 inSym: inSym,
315 outSym: outSym,
316 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200317 }
318
Akron740f3d72021-08-03 12:12:34 +0200319 // Initialize outgoing states
320 if tok.transitions[state+1] == nil {
321 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200322 }
323
Akron740f3d72021-08-03 12:12:34 +0200324 // Ignore transitions with invalid symbols
325 if inSym >= 0 {
326 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200327 }
Akron8ef408b2021-08-02 22:11:04 +0200328
Akron740f3d72021-08-03 12:12:34 +0200329 // Add final transition
330 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200331 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200332 }
333
Akron740f3d72021-08-03 12:12:34 +0200334 if DEBUG {
335 fmt.Println("Add",
336 state+1, "->", end+1,
337 "(",
338 inSym,
339 ":",
340 outSym,
341 ") (",
342 string(tok.sigmaRev[inSym]),
343 ":",
344 string(tok.sigmaRev[outSym]),
345 ")")
346 }
Akron75ebe7f2021-08-03 10:34:10 +0200347
Akron8ef408b2021-08-02 22:11:04 +0200348 continue
349 }
350 case SIGMA:
351 {
352 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
353
354 // Turn string into sigma id
355 number, err := strconv.Atoi(elem[0])
356
357 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200358 log.Error().Err(err)
359 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200360 }
361
Akron740f3d72021-08-03 12:12:34 +0200362 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200363
364 var symbol rune
365
366 // Read rune
367 if utf8.RuneCountInString(elem[1]) == 1 {
368 symbol = []rune(elem[1])[0]
369
370 // Probably a MCS
371 } else if utf8.RuneCountInString(elem[1]) > 1 {
372 switch elem[1] {
373 case "@_EPSILON_SYMBOL_@":
374 {
Akronc17f1ca2021-08-03 19:47:27 +0200375 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200376 continue
377 }
378 case "@_UNKNOWN_SYMBOL_@":
379 {
Akronc17f1ca2021-08-03 19:47:27 +0200380 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200381 continue
382 }
383
384 case "@_IDENTITY_SYMBOL_@":
385 {
Akronc17f1ca2021-08-03 19:47:27 +0200386 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200387 continue
388 }
389 default:
Akron740f3d72021-08-03 12:12:34 +0200390 {
391 log.Error().Msg("MCS not supported: " + line)
392 os.Exit(1)
393 }
Akron8ef408b2021-08-02 22:11:04 +0200394 }
395
Akron740f3d72021-08-03 12:12:34 +0200396 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200397 line, err = r.ReadString('\n')
398 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200399 log.Error().Err(err)
400 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200401 }
402 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200403 log.Error().Msg("MCS not supported:" + line)
404 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200405 }
Akron740f3d72021-08-03 12:12:34 +0200406 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200407 }
408
Akron740f3d72021-08-03 12:12:34 +0200409 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200410 }
411 }
412 }
413
414 return tok
415}
416
Akron64ffd9a2021-08-03 19:55:21 +0200417// Set alphabet A to the list of all symbols
418// outgoing from s
419func (tok *Tokenizer) get_set(s int, A *[]int) {
420 for a := range tok.transitions[s] {
421 *A = append(*A, a)
422 }
423
424 // Not required, but simplifies bug hunting
425 sort.Ints(*A)
426}
427
Akron8ef408b2021-08-02 22:11:04 +0200428// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200429func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
430
431 dat := &DaTokenizer{
Akron6247a5d2021-08-03 19:18:28 +0200432 sigma: make(map[rune]int),
433 loadLevel: -1,
Akronc17f1ca2021-08-03 19:47:27 +0200434 final: tok.final,
435 unknown: tok.unknown,
436 identity: tok.identity,
437 epsilon: tok.epsilon,
Akronf2120ca2021-08-03 16:26:41 +0200438 }
439
440 for num, sym := range tok.sigmaRev {
441 dat.sigma[sym] = num
442 }
Akron8ef408b2021-08-02 22:11:04 +0200443
444 mark := 0
445 size := 0
446
447 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200448 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200449
450 table[size] = &mapping{source: 1, target: 1}
451 size++
452
Akron740f3d72021-08-03 12:12:34 +0200453 // Allocate space for the outgoing symbol range
454 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200455
456 for mark < size {
457 s := table[mark].source // This is a state in Ms
458 t := table[mark].target // This is a state in Mt
459 mark++
Akron740f3d72021-08-03 12:12:34 +0200460
461 // Following the paper, here the state t can be remembered
462 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200463 A = A[:0]
464 tok.get_set(s, &A)
465
Akron740f3d72021-08-03 12:12:34 +0200466 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200467 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200468
Akron773b1ef2021-08-03 17:37:20 +0200469 // TODO:
470 // Sort the outgoing transitions based onm the
471 // outdegree of .end
472
Akron740f3d72021-08-03 12:12:34 +0200473 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200474 for _, a := range A {
475
Akronc17f1ca2021-08-03 19:47:27 +0200476 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200477
Akron740f3d72021-08-03 12:12:34 +0200478 // Aka g(s, a)
479 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200480
Akron740f3d72021-08-03 12:12:34 +0200481 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200482 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200483 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200484
Akron740f3d72021-08-03 12:12:34 +0200485 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200486 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200487
Akron8ef408b2021-08-02 22:11:04 +0200488 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200489 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200490 table[size] = &mapping{source: s1, target: t1}
491 size++
492 } else {
Akron740f3d72021-08-03 12:12:34 +0200493 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200494 dat.setBase(t1, r)
495 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200496 }
497 } else {
Akron740f3d72021-08-03 12:12:34 +0200498 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200499 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200500 }
501 }
502 }
503
504 // Following Mizobuchi et al (2000) the size of the
505 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200506 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200507 dat.array = dat.array[:dat.maxSize+1]
508 return dat
Akron8ef408b2021-08-02 22:11:04 +0200509}
510
Akron8ef408b2021-08-02 22:11:04 +0200511// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200512// exists and return this as a representative.
513// Currently iterates through the whole table
514// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200515func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200516 for x := 0; x < size; x++ {
517 if table[x].source == s {
518 return table[x].target
519 }
520 }
521 return 0
522}
523
Akron64ffd9a2021-08-03 19:55:21 +0200524// Resize double array when necessary
525func (dat *DaTokenizer) resize(l int) {
526 // TODO:
527 // This is a bit too aggressive atm and should be calmed down.
528 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200529 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200530 }
Akron64ffd9a2021-08-03 19:55:21 +0200531}
Akronc9d84a62021-08-03 15:56:03 +0200532
Akron64ffd9a2021-08-03 19:55:21 +0200533// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200534func (dat *DaTokenizer) setBase(p uint32, v uint32) {
535 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200536 dat.resize(l)
537 if dat.maxSize < l {
538 dat.maxSize = l
539 }
540 dat.array[p*2] = v
541}
542
Akron3fdfec62021-08-04 11:40:10 +0200543// Returns true if a state is separate pointing to a representative
544func (dat *DaTokenizer) isSeparate(p uint32) bool {
545 return dat.array[p*2]&leadingBit != 0
546}
547
548// Mark a state as separate pointing to a representative
549func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
550 if sep {
551 dat.array[p*2] |= leadingBit
552 } else {
553 dat.array[p*2] &= restBit
554 }
555}
556
Akron64ffd9a2021-08-03 19:55:21 +0200557// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200558func (dat *DaTokenizer) getBase(p uint32) uint32 {
559 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200560 return 0
561 }
Akron3fdfec62021-08-04 11:40:10 +0200562 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200563}
564
565// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200566func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
567 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200568 dat.resize(l)
569 if dat.maxSize < l {
570 dat.maxSize = l
571 }
572 dat.array[(p*2)+1] = v
573}
574
575// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200576func (dat *DaTokenizer) getCheck(p uint32) uint32 {
577 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200578 return 0
579 }
Akron3fdfec62021-08-04 11:40:10 +0200580 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200581}
582
583// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200584func (dat *DaTokenizer) setSize(v int) {
585 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200586}
587
588// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200589func (dat *DaTokenizer) GetSize() int {
590 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200591}
592
593// Based on Mizobuchi et al (2000), p. 124
594// This iterates for every state through the complete double array
595// structure until it finds a gap that fits all outgoing transitions
596// of the state. This is extremely slow, but is only necessary in the
597// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200598func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200599
600 // Start at the first entry of the double array list
Akron3fdfec62021-08-04 11:40:10 +0200601 base := uint32(1)
Akron8ef408b2021-08-02 22:11:04 +0200602
Akron8ef408b2021-08-02 22:11:04 +0200603OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200604
605 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200606 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200607 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200608 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200609 base++
610 goto OVERLAP
611 }
612 }
Akron8ef408b2021-08-02 22:11:04 +0200613 return base
614}
615
Akron773b1ef2021-08-03 17:37:20 +0200616func (dat *DaTokenizer) LoadLevel() float64 {
Akrond66a9262021-08-03 17:09:09 +0200617
Akron773b1ef2021-08-03 17:37:20 +0200618 if dat.loadLevel >= 0 {
619 return dat.loadLevel
620 }
Akrond66a9262021-08-03 17:09:09 +0200621 nonEmpty := 0
622 all := len(dat.array) / 2
623 for x := 1; x <= len(dat.array); x = x + 2 {
624 if dat.array[x] != 0 {
625 nonEmpty++
626 }
627 }
Akron773b1ef2021-08-03 17:37:20 +0200628 dat.loadLevel = float64(nonEmpty) / float64(all) * 100
629 return dat.loadLevel
Akrond66a9262021-08-03 17:09:09 +0200630}
631
Akron6247a5d2021-08-03 19:18:28 +0200632// WriteTo stores the double array data in an io.Writer.
633func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
634
635 // Store magical header
636 all, err := w.Write([]byte(MAGIC))
637 if err != nil {
638 log.Error().Msg("Unable to write data")
639 }
640
641 // Get sigma as a list
642 sigmalist := make([]rune, len(dat.sigma)+16)
643 max := 0
644 for sym, num := range dat.sigma {
645 sigmalist[num] = sym
646 if num > max {
647 max = num
648 }
649 }
650
651 sigmalist = sigmalist[:max+1]
652
653 buf := make([]byte, 0, 12)
654 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200655 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
656 bo.PutUint16(buf[4:6], uint16(dat.unknown))
657 bo.PutUint16(buf[6:8], uint16(dat.identity))
658 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200659 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
660 more, err := w.Write(buf[0:12])
661 if err != nil {
662 log.Error().Msg("Unable to write data")
663 }
664
665 all += more
666
667 wbuf := bytes.NewBuffer(nil)
668 wbufWrap := bufio.NewWriter(wbuf)
669
670 // Write sigma
671 for _, sym := range sigmalist {
672 more, err = wbufWrap.WriteRune(sym)
673 if err != nil {
674 log.Error().Msg("Unable to write data")
675 }
676 all += more
677 }
678 wbufWrap.Flush()
679 more, err = w.Write(wbuf.Bytes())
680 if err != nil {
681 log.Error().Msg("Unable to write data")
682 }
683 all += more
684
685 // Test marker - could be checksum
686 more, err = w.Write([]byte("T"))
687 if err != nil {
688 log.Error().Msg("Unable to write data")
689 }
690 all += more
691
692 wbuf.Reset()
693
694 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200695 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200696 more, err := w.Write(buf[0:4])
697 if err != nil {
698 log.Error().Msg("Unable to write data")
699 }
700 all += more
701 }
702
703 return int64(all), err
704}
705
Akron740f3d72021-08-03 12:12:34 +0200706// Match an input string against the double array
707// FSA.
708//
709// Based on Mizobuchi et al (2000), p. 129,
710// with additional support for IDENTITY, UNKNOWN
711// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200712func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200713 var a int
Akron3fdfec62021-08-04 11:40:10 +0200714 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200715 var ok bool
716
Akron3fdfec62021-08-04 11:40:10 +0200717 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200718 chars := []rune(input)
719 i := 0
720
Akron49d27ee2021-08-03 11:58:13 +0200721 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200722 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200723
Akron740f3d72021-08-03 12:12:34 +0200724 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200725 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200726 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200727 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200728 }
Akron64ffd9a2021-08-03 19:55:21 +0200729 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200730 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200731 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200732 }
Akron465a0992021-08-03 11:28:48 +0200733 tu = t
Akron730a79c2021-08-03 11:05:29 +0200734 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200735 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200736
Akron740f3d72021-08-03 12:12:34 +0200737 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200738 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200739
740 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200741 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200742 }
Akron740f3d72021-08-03 12:12:34 +0200743
Akron64ffd9a2021-08-03 19:55:21 +0200744 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200745 // Try again with unknown symbol, in case identity failed
746 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200747 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200748 }
Akron64ffd9a2021-08-03 19:55:21 +0200749 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200750
Akron64ffd9a2021-08-03 19:55:21 +0200751 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200752 // Try again with epsilon symbol, in case everything else failed
753 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200754 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200755 }
Akron64ffd9a2021-08-03 19:55:21 +0200756 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200757 } else {
758 break
759 }
760 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200761 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200762 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200763 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200764 }
Akron740f3d72021-08-03 12:12:34 +0200765
766 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200767 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200768 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200769 i++
770 }
Akron740f3d72021-08-03 12:12:34 +0200771 // TODO:
772 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200773 }
774
Akron740f3d72021-08-03 12:12:34 +0200775 if i != len(chars) {
776 if DEBUG {
777 fmt.Println("Not at the end")
778 }
Akron8ef408b2021-08-02 22:11:04 +0200779 return false
780 }
781
Akron465a0992021-08-03 11:28:48 +0200782FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200783
784 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200785 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200786 return true
787 }
Akron465a0992021-08-03 11:28:48 +0200788
Akron740f3d72021-08-03 12:12:34 +0200789 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200790 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200791 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200792
Akron740f3d72021-08-03 12:12:34 +0200793 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200794 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200795 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200796 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200797 }
Akron465a0992021-08-03 11:28:48 +0200798 return false
Akron740f3d72021-08-03 12:12:34 +0200799
Akron3fdfec62021-08-04 11:40:10 +0200800 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +0200801 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200802 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200803 }
Akron740f3d72021-08-03 12:12:34 +0200804
Akron465a0992021-08-03 11:28:48 +0200805 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200806}