blob: 39596b9d9f0c672db40f78529172334ff36b2ad9 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020017
Akron8ef408b2021-08-02 22:11:04 +020018import (
19 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020020 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020021 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron3fdfec62021-08-04 11:40:10 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 leadingBit uint32 = 1 << 31
44 restBit uint32 = ^uint32(0) &^ (1 << 31)
Akron8ef408b2021-08-02 22:11:04 +020045)
46
Akron6247a5d2021-08-03 19:18:28 +020047var bo binary.ByteOrder = binary.LittleEndian
48
Akron8ef408b2021-08-02 22:11:04 +020049type mapping struct {
50 source int
Akron3fdfec62021-08-04 11:40:10 +020051 target uint32
Akron8ef408b2021-08-02 22:11:04 +020052}
53
54type edge struct {
Akron83e75a22021-08-04 13:14:06 +020055 inSym int
56 outSym int
57 end int
58 nontoken bool
59 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020060}
61
62type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020063 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020064 sigmaRev map[int]rune
65 arcCount int
66 stateCount int
67 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020068 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron8ef408b2021-08-02 22:11:04 +020075}
76
Akronf2120ca2021-08-03 16:26:41 +020077type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020078 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020079 sigma map[rune]int
80 maxSize int
81 loadFactor float64
82 array []uint32
Akronc17f1ca2021-08-03 19:47:27 +020083
84 // Special symbols in sigma
85 epsilon int
86 unknown int
87 identity int
88 final int
Akronf2120ca2021-08-03 16:26:41 +020089}
90
Akron64ffd9a2021-08-03 19:55:21 +020091func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020092 f, err := os.Open(file)
93 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020094 log.Error().Err(err)
95 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020096 }
97 defer f.Close()
98
99 gz, err := gzip.NewReader(f)
100 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200101 log.Error().Err(err)
102 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200103 }
104 defer gz.Close()
105
Akron3fdfec62021-08-04 11:40:10 +0200106 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200107}
108
Akron3fdfec62021-08-04 11:40:10 +0200109func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200110 r := bufio.NewReader(ior)
111
112 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200113 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200114 epsilon: -1,
115 unknown: -1,
116 identity: -1,
117 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200118 }
119
Akron740f3d72021-08-03 12:12:34 +0200120 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200121
122 mode := 0
123 var elem []string
124 var elemint [5]int
125
126 for {
127 line, err := r.ReadString('\n')
128 if err != nil {
129 if err == io.EOF {
130 break
131 }
Akron740f3d72021-08-03 12:12:34 +0200132 log.Error().Err(err)
133 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200134 }
135 if strings.HasPrefix(line, "##foma-net") {
136 continue
137 }
138 if strings.HasPrefix(line, "##props##") {
139 mode = PROPS
140 continue
141 }
142 if strings.HasPrefix(line, "##states##") {
143 mode = STATES
144
145 // Adds a final transition symbol to sigma
146 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200147 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200148 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200149 continue
150 }
151 if strings.HasPrefix(line, "##sigma##") {
152 mode = SIGMA
153 continue
154 }
155 if strings.HasPrefix(line, "##end##") {
156 mode = NONE
157 continue
158 }
159
160 switch mode {
161 case PROPS:
162 {
163 elem = strings.Split(line, " ")
164 /*
165 fmt.Println("arity: " + elem[0])
166 fmt.Println("arccount: " + elem[1])
167 fmt.Println("statecount: " + elem[2])
168 fmt.Println("linecount: " + elem[3])
169 fmt.Println("finalcount: " + elem[4])
170 fmt.Println("pathcount: " + elem[5])
171 fmt.Println("is_deterministic: " + elem[6])
172 fmt.Println("is_pruned: " + elem[7])
173 fmt.Println("is_minimized: " + elem[8])
174 fmt.Println("is_epsilon_free: " + elem[9])
175 fmt.Println("is_loop_free: " + elem[10])
176 fmt.Println("extras: " + elem[11])
177 fmt.Println("name: " + elem[12])
178 */
179 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200180 log.Error().Msg("The FST needs to be deterministic")
181 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200182 }
183 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200184 log.Error().Msg("The FST needs to be epsilon free")
185 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200186 }
187
188 elemint[0], err = strconv.Atoi(elem[1])
189 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200190 log.Error().Msg("Can't read arccount")
191 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200192 }
Akron740f3d72021-08-03 12:12:34 +0200193 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200194
195 // States start at 1 in Mizobuchi et al (2000),
196 // as the state 0 is associated with a fail.
197 // Initialize states and transitions
198 elemint[0], err = strconv.Atoi(elem[2])
199 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200200 log.Error().Msg("Can't read statecount")
201 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200202 }
Akron740f3d72021-08-03 12:12:34 +0200203 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200204 tok.transitions = make([]map[int]*edge, elemint[0]+1)
205 continue
206 }
207 case STATES:
208 {
209 elem = strings.Split(line[0:len(line)-1], " ")
210 if elem[0] == "-1" {
211 continue
212 }
213 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200214 if err != nil {
215 break
216 }
Akron8ef408b2021-08-02 22:11:04 +0200217
218 if len(elem) > 1 {
219 elemint[1], err = strconv.Atoi(elem[1])
220 if err != nil {
221 break
222 }
223 if len(elem) > 2 {
224 elemint[2], err = strconv.Atoi(elem[2])
225 if err != nil {
226 break
227 }
228 if len(elem) > 3 {
229 elemint[3], err = strconv.Atoi(elem[3])
230 if err != nil {
231 break
232 }
233 if len(elem) > 4 {
234 elemint[4], err = strconv.Atoi(elem[4])
235 if err != nil {
236 break
237 }
238 }
239 }
240 }
241 }
242
243 switch len(elem) {
244 case 5:
245 {
Akron740f3d72021-08-03 12:12:34 +0200246 state = elemint[0]
247 inSym = elemint[1]
248 outSym = elemint[2]
249 end = elemint[3]
250 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200251 }
252 case 4:
253 {
254 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200255 state = elemint[0]
256 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200257 } else {
Akron740f3d72021-08-03 12:12:34 +0200258 state = elemint[0]
259 inSym = elemint[1]
260 end = elemint[2]
261 final = elemint[3]
262 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200263 }
264 }
265 case 3:
266 {
Akron740f3d72021-08-03 12:12:34 +0200267 inSym = elemint[0]
268 outSym = elemint[1]
269 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200270 }
271 case 2:
272 {
Akron740f3d72021-08-03 12:12:34 +0200273 inSym = elemint[0]
274 end = elemint[1]
275 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200276 }
277 }
278
Akron8ef408b2021-08-02 22:11:04 +0200279 // While the states in foma start with 0, the states in the
280 // Mizobuchi FSA start with one - so we increase every state by 1.
281
Akron83e75a22021-08-04 13:14:06 +0200282 nontoken := false
283 tokenend := false
284
Akron740f3d72021-08-03 12:12:34 +0200285 if inSym != outSym {
286
Akron83e75a22021-08-04 13:14:06 +0200287 if tok.sigmaRev[outSym] == NEWLINE {
288 tokenend = true
289 } else if outSym == tok.epsilon {
290 nontoken = true
291 } else {
Akron740f3d72021-08-03 12:12:34 +0200292 log.Error().Msg(
293 "Unsupported transition: " +
294 strconv.Itoa(state) +
295 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200296 " (" +
Akron740f3d72021-08-03 12:12:34 +0200297 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200298 ":" +
Akron740f3d72021-08-03 12:12:34 +0200299 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200300 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200301 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200302 ":" +
Akron740f3d72021-08-03 12:12:34 +0200303 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200304 ")")
Akron740f3d72021-08-03 12:12:34 +0200305 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200306 }
Akron83e75a22021-08-04 13:14:06 +0200307
308 /*
309 // Allow any epsilon to become a newline
310 if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
311
312 // Allow any whitespace to be ignored
313 !(inSym != tok.epsilon && outSym == tok.epsilon) &&
314
315 // Allow any whitespace to become a new line
316 !(tok.sigmaRev[outSym] == NEWLINE) {
317
318 }
319 */
320 } else if inSym == tok.epsilon {
321 panic("Epsilon transitions not allowed")
Akron8ef408b2021-08-02 22:11:04 +0200322 }
323
Akron740f3d72021-08-03 12:12:34 +0200324 // This collects all edges until arrstate changes
325
Akron8ef408b2021-08-02 22:11:04 +0200326 // TODO:
327 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
328 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
329 // This requires to remove the transition alltogether and marks the state instead.
330
331 // TODO:
332 // if arrout == EPSILON, mark the transition as NOTOKEN
333
334 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200335 inSym: inSym,
336 outSym: outSym,
337 end: end + 1,
338 tokenend: tokenend,
339 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200340 }
341
Akron740f3d72021-08-03 12:12:34 +0200342 // Initialize outgoing states
343 if tok.transitions[state+1] == nil {
344 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200345 }
346
Akron740f3d72021-08-03 12:12:34 +0200347 // Ignore transitions with invalid symbols
348 if inSym >= 0 {
349 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200350 }
Akron8ef408b2021-08-02 22:11:04 +0200351
Akron740f3d72021-08-03 12:12:34 +0200352 // Add final transition
353 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200354 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200355 }
356
Akron740f3d72021-08-03 12:12:34 +0200357 if DEBUG {
358 fmt.Println("Add",
359 state+1, "->", end+1,
360 "(",
361 inSym,
362 ":",
363 outSym,
364 ") (",
365 string(tok.sigmaRev[inSym]),
366 ":",
367 string(tok.sigmaRev[outSym]),
368 ")")
369 }
Akron75ebe7f2021-08-03 10:34:10 +0200370
Akron8ef408b2021-08-02 22:11:04 +0200371 continue
372 }
373 case SIGMA:
374 {
375 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
376
377 // Turn string into sigma id
378 number, err := strconv.Atoi(elem[0])
379
380 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200381 log.Error().Err(err)
382 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200383 }
384
Akron740f3d72021-08-03 12:12:34 +0200385 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200386
387 var symbol rune
388
389 // Read rune
390 if utf8.RuneCountInString(elem[1]) == 1 {
391 symbol = []rune(elem[1])[0]
392
393 // Probably a MCS
394 } else if utf8.RuneCountInString(elem[1]) > 1 {
395 switch elem[1] {
396 case "@_EPSILON_SYMBOL_@":
397 {
Akronc17f1ca2021-08-03 19:47:27 +0200398 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200399 continue
400 }
401 case "@_UNKNOWN_SYMBOL_@":
402 {
Akronc17f1ca2021-08-03 19:47:27 +0200403 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200404 continue
405 }
406
407 case "@_IDENTITY_SYMBOL_@":
408 {
Akronc17f1ca2021-08-03 19:47:27 +0200409 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200410 continue
411 }
412 default:
Akron740f3d72021-08-03 12:12:34 +0200413 {
414 log.Error().Msg("MCS not supported: " + line)
415 os.Exit(1)
416 }
Akron8ef408b2021-08-02 22:11:04 +0200417 }
418
Akron740f3d72021-08-03 12:12:34 +0200419 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200420 line, err = r.ReadString('\n')
421 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200422 log.Error().Err(err)
423 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200424 }
425 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200426 log.Error().Msg("MCS not supported:" + line)
427 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200428 }
Akron740f3d72021-08-03 12:12:34 +0200429 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200430 }
431
Akron740f3d72021-08-03 12:12:34 +0200432 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200433 }
434 }
435 }
436
437 return tok
438}
439
Akron64ffd9a2021-08-03 19:55:21 +0200440// Set alphabet A to the list of all symbols
441// outgoing from s
442func (tok *Tokenizer) get_set(s int, A *[]int) {
443 for a := range tok.transitions[s] {
444 *A = append(*A, a)
445 }
446
447 // Not required, but simplifies bug hunting
448 sort.Ints(*A)
449}
450
Akron8ef408b2021-08-02 22:11:04 +0200451// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200452func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
453
454 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200455 sigma: make(map[rune]int),
456 loadFactor: -1,
457 final: tok.final,
458 unknown: tok.unknown,
459 identity: tok.identity,
460 epsilon: tok.epsilon,
Akronf2120ca2021-08-03 16:26:41 +0200461 }
462
463 for num, sym := range tok.sigmaRev {
464 dat.sigma[sym] = num
465 }
Akron8ef408b2021-08-02 22:11:04 +0200466
467 mark := 0
468 size := 0
469
470 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200471 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200472
473 table[size] = &mapping{source: 1, target: 1}
474 size++
475
Akron740f3d72021-08-03 12:12:34 +0200476 // Allocate space for the outgoing symbol range
477 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200478
479 for mark < size {
480 s := table[mark].source // This is a state in Ms
481 t := table[mark].target // This is a state in Mt
482 mark++
Akron740f3d72021-08-03 12:12:34 +0200483
484 // Following the paper, here the state t can be remembered
485 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200486 A = A[:0]
487 tok.get_set(s, &A)
488
Akron740f3d72021-08-03 12:12:34 +0200489 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200490 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200491
Akron773b1ef2021-08-03 17:37:20 +0200492 // TODO:
493 // Sort the outgoing transitions based onm the
494 // outdegree of .end
495
Akron740f3d72021-08-03 12:12:34 +0200496 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200497 for _, a := range A {
498
Akronc17f1ca2021-08-03 19:47:27 +0200499 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200500
Akron740f3d72021-08-03 12:12:34 +0200501 // Aka g(s, a)
502 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200503
Akron740f3d72021-08-03 12:12:34 +0200504 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200505 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200506 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200507
Akron83e75a22021-08-04 13:14:06 +0200508 // Mark the state as being the target of a nontoken transition
509 if tok.transitions[s][a].nontoken {
510 dat.setNonToken(t, true)
511 }
512
Akron740f3d72021-08-03 12:12:34 +0200513 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200514 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200515
Akron8ef408b2021-08-02 22:11:04 +0200516 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200517 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200518 table[size] = &mapping{source: s1, target: t1}
519 size++
520 } else {
Akron740f3d72021-08-03 12:12:34 +0200521 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200522 dat.setBase(t1, r)
523 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200524 }
525 } else {
Akron740f3d72021-08-03 12:12:34 +0200526 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200527 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200528 }
529 }
530 }
531
532 // Following Mizobuchi et al (2000) the size of the
533 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200534 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200535 dat.array = dat.array[:dat.maxSize+1]
536 return dat
Akron8ef408b2021-08-02 22:11:04 +0200537}
538
Akron8ef408b2021-08-02 22:11:04 +0200539// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200540// exists and return this as a representative.
541// Currently iterates through the whole table
542// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200543func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200544 for x := 0; x < size; x++ {
545 if table[x].source == s {
546 return table[x].target
547 }
548 }
549 return 0
550}
551
Akron64ffd9a2021-08-03 19:55:21 +0200552// Resize double array when necessary
553func (dat *DaTokenizer) resize(l int) {
554 // TODO:
555 // This is a bit too aggressive atm and should be calmed down.
556 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200557 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200558 }
Akron64ffd9a2021-08-03 19:55:21 +0200559}
Akronc9d84a62021-08-03 15:56:03 +0200560
Akron64ffd9a2021-08-03 19:55:21 +0200561// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200562func (dat *DaTokenizer) setBase(p uint32, v uint32) {
563 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200564 dat.resize(l)
565 if dat.maxSize < l {
566 dat.maxSize = l
567 }
568 dat.array[p*2] = v
569}
570
Akron3fdfec62021-08-04 11:40:10 +0200571// Returns true if a state is separate pointing to a representative
572func (dat *DaTokenizer) isSeparate(p uint32) bool {
573 return dat.array[p*2]&leadingBit != 0
574}
575
576// Mark a state as separate pointing to a representative
577func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
578 if sep {
579 dat.array[p*2] |= leadingBit
580 } else {
581 dat.array[p*2] &= restBit
582 }
583}
584
Akron83e75a22021-08-04 13:14:06 +0200585// Returns true if a state is the target of a nontoken transition
586func (dat *DaTokenizer) isNonToken(p uint32) bool {
587 return dat.array[p*2+1]&leadingBit != 0
588}
589
590// Mark a state as being the target of a nontoken transition
591func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
592 if sep {
593 dat.array[p*2+1] |= leadingBit
594 } else {
595 dat.array[p*2+1] &= restBit
596 }
597}
598
Akron64ffd9a2021-08-03 19:55:21 +0200599// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200600func (dat *DaTokenizer) getBase(p uint32) uint32 {
601 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200602 return 0
603 }
Akron3fdfec62021-08-04 11:40:10 +0200604 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200605}
606
607// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200608func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
609 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200610 dat.resize(l)
611 if dat.maxSize < l {
612 dat.maxSize = l
613 }
614 dat.array[(p*2)+1] = v
615}
616
617// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200618func (dat *DaTokenizer) getCheck(p uint32) uint32 {
619 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200620 return 0
621 }
Akron3fdfec62021-08-04 11:40:10 +0200622 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200623}
624
625// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200626func (dat *DaTokenizer) setSize(v int) {
627 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200628}
629
630// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200631func (dat *DaTokenizer) GetSize() int {
632 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200633}
634
635// Based on Mizobuchi et al (2000), p. 124
636// This iterates for every state through the complete double array
637// structure until it finds a gap that fits all outgoing transitions
638// of the state. This is extremely slow, but is only necessary in the
639// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200640func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200641
642 // Start at the first entry of the double array list
Akron3fdfec62021-08-04 11:40:10 +0200643 base := uint32(1)
Akron8ef408b2021-08-02 22:11:04 +0200644
Akron8ef408b2021-08-02 22:11:04 +0200645OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200646
647 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200648 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200649 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200650 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200651 base++
652 goto OVERLAP
653 }
654 }
Akron8ef408b2021-08-02 22:11:04 +0200655 return base
656}
657
Akron03a3c612021-08-04 11:51:27 +0200658// LoadFactor as defined in Kanda et al (2018),
659// i.e. the proportion of non-empty elements to all elements.
660func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200661
Akron03a3c612021-08-04 11:51:27 +0200662 // Cache the loadfactor
663 if dat.loadFactor >= 0 {
664 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200665 }
Akrond66a9262021-08-03 17:09:09 +0200666 nonEmpty := 0
667 all := len(dat.array) / 2
668 for x := 1; x <= len(dat.array); x = x + 2 {
669 if dat.array[x] != 0 {
670 nonEmpty++
671 }
672 }
Akron03a3c612021-08-04 11:51:27 +0200673 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
674 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200675}
676
Akron6247a5d2021-08-03 19:18:28 +0200677// WriteTo stores the double array data in an io.Writer.
678func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
679
680 // Store magical header
681 all, err := w.Write([]byte(MAGIC))
682 if err != nil {
683 log.Error().Msg("Unable to write data")
684 }
685
686 // Get sigma as a list
687 sigmalist := make([]rune, len(dat.sigma)+16)
688 max := 0
689 for sym, num := range dat.sigma {
690 sigmalist[num] = sym
691 if num > max {
692 max = num
693 }
694 }
695
696 sigmalist = sigmalist[:max+1]
697
698 buf := make([]byte, 0, 12)
699 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200700 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
701 bo.PutUint16(buf[4:6], uint16(dat.unknown))
702 bo.PutUint16(buf[6:8], uint16(dat.identity))
703 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200704 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
705 more, err := w.Write(buf[0:12])
706 if err != nil {
707 log.Error().Msg("Unable to write data")
708 }
709
710 all += more
711
712 wbuf := bytes.NewBuffer(nil)
713 wbufWrap := bufio.NewWriter(wbuf)
714
715 // Write sigma
716 for _, sym := range sigmalist {
717 more, err = wbufWrap.WriteRune(sym)
718 if err != nil {
719 log.Error().Msg("Unable to write data")
720 }
721 all += more
722 }
723 wbufWrap.Flush()
724 more, err = w.Write(wbuf.Bytes())
725 if err != nil {
726 log.Error().Msg("Unable to write data")
727 }
728 all += more
729
730 // Test marker - could be checksum
731 more, err = w.Write([]byte("T"))
732 if err != nil {
733 log.Error().Msg("Unable to write data")
734 }
735 all += more
736
737 wbuf.Reset()
738
739 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200740 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200741 more, err := w.Write(buf[0:4])
742 if err != nil {
743 log.Error().Msg("Unable to write data")
744 }
745 all += more
746 }
747
748 return int64(all), err
749}
750
Akron740f3d72021-08-03 12:12:34 +0200751// Match an input string against the double array
752// FSA.
753//
754// Based on Mizobuchi et al (2000), p. 129,
755// with additional support for IDENTITY, UNKNOWN
756// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200757func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200758 var a int
Akron3fdfec62021-08-04 11:40:10 +0200759 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200760 var ok bool
761
Akron3fdfec62021-08-04 11:40:10 +0200762 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200763 chars := []rune(input)
764 i := 0
765
Akron49d27ee2021-08-03 11:58:13 +0200766 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200767 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200768
Akron740f3d72021-08-03 12:12:34 +0200769 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200770 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200771 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200772 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200773 }
Akron64ffd9a2021-08-03 19:55:21 +0200774 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200775 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200776 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200777 }
Akron465a0992021-08-03 11:28:48 +0200778 tu = t
Akron730a79c2021-08-03 11:05:29 +0200779 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200780 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200781
Akron740f3d72021-08-03 12:12:34 +0200782 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200783 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200784
785 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200786 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200787 }
Akron740f3d72021-08-03 12:12:34 +0200788
Akron64ffd9a2021-08-03 19:55:21 +0200789 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200790 // Try again with unknown symbol, in case identity failed
791 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200792 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200793 }
Akron64ffd9a2021-08-03 19:55:21 +0200794 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200795
Akron64ffd9a2021-08-03 19:55:21 +0200796 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200797 // Try again with epsilon symbol, in case everything else failed
798 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200799 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200800 }
Akron64ffd9a2021-08-03 19:55:21 +0200801 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200802 } else {
803 break
804 }
805 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200806 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200807 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200808 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200809 }
Akron740f3d72021-08-03 12:12:34 +0200810
811 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200812 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200813 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200814 i++
815 }
Akron83e75a22021-08-04 13:14:06 +0200816
Akron740f3d72021-08-03 12:12:34 +0200817 // TODO:
818 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200819 }
820
Akron740f3d72021-08-03 12:12:34 +0200821 if i != len(chars) {
822 if DEBUG {
823 fmt.Println("Not at the end")
824 }
Akron8ef408b2021-08-02 22:11:04 +0200825 return false
826 }
827
Akron465a0992021-08-03 11:28:48 +0200828FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200829
830 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200831 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200832 return true
833 }
Akron465a0992021-08-03 11:28:48 +0200834
Akron740f3d72021-08-03 12:12:34 +0200835 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200836 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200837 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200838
Akron740f3d72021-08-03 12:12:34 +0200839 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200840 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200841 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200842 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200843 }
Akron465a0992021-08-03 11:28:48 +0200844 return false
Akron740f3d72021-08-03 12:12:34 +0200845
Akron3fdfec62021-08-04 11:40:10 +0200846 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +0200847 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200848 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200849 }
Akron740f3d72021-08-03 12:12:34 +0200850
Akron465a0992021-08-03 11:28:48 +0200851 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200852}