blob: b4a3f3629eec51cb444c8cb0e5e9f0d4b3a1ce61 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020017
Akron8ef408b2021-08-02 22:11:04 +020018import (
19 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020020 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020021 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron2a4b9292021-08-04 15:35:22 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 firstBit uint32 = 1 << 31
44 secondBit uint32 = 1 << 30
45 restBit uint32 = ^uint32(0) &^ (firstBit | secondBit)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron6247a5d2021-08-03 19:18:28 +020048var bo binary.ByteOrder = binary.LittleEndian
49
Akron8ef408b2021-08-02 22:11:04 +020050type mapping struct {
51 source int
Akron3fdfec62021-08-04 11:40:10 +020052 target uint32
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type edge struct {
Akron83e75a22021-08-04 13:14:06 +020056 inSym int
57 outSym int
58 end int
59 nontoken bool
60 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020064 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020065 sigmaRev map[int]rune
66 arcCount int
67 stateCount int
68 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020069 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron8ef408b2021-08-02 22:11:04 +020076}
77
Akronf2120ca2021-08-03 16:26:41 +020078type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020079 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020080 sigma map[rune]int
81 maxSize int
82 loadFactor float64
83 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020084 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020085
86 // Special symbols in sigma
87 epsilon int
88 unknown int
89 identity int
90 final int
Akronf2120ca2021-08-03 16:26:41 +020091}
92
Akron64ffd9a2021-08-03 19:55:21 +020093func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020094 f, err := os.Open(file)
95 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020096 log.Error().Err(err)
97 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020098 }
99 defer f.Close()
100
101 gz, err := gzip.NewReader(f)
102 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200103 log.Error().Err(err)
104 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200105 }
106 defer gz.Close()
107
Akron3fdfec62021-08-04 11:40:10 +0200108 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200109}
110
Akron3fdfec62021-08-04 11:40:10 +0200111func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200112 r := bufio.NewReader(ior)
113
114 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200115 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200116 epsilon: -1,
117 unknown: -1,
118 identity: -1,
119 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121
Akron740f3d72021-08-03 12:12:34 +0200122 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200123
124 mode := 0
125 var elem []string
126 var elemint [5]int
127
128 for {
129 line, err := r.ReadString('\n')
130 if err != nil {
131 if err == io.EOF {
132 break
133 }
Akron740f3d72021-08-03 12:12:34 +0200134 log.Error().Err(err)
135 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200136 }
137 if strings.HasPrefix(line, "##foma-net") {
138 continue
139 }
140 if strings.HasPrefix(line, "##props##") {
141 mode = PROPS
142 continue
143 }
144 if strings.HasPrefix(line, "##states##") {
145 mode = STATES
146
147 // Adds a final transition symbol to sigma
148 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200149 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200150 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200151 continue
152 }
153 if strings.HasPrefix(line, "##sigma##") {
154 mode = SIGMA
155 continue
156 }
157 if strings.HasPrefix(line, "##end##") {
158 mode = NONE
159 continue
160 }
161
162 switch mode {
163 case PROPS:
164 {
165 elem = strings.Split(line, " ")
166 /*
167 fmt.Println("arity: " + elem[0])
168 fmt.Println("arccount: " + elem[1])
169 fmt.Println("statecount: " + elem[2])
170 fmt.Println("linecount: " + elem[3])
171 fmt.Println("finalcount: " + elem[4])
172 fmt.Println("pathcount: " + elem[5])
173 fmt.Println("is_deterministic: " + elem[6])
174 fmt.Println("is_pruned: " + elem[7])
175 fmt.Println("is_minimized: " + elem[8])
176 fmt.Println("is_epsilon_free: " + elem[9])
177 fmt.Println("is_loop_free: " + elem[10])
178 fmt.Println("extras: " + elem[11])
179 fmt.Println("name: " + elem[12])
180 */
181 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200182 log.Error().Msg("The FST needs to be deterministic")
183 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200184 }
185 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200186 log.Error().Msg("The FST needs to be epsilon free")
187 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200188 }
189
190 elemint[0], err = strconv.Atoi(elem[1])
191 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200192 log.Error().Msg("Can't read arccount")
193 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200194 }
Akron740f3d72021-08-03 12:12:34 +0200195 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200196
197 // States start at 1 in Mizobuchi et al (2000),
198 // as the state 0 is associated with a fail.
199 // Initialize states and transitions
200 elemint[0], err = strconv.Atoi(elem[2])
201 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200202 log.Error().Msg("Can't read statecount")
203 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200204 }
Akron740f3d72021-08-03 12:12:34 +0200205 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200206 tok.transitions = make([]map[int]*edge, elemint[0]+1)
207 continue
208 }
209 case STATES:
210 {
211 elem = strings.Split(line[0:len(line)-1], " ")
212 if elem[0] == "-1" {
213 continue
214 }
215 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200216 if err != nil {
217 break
218 }
Akron8ef408b2021-08-02 22:11:04 +0200219
220 if len(elem) > 1 {
221 elemint[1], err = strconv.Atoi(elem[1])
222 if err != nil {
223 break
224 }
225 if len(elem) > 2 {
226 elemint[2], err = strconv.Atoi(elem[2])
227 if err != nil {
228 break
229 }
230 if len(elem) > 3 {
231 elemint[3], err = strconv.Atoi(elem[3])
232 if err != nil {
233 break
234 }
235 if len(elem) > 4 {
236 elemint[4], err = strconv.Atoi(elem[4])
237 if err != nil {
238 break
239 }
240 }
241 }
242 }
243 }
244
245 switch len(elem) {
246 case 5:
247 {
Akron740f3d72021-08-03 12:12:34 +0200248 state = elemint[0]
249 inSym = elemint[1]
250 outSym = elemint[2]
251 end = elemint[3]
252 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200253 }
254 case 4:
255 {
256 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200257 state = elemint[0]
258 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200259 } else {
Akron740f3d72021-08-03 12:12:34 +0200260 state = elemint[0]
261 inSym = elemint[1]
262 end = elemint[2]
263 final = elemint[3]
264 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200265 }
266 }
267 case 3:
268 {
Akron740f3d72021-08-03 12:12:34 +0200269 inSym = elemint[0]
270 outSym = elemint[1]
271 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200272 }
273 case 2:
274 {
Akron740f3d72021-08-03 12:12:34 +0200275 inSym = elemint[0]
276 end = elemint[1]
277 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200278 }
279 }
280
Akron8ef408b2021-08-02 22:11:04 +0200281 // While the states in foma start with 0, the states in the
282 // Mizobuchi FSA start with one - so we increase every state by 1.
283
Akron83e75a22021-08-04 13:14:06 +0200284 nontoken := false
285 tokenend := false
286
Akron524c5432021-08-05 14:14:27 +0200287 // ID needs to be > 1
288 inSym++
289 outSym++
290
Akron740f3d72021-08-03 12:12:34 +0200291 if inSym != outSym {
292
Akron83e75a22021-08-04 13:14:06 +0200293 if tok.sigmaRev[outSym] == NEWLINE {
294 tokenend = true
295 } else if outSym == tok.epsilon {
296 nontoken = true
297 } else {
Akron740f3d72021-08-03 12:12:34 +0200298 log.Error().Msg(
299 "Unsupported transition: " +
300 strconv.Itoa(state) +
301 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200302 " (" +
Akron740f3d72021-08-03 12:12:34 +0200303 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200304 ":" +
Akron740f3d72021-08-03 12:12:34 +0200305 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200306 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200307 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200308 ":" +
Akron740f3d72021-08-03 12:12:34 +0200309 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200310 ")")
Akron740f3d72021-08-03 12:12:34 +0200311 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200312 }
Akron83e75a22021-08-04 13:14:06 +0200313
Akron83e75a22021-08-04 13:14:06 +0200314 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200315 log.Error().Msg("Epsilon transitions not supported")
316 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200317 }
318
Akron740f3d72021-08-03 12:12:34 +0200319 // This collects all edges until arrstate changes
320
Akron8ef408b2021-08-02 22:11:04 +0200321 // TODO:
322 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
323 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
324 // This requires to remove the transition alltogether and marks the state instead.
325
326 // TODO:
327 // if arrout == EPSILON, mark the transition as NOTOKEN
328
329 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200330 inSym: inSym,
331 outSym: outSym,
332 end: end + 1,
333 tokenend: tokenend,
334 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200335 }
336
Akron740f3d72021-08-03 12:12:34 +0200337 // Initialize outgoing states
338 if tok.transitions[state+1] == nil {
339 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200340 }
341
Akron740f3d72021-08-03 12:12:34 +0200342 // Ignore transitions with invalid symbols
343 if inSym >= 0 {
344 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200345 }
Akron8ef408b2021-08-02 22:11:04 +0200346
Akron740f3d72021-08-03 12:12:34 +0200347 // Add final transition
348 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200349 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200350 }
351
Akron740f3d72021-08-03 12:12:34 +0200352 if DEBUG {
353 fmt.Println("Add",
354 state+1, "->", end+1,
355 "(",
356 inSym,
357 ":",
358 outSym,
359 ") (",
360 string(tok.sigmaRev[inSym]),
361 ":",
362 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200363 ")",
364 ";",
365 "TE:", tokenend,
366 "NT:", nontoken)
Akron740f3d72021-08-03 12:12:34 +0200367 }
Akron75ebe7f2021-08-03 10:34:10 +0200368
Akron8ef408b2021-08-02 22:11:04 +0200369 continue
370 }
371 case SIGMA:
372 {
373 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
374
375 // Turn string into sigma id
376 number, err := strconv.Atoi(elem[0])
377
Akron524c5432021-08-05 14:14:27 +0200378 // ID needs to be > 1
379 number++
380
Akron8ef408b2021-08-02 22:11:04 +0200381 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200382 log.Error().Err(err)
383 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200384 }
385
Akron740f3d72021-08-03 12:12:34 +0200386 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200387
388 var symbol rune
389
390 // Read rune
391 if utf8.RuneCountInString(elem[1]) == 1 {
392 symbol = []rune(elem[1])[0]
393
394 // Probably a MCS
395 } else if utf8.RuneCountInString(elem[1]) > 1 {
396 switch elem[1] {
397 case "@_EPSILON_SYMBOL_@":
398 {
Akronc17f1ca2021-08-03 19:47:27 +0200399 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200400 continue
401 }
402 case "@_UNKNOWN_SYMBOL_@":
403 {
Akronc17f1ca2021-08-03 19:47:27 +0200404 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200405 continue
406 }
407
408 case "@_IDENTITY_SYMBOL_@":
409 {
Akronc17f1ca2021-08-03 19:47:27 +0200410 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200411 continue
412 }
413 default:
Akron740f3d72021-08-03 12:12:34 +0200414 {
415 log.Error().Msg("MCS not supported: " + line)
416 os.Exit(1)
417 }
Akron8ef408b2021-08-02 22:11:04 +0200418 }
419
Akron740f3d72021-08-03 12:12:34 +0200420 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200421 line, err = r.ReadString('\n')
422 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200423 log.Error().Err(err)
424 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200425 }
426 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200427 log.Error().Msg("MCS not supported:" + line)
428 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200429 }
Akron740f3d72021-08-03 12:12:34 +0200430 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200431 }
432
Akron740f3d72021-08-03 12:12:34 +0200433 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200434 }
435 }
436 }
437
438 return tok
439}
440
Akron64ffd9a2021-08-03 19:55:21 +0200441// Set alphabet A to the list of all symbols
442// outgoing from s
443func (tok *Tokenizer) get_set(s int, A *[]int) {
444 for a := range tok.transitions[s] {
445 *A = append(*A, a)
446 }
447
448 // Not required, but simplifies bug hunting
449 sort.Ints(*A)
450}
451
Akron8ef408b2021-08-02 22:11:04 +0200452// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200453func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
454
455 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200456 sigma: make(map[rune]int),
457 loadFactor: -1,
458 final: tok.final,
459 unknown: tok.unknown,
460 identity: tok.identity,
461 epsilon: tok.epsilon,
Akron3f8571a2021-08-05 11:18:10 +0200462 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200463 }
464
465 for num, sym := range tok.sigmaRev {
466 dat.sigma[sym] = num
467 }
Akron8ef408b2021-08-02 22:11:04 +0200468
469 mark := 0
470 size := 0
471
472 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200473 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200474
475 table[size] = &mapping{source: 1, target: 1}
476 size++
477
Akron740f3d72021-08-03 12:12:34 +0200478 // Allocate space for the outgoing symbol range
479 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200480
481 for mark < size {
482 s := table[mark].source // This is a state in Ms
483 t := table[mark].target // This is a state in Mt
484 mark++
Akron740f3d72021-08-03 12:12:34 +0200485
486 // Following the paper, here the state t can be remembered
487 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200488 A = A[:0]
489 tok.get_set(s, &A)
490
Akron740f3d72021-08-03 12:12:34 +0200491 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200492 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200493
Akron773b1ef2021-08-03 17:37:20 +0200494 // TODO:
Akron068874c2021-08-04 15:19:56 +0200495 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200496 // outdegree of .end
497
Akron740f3d72021-08-03 12:12:34 +0200498 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200499 for _, a := range A {
500
Akronc17f1ca2021-08-03 19:47:27 +0200501 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200502
Akron740f3d72021-08-03 12:12:34 +0200503 // Aka g(s, a)
504 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200505
Akron740f3d72021-08-03 12:12:34 +0200506 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200507 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200508 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200509
Akron524c5432021-08-05 14:14:27 +0200510 if DEBUG {
511 fmt.Println("Translate transition",
512 s, "->", s1, "(", a, ")", "to", t, "->", t1)
513 }
514
Akron83e75a22021-08-04 13:14:06 +0200515 // Mark the state as being the target of a nontoken transition
516 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200517 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200518 if DEBUG {
519 fmt.Println("Set", t1, "to nontoken")
520 }
Akron83e75a22021-08-04 13:14:06 +0200521 }
522
Akron84d68e62021-08-04 17:06:52 +0200523 // Mark the state as being the target of a tokenend transition
524 if tok.transitions[s][a].tokenend {
525 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200526 if DEBUG {
527 fmt.Println("Set", t1, "to tokenend")
528 }
Akron84d68e62021-08-04 17:06:52 +0200529 }
530
Akron740f3d72021-08-03 12:12:34 +0200531 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200532 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200533
Akron8ef408b2021-08-02 22:11:04 +0200534 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200535 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200536 table[size] = &mapping{source: s1, target: t1}
537 size++
538 } else {
Akron740f3d72021-08-03 12:12:34 +0200539 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200540 dat.setBase(t1, r)
541 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200542 }
543 } else {
Akron740f3d72021-08-03 12:12:34 +0200544 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200545 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200546 }
547 }
548 }
549
550 // Following Mizobuchi et al (2000) the size of the
551 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200552 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200553 dat.array = dat.array[:dat.maxSize+1]
554 return dat
Akron8ef408b2021-08-02 22:11:04 +0200555}
556
Akron8ef408b2021-08-02 22:11:04 +0200557// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200558// exists and return this as a representative.
559// Currently iterates through the whole table
560// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200561func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200562 for x := 0; x < size; x++ {
563 if table[x].source == s {
564 return table[x].target
565 }
566 }
567 return 0
568}
569
Akron64ffd9a2021-08-03 19:55:21 +0200570// Resize double array when necessary
571func (dat *DaTokenizer) resize(l int) {
572 // TODO:
573 // This is a bit too aggressive atm and should be calmed down.
574 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200575 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200576 }
Akron64ffd9a2021-08-03 19:55:21 +0200577}
Akronc9d84a62021-08-03 15:56:03 +0200578
Akron64ffd9a2021-08-03 19:55:21 +0200579// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200580func (dat *DaTokenizer) setBase(p uint32, v uint32) {
581 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200582 dat.resize(l)
583 if dat.maxSize < l {
584 dat.maxSize = l
585 }
586 dat.array[p*2] = v
587}
588
Akron3fdfec62021-08-04 11:40:10 +0200589// Returns true if a state is separate pointing to a representative
590func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200591 return dat.array[p*2]&firstBit != 0
Akron3fdfec62021-08-04 11:40:10 +0200592}
593
594// Mark a state as separate pointing to a representative
595func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
596 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200597 dat.array[p*2] |= firstBit
Akron3fdfec62021-08-04 11:40:10 +0200598 } else {
Akron2a4b9292021-08-04 15:35:22 +0200599 dat.array[p*2] &= (restBit | secondBit)
Akron3fdfec62021-08-04 11:40:10 +0200600 }
601}
602
Akron83e75a22021-08-04 13:14:06 +0200603// Returns true if a state is the target of a nontoken transition
604func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200605 return dat.array[p*2+1]&firstBit != 0
Akron83e75a22021-08-04 13:14:06 +0200606}
607
608// Mark a state as being the target of a nontoken transition
609func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
610 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200611 dat.array[p*2+1] |= firstBit
Akron83e75a22021-08-04 13:14:06 +0200612 } else {
Akron2a4b9292021-08-04 15:35:22 +0200613 dat.array[p*2+1] &= (restBit | secondBit)
Akron83e75a22021-08-04 13:14:06 +0200614 }
615}
616
Akron84d68e62021-08-04 17:06:52 +0200617// Returns true if a state is the target of a tokenend transition
618func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
619 return dat.array[p*2+1]&secondBit != 0
620}
621
622// Mark a state as being the target of a tokenend transition
623func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
624 if sep {
625 dat.array[p*2+1] |= secondBit
626 } else {
627 dat.array[p*2+1] &= (restBit | firstBit)
628 }
629}
630
Akron64ffd9a2021-08-03 19:55:21 +0200631// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200632func (dat *DaTokenizer) getBase(p uint32) uint32 {
633 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200634 return 0
635 }
Akron3fdfec62021-08-04 11:40:10 +0200636 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200637}
638
639// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200640func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
641 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200642 dat.resize(l)
643 if dat.maxSize < l {
644 dat.maxSize = l
645 }
646 dat.array[(p*2)+1] = v
647}
648
649// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200650func (dat *DaTokenizer) getCheck(p uint32) uint32 {
651 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200652 return 0
653 }
Akron3fdfec62021-08-04 11:40:10 +0200654 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200655}
656
657// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200658func (dat *DaTokenizer) setSize(v int) {
659 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200660}
661
662// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200663func (dat *DaTokenizer) GetSize() int {
664 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200665}
666
667// Based on Mizobuchi et al (2000), p. 124
668// This iterates for every state through the complete double array
669// structure until it finds a gap that fits all outgoing transitions
670// of the state. This is extremely slow, but is only necessary in the
671// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200672func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200673
674 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200675 base := uint32(1) // dat.lastFilledBase
676 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200677OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200678
Akron3f8571a2021-08-05 11:18:10 +0200679 /*
680 if !skip {
681 if dat.getCheck(base) != 0 {
682 dat.lastFilledBase = base
683 } else {
684 skip = true
685 }
686 }
687 */
688
Akron740f3d72021-08-03 12:12:34 +0200689 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200690 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200691 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200692 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200693 base++
694 goto OVERLAP
695 }
696 }
Akron8ef408b2021-08-02 22:11:04 +0200697 return base
698}
699
Akron03a3c612021-08-04 11:51:27 +0200700// LoadFactor as defined in Kanda et al (2018),
701// i.e. the proportion of non-empty elements to all elements.
702func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200703
Akron03a3c612021-08-04 11:51:27 +0200704 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200705 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200706 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200707 }
Akrond66a9262021-08-03 17:09:09 +0200708 nonEmpty := 0
709 all := len(dat.array) / 2
710 for x := 1; x <= len(dat.array); x = x + 2 {
711 if dat.array[x] != 0 {
712 nonEmpty++
713 }
714 }
Akron03a3c612021-08-04 11:51:27 +0200715 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
716 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200717}
718
Akron6247a5d2021-08-03 19:18:28 +0200719// WriteTo stores the double array data in an io.Writer.
720func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
721
722 // Store magical header
723 all, err := w.Write([]byte(MAGIC))
724 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200725 log.Error().Err(err)
726 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200727 }
728
729 // Get sigma as a list
730 sigmalist := make([]rune, len(dat.sigma)+16)
731 max := 0
732 for sym, num := range dat.sigma {
733 sigmalist[num] = sym
734 if num > max {
735 max = num
736 }
737 }
738
739 sigmalist = sigmalist[:max+1]
740
Akron3f8571a2021-08-05 11:18:10 +0200741 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200742 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200743 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
744 bo.PutUint16(buf[4:6], uint16(dat.unknown))
745 bo.PutUint16(buf[6:8], uint16(dat.identity))
746 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200747 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200748 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
749 more, err := w.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200750 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200751 log.Error().Err(err)
752 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200753 }
754
755 all += more
756
757 wbuf := bytes.NewBuffer(nil)
758 wbufWrap := bufio.NewWriter(wbuf)
759
760 // Write sigma
761 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200762
Akron6247a5d2021-08-03 19:18:28 +0200763 more, err = wbufWrap.WriteRune(sym)
764 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200765 log.Error().Err(err)
766 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200767 }
768 all += more
769 }
770 wbufWrap.Flush()
771 more, err = w.Write(wbuf.Bytes())
772 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200773 log.Error().Err(err)
774 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200775 }
776 all += more
777
778 // Test marker - could be checksum
779 more, err = w.Write([]byte("T"))
780 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200781 log.Error().Err(err)
782 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200783 }
784 all += more
785
786 wbuf.Reset()
787
788 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200789 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200790 more, err := w.Write(buf[0:4])
791 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200792 log.Error().Err(err)
793 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200794 }
795 all += more
796 }
797
798 return int64(all), err
799}
800
Akron3f8571a2021-08-05 11:18:10 +0200801func LoadDatokFile(file string) *DaTokenizer {
802 f, err := os.Open(file)
803 if err != nil {
804 log.Error().Err(err)
805 os.Exit(0)
806 }
807 defer f.Close()
808
809 gz, err := gzip.NewReader(f)
810 if err != nil {
811 log.Error().Err(err)
812 os.Exit(0)
813 }
814 defer gz.Close()
815
816 return ParseDatok(gz)
817}
818
819func ParseDatok(ior io.Reader) *DaTokenizer {
820
821 dat := &DaTokenizer{
822 sigma: make(map[rune]int),
823 epsilon: 0,
824 unknown: 0,
825 identity: 0,
826 final: 0,
827 loadFactor: 0,
828 }
829
830 r := bufio.NewReader(ior)
831
832 all := 0
833
834 buf := make([]byte, 1024)
835 buf = buf[0:len(MAGIC)]
836
837 more, err := r.Read(buf)
838
839 if err != nil {
840 log.Error().Err(err)
841 return nil
842 }
843
844 all += more
845
846 if string(MAGIC) != string(buf) {
847 log.Error().Msg("Not a datok file")
848 return nil
849 }
850
851 more, err = r.Read(buf[0:16])
852 if err != nil {
853 log.Error().Err(err)
854 return nil
855 }
856
857 all += more
858
859 // version := bo.Uint16(buf[0:2])
860 dat.epsilon = int(bo.Uint16(buf[2:4]))
861 dat.unknown = int(bo.Uint16(buf[4:6]))
862 dat.identity = int(bo.Uint16(buf[6:8]))
863 dat.final = int(bo.Uint16(buf[8:10]))
864
865 sigmaCount := int(bo.Uint16(buf[10:12]))
866 arraySize := int(bo.Uint32(buf[12:16]))
867
868 for x := 0; x < sigmaCount; x++ {
869 sym, more, err := r.ReadRune()
870 if err == nil && sym != 0 {
871 dat.sigma[sym] = x
872 }
873 all += more
874 }
875
876 more, err = r.Read(buf[0:1])
877
878 if err != nil {
879 log.Error().Err(err)
880 return nil
881 }
882
883 all += more
884
885 if string("T") != string(buf[0:1]) {
886 log.Error().Msg("Not a datok file")
887 return nil
888 }
889
890 // Read based on length
891 dat.array = make([]uint32, arraySize)
892
893 for x := 0; x < arraySize; x++ {
894 more, err = r.Read(buf[0:4])
895 if err != nil {
896 log.Error().Err(err)
897 return nil
898 }
899 all += more
900 dat.array[x] = bo.Uint32(buf[0:4])
901 }
902
903 return dat
904}
905
Akron740f3d72021-08-03 12:12:34 +0200906// Match an input string against the double array
907// FSA.
908//
909// Based on Mizobuchi et al (2000), p. 129,
910// with additional support for IDENTITY, UNKNOWN
911// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200912func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200913 var a int
Akron3fdfec62021-08-04 11:40:10 +0200914 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200915 var ok bool
916
Akron3fdfec62021-08-04 11:40:10 +0200917 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200918 chars := []rune(input)
919 i := 0
920
Akron49d27ee2021-08-03 11:58:13 +0200921 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200922 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200923
Akron740f3d72021-08-03 12:12:34 +0200924 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200925 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200926 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200927 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200928 }
Akron64ffd9a2021-08-03 19:55:21 +0200929 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200930 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200931 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200932 }
Akron465a0992021-08-03 11:28:48 +0200933 tu = t
Akron730a79c2021-08-03 11:05:29 +0200934 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200935 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200936
Akron740f3d72021-08-03 12:12:34 +0200937 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200938 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200939
940 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200941 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200942 }
Akron740f3d72021-08-03 12:12:34 +0200943
Akron64ffd9a2021-08-03 19:55:21 +0200944 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200945 // Try again with unknown symbol, in case identity failed
946 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200947 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200948 }
Akron64ffd9a2021-08-03 19:55:21 +0200949 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200950
Akron64ffd9a2021-08-03 19:55:21 +0200951 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200952 // Try again with epsilon symbol, in case everything else failed
953 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200954 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200955 }
Akron64ffd9a2021-08-03 19:55:21 +0200956 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200957 } else {
958 break
959 }
960 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200961 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200962 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200963 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200964 }
Akron740f3d72021-08-03 12:12:34 +0200965
966 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200967 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200968 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200969 i++
970 }
Akron83e75a22021-08-04 13:14:06 +0200971
Akron740f3d72021-08-03 12:12:34 +0200972 // TODO:
973 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200974 }
975
Akron740f3d72021-08-03 12:12:34 +0200976 if i != len(chars) {
977 if DEBUG {
978 fmt.Println("Not at the end")
979 }
Akron8ef408b2021-08-02 22:11:04 +0200980 return false
981 }
982
Akron465a0992021-08-03 11:28:48 +0200983FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200984
985 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200986 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200987 return true
988 }
Akron465a0992021-08-03 11:28:48 +0200989
Akron740f3d72021-08-03 12:12:34 +0200990 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200991 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200992 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200993
Akron740f3d72021-08-03 12:12:34 +0200994 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200995 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200996 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200997 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200998 }
Akron465a0992021-08-03 11:28:48 +0200999 return false
Akron740f3d72021-08-03 12:12:34 +02001000
Akron3fdfec62021-08-04 11:40:10 +02001001 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +02001002 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001003 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +02001004 }
Akron740f3d72021-08-03 12:12:34 +02001005
Akron465a0992021-08-03 11:28:48 +02001006 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +02001007}
Akron068874c2021-08-04 15:19:56 +02001008
Akron84d68e62021-08-04 17:06:52 +02001009// Transduce an input string against the double array
Akron068874c2021-08-04 15:19:56 +02001010// FSA.
1011//
1012// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +02001013// for NONTOKEN and TOKENEND handling
Akron3f8571a2021-08-05 11:18:10 +02001014func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001015 var a int
1016 var tu uint32
Akron84d68e62021-08-04 17:06:52 +02001017 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +02001018
Akron3f8571a2021-08-05 11:18:10 +02001019 reader := bufio.NewReader(r)
1020 writer := bufio.NewWriter(w)
1021 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001022
Akron3f8571a2021-08-05 11:18:10 +02001023 t := uint32(1) // Initial state
Akron3f8571a2021-08-05 11:18:10 +02001024 skip := false
1025
1026 var char rune
1027 var err error
1028 eof := false
1029
1030 for {
1031
1032 if !skip {
1033 char, _, err = reader.ReadRune()
1034 if err != nil {
1035 eof = true
1036 break
1037 }
1038 }
1039 skip = false
1040
1041 a, ok = dat.sigma[char]
Akron068874c2021-08-04 15:19:56 +02001042
1043 // Support identity symbol if character is not in sigma
1044 if !ok && dat.identity != -1 {
1045 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001046 fmt.Println("IDENTITY symbol", string(char), "->", dat.identity)
Akron068874c2021-08-04 15:19:56 +02001047 }
1048 a = dat.identity
1049 } else if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001050 fmt.Println("Sigma transition is okay for [", string(char), "]")
Akron068874c2021-08-04 15:19:56 +02001051 }
1052 tu = t
1053 CHECK:
1054 nontoken = false
Akron84d68e62021-08-04 17:06:52 +02001055 tokenend = false
1056
Akron068874c2021-08-04 15:19:56 +02001057 t = dat.getBase(tu) + uint32(a)
1058
Akron524c5432021-08-05 14:14:27 +02001059 if DEBUG {
1060 fmt.Println("Check", tu, "-", a, "(", string(char), ")", "->", t)
1061 }
1062
Akron068874c2021-08-04 15:19:56 +02001063 // Check if the transition is valid according to the double array
1064 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
1065
1066 if DEBUG {
1067 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
1068 }
1069
1070 if !ok && a == dat.identity {
1071 // Try again with unknown symbol, in case identity failed
1072 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001073 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001074 }
1075 a = dat.unknown
1076
1077 } else if a != dat.epsilon {
1078 // Try again with epsilon symbol, in case everything else failed
1079 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001080 fmt.Println("EPSILON symbol", string(char), "->", dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001081 }
1082 a = dat.epsilon
1083 } else {
1084 break
1085 }
1086 goto CHECK
1087 } else if dat.isSeparate(t) {
1088 // Move to representative state
1089 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +02001090 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +02001091
1092 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +02001093
Akron524c5432021-08-05 14:14:27 +02001094 if DEBUG {
1095 fmt.Println("Representative pointing to", t)
1096 }
1097
Akron068874c2021-08-04 15:19:56 +02001098 } else {
1099 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +02001100 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +02001101 }
1102
1103 // Transition is fine
Akron3f8571a2021-08-05 11:18:10 +02001104 if a == dat.epsilon {
1105 skip = true
Akron068874c2021-08-04 15:19:56 +02001106 // Character consumed
Akron3f8571a2021-08-05 11:18:10 +02001107 } else if !nontoken {
1108 writer.WriteRune(char)
1109 }
Akron068874c2021-08-04 15:19:56 +02001110
Akron524c5432021-08-05 14:14:27 +02001111 if DEBUG {
1112 fmt.Println(" --> ok!")
1113 }
1114
Akron3f8571a2021-08-05 11:18:10 +02001115 /*
1116 if nontoken {
1117 writer.WriteRune(("<|>")
Akron068874c2021-08-04 15:19:56 +02001118 }
Akron3f8571a2021-08-05 11:18:10 +02001119 */
Akron068874c2021-08-04 15:19:56 +02001120
Akron84d68e62021-08-04 17:06:52 +02001121 if tokenend {
Akron3f8571a2021-08-05 11:18:10 +02001122 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001123 }
1124
Akron068874c2021-08-04 15:19:56 +02001125 // TODO:
1126 // Prevent endless epsilon loops!
1127 }
1128
Akron3f8571a2021-08-05 11:18:10 +02001129 if !eof {
Akron068874c2021-08-04 15:19:56 +02001130 if DEBUG {
1131 fmt.Println("Not at the end")
1132 }
1133 return false
1134 }
1135
1136FINALCHECK:
1137
1138 // Automaton is in a final state
1139 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron3f8571a2021-08-05 11:18:10 +02001140 /*
1141 if dat.isNonToken(t) {
1142 fmt.Print("<|>")
1143 }
1144 */
Akron84d68e62021-08-04 17:06:52 +02001145 if dat.isTokenEnd(t) {
Akron3f8571a2021-08-05 11:18:10 +02001146 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001147 }
1148
1149 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001150 return true
1151 }
1152
1153 // Check epsilon transitions until a final state is reached
1154 tu = t
1155 t = dat.getBase(tu) + uint32(dat.epsilon)
1156
1157 // Epsilon transition failed
1158 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
1159 if DEBUG {
1160 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
1161 }
1162 return false
1163
1164 } else if dat.isSeparate(t) {
Akron3f8571a2021-08-05 11:18:10 +02001165 // nontoken = dat.isNonToken(t)
1166 tokenend = dat.isTokenEnd(t)
1167
Akron068874c2021-08-04 15:19:56 +02001168 // Move to representative state
1169 t = dat.getBase(t)
1170 } else {
Akron3f8571a2021-08-05 11:18:10 +02001171 tokenend = dat.isTokenEnd(t)
1172 // nontoken = dat.isNonToken(t)
Akron068874c2021-08-04 15:19:56 +02001173 }
1174
Akron3f8571a2021-08-05 11:18:10 +02001175 /*
1176 if nontoken {
1177 fmt.Print("<|>")
1178 }
1179 */
1180
1181 if tokenend {
1182 writer.WriteRune('\n')
Akron068874c2021-08-04 15:19:56 +02001183 }
1184
1185 goto FINALCHECK
1186}