blob: 2cdac27ddb428bea56e9c3bfa4460ab168f8ac4e [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron3a063ef2021-08-05 19:36:35 +020017// - Add checksum to serialization.
Akron740f3d72021-08-03 12:12:34 +020018
Akron8ef408b2021-08-02 22:11:04 +020019import (
20 "bufio"
21 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron2a4b9292021-08-04 15:35:22 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 firstBit uint32 = 1 << 31
44 secondBit uint32 = 1 << 30
45 restBit uint32 = ^uint32(0) &^ (firstBit | secondBit)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron6247a5d2021-08-03 19:18:28 +020048var bo binary.ByteOrder = binary.LittleEndian
49
Akron8ef408b2021-08-02 22:11:04 +020050type mapping struct {
51 source int
Akron3fdfec62021-08-04 11:40:10 +020052 target uint32
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type edge struct {
Akron83e75a22021-08-04 13:14:06 +020056 inSym int
57 outSym int
58 end int
59 nontoken bool
60 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020064 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020065 sigmaRev map[int]rune
66 arcCount int
67 stateCount int
68 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020069 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron8ef408b2021-08-02 22:11:04 +020076}
77
Akronf2120ca2021-08-03 16:26:41 +020078type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020079 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020080 sigma map[rune]int
81 maxSize int
82 loadFactor float64
83 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020084 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020085
86 // Special symbols in sigma
87 epsilon int
88 unknown int
89 identity int
90 final int
Akronf2120ca2021-08-03 16:26:41 +020091}
92
Akron64ffd9a2021-08-03 19:55:21 +020093func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020094 f, err := os.Open(file)
95 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020096 log.Error().Err(err)
97 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020098 }
99 defer f.Close()
100
101 gz, err := gzip.NewReader(f)
102 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200103 log.Error().Err(err)
104 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200105 }
106 defer gz.Close()
107
Akron3fdfec62021-08-04 11:40:10 +0200108 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200109}
110
Akron3fdfec62021-08-04 11:40:10 +0200111func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200112 r := bufio.NewReader(ior)
113
114 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200115 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200116 epsilon: -1,
117 unknown: -1,
118 identity: -1,
119 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121
Akron740f3d72021-08-03 12:12:34 +0200122 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200123
124 mode := 0
125 var elem []string
126 var elemint [5]int
127
128 for {
129 line, err := r.ReadString('\n')
130 if err != nil {
131 if err == io.EOF {
132 break
133 }
Akron740f3d72021-08-03 12:12:34 +0200134 log.Error().Err(err)
135 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200136 }
137 if strings.HasPrefix(line, "##foma-net") {
138 continue
139 }
140 if strings.HasPrefix(line, "##props##") {
141 mode = PROPS
142 continue
143 }
144 if strings.HasPrefix(line, "##states##") {
145 mode = STATES
146
147 // Adds a final transition symbol to sigma
148 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200149 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200150 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200151 continue
152 }
153 if strings.HasPrefix(line, "##sigma##") {
154 mode = SIGMA
155 continue
156 }
157 if strings.HasPrefix(line, "##end##") {
158 mode = NONE
159 continue
160 }
161
162 switch mode {
163 case PROPS:
164 {
165 elem = strings.Split(line, " ")
166 /*
167 fmt.Println("arity: " + elem[0])
168 fmt.Println("arccount: " + elem[1])
169 fmt.Println("statecount: " + elem[2])
170 fmt.Println("linecount: " + elem[3])
171 fmt.Println("finalcount: " + elem[4])
172 fmt.Println("pathcount: " + elem[5])
173 fmt.Println("is_deterministic: " + elem[6])
174 fmt.Println("is_pruned: " + elem[7])
175 fmt.Println("is_minimized: " + elem[8])
176 fmt.Println("is_epsilon_free: " + elem[9])
177 fmt.Println("is_loop_free: " + elem[10])
178 fmt.Println("extras: " + elem[11])
179 fmt.Println("name: " + elem[12])
180 */
181 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200182 log.Error().Msg("The FST needs to be deterministic")
183 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200184 }
185 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200186 log.Error().Msg("The FST needs to be epsilon free")
187 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200188 }
189
190 elemint[0], err = strconv.Atoi(elem[1])
191 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200192 log.Error().Msg("Can't read arccount")
193 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200194 }
Akron740f3d72021-08-03 12:12:34 +0200195 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200196
197 // States start at 1 in Mizobuchi et al (2000),
198 // as the state 0 is associated with a fail.
199 // Initialize states and transitions
200 elemint[0], err = strconv.Atoi(elem[2])
201 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200202 log.Error().Msg("Can't read statecount")
203 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200204 }
Akron740f3d72021-08-03 12:12:34 +0200205 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200206 tok.transitions = make([]map[int]*edge, elemint[0]+1)
207 continue
208 }
209 case STATES:
210 {
211 elem = strings.Split(line[0:len(line)-1], " ")
212 if elem[0] == "-1" {
213 continue
214 }
215 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200216 if err != nil {
217 break
218 }
Akron8ef408b2021-08-02 22:11:04 +0200219
220 if len(elem) > 1 {
221 elemint[1], err = strconv.Atoi(elem[1])
222 if err != nil {
223 break
224 }
225 if len(elem) > 2 {
226 elemint[2], err = strconv.Atoi(elem[2])
227 if err != nil {
228 break
229 }
230 if len(elem) > 3 {
231 elemint[3], err = strconv.Atoi(elem[3])
232 if err != nil {
233 break
234 }
235 if len(elem) > 4 {
236 elemint[4], err = strconv.Atoi(elem[4])
237 if err != nil {
238 break
239 }
240 }
241 }
242 }
243 }
244
245 switch len(elem) {
246 case 5:
247 {
Akron740f3d72021-08-03 12:12:34 +0200248 state = elemint[0]
249 inSym = elemint[1]
250 outSym = elemint[2]
251 end = elemint[3]
252 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200253 }
254 case 4:
255 {
256 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200257 state = elemint[0]
258 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200259 } else {
Akron740f3d72021-08-03 12:12:34 +0200260 state = elemint[0]
261 inSym = elemint[1]
262 end = elemint[2]
263 final = elemint[3]
264 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200265 }
266 }
267 case 3:
268 {
Akron740f3d72021-08-03 12:12:34 +0200269 inSym = elemint[0]
270 outSym = elemint[1]
271 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200272 }
273 case 2:
274 {
Akron740f3d72021-08-03 12:12:34 +0200275 inSym = elemint[0]
276 end = elemint[1]
277 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200278 }
279 }
280
Akron8ef408b2021-08-02 22:11:04 +0200281 // While the states in foma start with 0, the states in the
282 // Mizobuchi FSA start with one - so we increase every state by 1.
283
Akron83e75a22021-08-04 13:14:06 +0200284 nontoken := false
285 tokenend := false
286
Akron524c5432021-08-05 14:14:27 +0200287 // ID needs to be > 1
288 inSym++
289 outSym++
290
Akron740f3d72021-08-03 12:12:34 +0200291 if inSym != outSym {
292
Akron83e75a22021-08-04 13:14:06 +0200293 if tok.sigmaRev[outSym] == NEWLINE {
294 tokenend = true
295 } else if outSym == tok.epsilon {
296 nontoken = true
297 } else {
Akron740f3d72021-08-03 12:12:34 +0200298 log.Error().Msg(
299 "Unsupported transition: " +
300 strconv.Itoa(state) +
301 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200302 " (" +
Akron740f3d72021-08-03 12:12:34 +0200303 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200304 ":" +
Akron740f3d72021-08-03 12:12:34 +0200305 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200306 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200307 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200308 ":" +
Akron740f3d72021-08-03 12:12:34 +0200309 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200310 ")")
Akron740f3d72021-08-03 12:12:34 +0200311 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200312 }
Akron83e75a22021-08-04 13:14:06 +0200313
Akron83e75a22021-08-04 13:14:06 +0200314 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200315 log.Error().Msg("Epsilon transitions not supported")
316 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200317 }
318
Akron740f3d72021-08-03 12:12:34 +0200319 // This collects all edges until arrstate changes
320
Akron8ef408b2021-08-02 22:11:04 +0200321 // TODO:
322 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
323 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
324 // This requires to remove the transition alltogether and marks the state instead.
325
326 // TODO:
327 // if arrout == EPSILON, mark the transition as NOTOKEN
328
329 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200330 inSym: inSym,
331 outSym: outSym,
332 end: end + 1,
333 tokenend: tokenend,
334 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200335 }
336
Akron740f3d72021-08-03 12:12:34 +0200337 // Initialize outgoing states
338 if tok.transitions[state+1] == nil {
339 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200340 }
341
Akron740f3d72021-08-03 12:12:34 +0200342 // Ignore transitions with invalid symbols
343 if inSym >= 0 {
344 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200345 }
Akron8ef408b2021-08-02 22:11:04 +0200346
Akron740f3d72021-08-03 12:12:34 +0200347 // Add final transition
348 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200349 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200350 }
351
Akron740f3d72021-08-03 12:12:34 +0200352 if DEBUG {
353 fmt.Println("Add",
354 state+1, "->", end+1,
355 "(",
356 inSym,
357 ":",
358 outSym,
359 ") (",
360 string(tok.sigmaRev[inSym]),
361 ":",
362 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200363 ")",
364 ";",
365 "TE:", tokenend,
366 "NT:", nontoken)
Akron740f3d72021-08-03 12:12:34 +0200367 }
Akron75ebe7f2021-08-03 10:34:10 +0200368
Akron8ef408b2021-08-02 22:11:04 +0200369 continue
370 }
371 case SIGMA:
372 {
373 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
374
375 // Turn string into sigma id
376 number, err := strconv.Atoi(elem[0])
377
Akron524c5432021-08-05 14:14:27 +0200378 // ID needs to be > 1
379 number++
380
Akron8ef408b2021-08-02 22:11:04 +0200381 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200382 log.Error().Err(err)
383 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200384 }
385
Akron740f3d72021-08-03 12:12:34 +0200386 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200387
388 var symbol rune
389
390 // Read rune
391 if utf8.RuneCountInString(elem[1]) == 1 {
392 symbol = []rune(elem[1])[0]
393
394 // Probably a MCS
395 } else if utf8.RuneCountInString(elem[1]) > 1 {
396 switch elem[1] {
397 case "@_EPSILON_SYMBOL_@":
398 {
Akronc17f1ca2021-08-03 19:47:27 +0200399 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200400 continue
401 }
402 case "@_UNKNOWN_SYMBOL_@":
403 {
Akronc17f1ca2021-08-03 19:47:27 +0200404 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200405 continue
406 }
407
408 case "@_IDENTITY_SYMBOL_@":
409 {
Akronc17f1ca2021-08-03 19:47:27 +0200410 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200411 continue
412 }
413 default:
Akron740f3d72021-08-03 12:12:34 +0200414 {
415 log.Error().Msg("MCS not supported: " + line)
416 os.Exit(1)
417 }
Akron8ef408b2021-08-02 22:11:04 +0200418 }
419
Akron740f3d72021-08-03 12:12:34 +0200420 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200421 line, err = r.ReadString('\n')
422 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200423 log.Error().Err(err)
424 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200425 }
426 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200427 log.Error().Msg("MCS not supported:" + line)
428 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200429 }
Akron740f3d72021-08-03 12:12:34 +0200430 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200431 }
432
Akron740f3d72021-08-03 12:12:34 +0200433 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200434 }
435 }
436 }
437
438 return tok
439}
440
Akron64ffd9a2021-08-03 19:55:21 +0200441// Set alphabet A to the list of all symbols
442// outgoing from s
443func (tok *Tokenizer) get_set(s int, A *[]int) {
444 for a := range tok.transitions[s] {
445 *A = append(*A, a)
446 }
447
448 // Not required, but simplifies bug hunting
449 sort.Ints(*A)
450}
451
Akron8ef408b2021-08-02 22:11:04 +0200452// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200453func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
454
455 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200456 sigma: make(map[rune]int),
457 loadFactor: -1,
458 final: tok.final,
459 unknown: tok.unknown,
460 identity: tok.identity,
461 epsilon: tok.epsilon,
Akron3f8571a2021-08-05 11:18:10 +0200462 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200463 }
464
465 for num, sym := range tok.sigmaRev {
466 dat.sigma[sym] = num
467 }
Akron8ef408b2021-08-02 22:11:04 +0200468
469 mark := 0
470 size := 0
471
472 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200473 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200474
475 table[size] = &mapping{source: 1, target: 1}
476 size++
477
Akron740f3d72021-08-03 12:12:34 +0200478 // Allocate space for the outgoing symbol range
479 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200480
481 for mark < size {
482 s := table[mark].source // This is a state in Ms
483 t := table[mark].target // This is a state in Mt
484 mark++
Akron740f3d72021-08-03 12:12:34 +0200485
486 // Following the paper, here the state t can be remembered
487 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200488 A = A[:0]
489 tok.get_set(s, &A)
490
Akron740f3d72021-08-03 12:12:34 +0200491 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200492 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200493
Akron773b1ef2021-08-03 17:37:20 +0200494 // TODO:
Akron068874c2021-08-04 15:19:56 +0200495 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200496 // outdegree of .end
497
Akron740f3d72021-08-03 12:12:34 +0200498 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200499 for _, a := range A {
500
Akronc17f1ca2021-08-03 19:47:27 +0200501 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200502
Akron740f3d72021-08-03 12:12:34 +0200503 // Aka g(s, a)
504 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200505
Akron740f3d72021-08-03 12:12:34 +0200506 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200507 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200508 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200509
Akron524c5432021-08-05 14:14:27 +0200510 if DEBUG {
511 fmt.Println("Translate transition",
512 s, "->", s1, "(", a, ")", "to", t, "->", t1)
513 }
514
Akron83e75a22021-08-04 13:14:06 +0200515 // Mark the state as being the target of a nontoken transition
516 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200517 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200518 if DEBUG {
519 fmt.Println("Set", t1, "to nontoken")
520 }
Akron83e75a22021-08-04 13:14:06 +0200521 }
522
Akron84d68e62021-08-04 17:06:52 +0200523 // Mark the state as being the target of a tokenend transition
524 if tok.transitions[s][a].tokenend {
525 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200526 if DEBUG {
527 fmt.Println("Set", t1, "to tokenend")
528 }
Akron84d68e62021-08-04 17:06:52 +0200529 }
530
Akron740f3d72021-08-03 12:12:34 +0200531 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200532 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200533
Akron8ef408b2021-08-02 22:11:04 +0200534 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200535 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200536 table[size] = &mapping{source: s1, target: t1}
537 size++
538 } else {
Akron740f3d72021-08-03 12:12:34 +0200539 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200540 dat.setBase(t1, r)
541 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200542 }
543 } else {
Akron740f3d72021-08-03 12:12:34 +0200544 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200545 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200546 }
547 }
548 }
549
550 // Following Mizobuchi et al (2000) the size of the
551 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200552 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200553 dat.array = dat.array[:dat.maxSize+1]
554 return dat
Akron8ef408b2021-08-02 22:11:04 +0200555}
556
Akron8ef408b2021-08-02 22:11:04 +0200557// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200558// exists and return this as a representative.
559// Currently iterates through the whole table
560// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200561func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200562 for x := 0; x < size; x++ {
563 if table[x].source == s {
564 return table[x].target
565 }
566 }
567 return 0
568}
569
Akron64ffd9a2021-08-03 19:55:21 +0200570// Resize double array when necessary
571func (dat *DaTokenizer) resize(l int) {
572 // TODO:
573 // This is a bit too aggressive atm and should be calmed down.
574 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200575 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200576 }
Akron64ffd9a2021-08-03 19:55:21 +0200577}
Akronc9d84a62021-08-03 15:56:03 +0200578
Akron64ffd9a2021-08-03 19:55:21 +0200579// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200580func (dat *DaTokenizer) setBase(p uint32, v uint32) {
581 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200582 dat.resize(l)
583 if dat.maxSize < l {
584 dat.maxSize = l
585 }
586 dat.array[p*2] = v
587}
588
Akron3fdfec62021-08-04 11:40:10 +0200589// Returns true if a state is separate pointing to a representative
590func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200591 return dat.array[p*2]&firstBit != 0
Akron3fdfec62021-08-04 11:40:10 +0200592}
593
594// Mark a state as separate pointing to a representative
595func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
596 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200597 dat.array[p*2] |= firstBit
Akron3fdfec62021-08-04 11:40:10 +0200598 } else {
Akron2a4b9292021-08-04 15:35:22 +0200599 dat.array[p*2] &= (restBit | secondBit)
Akron3fdfec62021-08-04 11:40:10 +0200600 }
601}
602
Akron83e75a22021-08-04 13:14:06 +0200603// Returns true if a state is the target of a nontoken transition
604func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200605 return dat.array[p*2+1]&firstBit != 0
Akron83e75a22021-08-04 13:14:06 +0200606}
607
608// Mark a state as being the target of a nontoken transition
609func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
610 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200611 dat.array[p*2+1] |= firstBit
Akron83e75a22021-08-04 13:14:06 +0200612 } else {
Akron2a4b9292021-08-04 15:35:22 +0200613 dat.array[p*2+1] &= (restBit | secondBit)
Akron83e75a22021-08-04 13:14:06 +0200614 }
615}
616
Akron84d68e62021-08-04 17:06:52 +0200617// Returns true if a state is the target of a tokenend transition
618func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
619 return dat.array[p*2+1]&secondBit != 0
620}
621
622// Mark a state as being the target of a tokenend transition
623func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
624 if sep {
625 dat.array[p*2+1] |= secondBit
626 } else {
627 dat.array[p*2+1] &= (restBit | firstBit)
628 }
629}
630
Akron64ffd9a2021-08-03 19:55:21 +0200631// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200632func (dat *DaTokenizer) getBase(p uint32) uint32 {
633 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200634 return 0
635 }
Akron3fdfec62021-08-04 11:40:10 +0200636 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200637}
638
639// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200640func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
641 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200642 dat.resize(l)
643 if dat.maxSize < l {
644 dat.maxSize = l
645 }
646 dat.array[(p*2)+1] = v
647}
648
649// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200650func (dat *DaTokenizer) getCheck(p uint32) uint32 {
651 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200652 return 0
653 }
Akron3fdfec62021-08-04 11:40:10 +0200654 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200655}
656
657// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200658func (dat *DaTokenizer) setSize(v int) {
659 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200660}
661
662// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200663func (dat *DaTokenizer) GetSize() int {
664 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200665}
666
667// Based on Mizobuchi et al (2000), p. 124
668// This iterates for every state through the complete double array
669// structure until it finds a gap that fits all outgoing transitions
670// of the state. This is extremely slow, but is only necessary in the
671// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200672func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200673
674 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200675 base := uint32(1) // dat.lastFilledBase
676 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200677OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200678
Akron3f8571a2021-08-05 11:18:10 +0200679 /*
680 if !skip {
681 if dat.getCheck(base) != 0 {
682 dat.lastFilledBase = base
683 } else {
684 skip = true
685 }
686 }
687 */
688
Akron740f3d72021-08-03 12:12:34 +0200689 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200690 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200691 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200692 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200693 base++
694 goto OVERLAP
695 }
696 }
Akron8ef408b2021-08-02 22:11:04 +0200697 return base
698}
699
Akron03a3c612021-08-04 11:51:27 +0200700// LoadFactor as defined in Kanda et al (2018),
701// i.e. the proportion of non-empty elements to all elements.
702func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200703
Akron03a3c612021-08-04 11:51:27 +0200704 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200705 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200706 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200707 }
Akrond66a9262021-08-03 17:09:09 +0200708 nonEmpty := 0
709 all := len(dat.array) / 2
710 for x := 1; x <= len(dat.array); x = x + 2 {
711 if dat.array[x] != 0 {
712 nonEmpty++
713 }
714 }
Akron03a3c612021-08-04 11:51:27 +0200715 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
716 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200717}
718
Akron6247a5d2021-08-03 19:18:28 +0200719// WriteTo stores the double array data in an io.Writer.
Akron3a063ef2021-08-05 19:36:35 +0200720func (dat *DaTokenizer) Save(file string) (n int64, err error) {
721 f, err := os.Create(file)
722 if err != nil {
723 log.Error().Err(err)
724 return 0, nil
725 }
726 defer f.Close()
727 gz := gzip.NewWriter(f)
728 defer gz.Close()
729 n, err = dat.WriteTo(gz)
730 if err != nil {
731 log.Error().Err(err)
732 return n, err
733 }
734 gz.Flush()
735 return n, nil
736}
737
738// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200739func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
740
Akron3a063ef2021-08-05 19:36:35 +0200741 wb := bufio.NewWriter(w)
742 defer wb.Flush()
743
Akron6247a5d2021-08-03 19:18:28 +0200744 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200745 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200746 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200747 log.Error().Err(err)
748 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200749 }
750
751 // Get sigma as a list
752 sigmalist := make([]rune, len(dat.sigma)+16)
753 max := 0
754 for sym, num := range dat.sigma {
755 sigmalist[num] = sym
756 if num > max {
757 max = num
758 }
759 }
760
761 sigmalist = sigmalist[:max+1]
762
Akron3f8571a2021-08-05 11:18:10 +0200763 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200764 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200765 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
766 bo.PutUint16(buf[4:6], uint16(dat.unknown))
767 bo.PutUint16(buf[6:8], uint16(dat.identity))
768 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200769 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200770 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200771 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200772 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200773 log.Error().Err(err)
774 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200775 }
776
777 all += more
778
Akron3a063ef2021-08-05 19:36:35 +0200779 // wbuf := bytes.NewBuffer(nil)
780 // wbufWrap := bufio.NewWriter(wbuf)
Akron6247a5d2021-08-03 19:18:28 +0200781
782 // Write sigma
783 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200784
Akron3a063ef2021-08-05 19:36:35 +0200785 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200786 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200787 log.Error().Err(err)
788 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200789 }
790 all += more
791 }
Akron3a063ef2021-08-05 19:36:35 +0200792 // wbufWrap.Flush()
793 // more, err = w.Write(wbuf.Bytes())
Akron6247a5d2021-08-03 19:18:28 +0200794 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200795 log.Error().Err(err)
796 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200797 }
Akron3a063ef2021-08-05 19:36:35 +0200798 // all += more
Akron6247a5d2021-08-03 19:18:28 +0200799
800 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200801 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200802 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200803 log.Error().Err(err)
804 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200805 }
806 all += more
807
Akron3a063ef2021-08-05 19:36:35 +0200808 // wbuf.Reset()
Akron6247a5d2021-08-03 19:18:28 +0200809
Akron3a063ef2021-08-05 19:36:35 +0200810 for x := 0; x < len(dat.array); x++ {
811 // for _, d := range dat.array {
812 bo.PutUint32(buf[0:4], dat.array[x])
813 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200814 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200815 log.Error().Err(err)
816 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200817 }
Akron3a063ef2021-08-05 19:36:35 +0200818 if more != 4 {
819 log.Error().Msg("Can not write uint32")
820 return int64(all), err
821 }
Akron6247a5d2021-08-03 19:18:28 +0200822 all += more
823 }
824
Akron3a063ef2021-08-05 19:36:35 +0200825 // wbufWrap.Flush()
826
Akron6247a5d2021-08-03 19:18:28 +0200827 return int64(all), err
828}
829
Akron3f8571a2021-08-05 11:18:10 +0200830func LoadDatokFile(file string) *DaTokenizer {
831 f, err := os.Open(file)
832 if err != nil {
833 log.Error().Err(err)
834 os.Exit(0)
835 }
836 defer f.Close()
837
838 gz, err := gzip.NewReader(f)
839 if err != nil {
840 log.Error().Err(err)
841 os.Exit(0)
842 }
843 defer gz.Close()
844
Akron3a063ef2021-08-05 19:36:35 +0200845 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200846 return ParseDatok(gz)
847}
848
849func ParseDatok(ior io.Reader) *DaTokenizer {
850
851 dat := &DaTokenizer{
852 sigma: make(map[rune]int),
853 epsilon: 0,
854 unknown: 0,
855 identity: 0,
856 final: 0,
857 loadFactor: 0,
858 }
859
860 r := bufio.NewReader(ior)
861
862 all := 0
863
864 buf := make([]byte, 1024)
865 buf = buf[0:len(MAGIC)]
866
867 more, err := r.Read(buf)
868
869 if err != nil {
870 log.Error().Err(err)
871 return nil
872 }
873
874 all += more
875
876 if string(MAGIC) != string(buf) {
877 log.Error().Msg("Not a datok file")
878 return nil
879 }
880
Akron3a063ef2021-08-05 19:36:35 +0200881 more, err = io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200882 if err != nil {
883 log.Error().Err(err)
884 return nil
885 }
886
887 all += more
888
Akron3a063ef2021-08-05 19:36:35 +0200889 version := bo.Uint16(buf[0:2])
890
891 if version != VERSION {
892 log.Error().Msg("Version not compatible")
893 return nil
894 }
895
Akron3f8571a2021-08-05 11:18:10 +0200896 dat.epsilon = int(bo.Uint16(buf[2:4]))
897 dat.unknown = int(bo.Uint16(buf[4:6]))
898 dat.identity = int(bo.Uint16(buf[6:8]))
899 dat.final = int(bo.Uint16(buf[8:10]))
900
901 sigmaCount := int(bo.Uint16(buf[10:12]))
902 arraySize := int(bo.Uint32(buf[12:16]))
903
Akron3a063ef2021-08-05 19:36:35 +0200904 // Shouldn't be relevant though
905 dat.maxSize = arraySize - 1
906
Akron3f8571a2021-08-05 11:18:10 +0200907 for x := 0; x < sigmaCount; x++ {
908 sym, more, err := r.ReadRune()
909 if err == nil && sym != 0 {
910 dat.sigma[sym] = x
911 }
912 all += more
913 }
914
Akron3a063ef2021-08-05 19:36:35 +0200915 more, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200916
917 if err != nil {
918 log.Error().Err(err)
919 return nil
920 }
921
922 all += more
923
924 if string("T") != string(buf[0:1]) {
925 log.Error().Msg("Not a datok file")
926 return nil
927 }
928
929 // Read based on length
930 dat.array = make([]uint32, arraySize)
931
932 for x := 0; x < arraySize; x++ {
Akron3a063ef2021-08-05 19:36:35 +0200933 more, err = io.ReadFull(r, buf[0:4])
Akron3f8571a2021-08-05 11:18:10 +0200934 if err != nil {
Akron3a063ef2021-08-05 19:36:35 +0200935 if err == io.EOF {
936 fmt.Println(arraySize, x)
937 break
938 }
Akron3f8571a2021-08-05 11:18:10 +0200939 log.Error().Err(err)
940 return nil
941 }
942 all += more
943 dat.array[x] = bo.Uint32(buf[0:4])
944 }
945
946 return dat
947}
948
Akron740f3d72021-08-03 12:12:34 +0200949// Match an input string against the double array
950// FSA.
951//
952// Based on Mizobuchi et al (2000), p. 129,
953// with additional support for IDENTITY, UNKNOWN
954// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200955func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200956 var a int
Akron3fdfec62021-08-04 11:40:10 +0200957 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200958 var ok bool
959
Akron3fdfec62021-08-04 11:40:10 +0200960 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200961 chars := []rune(input)
962 i := 0
963
Akron49d27ee2021-08-03 11:58:13 +0200964 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200965 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200966
Akron740f3d72021-08-03 12:12:34 +0200967 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200968 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200969 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200970 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200971 }
Akron64ffd9a2021-08-03 19:55:21 +0200972 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200973 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200974 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200975 }
Akron465a0992021-08-03 11:28:48 +0200976 tu = t
Akron730a79c2021-08-03 11:05:29 +0200977 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200978 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200979
Akron740f3d72021-08-03 12:12:34 +0200980 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200981 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200982
983 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200984 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200985 }
Akron740f3d72021-08-03 12:12:34 +0200986
Akron64ffd9a2021-08-03 19:55:21 +0200987 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200988 // Try again with unknown symbol, in case identity failed
989 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200990 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200991 }
Akron64ffd9a2021-08-03 19:55:21 +0200992 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200993
Akron64ffd9a2021-08-03 19:55:21 +0200994 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200995 // Try again with epsilon symbol, in case everything else failed
996 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200997 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200998 }
Akron64ffd9a2021-08-03 19:55:21 +0200999 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +02001000 } else {
1001 break
1002 }
1003 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +02001004 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +02001005 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001006 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +02001007 }
Akron740f3d72021-08-03 12:12:34 +02001008
1009 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +02001010 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001011 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +02001012 i++
1013 }
Akron83e75a22021-08-04 13:14:06 +02001014
Akron740f3d72021-08-03 12:12:34 +02001015 // TODO:
1016 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +02001017 }
1018
Akron740f3d72021-08-03 12:12:34 +02001019 if i != len(chars) {
1020 if DEBUG {
1021 fmt.Println("Not at the end")
1022 }
Akron8ef408b2021-08-02 22:11:04 +02001023 return false
1024 }
1025
Akron465a0992021-08-03 11:28:48 +02001026FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +02001027
1028 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +02001029 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +02001030 return true
1031 }
Akron465a0992021-08-03 11:28:48 +02001032
Akron740f3d72021-08-03 12:12:34 +02001033 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +02001034 tu = t
Akron3fdfec62021-08-04 11:40:10 +02001035 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +02001036
Akron740f3d72021-08-03 12:12:34 +02001037 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +02001038 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001039 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001040 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +02001041 }
Akron465a0992021-08-03 11:28:48 +02001042 return false
Akron740f3d72021-08-03 12:12:34 +02001043
Akron3fdfec62021-08-04 11:40:10 +02001044 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +02001045 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001046 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +02001047 }
Akron740f3d72021-08-03 12:12:34 +02001048
Akron465a0992021-08-03 11:28:48 +02001049 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +02001050}
Akron068874c2021-08-04 15:19:56 +02001051
Akron84d68e62021-08-04 17:06:52 +02001052// Transduce an input string against the double array
Akron068874c2021-08-04 15:19:56 +02001053// FSA.
1054//
1055// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +02001056// for NONTOKEN and TOKENEND handling
Akron3f8571a2021-08-05 11:18:10 +02001057func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001058 var a int
1059 var tu uint32
Akron84d68e62021-08-04 17:06:52 +02001060 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +02001061
Akron3f8571a2021-08-05 11:18:10 +02001062 reader := bufio.NewReader(r)
1063 writer := bufio.NewWriter(w)
1064 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001065
Akron3f8571a2021-08-05 11:18:10 +02001066 t := uint32(1) // Initial state
Akron3f8571a2021-08-05 11:18:10 +02001067 skip := false
1068
1069 var char rune
1070 var err error
1071 eof := false
1072
1073 for {
1074
1075 if !skip {
1076 char, _, err = reader.ReadRune()
1077 if err != nil {
1078 eof = true
1079 break
1080 }
1081 }
1082 skip = false
1083
1084 a, ok = dat.sigma[char]
Akron068874c2021-08-04 15:19:56 +02001085
1086 // Support identity symbol if character is not in sigma
1087 if !ok && dat.identity != -1 {
1088 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001089 fmt.Println("IDENTITY symbol", string(char), "->", dat.identity)
Akron068874c2021-08-04 15:19:56 +02001090 }
1091 a = dat.identity
1092 } else if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001093 fmt.Println("Sigma transition is okay for [", string(char), "]")
Akron068874c2021-08-04 15:19:56 +02001094 }
1095 tu = t
1096 CHECK:
1097 nontoken = false
Akron84d68e62021-08-04 17:06:52 +02001098 tokenend = false
1099
Akron068874c2021-08-04 15:19:56 +02001100 t = dat.getBase(tu) + uint32(a)
1101
Akron524c5432021-08-05 14:14:27 +02001102 if DEBUG {
1103 fmt.Println("Check", tu, "-", a, "(", string(char), ")", "->", t)
1104 }
1105
Akron068874c2021-08-04 15:19:56 +02001106 // Check if the transition is valid according to the double array
1107 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
1108
1109 if DEBUG {
1110 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
1111 }
1112
1113 if !ok && a == dat.identity {
1114 // Try again with unknown symbol, in case identity failed
1115 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001116 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001117 }
1118 a = dat.unknown
1119
1120 } else if a != dat.epsilon {
1121 // Try again with epsilon symbol, in case everything else failed
1122 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001123 fmt.Println("EPSILON symbol", string(char), "->", dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001124 }
1125 a = dat.epsilon
1126 } else {
1127 break
1128 }
1129 goto CHECK
1130 } else if dat.isSeparate(t) {
1131 // Move to representative state
1132 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +02001133 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +02001134
1135 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +02001136
Akron524c5432021-08-05 14:14:27 +02001137 if DEBUG {
1138 fmt.Println("Representative pointing to", t)
1139 }
1140
Akron068874c2021-08-04 15:19:56 +02001141 } else {
1142 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +02001143 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +02001144 }
1145
1146 // Transition is fine
Akron3f8571a2021-08-05 11:18:10 +02001147 if a == dat.epsilon {
1148 skip = true
Akron068874c2021-08-04 15:19:56 +02001149 // Character consumed
Akron3f8571a2021-08-05 11:18:10 +02001150 } else if !nontoken {
1151 writer.WriteRune(char)
1152 }
Akron068874c2021-08-04 15:19:56 +02001153
Akron524c5432021-08-05 14:14:27 +02001154 if DEBUG {
1155 fmt.Println(" --> ok!")
1156 }
1157
Akron3f8571a2021-08-05 11:18:10 +02001158 /*
1159 if nontoken {
1160 writer.WriteRune(("<|>")
Akron068874c2021-08-04 15:19:56 +02001161 }
Akron3f8571a2021-08-05 11:18:10 +02001162 */
Akron068874c2021-08-04 15:19:56 +02001163
Akron84d68e62021-08-04 17:06:52 +02001164 if tokenend {
Akron3f8571a2021-08-05 11:18:10 +02001165 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001166 }
1167
Akron068874c2021-08-04 15:19:56 +02001168 // TODO:
1169 // Prevent endless epsilon loops!
1170 }
1171
Akron3f8571a2021-08-05 11:18:10 +02001172 if !eof {
Akron068874c2021-08-04 15:19:56 +02001173 if DEBUG {
1174 fmt.Println("Not at the end")
1175 }
1176 return false
1177 }
1178
1179FINALCHECK:
1180
1181 // Automaton is in a final state
1182 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron3f8571a2021-08-05 11:18:10 +02001183 /*
1184 if dat.isNonToken(t) {
1185 fmt.Print("<|>")
1186 }
1187 */
Akron84d68e62021-08-04 17:06:52 +02001188 if dat.isTokenEnd(t) {
Akron3f8571a2021-08-05 11:18:10 +02001189 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001190 }
1191
1192 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001193 return true
1194 }
1195
1196 // Check epsilon transitions until a final state is reached
1197 tu = t
1198 t = dat.getBase(tu) + uint32(dat.epsilon)
1199
1200 // Epsilon transition failed
1201 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
1202 if DEBUG {
1203 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
1204 }
1205 return false
1206
1207 } else if dat.isSeparate(t) {
Akron3f8571a2021-08-05 11:18:10 +02001208 // nontoken = dat.isNonToken(t)
1209 tokenend = dat.isTokenEnd(t)
1210
Akron068874c2021-08-04 15:19:56 +02001211 // Move to representative state
1212 t = dat.getBase(t)
1213 } else {
Akron3f8571a2021-08-05 11:18:10 +02001214 tokenend = dat.isTokenEnd(t)
1215 // nontoken = dat.isNonToken(t)
Akron068874c2021-08-04 15:19:56 +02001216 }
1217
Akron3f8571a2021-08-05 11:18:10 +02001218 /*
1219 if nontoken {
1220 fmt.Print("<|>")
1221 }
1222 */
1223
1224 if tokenend {
1225 writer.WriteRune('\n')
Akron068874c2021-08-04 15:19:56 +02001226 }
1227
1228 goto FINALCHECK
1229}