blob: 0c763c865fa8044ef0f69a8f78eb80c179052ef8 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron3a063ef2021-08-05 19:36:35 +020017// - Add checksum to serialization.
Akron740f3d72021-08-03 12:12:34 +020018
Akron8ef408b2021-08-02 22:11:04 +020019import (
20 "bufio"
21 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron2a4b9292021-08-04 15:35:22 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 firstBit uint32 = 1 << 31
44 secondBit uint32 = 1 << 30
45 restBit uint32 = ^uint32(0) &^ (firstBit | secondBit)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron6247a5d2021-08-03 19:18:28 +020048var bo binary.ByteOrder = binary.LittleEndian
49
Akron8ef408b2021-08-02 22:11:04 +020050type mapping struct {
51 source int
Akron3fdfec62021-08-04 11:40:10 +020052 target uint32
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type edge struct {
Akron83e75a22021-08-04 13:14:06 +020056 inSym int
57 outSym int
58 end int
59 nontoken bool
60 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020064 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020065 sigmaRev map[int]rune
66 arcCount int
67 stateCount int
68 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020069 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron8ef408b2021-08-02 22:11:04 +020076}
77
Akronf2120ca2021-08-03 16:26:41 +020078type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020079 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020080 sigma map[rune]int
81 maxSize int
82 loadFactor float64
83 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020084 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020085
86 // Special symbols in sigma
87 epsilon int
88 unknown int
89 identity int
90 final int
Akronf2120ca2021-08-03 16:26:41 +020091}
92
Akron64ffd9a2021-08-03 19:55:21 +020093func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020094 f, err := os.Open(file)
95 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020096 log.Error().Err(err)
97 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020098 }
99 defer f.Close()
100
101 gz, err := gzip.NewReader(f)
102 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200103 log.Error().Err(err)
104 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200105 }
106 defer gz.Close()
107
Akron3fdfec62021-08-04 11:40:10 +0200108 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200109}
110
Akron3fdfec62021-08-04 11:40:10 +0200111func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200112 r := bufio.NewReader(ior)
113
114 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200115 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200116 epsilon: -1,
117 unknown: -1,
118 identity: -1,
119 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121
Akron3610f102021-08-08 14:13:25 +0200122 checkmap := make(map[string]bool)
123
Akron740f3d72021-08-03 12:12:34 +0200124 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200125
126 mode := 0
127 var elem []string
128 var elemint [5]int
129
130 for {
131 line, err := r.ReadString('\n')
132 if err != nil {
133 if err == io.EOF {
134 break
135 }
Akron740f3d72021-08-03 12:12:34 +0200136 log.Error().Err(err)
137 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200138 }
139 if strings.HasPrefix(line, "##foma-net") {
140 continue
141 }
142 if strings.HasPrefix(line, "##props##") {
143 mode = PROPS
144 continue
145 }
146 if strings.HasPrefix(line, "##states##") {
147 mode = STATES
148
149 // Adds a final transition symbol to sigma
150 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200151 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200152 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200153 continue
154 }
155 if strings.HasPrefix(line, "##sigma##") {
156 mode = SIGMA
157 continue
158 }
159 if strings.HasPrefix(line, "##end##") {
160 mode = NONE
161 continue
162 }
163
164 switch mode {
165 case PROPS:
166 {
167 elem = strings.Split(line, " ")
168 /*
169 fmt.Println("arity: " + elem[0])
170 fmt.Println("arccount: " + elem[1])
171 fmt.Println("statecount: " + elem[2])
172 fmt.Println("linecount: " + elem[3])
173 fmt.Println("finalcount: " + elem[4])
174 fmt.Println("pathcount: " + elem[5])
175 fmt.Println("is_deterministic: " + elem[6])
176 fmt.Println("is_pruned: " + elem[7])
177 fmt.Println("is_minimized: " + elem[8])
178 fmt.Println("is_epsilon_free: " + elem[9])
179 fmt.Println("is_loop_free: " + elem[10])
180 fmt.Println("extras: " + elem[11])
181 fmt.Println("name: " + elem[12])
182 */
183 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200184 log.Error().Msg("The FST needs to be deterministic")
185 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200186 }
187 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200188 log.Error().Msg("The FST needs to be epsilon free")
189 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200190 }
191
192 elemint[0], err = strconv.Atoi(elem[1])
193 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200194 log.Error().Msg("Can't read arccount")
195 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200196 }
Akron740f3d72021-08-03 12:12:34 +0200197 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200198
199 // States start at 1 in Mizobuchi et al (2000),
200 // as the state 0 is associated with a fail.
201 // Initialize states and transitions
202 elemint[0], err = strconv.Atoi(elem[2])
203 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200204 log.Error().Msg("Can't read statecount")
205 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200206 }
Akron740f3d72021-08-03 12:12:34 +0200207 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200208 tok.transitions = make([]map[int]*edge, elemint[0]+1)
209 continue
210 }
211 case STATES:
212 {
213 elem = strings.Split(line[0:len(line)-1], " ")
214 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200215 if DEBUG {
216 fmt.Println("Skip", elem)
217 }
Akron8ef408b2021-08-02 22:11:04 +0200218 continue
219 }
220 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200221 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200222 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200223 break
224 }
Akron8ef408b2021-08-02 22:11:04 +0200225
226 if len(elem) > 1 {
227 elemint[1], err = strconv.Atoi(elem[1])
228 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200229 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200230 break
231 }
232 if len(elem) > 2 {
233 elemint[2], err = strconv.Atoi(elem[2])
234 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200235 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200236 break
237 }
238 if len(elem) > 3 {
239 elemint[3], err = strconv.Atoi(elem[3])
240 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200241 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200242 break
243 }
244 if len(elem) > 4 {
245 elemint[4], err = strconv.Atoi(elem[4])
246 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200247 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200248 break
249 }
250 }
251 }
252 }
253 }
254
255 switch len(elem) {
256 case 5:
257 {
Akron740f3d72021-08-03 12:12:34 +0200258 state = elemint[0]
259 inSym = elemint[1]
260 outSym = elemint[2]
261 end = elemint[3]
262 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200263 }
264 case 4:
265 {
266 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200267 state = elemint[0]
268 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200269 } else {
Akron740f3d72021-08-03 12:12:34 +0200270 state = elemint[0]
271 inSym = elemint[1]
272 end = elemint[2]
273 final = elemint[3]
274 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200275 }
276 }
277 case 3:
278 {
Akron740f3d72021-08-03 12:12:34 +0200279 inSym = elemint[0]
280 outSym = elemint[1]
281 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200282 }
283 case 2:
284 {
Akron740f3d72021-08-03 12:12:34 +0200285 inSym = elemint[0]
286 end = elemint[1]
287 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200288 }
289 }
290
Akron8ef408b2021-08-02 22:11:04 +0200291 // While the states in foma start with 0, the states in the
292 // Mizobuchi FSA start with one - so we increase every state by 1.
293
Akron83e75a22021-08-04 13:14:06 +0200294 nontoken := false
295 tokenend := false
296
Akron524c5432021-08-05 14:14:27 +0200297 // ID needs to be > 1
298 inSym++
299 outSym++
300
Akron740f3d72021-08-03 12:12:34 +0200301 if inSym != outSym {
302
Akron83e75a22021-08-04 13:14:06 +0200303 if tok.sigmaRev[outSym] == NEWLINE {
304 tokenend = true
305 } else if outSym == tok.epsilon {
306 nontoken = true
307 } else {
Akron740f3d72021-08-03 12:12:34 +0200308 log.Error().Msg(
309 "Unsupported transition: " +
310 strconv.Itoa(state) +
311 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200312 " (" +
Akron740f3d72021-08-03 12:12:34 +0200313 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200314 ":" +
Akron740f3d72021-08-03 12:12:34 +0200315 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200316 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200317 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200318 ":" +
Akron740f3d72021-08-03 12:12:34 +0200319 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200320 ")")
Akron740f3d72021-08-03 12:12:34 +0200321 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200322 }
Akron83e75a22021-08-04 13:14:06 +0200323
Akron83e75a22021-08-04 13:14:06 +0200324 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200325 log.Error().Msg("Epsilon transitions not supported")
326 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200327 }
328
Akron740f3d72021-08-03 12:12:34 +0200329 // This collects all edges until arrstate changes
330
Akron8ef408b2021-08-02 22:11:04 +0200331 // TODO:
332 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
333 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
334 // This requires to remove the transition alltogether and marks the state instead.
335
336 // TODO:
337 // if arrout == EPSILON, mark the transition as NOTOKEN
338
339 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200340 inSym: inSym,
341 outSym: outSym,
342 end: end + 1,
343 tokenend: tokenend,
344 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200345 }
346
Akron740f3d72021-08-03 12:12:34 +0200347 // Initialize outgoing states
348 if tok.transitions[state+1] == nil {
349 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200350 }
351
Akron740f3d72021-08-03 12:12:34 +0200352 // Ignore transitions with invalid symbols
353 if inSym >= 0 {
354 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200355 }
Akron8ef408b2021-08-02 22:11:04 +0200356
Akron740f3d72021-08-03 12:12:34 +0200357 // Add final transition
358 if final == 1 {
Akron3610f102021-08-08 14:13:25 +0200359 // TODO: maybe this is irrelevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200360 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200361 }
362
Akron3610f102021-08-08 14:13:25 +0200363 test := fmt.Sprint(state+1) + ":" + fmt.Sprint(inSym)
364 if checkmap[test] {
365 fmt.Println("Path already defined!", test)
366 os.Exit(0)
367 } else {
368 checkmap[test] = true
369 }
370
Akronb4bbb472021-08-09 11:49:38 +0200371 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200372 fmt.Println("Add",
373 state+1, "->", end+1,
374 "(",
375 inSym,
376 ":",
377 outSym,
378 ") (",
379 string(tok.sigmaRev[inSym]),
380 ":",
381 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200382 ")",
383 ";",
384 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200385 "NT:", nontoken,
386 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200387 }
Akron75ebe7f2021-08-03 10:34:10 +0200388
Akron8ef408b2021-08-02 22:11:04 +0200389 continue
390 }
391 case SIGMA:
392 {
393 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
394
395 // Turn string into sigma id
396 number, err := strconv.Atoi(elem[0])
397
Akron524c5432021-08-05 14:14:27 +0200398 // ID needs to be > 1
399 number++
400
Akron8ef408b2021-08-02 22:11:04 +0200401 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200402 log.Error().Err(err)
403 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200404 }
405
Akron740f3d72021-08-03 12:12:34 +0200406 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200407
408 var symbol rune
409
410 // Read rune
411 if utf8.RuneCountInString(elem[1]) == 1 {
412 symbol = []rune(elem[1])[0]
413
414 // Probably a MCS
415 } else if utf8.RuneCountInString(elem[1]) > 1 {
416 switch elem[1] {
417 case "@_EPSILON_SYMBOL_@":
418 {
Akronc17f1ca2021-08-03 19:47:27 +0200419 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200420 continue
421 }
422 case "@_UNKNOWN_SYMBOL_@":
423 {
Akronc17f1ca2021-08-03 19:47:27 +0200424 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200425 continue
426 }
427
428 case "@_IDENTITY_SYMBOL_@":
429 {
Akronc17f1ca2021-08-03 19:47:27 +0200430 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200431 continue
432 }
433 default:
Akron740f3d72021-08-03 12:12:34 +0200434 {
435 log.Error().Msg("MCS not supported: " + line)
436 os.Exit(1)
437 }
Akron8ef408b2021-08-02 22:11:04 +0200438 }
439
Akron740f3d72021-08-03 12:12:34 +0200440 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200441 line, err = r.ReadString('\n')
442 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200443 log.Error().Err(err)
444 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200445 }
446 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200447 log.Error().Msg("MCS not supported:" + line)
448 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200449 }
Akron740f3d72021-08-03 12:12:34 +0200450 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200451 }
452
Akron740f3d72021-08-03 12:12:34 +0200453 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200454 }
455 }
456 }
457
458 return tok
459}
460
Akron64ffd9a2021-08-03 19:55:21 +0200461// Set alphabet A to the list of all symbols
462// outgoing from s
463func (tok *Tokenizer) get_set(s int, A *[]int) {
464 for a := range tok.transitions[s] {
465 *A = append(*A, a)
466 }
467
468 // Not required, but simplifies bug hunting
469 sort.Ints(*A)
470}
471
Akron8ef408b2021-08-02 22:11:04 +0200472// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200473func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
474
475 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200476 sigma: make(map[rune]int),
477 loadFactor: -1,
478 final: tok.final,
479 unknown: tok.unknown,
480 identity: tok.identity,
481 epsilon: tok.epsilon,
Akron3f8571a2021-08-05 11:18:10 +0200482 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200483 }
484
485 for num, sym := range tok.sigmaRev {
486 dat.sigma[sym] = num
487 }
Akron8ef408b2021-08-02 22:11:04 +0200488
489 mark := 0
490 size := 0
491
492 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200493 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200494
495 table[size] = &mapping{source: 1, target: 1}
496 size++
497
Akron740f3d72021-08-03 12:12:34 +0200498 // Allocate space for the outgoing symbol range
499 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200500
501 for mark < size {
502 s := table[mark].source // This is a state in Ms
503 t := table[mark].target // This is a state in Mt
504 mark++
Akron740f3d72021-08-03 12:12:34 +0200505
Akron3610f102021-08-08 14:13:25 +0200506 if t == 6288 {
507 fmt.Println("1 State", t, "was", s)
508 }
509
Akron740f3d72021-08-03 12:12:34 +0200510 // Following the paper, here the state t can be remembered
511 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200512 A = A[:0]
513 tok.get_set(s, &A)
514
Akron740f3d72021-08-03 12:12:34 +0200515 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200516 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200517
Akron773b1ef2021-08-03 17:37:20 +0200518 // TODO:
Akron068874c2021-08-04 15:19:56 +0200519 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200520 // outdegree of .end
521
Akron740f3d72021-08-03 12:12:34 +0200522 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200523 for _, a := range A {
524
Akronc17f1ca2021-08-03 19:47:27 +0200525 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200526
Akron740f3d72021-08-03 12:12:34 +0200527 // Aka g(s, a)
528 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200529
Akron740f3d72021-08-03 12:12:34 +0200530 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200531 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200532 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200533
Akron3610f102021-08-08 14:13:25 +0200534 if DEBUG || t1 == 6288 {
Akron524c5432021-08-05 14:14:27 +0200535 fmt.Println("Translate transition",
536 s, "->", s1, "(", a, ")", "to", t, "->", t1)
537 }
538
Akron83e75a22021-08-04 13:14:06 +0200539 // Mark the state as being the target of a nontoken transition
540 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200541 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200542 if DEBUG {
543 fmt.Println("Set", t1, "to nontoken")
544 }
Akron83e75a22021-08-04 13:14:06 +0200545 }
546
Akron84d68e62021-08-04 17:06:52 +0200547 // Mark the state as being the target of a tokenend transition
548 if tok.transitions[s][a].tokenend {
549 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200550 if DEBUG {
551 fmt.Println("Set", t1, "to tokenend")
552 }
Akron84d68e62021-08-04 17:06:52 +0200553 }
554
Akron740f3d72021-08-03 12:12:34 +0200555 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200556 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200557
Akron8ef408b2021-08-02 22:11:04 +0200558 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200559 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200560 table[size] = &mapping{source: s1, target: t1}
561 size++
562 } else {
Akron740f3d72021-08-03 12:12:34 +0200563 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200564 dat.setBase(t1, r)
565 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200566 }
567 } else {
Akron740f3d72021-08-03 12:12:34 +0200568 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200569 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200570 }
571 }
572 }
573
574 // Following Mizobuchi et al (2000) the size of the
575 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200576 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200577 dat.array = dat.array[:dat.maxSize+1]
578 return dat
Akron8ef408b2021-08-02 22:11:04 +0200579}
580
Akron8ef408b2021-08-02 22:11:04 +0200581// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200582// exists and return this as a representative.
583// Currently iterates through the whole table
584// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200585func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200586 for x := 0; x < size; x++ {
587 if table[x].source == s {
588 return table[x].target
589 }
590 }
591 return 0
592}
593
Akron64ffd9a2021-08-03 19:55:21 +0200594// Resize double array when necessary
595func (dat *DaTokenizer) resize(l int) {
596 // TODO:
597 // This is a bit too aggressive atm and should be calmed down.
598 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200599 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200600 }
Akron64ffd9a2021-08-03 19:55:21 +0200601}
Akronc9d84a62021-08-03 15:56:03 +0200602
Akron64ffd9a2021-08-03 19:55:21 +0200603// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200604func (dat *DaTokenizer) setBase(p uint32, v uint32) {
605 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200606 dat.resize(l)
607 if dat.maxSize < l {
608 dat.maxSize = l
609 }
610 dat.array[p*2] = v
611}
612
Akron3fdfec62021-08-04 11:40:10 +0200613// Returns true if a state is separate pointing to a representative
614func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200615 return dat.array[p*2]&firstBit != 0
Akron3fdfec62021-08-04 11:40:10 +0200616}
617
618// Mark a state as separate pointing to a representative
619func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
620 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200621 dat.array[p*2] |= firstBit
Akron3fdfec62021-08-04 11:40:10 +0200622 } else {
Akron2a4b9292021-08-04 15:35:22 +0200623 dat.array[p*2] &= (restBit | secondBit)
Akron3fdfec62021-08-04 11:40:10 +0200624 }
625}
626
Akron83e75a22021-08-04 13:14:06 +0200627// Returns true if a state is the target of a nontoken transition
628func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200629 return dat.array[p*2+1]&firstBit != 0
Akron83e75a22021-08-04 13:14:06 +0200630}
631
632// Mark a state as being the target of a nontoken transition
633func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
634 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200635 dat.array[p*2+1] |= firstBit
Akron83e75a22021-08-04 13:14:06 +0200636 } else {
Akron2a4b9292021-08-04 15:35:22 +0200637 dat.array[p*2+1] &= (restBit | secondBit)
Akron83e75a22021-08-04 13:14:06 +0200638 }
639}
640
Akron84d68e62021-08-04 17:06:52 +0200641// Returns true if a state is the target of a tokenend transition
642func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
643 return dat.array[p*2+1]&secondBit != 0
644}
645
646// Mark a state as being the target of a tokenend transition
647func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
648 if sep {
649 dat.array[p*2+1] |= secondBit
650 } else {
651 dat.array[p*2+1] &= (restBit | firstBit)
652 }
653}
654
Akron64ffd9a2021-08-03 19:55:21 +0200655// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200656func (dat *DaTokenizer) getBase(p uint32) uint32 {
657 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200658 return 0
659 }
Akron3fdfec62021-08-04 11:40:10 +0200660 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200661}
662
663// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200664func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
665 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200666 dat.resize(l)
667 if dat.maxSize < l {
668 dat.maxSize = l
669 }
670 dat.array[(p*2)+1] = v
671}
672
673// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200674func (dat *DaTokenizer) getCheck(p uint32) uint32 {
675 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200676 return 0
677 }
Akron3fdfec62021-08-04 11:40:10 +0200678 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200679}
680
681// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200682func (dat *DaTokenizer) setSize(v int) {
683 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200684}
685
686// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200687func (dat *DaTokenizer) GetSize() int {
688 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200689}
690
691// Based on Mizobuchi et al (2000), p. 124
692// This iterates for every state through the complete double array
693// structure until it finds a gap that fits all outgoing transitions
694// of the state. This is extremely slow, but is only necessary in the
695// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200696func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200697
698 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200699 base := uint32(1) // dat.lastFilledBase
700 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200701OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200702
Akron3f8571a2021-08-05 11:18:10 +0200703 /*
704 if !skip {
705 if dat.getCheck(base) != 0 {
706 dat.lastFilledBase = base
707 } else {
708 skip = true
709 }
710 }
711 */
712
Akron740f3d72021-08-03 12:12:34 +0200713 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200714 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200715 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200716 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200717 base++
718 goto OVERLAP
719 }
720 }
Akron8ef408b2021-08-02 22:11:04 +0200721 return base
722}
723
Akron3610f102021-08-08 14:13:25 +0200724// List all outgoing transitions for a state
725// for testing purposes
726func (dat *DaTokenizer) outgoing(t uint32) []int {
727
728 valid := make([]int, 0, len(dat.sigma))
729
730 for _, a := range dat.sigma {
731 t1 := dat.getBase(t) + uint32(a)
732 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
733 valid = append(valid, a)
734 }
735 }
736
737 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
738 t1 := dat.getBase(t) + uint32(a)
739 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
740 valid = append(valid, -1*a)
741 }
742 }
743
744 sort.Ints(valid)
745
746 return valid
747}
748
Akron03a3c612021-08-04 11:51:27 +0200749// LoadFactor as defined in Kanda et al (2018),
750// i.e. the proportion of non-empty elements to all elements.
751func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200752
Akron03a3c612021-08-04 11:51:27 +0200753 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200754 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200755 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200756 }
Akrond66a9262021-08-03 17:09:09 +0200757 nonEmpty := 0
758 all := len(dat.array) / 2
759 for x := 1; x <= len(dat.array); x = x + 2 {
760 if dat.array[x] != 0 {
761 nonEmpty++
762 }
763 }
Akron03a3c612021-08-04 11:51:27 +0200764 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
765 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200766}
767
Akron6247a5d2021-08-03 19:18:28 +0200768// WriteTo stores the double array data in an io.Writer.
Akron3a063ef2021-08-05 19:36:35 +0200769func (dat *DaTokenizer) Save(file string) (n int64, err error) {
770 f, err := os.Create(file)
771 if err != nil {
772 log.Error().Err(err)
773 return 0, nil
774 }
775 defer f.Close()
776 gz := gzip.NewWriter(f)
777 defer gz.Close()
778 n, err = dat.WriteTo(gz)
779 if err != nil {
780 log.Error().Err(err)
781 return n, err
782 }
783 gz.Flush()
784 return n, nil
785}
786
787// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200788func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
789
Akron3a063ef2021-08-05 19:36:35 +0200790 wb := bufio.NewWriter(w)
791 defer wb.Flush()
792
Akron6247a5d2021-08-03 19:18:28 +0200793 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200794 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200795 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200796 log.Error().Err(err)
797 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200798 }
799
800 // Get sigma as a list
801 sigmalist := make([]rune, len(dat.sigma)+16)
802 max := 0
803 for sym, num := range dat.sigma {
804 sigmalist[num] = sym
805 if num > max {
806 max = num
807 }
808 }
809
810 sigmalist = sigmalist[:max+1]
811
Akron3f8571a2021-08-05 11:18:10 +0200812 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200813 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200814 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
815 bo.PutUint16(buf[4:6], uint16(dat.unknown))
816 bo.PutUint16(buf[6:8], uint16(dat.identity))
817 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200818 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200819 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200820 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200821 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200822 log.Error().Err(err)
823 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200824 }
825
826 all += more
827
Akron3a063ef2021-08-05 19:36:35 +0200828 // wbuf := bytes.NewBuffer(nil)
829 // wbufWrap := bufio.NewWriter(wbuf)
Akron6247a5d2021-08-03 19:18:28 +0200830
831 // Write sigma
832 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200833
Akron3a063ef2021-08-05 19:36:35 +0200834 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200835 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200836 log.Error().Err(err)
837 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200838 }
839 all += more
840 }
Akron3a063ef2021-08-05 19:36:35 +0200841 // wbufWrap.Flush()
842 // more, err = w.Write(wbuf.Bytes())
Akron6247a5d2021-08-03 19:18:28 +0200843 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200844 log.Error().Err(err)
845 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200846 }
Akron3a063ef2021-08-05 19:36:35 +0200847 // all += more
Akron6247a5d2021-08-03 19:18:28 +0200848
849 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200850 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200851 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200852 log.Error().Err(err)
853 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200854 }
855 all += more
856
Akron3a063ef2021-08-05 19:36:35 +0200857 // wbuf.Reset()
Akron6247a5d2021-08-03 19:18:28 +0200858
Akron3a063ef2021-08-05 19:36:35 +0200859 for x := 0; x < len(dat.array); x++ {
860 // for _, d := range dat.array {
861 bo.PutUint32(buf[0:4], dat.array[x])
862 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200863 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200864 log.Error().Err(err)
865 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200866 }
Akron3a063ef2021-08-05 19:36:35 +0200867 if more != 4 {
868 log.Error().Msg("Can not write uint32")
869 return int64(all), err
870 }
Akron6247a5d2021-08-03 19:18:28 +0200871 all += more
872 }
873
Akron3a063ef2021-08-05 19:36:35 +0200874 // wbufWrap.Flush()
875
Akron6247a5d2021-08-03 19:18:28 +0200876 return int64(all), err
877}
878
Akron3f8571a2021-08-05 11:18:10 +0200879func LoadDatokFile(file string) *DaTokenizer {
880 f, err := os.Open(file)
881 if err != nil {
882 log.Error().Err(err)
883 os.Exit(0)
884 }
885 defer f.Close()
886
887 gz, err := gzip.NewReader(f)
888 if err != nil {
889 log.Error().Err(err)
890 os.Exit(0)
891 }
892 defer gz.Close()
893
Akron3a063ef2021-08-05 19:36:35 +0200894 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200895 return ParseDatok(gz)
896}
897
898func ParseDatok(ior io.Reader) *DaTokenizer {
899
900 dat := &DaTokenizer{
901 sigma: make(map[rune]int),
902 epsilon: 0,
903 unknown: 0,
904 identity: 0,
905 final: 0,
906 loadFactor: 0,
907 }
908
909 r := bufio.NewReader(ior)
910
911 all := 0
912
913 buf := make([]byte, 1024)
914 buf = buf[0:len(MAGIC)]
915
916 more, err := r.Read(buf)
917
918 if err != nil {
919 log.Error().Err(err)
920 return nil
921 }
922
923 all += more
924
925 if string(MAGIC) != string(buf) {
926 log.Error().Msg("Not a datok file")
927 return nil
928 }
929
Akron3a063ef2021-08-05 19:36:35 +0200930 more, err = io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200931 if err != nil {
932 log.Error().Err(err)
933 return nil
934 }
935
936 all += more
937
Akron3a063ef2021-08-05 19:36:35 +0200938 version := bo.Uint16(buf[0:2])
939
940 if version != VERSION {
941 log.Error().Msg("Version not compatible")
942 return nil
943 }
944
Akron3f8571a2021-08-05 11:18:10 +0200945 dat.epsilon = int(bo.Uint16(buf[2:4]))
946 dat.unknown = int(bo.Uint16(buf[4:6]))
947 dat.identity = int(bo.Uint16(buf[6:8]))
948 dat.final = int(bo.Uint16(buf[8:10]))
949
950 sigmaCount := int(bo.Uint16(buf[10:12]))
951 arraySize := int(bo.Uint32(buf[12:16]))
952
Akron3a063ef2021-08-05 19:36:35 +0200953 // Shouldn't be relevant though
954 dat.maxSize = arraySize - 1
955
Akron3f8571a2021-08-05 11:18:10 +0200956 for x := 0; x < sigmaCount; x++ {
957 sym, more, err := r.ReadRune()
958 if err == nil && sym != 0 {
959 dat.sigma[sym] = x
960 }
961 all += more
962 }
963
Akron3a063ef2021-08-05 19:36:35 +0200964 more, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200965
966 if err != nil {
967 log.Error().Err(err)
968 return nil
969 }
970
971 all += more
972
973 if string("T") != string(buf[0:1]) {
974 log.Error().Msg("Not a datok file")
975 return nil
976 }
977
978 // Read based on length
979 dat.array = make([]uint32, arraySize)
980
981 for x := 0; x < arraySize; x++ {
Akron3a063ef2021-08-05 19:36:35 +0200982 more, err = io.ReadFull(r, buf[0:4])
Akron3f8571a2021-08-05 11:18:10 +0200983 if err != nil {
Akron3a063ef2021-08-05 19:36:35 +0200984 if err == io.EOF {
985 fmt.Println(arraySize, x)
986 break
987 }
Akron3f8571a2021-08-05 11:18:10 +0200988 log.Error().Err(err)
989 return nil
990 }
991 all += more
992 dat.array[x] = bo.Uint32(buf[0:4])
993 }
994
995 return dat
996}
997
Akron740f3d72021-08-03 12:12:34 +0200998// Match an input string against the double array
999// FSA.
1000//
1001// Based on Mizobuchi et al (2000), p. 129,
1002// with additional support for IDENTITY, UNKNOWN
1003// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +02001004func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +02001005 var a int
Akron3fdfec62021-08-04 11:40:10 +02001006 var tu uint32
Akron465a0992021-08-03 11:28:48 +02001007 var ok bool
1008
Akron3fdfec62021-08-04 11:40:10 +02001009 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +02001010 chars := []rune(input)
1011 i := 0
1012
Akron49d27ee2021-08-03 11:58:13 +02001013 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +02001014 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +02001015
Akron740f3d72021-08-03 12:12:34 +02001016 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +02001017 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +02001018 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001019 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +02001020 }
Akron64ffd9a2021-08-03 19:55:21 +02001021 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +02001022 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +02001023 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +02001024 }
Akron465a0992021-08-03 11:28:48 +02001025 tu = t
Akron730a79c2021-08-03 11:05:29 +02001026 CHECK:
Akron3fdfec62021-08-04 11:40:10 +02001027 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +02001028
Akron740f3d72021-08-03 12:12:34 +02001029 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +02001030 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001031
1032 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001033 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +02001034 }
Akron740f3d72021-08-03 12:12:34 +02001035
Akron64ffd9a2021-08-03 19:55:21 +02001036 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +02001037 // Try again with unknown symbol, in case identity failed
1038 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001039 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +02001040 }
Akron64ffd9a2021-08-03 19:55:21 +02001041 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +02001042
Akron64ffd9a2021-08-03 19:55:21 +02001043 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001044 // Try again with epsilon symbol, in case everything else failed
1045 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001046 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +02001047 }
Akron64ffd9a2021-08-03 19:55:21 +02001048 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +02001049 } else {
1050 break
1051 }
1052 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +02001053 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +02001054 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001055 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +02001056 }
Akron740f3d72021-08-03 12:12:34 +02001057
1058 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +02001059 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001060 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +02001061 i++
1062 }
Akron83e75a22021-08-04 13:14:06 +02001063
Akron740f3d72021-08-03 12:12:34 +02001064 // TODO:
1065 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +02001066 }
1067
Akron740f3d72021-08-03 12:12:34 +02001068 if i != len(chars) {
1069 if DEBUG {
1070 fmt.Println("Not at the end")
1071 }
Akron8ef408b2021-08-02 22:11:04 +02001072 return false
1073 }
1074
Akron465a0992021-08-03 11:28:48 +02001075FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +02001076
1077 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +02001078 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +02001079 return true
1080 }
Akron465a0992021-08-03 11:28:48 +02001081
Akron740f3d72021-08-03 12:12:34 +02001082 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +02001083 tu = t
Akron3fdfec62021-08-04 11:40:10 +02001084 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +02001085
Akron740f3d72021-08-03 12:12:34 +02001086 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +02001087 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001088 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001089 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +02001090 }
Akron465a0992021-08-03 11:28:48 +02001091 return false
Akron740f3d72021-08-03 12:12:34 +02001092
Akron3fdfec62021-08-04 11:40:10 +02001093 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +02001094 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001095 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +02001096 }
Akron740f3d72021-08-03 12:12:34 +02001097
Akron465a0992021-08-03 11:28:48 +02001098 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +02001099}
Akron068874c2021-08-04 15:19:56 +02001100
Akron3610f102021-08-08 14:13:25 +02001101func showBuffer(buffer []rune, buffo int, buffi int) string {
1102 out := make([]rune, 0, 1024)
1103 for x := 0; x < len(buffer); x++ {
1104 if buffi == x {
1105 out = append(out, '^')
1106 }
1107 if buffo == x {
1108 out = append(out, '[', buffer[x], ']')
1109 } else {
1110 out = append(out, buffer[x])
1111 }
1112 }
1113 return string(out)
1114}
1115
Akron84d68e62021-08-04 17:06:52 +02001116// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001117// FSA. The rules are always greedy. If the automaton fails,
1118// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001119//
1120// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +02001121// for NONTOKEN and TOKENEND handling
Akron3f8571a2021-08-05 11:18:10 +02001122func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001123 var a int
Akronb4bbb472021-08-09 11:49:38 +02001124 var t0 uint32
Akron84d68e62021-08-04 17:06:52 +02001125 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +02001126
Akron3610f102021-08-08 14:13:25 +02001127 // Remember the last position of a possible tokenend,
1128 // in case the automaton fails.
1129 epsilonState := uint32(0)
1130 epsilonOffset := 0
1131
1132 // Implement a low level buffer for full control,
1133 // however - it is probably better to introduce
1134 // this on a higher level with a io.Reader interface
1135 // The buffer stores a single word and may have white
1136 // space at the end (but not at the beginning).
1137 //
1138 // This is the only backtracking requirement because of
1139 // epsilon transitions, to support tokenizations like:
1140 // "this is an example|.| And it works." vs
1141 // "this is an example.com| application."
1142 buffer := make([]rune, 1024)
1143 buffo := 0 // Buffer offset
1144 buffi := 0 // Buffer length
1145
Akron3f8571a2021-08-05 11:18:10 +02001146 reader := bufio.NewReader(r)
1147 writer := bufio.NewWriter(w)
1148 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001149
Akron3f8571a2021-08-05 11:18:10 +02001150 t := uint32(1) // Initial state
Akron3f8571a2021-08-05 11:18:10 +02001151
1152 var char rune
1153 var err error
1154 eof := false
1155
Akron3610f102021-08-08 14:13:25 +02001156 // TODO:
1157 // Write all characters first into a buffer
1158 // and flush when necessary
1159 // TODO:
1160 // Create an epsilon stack
Akron3f8571a2021-08-05 11:18:10 +02001161 for {
1162
Akronb4bbb472021-08-09 11:49:38 +02001163 // Get from reader if buffer is empty
Akron3610f102021-08-08 14:13:25 +02001164 if buffo >= buffi {
Akron3f8571a2021-08-05 11:18:10 +02001165 char, _, err = reader.ReadRune()
1166 if err != nil {
1167 eof = true
1168 break
1169 }
Akron3610f102021-08-08 14:13:25 +02001170 buffer[buffi] = char
1171 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001172 }
Akron3610f102021-08-08 14:13:25 +02001173
Akron3610f102021-08-08 14:13:25 +02001174 char = buffer[buffo]
Akron3610f102021-08-08 14:13:25 +02001175
1176 if DEBUG {
1177 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1178 }
Akron3f8571a2021-08-05 11:18:10 +02001179
1180 a, ok = dat.sigma[char]
Akron068874c2021-08-04 15:19:56 +02001181
1182 // Support identity symbol if character is not in sigma
1183 if !ok && dat.identity != -1 {
1184 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001185 fmt.Println("IDENTITY symbol", string(char), "->", dat.identity)
Akron068874c2021-08-04 15:19:56 +02001186 }
1187 a = dat.identity
Akron068874c2021-08-04 15:19:56 +02001188 }
Akron3610f102021-08-08 14:13:25 +02001189
Akronb4bbb472021-08-09 11:49:38 +02001190 t0 = t
1191
1192 if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
Akron3610f102021-08-08 14:13:25 +02001193 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001194 fmt.Println("Remember for epsilon tu:charcount", t0, buffo)
Akron3610f102021-08-08 14:13:25 +02001195 }
Akronb4bbb472021-08-09 11:49:38 +02001196 epsilonState = t0
Akron3610f102021-08-08 14:13:25 +02001197 epsilonOffset = buffo
1198 }
1199
Akron068874c2021-08-04 15:19:56 +02001200 CHECK:
1201 nontoken = false
Akron84d68e62021-08-04 17:06:52 +02001202 tokenend = false
1203
Akronb4bbb472021-08-09 11:49:38 +02001204 t = dat.getBase(t0) + uint32(a)
Akron068874c2021-08-04 15:19:56 +02001205
Akron524c5432021-08-05 14:14:27 +02001206 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001207 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1208 fmt.Println("Valid:", dat.outgoing(t0))
Akron524c5432021-08-05 14:14:27 +02001209 }
1210
Akron068874c2021-08-04 15:19:56 +02001211 // Check if the transition is valid according to the double array
Akronb4bbb472021-08-09 11:49:38 +02001212 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001213
1214 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001215 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001216 }
1217
1218 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001219
Akron068874c2021-08-04 15:19:56 +02001220 // Try again with unknown symbol, in case identity failed
1221 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001222 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001223 }
1224 a = dat.unknown
1225
1226 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001227
Akron068874c2021-08-04 15:19:56 +02001228 // Try again with epsilon symbol, in case everything else failed
1229 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001230 fmt.Println("EPSILON symbol", string(char), "->", dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001231 }
Akronb4bbb472021-08-09 11:49:38 +02001232 t0 = epsilonState
Akron068874c2021-08-04 15:19:56 +02001233 a = dat.epsilon
Akron3610f102021-08-08 14:13:25 +02001234 epsilonState = 0 // reset
1235 buffo = epsilonOffset
1236 if DEBUG {
1237 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1238 }
Akronb4bbb472021-08-09 11:49:38 +02001239
Akron068874c2021-08-04 15:19:56 +02001240 } else {
1241 break
1242 }
Akron068874c2021-08-04 15:19:56 +02001243
Akronb4bbb472021-08-09 11:49:38 +02001244 goto CHECK
1245
1246 }
1247
1248 // Move to representative state
1249 nontoken = dat.isNonToken(t)
1250 tokenend = dat.isTokenEnd(t)
1251
1252 if dat.isSeparate(t) {
Akron068874c2021-08-04 15:19:56 +02001253 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +02001254
Akron524c5432021-08-05 14:14:27 +02001255 if DEBUG {
1256 fmt.Println("Representative pointing to", t)
1257 }
Akron068874c2021-08-04 15:19:56 +02001258 }
1259
1260 // Transition is fine
Akronb4bbb472021-08-09 11:49:38 +02001261 if a != dat.epsilon {
1262
Akron068874c2021-08-04 15:19:56 +02001263 // Character consumed
Akron3610f102021-08-08 14:13:25 +02001264 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001265 if nontoken {
1266
Akron3610f102021-08-08 14:13:25 +02001267 if DEBUG {
1268 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1269 }
Akronb4bbb472021-08-09 11:49:38 +02001270
Akron3610f102021-08-08 14:13:25 +02001271 // Maybe remove the first character, if buffo == 0?
1272 if buffo == 1 {
Akronb4bbb472021-08-09 11:49:38 +02001273
1274 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001275 for x, i := range buffer[buffo:buffi] {
1276 buffer[x] = i
1277 }
1278 // writer.WriteRune('\n')
1279 buffi -= buffo
1280 epsilonOffset -= buffo
1281 buffo = 0
1282 }
1283 }
Akron3f8571a2021-08-05 11:18:10 +02001284 }
Akron068874c2021-08-04 15:19:56 +02001285
Akron524c5432021-08-05 14:14:27 +02001286 if DEBUG {
1287 fmt.Println(" --> ok!")
1288 }
1289
Akron84d68e62021-08-04 17:06:52 +02001290 if tokenend {
Akron3610f102021-08-08 14:13:25 +02001291 data := []byte(string(buffer[:buffo]))
1292 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001293 fmt.Println("-> Flush buffer:", string(data), showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001294 }
1295 writer.Write(data)
Akron3f8571a2021-08-05 11:18:10 +02001296 writer.WriteRune('\n')
Akron3610f102021-08-08 14:13:25 +02001297
1298 // Better as a ring buffer
1299 for x, i := range buffer[buffo:buffi] {
1300 buffer[x] = i
1301 }
Akronb4bbb472021-08-09 11:49:38 +02001302
Akron3610f102021-08-08 14:13:25 +02001303 buffi -= buffo
1304 epsilonOffset -= buffo
1305 buffo = 0
1306 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001307 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001308 }
Akron84d68e62021-08-04 17:06:52 +02001309 }
1310
Akron068874c2021-08-04 15:19:56 +02001311 // TODO:
1312 // Prevent endless epsilon loops!
1313 }
1314
Akron3f8571a2021-08-05 11:18:10 +02001315 if !eof {
Akron068874c2021-08-04 15:19:56 +02001316 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001317 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001318 }
1319 return false
1320 }
1321
1322FINALCHECK:
1323
1324 // Automaton is in a final state
1325 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akronb4bbb472021-08-09 11:49:38 +02001326
1327 if buffi > 0 {
1328 data := []byte(string(buffer[:buffi]))
1329 if DEBUG {
1330 fmt.Println("-> Flush buffer:", string(data))
Akron3f8571a2021-08-05 11:18:10 +02001331 }
Akronb4bbb472021-08-09 11:49:38 +02001332 writer.Write(data)
1333 // states are irrelevant here
1334 }
1335
Akron84d68e62021-08-04 17:06:52 +02001336 if dat.isTokenEnd(t) {
Akron3f8571a2021-08-05 11:18:10 +02001337 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001338 }
1339
1340 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001341 return true
1342 }
1343
1344 // Check epsilon transitions until a final state is reached
Akronb4bbb472021-08-09 11:49:38 +02001345 t0 = t
1346 t = dat.getBase(t0) + uint32(dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001347
1348 // Epsilon transition failed
Akronb4bbb472021-08-09 11:49:38 +02001349 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001350 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001351 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001352 }
1353 return false
Akron068874c2021-08-04 15:19:56 +02001354 }
1355
Akronb4bbb472021-08-09 11:49:38 +02001356 // nontoken = dat.isNonToken(t)
1357 tokenend = dat.isTokenEnd(t)
1358
1359 if dat.isSeparate(t) {
1360 // Move to representative state
1361 t = dat.getBase(t)
1362 }
Akron3f8571a2021-08-05 11:18:10 +02001363
1364 if tokenend {
Akron3610f102021-08-08 14:13:25 +02001365 if buffi > 0 {
1366 data := []byte(string(buffer[:buffi]))
1367 if DEBUG {
1368 fmt.Println("-> Flush buffer:", string(data))
1369 }
1370 writer.Write(data)
1371 buffi = 0
1372 buffo = 0
1373 epsilonState = 0
1374 }
Akron3f8571a2021-08-05 11:18:10 +02001375 writer.WriteRune('\n')
Akron068874c2021-08-04 15:19:56 +02001376 }
1377
1378 goto FINALCHECK
1379}