blob: 826ba870e181f00d981bf55fe8747765ef1378fc [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron740f3d72021-08-03 12:12:34 +02009// TODO:
10// - replace maxSize with the check value
11// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020012
Akron8ef408b2021-08-02 22:11:04 +020013import (
14 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020015 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020016 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020017 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020018 "fmt"
19 "io"
20 "os"
Akronc9d84a62021-08-03 15:56:03 +020021 "sort"
Akron8ef408b2021-08-02 22:11:04 +020022 "strconv"
23 "strings"
24 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020025
26 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020027)
28
29const (
Akron75ebe7f2021-08-03 10:34:10 +020030 PROPS = 1
31 SIGMA = 2
32 STATES = 3
33 NONE = 4
34 NEWLINE = '\u000a'
Akron740f3d72021-08-03 12:12:34 +020035 DEBUG = false
Akron6247a5d2021-08-03 19:18:28 +020036 MAGIC = "DATOK"
37 VERSION = uint16(1)
Akron8ef408b2021-08-02 22:11:04 +020038)
39
Akron6247a5d2021-08-03 19:18:28 +020040var bo binary.ByteOrder = binary.LittleEndian
41
Akron8ef408b2021-08-02 22:11:04 +020042type mapping struct {
43 source int
44 target int
45}
46
47type edge struct {
Akron740f3d72021-08-03 12:12:34 +020048 inSym int
49 outSym int
50 end int
Akron8ef408b2021-08-02 22:11:04 +020051}
52
53type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020054 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020055 sigmaRev map[int]rune
56 arcCount int
57 stateCount int
58 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020059 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020060
61 // Special symbols in sigma
62 epsilon int
63 unknown int
64 identity int
65 final int
Akron8ef408b2021-08-02 22:11:04 +020066}
67
Akronf2120ca2021-08-03 16:26:41 +020068type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020069 // sigmaRev map[int]rune
Akron773b1ef2021-08-03 17:37:20 +020070 sigma map[rune]int
71 maxSize int
72 loadLevel float64
73 array []int
Akronc17f1ca2021-08-03 19:47:27 +020074
75 // Special symbols in sigma
76 epsilon int
77 unknown int
78 identity int
79 final int
Akronf2120ca2021-08-03 16:26:41 +020080}
81
Akronc17f1ca2021-08-03 19:47:27 +020082func ParseFoma(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020083 f, err := os.Open(file)
84 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020085 log.Error().Err(err)
86 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020087 }
88 defer f.Close()
89
90 gz, err := gzip.NewReader(f)
91 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020092 log.Error().Err(err)
93 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020094 }
95 defer gz.Close()
96
Akron740f3d72021-08-03 12:12:34 +020097 return Parse(gz)
Akron8ef408b2021-08-02 22:11:04 +020098}
99
Akron740f3d72021-08-03 12:12:34 +0200100func Parse(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200101 r := bufio.NewReader(ior)
102
103 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200104 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200105 epsilon: -1,
106 unknown: -1,
107 identity: -1,
108 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200109 }
110
Akron740f3d72021-08-03 12:12:34 +0200111 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200112
113 mode := 0
114 var elem []string
115 var elemint [5]int
116
117 for {
118 line, err := r.ReadString('\n')
119 if err != nil {
120 if err == io.EOF {
121 break
122 }
Akron740f3d72021-08-03 12:12:34 +0200123 log.Error().Err(err)
124 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200125 }
126 if strings.HasPrefix(line, "##foma-net") {
127 continue
128 }
129 if strings.HasPrefix(line, "##props##") {
130 mode = PROPS
131 continue
132 }
133 if strings.HasPrefix(line, "##states##") {
134 mode = STATES
135
136 // Adds a final transition symbol to sigma
137 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200138 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200139 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200140 continue
141 }
142 if strings.HasPrefix(line, "##sigma##") {
143 mode = SIGMA
144 continue
145 }
146 if strings.HasPrefix(line, "##end##") {
147 mode = NONE
148 continue
149 }
150
151 switch mode {
152 case PROPS:
153 {
154 elem = strings.Split(line, " ")
155 /*
156 fmt.Println("arity: " + elem[0])
157 fmt.Println("arccount: " + elem[1])
158 fmt.Println("statecount: " + elem[2])
159 fmt.Println("linecount: " + elem[3])
160 fmt.Println("finalcount: " + elem[4])
161 fmt.Println("pathcount: " + elem[5])
162 fmt.Println("is_deterministic: " + elem[6])
163 fmt.Println("is_pruned: " + elem[7])
164 fmt.Println("is_minimized: " + elem[8])
165 fmt.Println("is_epsilon_free: " + elem[9])
166 fmt.Println("is_loop_free: " + elem[10])
167 fmt.Println("extras: " + elem[11])
168 fmt.Println("name: " + elem[12])
169 */
170 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200171 log.Error().Msg("The FST needs to be deterministic")
172 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200173 }
174 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200175 log.Error().Msg("The FST needs to be epsilon free")
176 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200177 }
178
179 elemint[0], err = strconv.Atoi(elem[1])
180 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200181 log.Error().Msg("Can't read arccount")
182 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200183 }
Akron740f3d72021-08-03 12:12:34 +0200184 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200185
186 // States start at 1 in Mizobuchi et al (2000),
187 // as the state 0 is associated with a fail.
188 // Initialize states and transitions
189 elemint[0], err = strconv.Atoi(elem[2])
190 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200191 log.Error().Msg("Can't read statecount")
192 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200193 }
Akron740f3d72021-08-03 12:12:34 +0200194 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200195 tok.transitions = make([]map[int]*edge, elemint[0]+1)
196 continue
197 }
198 case STATES:
199 {
200 elem = strings.Split(line[0:len(line)-1], " ")
201 if elem[0] == "-1" {
202 continue
203 }
204 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200205 if err != nil {
206 break
207 }
Akron8ef408b2021-08-02 22:11:04 +0200208
209 if len(elem) > 1 {
210 elemint[1], err = strconv.Atoi(elem[1])
211 if err != nil {
212 break
213 }
214 if len(elem) > 2 {
215 elemint[2], err = strconv.Atoi(elem[2])
216 if err != nil {
217 break
218 }
219 if len(elem) > 3 {
220 elemint[3], err = strconv.Atoi(elem[3])
221 if err != nil {
222 break
223 }
224 if len(elem) > 4 {
225 elemint[4], err = strconv.Atoi(elem[4])
226 if err != nil {
227 break
228 }
229 }
230 }
231 }
232 }
233
234 switch len(elem) {
235 case 5:
236 {
Akron740f3d72021-08-03 12:12:34 +0200237 state = elemint[0]
238 inSym = elemint[1]
239 outSym = elemint[2]
240 end = elemint[3]
241 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200242 }
243 case 4:
244 {
245 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200246 state = elemint[0]
247 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200248 } else {
Akron740f3d72021-08-03 12:12:34 +0200249 state = elemint[0]
250 inSym = elemint[1]
251 end = elemint[2]
252 final = elemint[3]
253 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200254 }
255 }
256 case 3:
257 {
Akron740f3d72021-08-03 12:12:34 +0200258 inSym = elemint[0]
259 outSym = elemint[1]
260 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200261 }
262 case 2:
263 {
Akron740f3d72021-08-03 12:12:34 +0200264 inSym = elemint[0]
265 end = elemint[1]
266 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200267 }
268 }
269
Akron8ef408b2021-08-02 22:11:04 +0200270 // While the states in foma start with 0, the states in the
271 // Mizobuchi FSA start with one - so we increase every state by 1.
272
Akron740f3d72021-08-03 12:12:34 +0200273 if inSym != outSym {
274
275 // Allow any epsilon to become a newline
Akronc17f1ca2021-08-03 19:47:27 +0200276 if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
Akron740f3d72021-08-03 12:12:34 +0200277
278 // Allow any whitespace to be ignored
Akronc17f1ca2021-08-03 19:47:27 +0200279 !(inSym != tok.epsilon && outSym == tok.epsilon) &&
Akron740f3d72021-08-03 12:12:34 +0200280
281 // Allow any whitespace to become a new line
282 !(tok.sigmaRev[outSym] == NEWLINE) {
283
284 log.Error().Msg(
285 "Unsupported transition: " +
286 strconv.Itoa(state) +
287 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200288 " (" +
Akron740f3d72021-08-03 12:12:34 +0200289 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200290 ":" +
Akron740f3d72021-08-03 12:12:34 +0200291 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200292 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200293 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200294 ":" +
Akron740f3d72021-08-03 12:12:34 +0200295 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200296 ")")
Akron740f3d72021-08-03 12:12:34 +0200297 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200298 }
Akron8ef408b2021-08-02 22:11:04 +0200299 }
300
Akron740f3d72021-08-03 12:12:34 +0200301 // This collects all edges until arrstate changes
302
Akron8ef408b2021-08-02 22:11:04 +0200303 // TODO:
304 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
305 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
306 // This requires to remove the transition alltogether and marks the state instead.
307
308 // TODO:
309 // if arrout == EPSILON, mark the transition as NOTOKEN
310
311 targetObj := &edge{
Akron740f3d72021-08-03 12:12:34 +0200312 inSym: inSym,
313 outSym: outSym,
314 end: end + 1,
Akron8ef408b2021-08-02 22:11:04 +0200315 }
316
Akron740f3d72021-08-03 12:12:34 +0200317 // Initialize outgoing states
318 if tok.transitions[state+1] == nil {
319 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200320 }
321
Akron740f3d72021-08-03 12:12:34 +0200322 // Ignore transitions with invalid symbols
323 if inSym >= 0 {
324 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200325 }
Akron8ef408b2021-08-02 22:11:04 +0200326
Akron740f3d72021-08-03 12:12:34 +0200327 // Add final transition
328 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200329 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200330 }
331
Akron740f3d72021-08-03 12:12:34 +0200332 if DEBUG {
333 fmt.Println("Add",
334 state+1, "->", end+1,
335 "(",
336 inSym,
337 ":",
338 outSym,
339 ") (",
340 string(tok.sigmaRev[inSym]),
341 ":",
342 string(tok.sigmaRev[outSym]),
343 ")")
344 }
Akron75ebe7f2021-08-03 10:34:10 +0200345
Akron8ef408b2021-08-02 22:11:04 +0200346 continue
347 }
348 case SIGMA:
349 {
350 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
351
352 // Turn string into sigma id
353 number, err := strconv.Atoi(elem[0])
354
355 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200356 log.Error().Err(err)
357 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200358 }
359
Akron740f3d72021-08-03 12:12:34 +0200360 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200361
362 var symbol rune
363
364 // Read rune
365 if utf8.RuneCountInString(elem[1]) == 1 {
366 symbol = []rune(elem[1])[0]
367
368 // Probably a MCS
369 } else if utf8.RuneCountInString(elem[1]) > 1 {
370 switch elem[1] {
371 case "@_EPSILON_SYMBOL_@":
372 {
Akronc17f1ca2021-08-03 19:47:27 +0200373 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200374 continue
375 }
376 case "@_UNKNOWN_SYMBOL_@":
377 {
Akronc17f1ca2021-08-03 19:47:27 +0200378 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200379 continue
380 }
381
382 case "@_IDENTITY_SYMBOL_@":
383 {
Akronc17f1ca2021-08-03 19:47:27 +0200384 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200385 continue
386 }
387 default:
Akron740f3d72021-08-03 12:12:34 +0200388 {
389 log.Error().Msg("MCS not supported: " + line)
390 os.Exit(1)
391 }
Akron8ef408b2021-08-02 22:11:04 +0200392 }
393
Akron740f3d72021-08-03 12:12:34 +0200394 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200395 line, err = r.ReadString('\n')
396 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200397 log.Error().Err(err)
398 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200399 }
400 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200401 log.Error().Msg("MCS not supported:" + line)
402 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200403 }
Akron740f3d72021-08-03 12:12:34 +0200404 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200405 }
406
Akron740f3d72021-08-03 12:12:34 +0200407 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200408 }
409 }
410 }
411
412 return tok
413}
414
415// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200416func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
417
418 dat := &DaTokenizer{
Akron6247a5d2021-08-03 19:18:28 +0200419 sigma: make(map[rune]int),
420 loadLevel: -1,
Akronc17f1ca2021-08-03 19:47:27 +0200421 final: tok.final,
422 unknown: tok.unknown,
423 identity: tok.identity,
424 epsilon: tok.epsilon,
Akronf2120ca2021-08-03 16:26:41 +0200425 }
426
427 for num, sym := range tok.sigmaRev {
428 dat.sigma[sym] = num
429 }
Akron8ef408b2021-08-02 22:11:04 +0200430
431 mark := 0
432 size := 0
433
434 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200435 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200436
437 table[size] = &mapping{source: 1, target: 1}
438 size++
439
Akron740f3d72021-08-03 12:12:34 +0200440 // Allocate space for the outgoing symbol range
441 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200442
443 for mark < size {
444 s := table[mark].source // This is a state in Ms
445 t := table[mark].target // This is a state in Mt
446 mark++
Akron740f3d72021-08-03 12:12:34 +0200447
448 // Following the paper, here the state t can be remembered
449 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200450 A = A[:0]
451 tok.get_set(s, &A)
452
Akron740f3d72021-08-03 12:12:34 +0200453 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200454 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200455
Akron773b1ef2021-08-03 17:37:20 +0200456 // TODO:
457 // Sort the outgoing transitions based onm the
458 // outdegree of .end
459
Akron740f3d72021-08-03 12:12:34 +0200460 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200461 for _, a := range A {
462
Akronc17f1ca2021-08-03 19:47:27 +0200463 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200464
Akron740f3d72021-08-03 12:12:34 +0200465 // Aka g(s, a)
466 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200467
Akron740f3d72021-08-03 12:12:34 +0200468 // Store the transition
Akronf2120ca2021-08-03 16:26:41 +0200469 t1 := dat.getBase(t) + a
470 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200471
Akron740f3d72021-08-03 12:12:34 +0200472 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200473 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200474
Akron8ef408b2021-08-02 22:11:04 +0200475 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200476 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200477 table[size] = &mapping{source: s1, target: t1}
478 size++
479 } else {
Akron740f3d72021-08-03 12:12:34 +0200480 // Overwrite with the representative state
Akronf2120ca2021-08-03 16:26:41 +0200481 dat.setBase(t1, -1*r)
Akron8ef408b2021-08-02 22:11:04 +0200482 }
483 } else {
Akron740f3d72021-08-03 12:12:34 +0200484 // Store a final transition
Akronc17f1ca2021-08-03 19:47:27 +0200485 dat.setCheck(dat.getBase(t)+dat.final, t)
Akron8ef408b2021-08-02 22:11:04 +0200486 }
487 }
488 }
489
490 // Following Mizobuchi et al (2000) the size of the
491 // FSA should be stored in check(1).
Akronf2120ca2021-08-03 16:26:41 +0200492 dat.setCheck(1, dat.maxSize+1)
493 dat.array = dat.array[:dat.maxSize+1]
494 return dat
Akron8ef408b2021-08-02 22:11:04 +0200495}
496
Akron740f3d72021-08-03 12:12:34 +0200497// Resize double array when necessary
Akronf2120ca2021-08-03 16:26:41 +0200498func (tok *DaTokenizer) resize(l int) {
Akron740f3d72021-08-03 12:12:34 +0200499 // TODO:
500 // This is a bit too aggressive atm and should be calmed down.
Akron75ebe7f2021-08-03 10:34:10 +0200501 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200502 tok.array = append(tok.array, make([]int, l)...)
503 }
504}
505
Akron740f3d72021-08-03 12:12:34 +0200506// Set base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200507func (tok *DaTokenizer) setBase(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200508 l := p*2 + 1
509 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200510 if tok.maxSize < l {
511 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200512 }
513 tok.array[p*2] = v
514}
515
Akron740f3d72021-08-03 12:12:34 +0200516// Get base value in double array
Akronf2120ca2021-08-03 16:26:41 +0200517func (tok *DaTokenizer) getBase(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200518 if p*2 >= len(tok.array) {
519 return 0
520 }
521 return tok.array[p*2]
522}
523
Akron740f3d72021-08-03 12:12:34 +0200524// Set check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200525func (tok *DaTokenizer) setCheck(p int, v int) {
Akron8ef408b2021-08-02 22:11:04 +0200526 l := p*2 + 1
527 tok.resize(l)
Akron740f3d72021-08-03 12:12:34 +0200528 if tok.maxSize < l {
529 tok.maxSize = l
Akron8ef408b2021-08-02 22:11:04 +0200530 }
531 tok.array[(p*2)+1] = v
532}
533
Akron740f3d72021-08-03 12:12:34 +0200534// Get check value in double array
Akronf2120ca2021-08-03 16:26:41 +0200535func (tok *DaTokenizer) getCheck(p int) int {
Akron8ef408b2021-08-02 22:11:04 +0200536 if (p*2)+1 >= len(tok.array) {
537 return 0
538 }
539 return tok.array[(p*2)+1]
540}
541
Akron740f3d72021-08-03 12:12:34 +0200542// Set size of double array
Akronf2120ca2021-08-03 16:26:41 +0200543func (tok *DaTokenizer) setSize(p, v int) {
Akron740f3d72021-08-03 12:12:34 +0200544 tok.setCheck(1, v)
545}
546
547// Get size of double array
Akron773b1ef2021-08-03 17:37:20 +0200548func (tok *DaTokenizer) GetSize(p int) int {
Akron740f3d72021-08-03 12:12:34 +0200549 return tok.getCheck(1)
550}
551
Akron8ef408b2021-08-02 22:11:04 +0200552// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200553// exists and return this as a representative.
554// Currently iterates through the whole table
555// in a bruteforce manner.
Akron8ef408b2021-08-02 22:11:04 +0200556func in_table(s int, table []*mapping, size int) int {
557 for x := 0; x < size; x++ {
558 if table[x].source == s {
559 return table[x].target
560 }
561 }
562 return 0
563}
564
565// Set alphabet A to the list of all symbols
566// outgoing from s
567func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200568 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200569 *A = append(*A, a)
570 }
Akronc9d84a62021-08-03 15:56:03 +0200571
572 // Not required, but simplifies bug hunting
573 sort.Ints(*A)
Akron8ef408b2021-08-02 22:11:04 +0200574}
575
576// Based on Mizobuchi et al (2000), p. 124
577// This iterates for every state through the complete double array
578// structure until it finds a gap that fits all outgoing transitions
579// of the state. This is extremely slow, but is only necessary in the
580// construction phase of the tokenizer.
Akronf2120ca2021-08-03 16:26:41 +0200581func (dat *DaTokenizer) xCheck(symbols []int) int {
Akron740f3d72021-08-03 12:12:34 +0200582
583 // Start at the first entry of the double array list
Akron8ef408b2021-08-02 22:11:04 +0200584 base := 1
585
Akron8ef408b2021-08-02 22:11:04 +0200586OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200587
588 // Resize the array if necessary
Akronc17f1ca2021-08-03 19:47:27 +0200589 dat.resize((base + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200590 for _, a := range symbols {
Akronf2120ca2021-08-03 16:26:41 +0200591 if dat.getCheck(base+a) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200592 base++
593 goto OVERLAP
594 }
595 }
Akron8ef408b2021-08-02 22:11:04 +0200596 return base
597}
598
Akron773b1ef2021-08-03 17:37:20 +0200599func (dat *DaTokenizer) LoadLevel() float64 {
Akrond66a9262021-08-03 17:09:09 +0200600
Akron773b1ef2021-08-03 17:37:20 +0200601 if dat.loadLevel >= 0 {
602 return dat.loadLevel
603 }
Akrond66a9262021-08-03 17:09:09 +0200604 nonEmpty := 0
605 all := len(dat.array) / 2
606 for x := 1; x <= len(dat.array); x = x + 2 {
607 if dat.array[x] != 0 {
608 nonEmpty++
609 }
610 }
Akron773b1ef2021-08-03 17:37:20 +0200611 dat.loadLevel = float64(nonEmpty) / float64(all) * 100
612 return dat.loadLevel
Akrond66a9262021-08-03 17:09:09 +0200613}
614
Akron6247a5d2021-08-03 19:18:28 +0200615// WriteTo stores the double array data in an io.Writer.
616func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
617
618 // Store magical header
619 all, err := w.Write([]byte(MAGIC))
620 if err != nil {
621 log.Error().Msg("Unable to write data")
622 }
623
624 // Get sigma as a list
625 sigmalist := make([]rune, len(dat.sigma)+16)
626 max := 0
627 for sym, num := range dat.sigma {
628 sigmalist[num] = sym
629 if num > max {
630 max = num
631 }
632 }
633
634 sigmalist = sigmalist[:max+1]
635
636 buf := make([]byte, 0, 12)
637 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200638 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
639 bo.PutUint16(buf[4:6], uint16(dat.unknown))
640 bo.PutUint16(buf[6:8], uint16(dat.identity))
641 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200642 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
643 more, err := w.Write(buf[0:12])
644 if err != nil {
645 log.Error().Msg("Unable to write data")
646 }
647
648 all += more
649
650 wbuf := bytes.NewBuffer(nil)
651 wbufWrap := bufio.NewWriter(wbuf)
652
653 // Write sigma
654 for _, sym := range sigmalist {
655 more, err = wbufWrap.WriteRune(sym)
656 if err != nil {
657 log.Error().Msg("Unable to write data")
658 }
659 all += more
660 }
661 wbufWrap.Flush()
662 more, err = w.Write(wbuf.Bytes())
663 if err != nil {
664 log.Error().Msg("Unable to write data")
665 }
666 all += more
667
668 // Test marker - could be checksum
669 more, err = w.Write([]byte("T"))
670 if err != nil {
671 log.Error().Msg("Unable to write data")
672 }
673 all += more
674
675 wbuf.Reset()
676
677 for _, d := range dat.array {
678 bo.PutUint32(buf[0:4], uint32(d))
679 more, err := w.Write(buf[0:4])
680 if err != nil {
681 log.Error().Msg("Unable to write data")
682 }
683 all += more
684 }
685
686 return int64(all), err
687}
688
Akron740f3d72021-08-03 12:12:34 +0200689// Match an input string against the double array
690// FSA.
691//
692// Based on Mizobuchi et al (2000), p. 129,
693// with additional support for IDENTITY, UNKNOWN
694// and EPSILON transitions.
Akronf2120ca2021-08-03 16:26:41 +0200695func (tok *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200696 var a int
697 var tu int
698 var ok bool
699
Akron740f3d72021-08-03 12:12:34 +0200700 t := 1 // Initial state
701 chars := []rune(input)
702 i := 0
703
Akron49d27ee2021-08-03 11:58:13 +0200704 for i < len(chars) {
Akron465a0992021-08-03 11:28:48 +0200705 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200706
Akron740f3d72021-08-03 12:12:34 +0200707 // Support identity symbol if character is not in sigma
Akronc17f1ca2021-08-03 19:47:27 +0200708 if !ok && tok.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200709 if DEBUG {
Akronc17f1ca2021-08-03 19:47:27 +0200710 fmt.Println("IDENTITY symbol", string(chars[i]), "->", tok.identity)
Akron740f3d72021-08-03 12:12:34 +0200711 }
Akronc17f1ca2021-08-03 19:47:27 +0200712 a = tok.identity
Akron740f3d72021-08-03 12:12:34 +0200713 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200714 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200715 }
Akron465a0992021-08-03 11:28:48 +0200716 tu = t
Akron730a79c2021-08-03 11:05:29 +0200717 CHECK:
Akron740f3d72021-08-03 12:12:34 +0200718 t = tok.getBase(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200719
Akron740f3d72021-08-03 12:12:34 +0200720 // Check if the transition is valid according to the double array
721 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
722
723 if DEBUG {
724 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200725 }
Akron740f3d72021-08-03 12:12:34 +0200726
Akronc17f1ca2021-08-03 19:47:27 +0200727 if !ok && a == tok.identity {
Akron740f3d72021-08-03 12:12:34 +0200728 // Try again with unknown symbol, in case identity failed
729 if DEBUG {
Akronc17f1ca2021-08-03 19:47:27 +0200730 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", tok.unknown)
Akron740f3d72021-08-03 12:12:34 +0200731 }
Akronc17f1ca2021-08-03 19:47:27 +0200732 a = tok.unknown
Akron740f3d72021-08-03 12:12:34 +0200733
Akronc17f1ca2021-08-03 19:47:27 +0200734 } else if a != tok.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200735 // Try again with epsilon symbol, in case everything else failed
736 if DEBUG {
Akronc17f1ca2021-08-03 19:47:27 +0200737 fmt.Println("EPSILON symbol", string(chars[i]), "->", tok.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200738 }
Akronc17f1ca2021-08-03 19:47:27 +0200739 a = tok.epsilon
Akron740f3d72021-08-03 12:12:34 +0200740 } else {
741 break
742 }
743 goto CHECK
744 } else if tok.getBase(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200745 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200746 t = -1 * tok.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200747 }
Akron740f3d72021-08-03 12:12:34 +0200748
749 // Transition is fine
Akronc17f1ca2021-08-03 19:47:27 +0200750 if a != tok.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200751 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200752 i++
753 }
Akron740f3d72021-08-03 12:12:34 +0200754 // TODO:
755 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200756 }
757
Akron740f3d72021-08-03 12:12:34 +0200758 if i != len(chars) {
759 if DEBUG {
760 fmt.Println("Not at the end")
761 }
Akron8ef408b2021-08-02 22:11:04 +0200762 return false
763 }
764
Akron465a0992021-08-03 11:28:48 +0200765FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200766
767 // Automaton is in a final state
Akronc17f1ca2021-08-03 19:47:27 +0200768 if tok.getCheck(tok.getBase(t)+tok.final) == t {
Akron8ef408b2021-08-02 22:11:04 +0200769 return true
770 }
Akron465a0992021-08-03 11:28:48 +0200771
Akron740f3d72021-08-03 12:12:34 +0200772 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200773 tu = t
Akronc17f1ca2021-08-03 19:47:27 +0200774 t = tok.getBase(tu) + tok.epsilon
Akron465a0992021-08-03 11:28:48 +0200775
Akron740f3d72021-08-03 12:12:34 +0200776 // Epsilon transition failed
777 if t > tok.getCheck(1) || tok.getCheck(t) != tu {
778 if DEBUG {
779 fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
780 }
Akron465a0992021-08-03 11:28:48 +0200781 return false
Akron740f3d72021-08-03 12:12:34 +0200782
783 } else if tok.getBase(t) < 0 {
Akron465a0992021-08-03 11:28:48 +0200784 // Move to representative state
Akron740f3d72021-08-03 12:12:34 +0200785 t = -1 * tok.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200786 }
Akron740f3d72021-08-03 12:12:34 +0200787
Akron465a0992021-08-03 11:28:48 +0200788 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200789}