blob: e05dc568f8fa824de94c6a63115b453f70e7f3d8 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020017
Akron8ef408b2021-08-02 22:11:04 +020018import (
19 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020020 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020021 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron2a4b9292021-08-04 15:35:22 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 firstBit uint32 = 1 << 31
44 secondBit uint32 = 1 << 30
45 restBit uint32 = ^uint32(0) &^ (firstBit | secondBit)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron6247a5d2021-08-03 19:18:28 +020048var bo binary.ByteOrder = binary.LittleEndian
49
Akron8ef408b2021-08-02 22:11:04 +020050type mapping struct {
51 source int
Akron3fdfec62021-08-04 11:40:10 +020052 target uint32
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type edge struct {
Akron83e75a22021-08-04 13:14:06 +020056 inSym int
57 outSym int
58 end int
59 nontoken bool
60 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020064 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020065 sigmaRev map[int]rune
66 arcCount int
67 stateCount int
68 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020069 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron8ef408b2021-08-02 22:11:04 +020076}
77
Akronf2120ca2021-08-03 16:26:41 +020078type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020079 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020080 sigma map[rune]int
81 maxSize int
82 loadFactor float64
83 array []uint32
Akronc17f1ca2021-08-03 19:47:27 +020084
85 // Special symbols in sigma
86 epsilon int
87 unknown int
88 identity int
89 final int
Akronf2120ca2021-08-03 16:26:41 +020090}
91
Akron64ffd9a2021-08-03 19:55:21 +020092func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020093 f, err := os.Open(file)
94 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020095 log.Error().Err(err)
96 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020097 }
98 defer f.Close()
99
100 gz, err := gzip.NewReader(f)
101 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200102 log.Error().Err(err)
103 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200104 }
105 defer gz.Close()
106
Akron3fdfec62021-08-04 11:40:10 +0200107 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200108}
109
Akron3fdfec62021-08-04 11:40:10 +0200110func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200111 r := bufio.NewReader(ior)
112
113 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200114 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200115 epsilon: -1,
116 unknown: -1,
117 identity: -1,
118 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120
Akron740f3d72021-08-03 12:12:34 +0200121 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200122
123 mode := 0
124 var elem []string
125 var elemint [5]int
126
127 for {
128 line, err := r.ReadString('\n')
129 if err != nil {
130 if err == io.EOF {
131 break
132 }
Akron740f3d72021-08-03 12:12:34 +0200133 log.Error().Err(err)
134 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200135 }
136 if strings.HasPrefix(line, "##foma-net") {
137 continue
138 }
139 if strings.HasPrefix(line, "##props##") {
140 mode = PROPS
141 continue
142 }
143 if strings.HasPrefix(line, "##states##") {
144 mode = STATES
145
146 // Adds a final transition symbol to sigma
147 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200148 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200149 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200150 continue
151 }
152 if strings.HasPrefix(line, "##sigma##") {
153 mode = SIGMA
154 continue
155 }
156 if strings.HasPrefix(line, "##end##") {
157 mode = NONE
158 continue
159 }
160
161 switch mode {
162 case PROPS:
163 {
164 elem = strings.Split(line, " ")
165 /*
166 fmt.Println("arity: " + elem[0])
167 fmt.Println("arccount: " + elem[1])
168 fmt.Println("statecount: " + elem[2])
169 fmt.Println("linecount: " + elem[3])
170 fmt.Println("finalcount: " + elem[4])
171 fmt.Println("pathcount: " + elem[5])
172 fmt.Println("is_deterministic: " + elem[6])
173 fmt.Println("is_pruned: " + elem[7])
174 fmt.Println("is_minimized: " + elem[8])
175 fmt.Println("is_epsilon_free: " + elem[9])
176 fmt.Println("is_loop_free: " + elem[10])
177 fmt.Println("extras: " + elem[11])
178 fmt.Println("name: " + elem[12])
179 */
180 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200181 log.Error().Msg("The FST needs to be deterministic")
182 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200183 }
184 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200185 log.Error().Msg("The FST needs to be epsilon free")
186 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200187 }
188
189 elemint[0], err = strconv.Atoi(elem[1])
190 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200191 log.Error().Msg("Can't read arccount")
192 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200193 }
Akron740f3d72021-08-03 12:12:34 +0200194 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200195
196 // States start at 1 in Mizobuchi et al (2000),
197 // as the state 0 is associated with a fail.
198 // Initialize states and transitions
199 elemint[0], err = strconv.Atoi(elem[2])
200 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200201 log.Error().Msg("Can't read statecount")
202 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200203 }
Akron740f3d72021-08-03 12:12:34 +0200204 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200205 tok.transitions = make([]map[int]*edge, elemint[0]+1)
206 continue
207 }
208 case STATES:
209 {
210 elem = strings.Split(line[0:len(line)-1], " ")
211 if elem[0] == "-1" {
212 continue
213 }
214 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200215 if err != nil {
216 break
217 }
Akron8ef408b2021-08-02 22:11:04 +0200218
219 if len(elem) > 1 {
220 elemint[1], err = strconv.Atoi(elem[1])
221 if err != nil {
222 break
223 }
224 if len(elem) > 2 {
225 elemint[2], err = strconv.Atoi(elem[2])
226 if err != nil {
227 break
228 }
229 if len(elem) > 3 {
230 elemint[3], err = strconv.Atoi(elem[3])
231 if err != nil {
232 break
233 }
234 if len(elem) > 4 {
235 elemint[4], err = strconv.Atoi(elem[4])
236 if err != nil {
237 break
238 }
239 }
240 }
241 }
242 }
243
244 switch len(elem) {
245 case 5:
246 {
Akron740f3d72021-08-03 12:12:34 +0200247 state = elemint[0]
248 inSym = elemint[1]
249 outSym = elemint[2]
250 end = elemint[3]
251 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200252 }
253 case 4:
254 {
255 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200256 state = elemint[0]
257 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200258 } else {
Akron740f3d72021-08-03 12:12:34 +0200259 state = elemint[0]
260 inSym = elemint[1]
261 end = elemint[2]
262 final = elemint[3]
263 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200264 }
265 }
266 case 3:
267 {
Akron740f3d72021-08-03 12:12:34 +0200268 inSym = elemint[0]
269 outSym = elemint[1]
270 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200271 }
272 case 2:
273 {
Akron740f3d72021-08-03 12:12:34 +0200274 inSym = elemint[0]
275 end = elemint[1]
276 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200277 }
278 }
279
Akron8ef408b2021-08-02 22:11:04 +0200280 // While the states in foma start with 0, the states in the
281 // Mizobuchi FSA start with one - so we increase every state by 1.
282
Akron83e75a22021-08-04 13:14:06 +0200283 nontoken := false
284 tokenend := false
285
Akron740f3d72021-08-03 12:12:34 +0200286 if inSym != outSym {
287
Akron83e75a22021-08-04 13:14:06 +0200288 if tok.sigmaRev[outSym] == NEWLINE {
289 tokenend = true
290 } else if outSym == tok.epsilon {
291 nontoken = true
292 } else {
Akron740f3d72021-08-03 12:12:34 +0200293 log.Error().Msg(
294 "Unsupported transition: " +
295 strconv.Itoa(state) +
296 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200297 " (" +
Akron740f3d72021-08-03 12:12:34 +0200298 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200299 ":" +
Akron740f3d72021-08-03 12:12:34 +0200300 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200301 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200302 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200303 ":" +
Akron740f3d72021-08-03 12:12:34 +0200304 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200305 ")")
Akron740f3d72021-08-03 12:12:34 +0200306 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200307 }
Akron83e75a22021-08-04 13:14:06 +0200308
Akron83e75a22021-08-04 13:14:06 +0200309 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200310 log.Error().Msg("Epsilon transitions not supported")
311 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200312 }
313
Akron740f3d72021-08-03 12:12:34 +0200314 // This collects all edges until arrstate changes
315
Akron8ef408b2021-08-02 22:11:04 +0200316 // TODO:
317 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
318 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
319 // This requires to remove the transition alltogether and marks the state instead.
320
321 // TODO:
322 // if arrout == EPSILON, mark the transition as NOTOKEN
323
324 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200325 inSym: inSym,
326 outSym: outSym,
327 end: end + 1,
328 tokenend: tokenend,
329 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200330 }
331
Akron740f3d72021-08-03 12:12:34 +0200332 // Initialize outgoing states
333 if tok.transitions[state+1] == nil {
334 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200335 }
336
Akron740f3d72021-08-03 12:12:34 +0200337 // Ignore transitions with invalid symbols
338 if inSym >= 0 {
339 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200340 }
Akron8ef408b2021-08-02 22:11:04 +0200341
Akron740f3d72021-08-03 12:12:34 +0200342 // Add final transition
343 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200344 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200345 }
346
Akron740f3d72021-08-03 12:12:34 +0200347 if DEBUG {
348 fmt.Println("Add",
349 state+1, "->", end+1,
350 "(",
351 inSym,
352 ":",
353 outSym,
354 ") (",
355 string(tok.sigmaRev[inSym]),
356 ":",
357 string(tok.sigmaRev[outSym]),
358 ")")
359 }
Akron75ebe7f2021-08-03 10:34:10 +0200360
Akron8ef408b2021-08-02 22:11:04 +0200361 continue
362 }
363 case SIGMA:
364 {
365 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
366
367 // Turn string into sigma id
368 number, err := strconv.Atoi(elem[0])
369
370 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200371 log.Error().Err(err)
372 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200373 }
374
Akron740f3d72021-08-03 12:12:34 +0200375 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200376
377 var symbol rune
378
379 // Read rune
380 if utf8.RuneCountInString(elem[1]) == 1 {
381 symbol = []rune(elem[1])[0]
382
383 // Probably a MCS
384 } else if utf8.RuneCountInString(elem[1]) > 1 {
385 switch elem[1] {
386 case "@_EPSILON_SYMBOL_@":
387 {
Akronc17f1ca2021-08-03 19:47:27 +0200388 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200389 continue
390 }
391 case "@_UNKNOWN_SYMBOL_@":
392 {
Akronc17f1ca2021-08-03 19:47:27 +0200393 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200394 continue
395 }
396
397 case "@_IDENTITY_SYMBOL_@":
398 {
Akronc17f1ca2021-08-03 19:47:27 +0200399 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200400 continue
401 }
402 default:
Akron740f3d72021-08-03 12:12:34 +0200403 {
404 log.Error().Msg("MCS not supported: " + line)
405 os.Exit(1)
406 }
Akron8ef408b2021-08-02 22:11:04 +0200407 }
408
Akron740f3d72021-08-03 12:12:34 +0200409 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200410 line, err = r.ReadString('\n')
411 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200412 log.Error().Err(err)
413 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200414 }
415 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200416 log.Error().Msg("MCS not supported:" + line)
417 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200418 }
Akron740f3d72021-08-03 12:12:34 +0200419 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200420 }
421
Akron740f3d72021-08-03 12:12:34 +0200422 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200423 }
424 }
425 }
426
427 return tok
428}
429
Akron64ffd9a2021-08-03 19:55:21 +0200430// Set alphabet A to the list of all symbols
431// outgoing from s
432func (tok *Tokenizer) get_set(s int, A *[]int) {
433 for a := range tok.transitions[s] {
434 *A = append(*A, a)
435 }
436
437 // Not required, but simplifies bug hunting
438 sort.Ints(*A)
439}
440
Akron8ef408b2021-08-02 22:11:04 +0200441// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200442func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
443
444 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200445 sigma: make(map[rune]int),
446 loadFactor: -1,
447 final: tok.final,
448 unknown: tok.unknown,
449 identity: tok.identity,
450 epsilon: tok.epsilon,
Akronf2120ca2021-08-03 16:26:41 +0200451 }
452
453 for num, sym := range tok.sigmaRev {
454 dat.sigma[sym] = num
455 }
Akron8ef408b2021-08-02 22:11:04 +0200456
457 mark := 0
458 size := 0
459
460 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200461 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200462
463 table[size] = &mapping{source: 1, target: 1}
464 size++
465
Akron740f3d72021-08-03 12:12:34 +0200466 // Allocate space for the outgoing symbol range
467 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200468
469 for mark < size {
470 s := table[mark].source // This is a state in Ms
471 t := table[mark].target // This is a state in Mt
472 mark++
Akron740f3d72021-08-03 12:12:34 +0200473
474 // Following the paper, here the state t can be remembered
475 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200476 A = A[:0]
477 tok.get_set(s, &A)
478
Akron740f3d72021-08-03 12:12:34 +0200479 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200480 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200481
Akron773b1ef2021-08-03 17:37:20 +0200482 // TODO:
Akron068874c2021-08-04 15:19:56 +0200483 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200484 // outdegree of .end
485
Akron740f3d72021-08-03 12:12:34 +0200486 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200487 for _, a := range A {
488
Akronc17f1ca2021-08-03 19:47:27 +0200489 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200490
Akron740f3d72021-08-03 12:12:34 +0200491 // Aka g(s, a)
492 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200493
Akron740f3d72021-08-03 12:12:34 +0200494 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200495 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200496 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200497
Akron83e75a22021-08-04 13:14:06 +0200498 // Mark the state as being the target of a nontoken transition
499 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200500 dat.setNonToken(t1, true)
Akron83e75a22021-08-04 13:14:06 +0200501 }
502
Akron84d68e62021-08-04 17:06:52 +0200503 // Mark the state as being the target of a tokenend transition
504 if tok.transitions[s][a].tokenend {
505 dat.setTokenEnd(t1, true)
506 }
507
Akron740f3d72021-08-03 12:12:34 +0200508 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200509 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200510
Akron8ef408b2021-08-02 22:11:04 +0200511 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200512 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200513 table[size] = &mapping{source: s1, target: t1}
514 size++
515 } else {
Akron740f3d72021-08-03 12:12:34 +0200516 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200517 dat.setBase(t1, r)
518 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200519 }
520 } else {
Akron740f3d72021-08-03 12:12:34 +0200521 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200522 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200523 }
524 }
525 }
526
527 // Following Mizobuchi et al (2000) the size of the
528 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200529 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200530 dat.array = dat.array[:dat.maxSize+1]
531 return dat
Akron8ef408b2021-08-02 22:11:04 +0200532}
533
Akron8ef408b2021-08-02 22:11:04 +0200534// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200535// exists and return this as a representative.
536// Currently iterates through the whole table
537// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200538func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200539 for x := 0; x < size; x++ {
540 if table[x].source == s {
541 return table[x].target
542 }
543 }
544 return 0
545}
546
Akron64ffd9a2021-08-03 19:55:21 +0200547// Resize double array when necessary
548func (dat *DaTokenizer) resize(l int) {
549 // TODO:
550 // This is a bit too aggressive atm and should be calmed down.
551 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200552 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200553 }
Akron64ffd9a2021-08-03 19:55:21 +0200554}
Akronc9d84a62021-08-03 15:56:03 +0200555
Akron64ffd9a2021-08-03 19:55:21 +0200556// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200557func (dat *DaTokenizer) setBase(p uint32, v uint32) {
558 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200559 dat.resize(l)
560 if dat.maxSize < l {
561 dat.maxSize = l
562 }
563 dat.array[p*2] = v
564}
565
Akron3fdfec62021-08-04 11:40:10 +0200566// Returns true if a state is separate pointing to a representative
567func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200568 return dat.array[p*2]&firstBit != 0
Akron3fdfec62021-08-04 11:40:10 +0200569}
570
571// Mark a state as separate pointing to a representative
572func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
573 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200574 dat.array[p*2] |= firstBit
Akron3fdfec62021-08-04 11:40:10 +0200575 } else {
Akron2a4b9292021-08-04 15:35:22 +0200576 dat.array[p*2] &= (restBit | secondBit)
Akron3fdfec62021-08-04 11:40:10 +0200577 }
578}
579
Akron83e75a22021-08-04 13:14:06 +0200580// Returns true if a state is the target of a nontoken transition
581func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200582 return dat.array[p*2+1]&firstBit != 0
Akron83e75a22021-08-04 13:14:06 +0200583}
584
585// Mark a state as being the target of a nontoken transition
586func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
587 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200588 dat.array[p*2+1] |= firstBit
Akron83e75a22021-08-04 13:14:06 +0200589 } else {
Akron2a4b9292021-08-04 15:35:22 +0200590 dat.array[p*2+1] &= (restBit | secondBit)
Akron83e75a22021-08-04 13:14:06 +0200591 }
592}
593
Akron84d68e62021-08-04 17:06:52 +0200594// Returns true if a state is the target of a tokenend transition
595func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
596 return dat.array[p*2+1]&secondBit != 0
597}
598
599// Mark a state as being the target of a tokenend transition
600func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
601 if sep {
602 dat.array[p*2+1] |= secondBit
603 } else {
604 dat.array[p*2+1] &= (restBit | firstBit)
605 }
606}
607
Akron64ffd9a2021-08-03 19:55:21 +0200608// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200609func (dat *DaTokenizer) getBase(p uint32) uint32 {
610 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200611 return 0
612 }
Akron3fdfec62021-08-04 11:40:10 +0200613 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200614}
615
616// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200617func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
618 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200619 dat.resize(l)
620 if dat.maxSize < l {
621 dat.maxSize = l
622 }
623 dat.array[(p*2)+1] = v
624}
625
626// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200627func (dat *DaTokenizer) getCheck(p uint32) uint32 {
628 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200629 return 0
630 }
Akron3fdfec62021-08-04 11:40:10 +0200631 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200632}
633
634// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200635func (dat *DaTokenizer) setSize(v int) {
636 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200637}
638
639// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200640func (dat *DaTokenizer) GetSize() int {
641 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200642}
643
644// Based on Mizobuchi et al (2000), p. 124
645// This iterates for every state through the complete double array
646// structure until it finds a gap that fits all outgoing transitions
647// of the state. This is extremely slow, but is only necessary in the
648// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200649func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200650
651 // Start at the first entry of the double array list
Akron3fdfec62021-08-04 11:40:10 +0200652 base := uint32(1)
Akron8ef408b2021-08-02 22:11:04 +0200653
Akron8ef408b2021-08-02 22:11:04 +0200654OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200655
656 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200657 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200658 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200659 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200660 base++
661 goto OVERLAP
662 }
663 }
Akron8ef408b2021-08-02 22:11:04 +0200664 return base
665}
666
Akron03a3c612021-08-04 11:51:27 +0200667// LoadFactor as defined in Kanda et al (2018),
668// i.e. the proportion of non-empty elements to all elements.
669func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200670
Akron03a3c612021-08-04 11:51:27 +0200671 // Cache the loadfactor
672 if dat.loadFactor >= 0 {
673 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200674 }
Akrond66a9262021-08-03 17:09:09 +0200675 nonEmpty := 0
676 all := len(dat.array) / 2
677 for x := 1; x <= len(dat.array); x = x + 2 {
678 if dat.array[x] != 0 {
679 nonEmpty++
680 }
681 }
Akron03a3c612021-08-04 11:51:27 +0200682 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
683 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200684}
685
Akron6247a5d2021-08-03 19:18:28 +0200686// WriteTo stores the double array data in an io.Writer.
687func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
688
689 // Store magical header
690 all, err := w.Write([]byte(MAGIC))
691 if err != nil {
692 log.Error().Msg("Unable to write data")
693 }
694
695 // Get sigma as a list
696 sigmalist := make([]rune, len(dat.sigma)+16)
697 max := 0
698 for sym, num := range dat.sigma {
699 sigmalist[num] = sym
700 if num > max {
701 max = num
702 }
703 }
704
705 sigmalist = sigmalist[:max+1]
706
707 buf := make([]byte, 0, 12)
708 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200709 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
710 bo.PutUint16(buf[4:6], uint16(dat.unknown))
711 bo.PutUint16(buf[6:8], uint16(dat.identity))
712 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200713 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
714 more, err := w.Write(buf[0:12])
715 if err != nil {
716 log.Error().Msg("Unable to write data")
717 }
718
719 all += more
720
721 wbuf := bytes.NewBuffer(nil)
722 wbufWrap := bufio.NewWriter(wbuf)
723
724 // Write sigma
725 for _, sym := range sigmalist {
726 more, err = wbufWrap.WriteRune(sym)
727 if err != nil {
728 log.Error().Msg("Unable to write data")
729 }
730 all += more
731 }
732 wbufWrap.Flush()
733 more, err = w.Write(wbuf.Bytes())
734 if err != nil {
735 log.Error().Msg("Unable to write data")
736 }
737 all += more
738
739 // Test marker - could be checksum
740 more, err = w.Write([]byte("T"))
741 if err != nil {
742 log.Error().Msg("Unable to write data")
743 }
744 all += more
745
746 wbuf.Reset()
747
748 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200749 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200750 more, err := w.Write(buf[0:4])
751 if err != nil {
752 log.Error().Msg("Unable to write data")
753 }
754 all += more
755 }
756
757 return int64(all), err
758}
759
Akron740f3d72021-08-03 12:12:34 +0200760// Match an input string against the double array
761// FSA.
762//
763// Based on Mizobuchi et al (2000), p. 129,
764// with additional support for IDENTITY, UNKNOWN
765// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200766func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200767 var a int
Akron3fdfec62021-08-04 11:40:10 +0200768 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200769 var ok bool
770
Akron3fdfec62021-08-04 11:40:10 +0200771 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200772 chars := []rune(input)
773 i := 0
774
Akron49d27ee2021-08-03 11:58:13 +0200775 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200776 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200777
Akron740f3d72021-08-03 12:12:34 +0200778 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200779 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200780 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200781 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200782 }
Akron64ffd9a2021-08-03 19:55:21 +0200783 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200784 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200785 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200786 }
Akron465a0992021-08-03 11:28:48 +0200787 tu = t
Akron730a79c2021-08-03 11:05:29 +0200788 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200789 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200790
Akron740f3d72021-08-03 12:12:34 +0200791 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200792 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200793
794 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200795 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200796 }
Akron740f3d72021-08-03 12:12:34 +0200797
Akron64ffd9a2021-08-03 19:55:21 +0200798 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200799 // Try again with unknown symbol, in case identity failed
800 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200801 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200802 }
Akron64ffd9a2021-08-03 19:55:21 +0200803 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200804
Akron64ffd9a2021-08-03 19:55:21 +0200805 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200806 // Try again with epsilon symbol, in case everything else failed
807 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200808 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200809 }
Akron64ffd9a2021-08-03 19:55:21 +0200810 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200811 } else {
812 break
813 }
814 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200815 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200816 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200817 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200818 }
Akron740f3d72021-08-03 12:12:34 +0200819
820 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200821 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200822 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200823 i++
824 }
Akron83e75a22021-08-04 13:14:06 +0200825
Akron740f3d72021-08-03 12:12:34 +0200826 // TODO:
827 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200828 }
829
Akron740f3d72021-08-03 12:12:34 +0200830 if i != len(chars) {
831 if DEBUG {
832 fmt.Println("Not at the end")
833 }
Akron8ef408b2021-08-02 22:11:04 +0200834 return false
835 }
836
Akron465a0992021-08-03 11:28:48 +0200837FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200838
839 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200840 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200841 return true
842 }
Akron465a0992021-08-03 11:28:48 +0200843
Akron740f3d72021-08-03 12:12:34 +0200844 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200845 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200846 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200847
Akron740f3d72021-08-03 12:12:34 +0200848 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200849 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200850 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200851 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200852 }
Akron465a0992021-08-03 11:28:48 +0200853 return false
Akron740f3d72021-08-03 12:12:34 +0200854
Akron3fdfec62021-08-04 11:40:10 +0200855 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +0200856 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200857 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200858 }
Akron740f3d72021-08-03 12:12:34 +0200859
Akron465a0992021-08-03 11:28:48 +0200860 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200861}
Akron068874c2021-08-04 15:19:56 +0200862
Akron84d68e62021-08-04 17:06:52 +0200863// Transduce an input string against the double array
Akron068874c2021-08-04 15:19:56 +0200864// FSA.
865//
866// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +0200867// for NONTOKEN and TOKENEND handling
Akron068874c2021-08-04 15:19:56 +0200868func (dat *DaTokenizer) Transduce(input string) bool {
869 var a int
870 var tu uint32
Akron84d68e62021-08-04 17:06:52 +0200871 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +0200872
873 t := uint32(1) // Initial state
874 chars := []rune(input)
875 i := 0
876
877 for i < len(chars) {
878 a, ok = dat.sigma[chars[i]]
879
880 // Support identity symbol if character is not in sigma
881 if !ok && dat.identity != -1 {
882 if DEBUG {
883 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
884 }
885 a = dat.identity
886 } else if DEBUG {
887 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
888 }
889 tu = t
890 CHECK:
891 nontoken = false
Akron84d68e62021-08-04 17:06:52 +0200892 tokenend = false
893
Akron068874c2021-08-04 15:19:56 +0200894 t = dat.getBase(tu) + uint32(a)
895
896 // Check if the transition is valid according to the double array
897 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
898
899 if DEBUG {
900 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
901 }
902
903 if !ok && a == dat.identity {
904 // Try again with unknown symbol, in case identity failed
905 if DEBUG {
906 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
907 }
908 a = dat.unknown
909
910 } else if a != dat.epsilon {
911 // Try again with epsilon symbol, in case everything else failed
912 if DEBUG {
913 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
914 }
915 a = dat.epsilon
916 } else {
917 break
918 }
919 goto CHECK
920 } else if dat.isSeparate(t) {
921 // Move to representative state
922 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +0200923 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +0200924
925 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +0200926
Akron068874c2021-08-04 15:19:56 +0200927 } else {
928 nontoken = dat.isNonToken(t)
Akron84d68e62021-08-04 17:06:52 +0200929 tokenend = dat.isTokenEnd(t)
Akron068874c2021-08-04 15:19:56 +0200930 }
931
932 // Transition is fine
933 if a != dat.epsilon {
934 // Character consumed
935
936 if !nontoken {
937 fmt.Print("[", string(chars[i]), "]")
938 }
939 i++
940 }
941
942 if nontoken {
943 fmt.Print("<|>")
944 }
945
Akron84d68e62021-08-04 17:06:52 +0200946 if tokenend {
947 fmt.Print("< !!! >")
948 }
949
Akron068874c2021-08-04 15:19:56 +0200950 // TODO:
951 // Prevent endless epsilon loops!
952 }
953
954 if i != len(chars) {
955 if DEBUG {
956 fmt.Println("Not at the end")
957 }
958 return false
959 }
960
961FINALCHECK:
962
963 // Automaton is in a final state
964 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
965 if dat.isNonToken(t) {
966 fmt.Print("<|>")
967 }
Akron84d68e62021-08-04 17:06:52 +0200968 if dat.isTokenEnd(t) {
969 fmt.Print("< !!! >")
970 }
971
972 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +0200973 return true
974 }
975
976 // Check epsilon transitions until a final state is reached
977 tu = t
978 t = dat.getBase(tu) + uint32(dat.epsilon)
979
980 // Epsilon transition failed
981 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
982 if DEBUG {
983 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
984 }
985 return false
986
987 } else if dat.isSeparate(t) {
988 nontoken = dat.isNonToken(t)
989 // Move to representative state
990 t = dat.getBase(t)
991 } else {
992 nontoken = dat.isNonToken(t)
993 }
994
995 if nontoken {
996 fmt.Print("<|>")
997 }
998
999 goto FINALCHECK
1000}