blob: f7339ab5c37a1e3a46c7292641dfc4248f15bec5 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020017
Akron8ef408b2021-08-02 22:11:04 +020018import (
19 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020020 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020021 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron3fdfec62021-08-04 11:40:10 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 leadingBit uint32 = 1 << 31
44 restBit uint32 = ^uint32(0) &^ (1 << 31)
Akron8ef408b2021-08-02 22:11:04 +020045)
46
Akron6247a5d2021-08-03 19:18:28 +020047var bo binary.ByteOrder = binary.LittleEndian
48
Akron8ef408b2021-08-02 22:11:04 +020049type mapping struct {
50 source int
Akron3fdfec62021-08-04 11:40:10 +020051 target uint32
Akron8ef408b2021-08-02 22:11:04 +020052}
53
54type edge struct {
Akron83e75a22021-08-04 13:14:06 +020055 inSym int
56 outSym int
57 end int
58 nontoken bool
59 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020060}
61
62type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020063 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020064 sigmaRev map[int]rune
65 arcCount int
66 stateCount int
67 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020068 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020069
70 // Special symbols in sigma
71 epsilon int
72 unknown int
73 identity int
74 final int
Akron8ef408b2021-08-02 22:11:04 +020075}
76
Akronf2120ca2021-08-03 16:26:41 +020077type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020078 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020079 sigma map[rune]int
80 maxSize int
81 loadFactor float64
82 array []uint32
Akronc17f1ca2021-08-03 19:47:27 +020083
84 // Special symbols in sigma
85 epsilon int
86 unknown int
87 identity int
88 final int
Akronf2120ca2021-08-03 16:26:41 +020089}
90
Akron64ffd9a2021-08-03 19:55:21 +020091func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020092 f, err := os.Open(file)
93 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020094 log.Error().Err(err)
95 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020096 }
97 defer f.Close()
98
99 gz, err := gzip.NewReader(f)
100 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200101 log.Error().Err(err)
102 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200103 }
104 defer gz.Close()
105
Akron3fdfec62021-08-04 11:40:10 +0200106 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200107}
108
Akron3fdfec62021-08-04 11:40:10 +0200109func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200110 r := bufio.NewReader(ior)
111
112 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200113 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200114 epsilon: -1,
115 unknown: -1,
116 identity: -1,
117 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200118 }
119
Akron740f3d72021-08-03 12:12:34 +0200120 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200121
122 mode := 0
123 var elem []string
124 var elemint [5]int
125
126 for {
127 line, err := r.ReadString('\n')
128 if err != nil {
129 if err == io.EOF {
130 break
131 }
Akron740f3d72021-08-03 12:12:34 +0200132 log.Error().Err(err)
133 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200134 }
135 if strings.HasPrefix(line, "##foma-net") {
136 continue
137 }
138 if strings.HasPrefix(line, "##props##") {
139 mode = PROPS
140 continue
141 }
142 if strings.HasPrefix(line, "##states##") {
143 mode = STATES
144
145 // Adds a final transition symbol to sigma
146 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200147 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200148 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200149 continue
150 }
151 if strings.HasPrefix(line, "##sigma##") {
152 mode = SIGMA
153 continue
154 }
155 if strings.HasPrefix(line, "##end##") {
156 mode = NONE
157 continue
158 }
159
160 switch mode {
161 case PROPS:
162 {
163 elem = strings.Split(line, " ")
164 /*
165 fmt.Println("arity: " + elem[0])
166 fmt.Println("arccount: " + elem[1])
167 fmt.Println("statecount: " + elem[2])
168 fmt.Println("linecount: " + elem[3])
169 fmt.Println("finalcount: " + elem[4])
170 fmt.Println("pathcount: " + elem[5])
171 fmt.Println("is_deterministic: " + elem[6])
172 fmt.Println("is_pruned: " + elem[7])
173 fmt.Println("is_minimized: " + elem[8])
174 fmt.Println("is_epsilon_free: " + elem[9])
175 fmt.Println("is_loop_free: " + elem[10])
176 fmt.Println("extras: " + elem[11])
177 fmt.Println("name: " + elem[12])
178 */
179 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200180 log.Error().Msg("The FST needs to be deterministic")
181 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200182 }
183 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200184 log.Error().Msg("The FST needs to be epsilon free")
185 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200186 }
187
188 elemint[0], err = strconv.Atoi(elem[1])
189 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200190 log.Error().Msg("Can't read arccount")
191 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200192 }
Akron740f3d72021-08-03 12:12:34 +0200193 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200194
195 // States start at 1 in Mizobuchi et al (2000),
196 // as the state 0 is associated with a fail.
197 // Initialize states and transitions
198 elemint[0], err = strconv.Atoi(elem[2])
199 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200200 log.Error().Msg("Can't read statecount")
201 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200202 }
Akron740f3d72021-08-03 12:12:34 +0200203 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200204 tok.transitions = make([]map[int]*edge, elemint[0]+1)
205 continue
206 }
207 case STATES:
208 {
209 elem = strings.Split(line[0:len(line)-1], " ")
210 if elem[0] == "-1" {
211 continue
212 }
213 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200214 if err != nil {
215 break
216 }
Akron8ef408b2021-08-02 22:11:04 +0200217
218 if len(elem) > 1 {
219 elemint[1], err = strconv.Atoi(elem[1])
220 if err != nil {
221 break
222 }
223 if len(elem) > 2 {
224 elemint[2], err = strconv.Atoi(elem[2])
225 if err != nil {
226 break
227 }
228 if len(elem) > 3 {
229 elemint[3], err = strconv.Atoi(elem[3])
230 if err != nil {
231 break
232 }
233 if len(elem) > 4 {
234 elemint[4], err = strconv.Atoi(elem[4])
235 if err != nil {
236 break
237 }
238 }
239 }
240 }
241 }
242
243 switch len(elem) {
244 case 5:
245 {
Akron740f3d72021-08-03 12:12:34 +0200246 state = elemint[0]
247 inSym = elemint[1]
248 outSym = elemint[2]
249 end = elemint[3]
250 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200251 }
252 case 4:
253 {
254 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200255 state = elemint[0]
256 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200257 } else {
Akron740f3d72021-08-03 12:12:34 +0200258 state = elemint[0]
259 inSym = elemint[1]
260 end = elemint[2]
261 final = elemint[3]
262 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200263 }
264 }
265 case 3:
266 {
Akron740f3d72021-08-03 12:12:34 +0200267 inSym = elemint[0]
268 outSym = elemint[1]
269 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200270 }
271 case 2:
272 {
Akron740f3d72021-08-03 12:12:34 +0200273 inSym = elemint[0]
274 end = elemint[1]
275 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200276 }
277 }
278
Akron8ef408b2021-08-02 22:11:04 +0200279 // While the states in foma start with 0, the states in the
280 // Mizobuchi FSA start with one - so we increase every state by 1.
281
Akron83e75a22021-08-04 13:14:06 +0200282 nontoken := false
283 tokenend := false
284
Akron740f3d72021-08-03 12:12:34 +0200285 if inSym != outSym {
286
Akron83e75a22021-08-04 13:14:06 +0200287 if tok.sigmaRev[outSym] == NEWLINE {
288 tokenend = true
289 } else if outSym == tok.epsilon {
290 nontoken = true
291 } else {
Akron740f3d72021-08-03 12:12:34 +0200292 log.Error().Msg(
293 "Unsupported transition: " +
294 strconv.Itoa(state) +
295 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200296 " (" +
Akron740f3d72021-08-03 12:12:34 +0200297 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200298 ":" +
Akron740f3d72021-08-03 12:12:34 +0200299 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200300 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200301 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200302 ":" +
Akron740f3d72021-08-03 12:12:34 +0200303 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200304 ")")
Akron740f3d72021-08-03 12:12:34 +0200305 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200306 }
Akron83e75a22021-08-04 13:14:06 +0200307
Akron83e75a22021-08-04 13:14:06 +0200308 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200309 log.Error().Msg("Epsilon transitions not supported")
310 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200311 }
312
Akron740f3d72021-08-03 12:12:34 +0200313 // This collects all edges until arrstate changes
314
Akron8ef408b2021-08-02 22:11:04 +0200315 // TODO:
316 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
317 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
318 // This requires to remove the transition alltogether and marks the state instead.
319
320 // TODO:
321 // if arrout == EPSILON, mark the transition as NOTOKEN
322
323 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200324 inSym: inSym,
325 outSym: outSym,
326 end: end + 1,
327 tokenend: tokenend,
328 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200329 }
330
Akron740f3d72021-08-03 12:12:34 +0200331 // Initialize outgoing states
332 if tok.transitions[state+1] == nil {
333 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200334 }
335
Akron740f3d72021-08-03 12:12:34 +0200336 // Ignore transitions with invalid symbols
337 if inSym >= 0 {
338 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200339 }
Akron8ef408b2021-08-02 22:11:04 +0200340
Akron740f3d72021-08-03 12:12:34 +0200341 // Add final transition
342 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200343 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200344 }
345
Akron740f3d72021-08-03 12:12:34 +0200346 if DEBUG {
347 fmt.Println("Add",
348 state+1, "->", end+1,
349 "(",
350 inSym,
351 ":",
352 outSym,
353 ") (",
354 string(tok.sigmaRev[inSym]),
355 ":",
356 string(tok.sigmaRev[outSym]),
357 ")")
358 }
Akron75ebe7f2021-08-03 10:34:10 +0200359
Akron8ef408b2021-08-02 22:11:04 +0200360 continue
361 }
362 case SIGMA:
363 {
364 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
365
366 // Turn string into sigma id
367 number, err := strconv.Atoi(elem[0])
368
369 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200370 log.Error().Err(err)
371 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200372 }
373
Akron740f3d72021-08-03 12:12:34 +0200374 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200375
376 var symbol rune
377
378 // Read rune
379 if utf8.RuneCountInString(elem[1]) == 1 {
380 symbol = []rune(elem[1])[0]
381
382 // Probably a MCS
383 } else if utf8.RuneCountInString(elem[1]) > 1 {
384 switch elem[1] {
385 case "@_EPSILON_SYMBOL_@":
386 {
Akronc17f1ca2021-08-03 19:47:27 +0200387 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200388 continue
389 }
390 case "@_UNKNOWN_SYMBOL_@":
391 {
Akronc17f1ca2021-08-03 19:47:27 +0200392 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200393 continue
394 }
395
396 case "@_IDENTITY_SYMBOL_@":
397 {
Akronc17f1ca2021-08-03 19:47:27 +0200398 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200399 continue
400 }
401 default:
Akron740f3d72021-08-03 12:12:34 +0200402 {
403 log.Error().Msg("MCS not supported: " + line)
404 os.Exit(1)
405 }
Akron8ef408b2021-08-02 22:11:04 +0200406 }
407
Akron740f3d72021-08-03 12:12:34 +0200408 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200409 line, err = r.ReadString('\n')
410 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200411 log.Error().Err(err)
412 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200413 }
414 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200415 log.Error().Msg("MCS not supported:" + line)
416 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200417 }
Akron740f3d72021-08-03 12:12:34 +0200418 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200419 }
420
Akron740f3d72021-08-03 12:12:34 +0200421 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200422 }
423 }
424 }
425
426 return tok
427}
428
Akron64ffd9a2021-08-03 19:55:21 +0200429// Set alphabet A to the list of all symbols
430// outgoing from s
431func (tok *Tokenizer) get_set(s int, A *[]int) {
432 for a := range tok.transitions[s] {
433 *A = append(*A, a)
434 }
435
436 // Not required, but simplifies bug hunting
437 sort.Ints(*A)
438}
439
Akron8ef408b2021-08-02 22:11:04 +0200440// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200441func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
442
443 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200444 sigma: make(map[rune]int),
445 loadFactor: -1,
446 final: tok.final,
447 unknown: tok.unknown,
448 identity: tok.identity,
449 epsilon: tok.epsilon,
Akronf2120ca2021-08-03 16:26:41 +0200450 }
451
452 for num, sym := range tok.sigmaRev {
453 dat.sigma[sym] = num
454 }
Akron8ef408b2021-08-02 22:11:04 +0200455
456 mark := 0
457 size := 0
458
459 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200460 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200461
462 table[size] = &mapping{source: 1, target: 1}
463 size++
464
Akron740f3d72021-08-03 12:12:34 +0200465 // Allocate space for the outgoing symbol range
466 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200467
468 for mark < size {
469 s := table[mark].source // This is a state in Ms
470 t := table[mark].target // This is a state in Mt
471 mark++
Akron740f3d72021-08-03 12:12:34 +0200472
473 // Following the paper, here the state t can be remembered
474 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200475 A = A[:0]
476 tok.get_set(s, &A)
477
Akron740f3d72021-08-03 12:12:34 +0200478 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200479 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200480
Akron773b1ef2021-08-03 17:37:20 +0200481 // TODO:
Akron068874c2021-08-04 15:19:56 +0200482 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200483 // outdegree of .end
484
Akron740f3d72021-08-03 12:12:34 +0200485 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200486 for _, a := range A {
487
Akronc17f1ca2021-08-03 19:47:27 +0200488 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200489
Akron740f3d72021-08-03 12:12:34 +0200490 // Aka g(s, a)
491 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200492
Akron740f3d72021-08-03 12:12:34 +0200493 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200494 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200495 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200496
Akron83e75a22021-08-04 13:14:06 +0200497 // Mark the state as being the target of a nontoken transition
498 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200499 dat.setNonToken(t1, true)
Akron83e75a22021-08-04 13:14:06 +0200500 }
501
Akron740f3d72021-08-03 12:12:34 +0200502 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200503 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200504
Akron8ef408b2021-08-02 22:11:04 +0200505 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200506 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200507 table[size] = &mapping{source: s1, target: t1}
508 size++
509 } else {
Akron740f3d72021-08-03 12:12:34 +0200510 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200511 dat.setBase(t1, r)
512 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200513 }
514 } else {
Akron740f3d72021-08-03 12:12:34 +0200515 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200516 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200517 }
518 }
519 }
520
521 // Following Mizobuchi et al (2000) the size of the
522 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200523 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200524 dat.array = dat.array[:dat.maxSize+1]
525 return dat
Akron8ef408b2021-08-02 22:11:04 +0200526}
527
Akron8ef408b2021-08-02 22:11:04 +0200528// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200529// exists and return this as a representative.
530// Currently iterates through the whole table
531// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200532func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200533 for x := 0; x < size; x++ {
534 if table[x].source == s {
535 return table[x].target
536 }
537 }
538 return 0
539}
540
Akron64ffd9a2021-08-03 19:55:21 +0200541// Resize double array when necessary
542func (dat *DaTokenizer) resize(l int) {
543 // TODO:
544 // This is a bit too aggressive atm and should be calmed down.
545 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200546 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200547 }
Akron64ffd9a2021-08-03 19:55:21 +0200548}
Akronc9d84a62021-08-03 15:56:03 +0200549
Akron64ffd9a2021-08-03 19:55:21 +0200550// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200551func (dat *DaTokenizer) setBase(p uint32, v uint32) {
552 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200553 dat.resize(l)
554 if dat.maxSize < l {
555 dat.maxSize = l
556 }
557 dat.array[p*2] = v
558}
559
Akron3fdfec62021-08-04 11:40:10 +0200560// Returns true if a state is separate pointing to a representative
561func (dat *DaTokenizer) isSeparate(p uint32) bool {
562 return dat.array[p*2]&leadingBit != 0
563}
564
565// Mark a state as separate pointing to a representative
566func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
567 if sep {
568 dat.array[p*2] |= leadingBit
569 } else {
570 dat.array[p*2] &= restBit
571 }
572}
573
Akron83e75a22021-08-04 13:14:06 +0200574// Returns true if a state is the target of a nontoken transition
575func (dat *DaTokenizer) isNonToken(p uint32) bool {
576 return dat.array[p*2+1]&leadingBit != 0
577}
578
579// Mark a state as being the target of a nontoken transition
580func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
581 if sep {
582 dat.array[p*2+1] |= leadingBit
583 } else {
584 dat.array[p*2+1] &= restBit
585 }
586}
587
Akron64ffd9a2021-08-03 19:55:21 +0200588// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200589func (dat *DaTokenizer) getBase(p uint32) uint32 {
590 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200591 return 0
592 }
Akron3fdfec62021-08-04 11:40:10 +0200593 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200594}
595
596// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200597func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
598 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200599 dat.resize(l)
600 if dat.maxSize < l {
601 dat.maxSize = l
602 }
603 dat.array[(p*2)+1] = v
604}
605
606// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200607func (dat *DaTokenizer) getCheck(p uint32) uint32 {
608 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200609 return 0
610 }
Akron3fdfec62021-08-04 11:40:10 +0200611 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200612}
613
614// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200615func (dat *DaTokenizer) setSize(v int) {
616 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200617}
618
619// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200620func (dat *DaTokenizer) GetSize() int {
621 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200622}
623
624// Based on Mizobuchi et al (2000), p. 124
625// This iterates for every state through the complete double array
626// structure until it finds a gap that fits all outgoing transitions
627// of the state. This is extremely slow, but is only necessary in the
628// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200629func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200630
631 // Start at the first entry of the double array list
Akron3fdfec62021-08-04 11:40:10 +0200632 base := uint32(1)
Akron8ef408b2021-08-02 22:11:04 +0200633
Akron8ef408b2021-08-02 22:11:04 +0200634OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200635
636 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200637 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200638 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200639 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200640 base++
641 goto OVERLAP
642 }
643 }
Akron8ef408b2021-08-02 22:11:04 +0200644 return base
645}
646
Akron03a3c612021-08-04 11:51:27 +0200647// LoadFactor as defined in Kanda et al (2018),
648// i.e. the proportion of non-empty elements to all elements.
649func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200650
Akron03a3c612021-08-04 11:51:27 +0200651 // Cache the loadfactor
652 if dat.loadFactor >= 0 {
653 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200654 }
Akrond66a9262021-08-03 17:09:09 +0200655 nonEmpty := 0
656 all := len(dat.array) / 2
657 for x := 1; x <= len(dat.array); x = x + 2 {
658 if dat.array[x] != 0 {
659 nonEmpty++
660 }
661 }
Akron03a3c612021-08-04 11:51:27 +0200662 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
663 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200664}
665
Akron6247a5d2021-08-03 19:18:28 +0200666// WriteTo stores the double array data in an io.Writer.
667func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
668
669 // Store magical header
670 all, err := w.Write([]byte(MAGIC))
671 if err != nil {
672 log.Error().Msg("Unable to write data")
673 }
674
675 // Get sigma as a list
676 sigmalist := make([]rune, len(dat.sigma)+16)
677 max := 0
678 for sym, num := range dat.sigma {
679 sigmalist[num] = sym
680 if num > max {
681 max = num
682 }
683 }
684
685 sigmalist = sigmalist[:max+1]
686
687 buf := make([]byte, 0, 12)
688 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200689 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
690 bo.PutUint16(buf[4:6], uint16(dat.unknown))
691 bo.PutUint16(buf[6:8], uint16(dat.identity))
692 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200693 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
694 more, err := w.Write(buf[0:12])
695 if err != nil {
696 log.Error().Msg("Unable to write data")
697 }
698
699 all += more
700
701 wbuf := bytes.NewBuffer(nil)
702 wbufWrap := bufio.NewWriter(wbuf)
703
704 // Write sigma
705 for _, sym := range sigmalist {
706 more, err = wbufWrap.WriteRune(sym)
707 if err != nil {
708 log.Error().Msg("Unable to write data")
709 }
710 all += more
711 }
712 wbufWrap.Flush()
713 more, err = w.Write(wbuf.Bytes())
714 if err != nil {
715 log.Error().Msg("Unable to write data")
716 }
717 all += more
718
719 // Test marker - could be checksum
720 more, err = w.Write([]byte("T"))
721 if err != nil {
722 log.Error().Msg("Unable to write data")
723 }
724 all += more
725
726 wbuf.Reset()
727
728 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200729 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200730 more, err := w.Write(buf[0:4])
731 if err != nil {
732 log.Error().Msg("Unable to write data")
733 }
734 all += more
735 }
736
737 return int64(all), err
738}
739
Akron740f3d72021-08-03 12:12:34 +0200740// Match an input string against the double array
741// FSA.
742//
743// Based on Mizobuchi et al (2000), p. 129,
744// with additional support for IDENTITY, UNKNOWN
745// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200746func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200747 var a int
Akron3fdfec62021-08-04 11:40:10 +0200748 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200749 var ok bool
750
Akron3fdfec62021-08-04 11:40:10 +0200751 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200752 chars := []rune(input)
753 i := 0
754
Akron49d27ee2021-08-03 11:58:13 +0200755 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200756 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200757
Akron740f3d72021-08-03 12:12:34 +0200758 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200759 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200760 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200761 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200762 }
Akron64ffd9a2021-08-03 19:55:21 +0200763 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200764 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200765 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200766 }
Akron465a0992021-08-03 11:28:48 +0200767 tu = t
Akron730a79c2021-08-03 11:05:29 +0200768 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200769 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200770
Akron740f3d72021-08-03 12:12:34 +0200771 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200772 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200773
774 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200775 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200776 }
Akron740f3d72021-08-03 12:12:34 +0200777
Akron64ffd9a2021-08-03 19:55:21 +0200778 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200779 // Try again with unknown symbol, in case identity failed
780 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200781 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200782 }
Akron64ffd9a2021-08-03 19:55:21 +0200783 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200784
Akron64ffd9a2021-08-03 19:55:21 +0200785 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200786 // Try again with epsilon symbol, in case everything else failed
787 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200788 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200789 }
Akron64ffd9a2021-08-03 19:55:21 +0200790 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200791 } else {
792 break
793 }
794 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200795 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200796 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200797 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200798 }
Akron740f3d72021-08-03 12:12:34 +0200799
800 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200801 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200802 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200803 i++
804 }
Akron83e75a22021-08-04 13:14:06 +0200805
Akron740f3d72021-08-03 12:12:34 +0200806 // TODO:
807 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200808 }
809
Akron740f3d72021-08-03 12:12:34 +0200810 if i != len(chars) {
811 if DEBUG {
812 fmt.Println("Not at the end")
813 }
Akron8ef408b2021-08-02 22:11:04 +0200814 return false
815 }
816
Akron465a0992021-08-03 11:28:48 +0200817FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200818
819 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200820 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200821 return true
822 }
Akron465a0992021-08-03 11:28:48 +0200823
Akron740f3d72021-08-03 12:12:34 +0200824 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200825 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200826 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200827
Akron740f3d72021-08-03 12:12:34 +0200828 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200829 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200830 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200831 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200832 }
Akron465a0992021-08-03 11:28:48 +0200833 return false
Akron740f3d72021-08-03 12:12:34 +0200834
Akron3fdfec62021-08-04 11:40:10 +0200835 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +0200836 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200837 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200838 }
Akron740f3d72021-08-03 12:12:34 +0200839
Akron465a0992021-08-03 11:28:48 +0200840 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200841}
Akron068874c2021-08-04 15:19:56 +0200842
843// Match an input string against the double array
844// FSA.
845//
846// Based on Match with additional support
847// for NONTOKEN handling
848func (dat *DaTokenizer) Transduce(input string) bool {
849 var a int
850 var tu uint32
851 var ok, nontoken bool
852
853 t := uint32(1) // Initial state
854 chars := []rune(input)
855 i := 0
856
857 for i < len(chars) {
858 a, ok = dat.sigma[chars[i]]
859
860 // Support identity symbol if character is not in sigma
861 if !ok && dat.identity != -1 {
862 if DEBUG {
863 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
864 }
865 a = dat.identity
866 } else if DEBUG {
867 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
868 }
869 tu = t
870 CHECK:
871 nontoken = false
872 t = dat.getBase(tu) + uint32(a)
873
874 // Check if the transition is valid according to the double array
875 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
876
877 if DEBUG {
878 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
879 }
880
881 if !ok && a == dat.identity {
882 // Try again with unknown symbol, in case identity failed
883 if DEBUG {
884 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
885 }
886 a = dat.unknown
887
888 } else if a != dat.epsilon {
889 // Try again with epsilon symbol, in case everything else failed
890 if DEBUG {
891 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
892 }
893 a = dat.epsilon
894 } else {
895 break
896 }
897 goto CHECK
898 } else if dat.isSeparate(t) {
899 // Move to representative state
900 nontoken = dat.isNonToken(t)
901
902 t = dat.getBase(t)
903 } else {
904 nontoken = dat.isNonToken(t)
905 }
906
907 // Transition is fine
908 if a != dat.epsilon {
909 // Character consumed
910
911 if !nontoken {
912 fmt.Print("[", string(chars[i]), "]")
913 }
914 i++
915 }
916
917 if nontoken {
918 fmt.Print("<|>")
919 }
920
921 // TODO:
922 // Prevent endless epsilon loops!
923 }
924
925 if i != len(chars) {
926 if DEBUG {
927 fmt.Println("Not at the end")
928 }
929 return false
930 }
931
932FINALCHECK:
933
934 // Automaton is in a final state
935 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
936 if dat.isNonToken(t) {
937 fmt.Print("<|>")
938 }
939 return true
940 }
941
942 // Check epsilon transitions until a final state is reached
943 tu = t
944 t = dat.getBase(tu) + uint32(dat.epsilon)
945
946 // Epsilon transition failed
947 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
948 if DEBUG {
949 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
950 }
951 return false
952
953 } else if dat.isSeparate(t) {
954 nontoken = dat.isNonToken(t)
955 // Move to representative state
956 t = dat.getBase(t)
957 } else {
958 nontoken = dat.isNonToken(t)
959 }
960
961 if nontoken {
962 fmt.Print("<|>")
963 }
964
965 goto FINALCHECK
966}