blob: a872981ff74243d2ddae909054316a0122aecc70 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akron83e75a22021-08-04 13:14:06 +02009// The maximum number of states is 107.3741.823 (30bit),
10// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
13
Akron740f3d72021-08-03 12:12:34 +020014// TODO:
15// - replace maxSize with the check value
16// - Strip first state and make everything start with 0!
Akron740f3d72021-08-03 12:12:34 +020017
Akron8ef408b2021-08-02 22:11:04 +020018import (
19 "bufio"
Akron6247a5d2021-08-03 19:18:28 +020020 "bytes"
Akron8ef408b2021-08-02 22:11:04 +020021 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020022 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020023 "fmt"
24 "io"
25 "os"
Akronc9d84a62021-08-03 15:56:03 +020026 "sort"
Akron8ef408b2021-08-02 22:11:04 +020027 "strconv"
28 "strings"
29 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020030
31 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020032)
33
34const (
Akron2a4b9292021-08-04 15:35:22 +020035 PROPS = 1
36 SIGMA = 2
37 STATES = 3
38 NONE = 4
39 NEWLINE = '\u000a'
40 DEBUG = false
41 MAGIC = "DATOK"
42 VERSION = uint16(1)
43 firstBit uint32 = 1 << 31
44 secondBit uint32 = 1 << 30
45 restBit uint32 = ^uint32(0) &^ (firstBit | secondBit)
Akron8ef408b2021-08-02 22:11:04 +020046)
47
Akron6247a5d2021-08-03 19:18:28 +020048var bo binary.ByteOrder = binary.LittleEndian
49
Akron8ef408b2021-08-02 22:11:04 +020050type mapping struct {
51 source int
Akron3fdfec62021-08-04 11:40:10 +020052 target uint32
Akron8ef408b2021-08-02 22:11:04 +020053}
54
55type edge struct {
Akron83e75a22021-08-04 13:14:06 +020056 inSym int
57 outSym int
58 end int
59 nontoken bool
60 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type Tokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020064 // sigma map[rune]int
Akron740f3d72021-08-03 12:12:34 +020065 sigmaRev map[int]rune
66 arcCount int
67 stateCount int
68 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020069 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020070
71 // Special symbols in sigma
72 epsilon int
73 unknown int
74 identity int
75 final int
Akron8ef408b2021-08-02 22:11:04 +020076}
77
Akronf2120ca2021-08-03 16:26:41 +020078type DaTokenizer struct {
Akronf2120ca2021-08-03 16:26:41 +020079 // sigmaRev map[int]rune
Akron03a3c612021-08-04 11:51:27 +020080 sigma map[rune]int
81 maxSize int
82 loadFactor float64
83 array []uint32
Akronc17f1ca2021-08-03 19:47:27 +020084
85 // Special symbols in sigma
86 epsilon int
87 unknown int
88 identity int
89 final int
Akronf2120ca2021-08-03 16:26:41 +020090}
91
Akron64ffd9a2021-08-03 19:55:21 +020092func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +020093 f, err := os.Open(file)
94 if err != nil {
Akron740f3d72021-08-03 12:12:34 +020095 log.Error().Err(err)
96 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +020097 }
98 defer f.Close()
99
100 gz, err := gzip.NewReader(f)
101 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200102 log.Error().Err(err)
103 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200104 }
105 defer gz.Close()
106
Akron3fdfec62021-08-04 11:40:10 +0200107 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200108}
109
Akron3fdfec62021-08-04 11:40:10 +0200110func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200111 r := bufio.NewReader(ior)
112
113 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200114 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200115 epsilon: -1,
116 unknown: -1,
117 identity: -1,
118 final: -1,
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120
Akron740f3d72021-08-03 12:12:34 +0200121 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200122
123 mode := 0
124 var elem []string
125 var elemint [5]int
126
127 for {
128 line, err := r.ReadString('\n')
129 if err != nil {
130 if err == io.EOF {
131 break
132 }
Akron740f3d72021-08-03 12:12:34 +0200133 log.Error().Err(err)
134 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200135 }
136 if strings.HasPrefix(line, "##foma-net") {
137 continue
138 }
139 if strings.HasPrefix(line, "##props##") {
140 mode = PROPS
141 continue
142 }
143 if strings.HasPrefix(line, "##states##") {
144 mode = STATES
145
146 // Adds a final transition symbol to sigma
147 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200148 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200149 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200150 continue
151 }
152 if strings.HasPrefix(line, "##sigma##") {
153 mode = SIGMA
154 continue
155 }
156 if strings.HasPrefix(line, "##end##") {
157 mode = NONE
158 continue
159 }
160
161 switch mode {
162 case PROPS:
163 {
164 elem = strings.Split(line, " ")
165 /*
166 fmt.Println("arity: " + elem[0])
167 fmt.Println("arccount: " + elem[1])
168 fmt.Println("statecount: " + elem[2])
169 fmt.Println("linecount: " + elem[3])
170 fmt.Println("finalcount: " + elem[4])
171 fmt.Println("pathcount: " + elem[5])
172 fmt.Println("is_deterministic: " + elem[6])
173 fmt.Println("is_pruned: " + elem[7])
174 fmt.Println("is_minimized: " + elem[8])
175 fmt.Println("is_epsilon_free: " + elem[9])
176 fmt.Println("is_loop_free: " + elem[10])
177 fmt.Println("extras: " + elem[11])
178 fmt.Println("name: " + elem[12])
179 */
180 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200181 log.Error().Msg("The FST needs to be deterministic")
182 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200183 }
184 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200185 log.Error().Msg("The FST needs to be epsilon free")
186 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200187 }
188
189 elemint[0], err = strconv.Atoi(elem[1])
190 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200191 log.Error().Msg("Can't read arccount")
192 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200193 }
Akron740f3d72021-08-03 12:12:34 +0200194 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200195
196 // States start at 1 in Mizobuchi et al (2000),
197 // as the state 0 is associated with a fail.
198 // Initialize states and transitions
199 elemint[0], err = strconv.Atoi(elem[2])
200 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200201 log.Error().Msg("Can't read statecount")
202 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200203 }
Akron740f3d72021-08-03 12:12:34 +0200204 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200205 tok.transitions = make([]map[int]*edge, elemint[0]+1)
206 continue
207 }
208 case STATES:
209 {
210 elem = strings.Split(line[0:len(line)-1], " ")
211 if elem[0] == "-1" {
212 continue
213 }
214 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200215 if err != nil {
216 break
217 }
Akron8ef408b2021-08-02 22:11:04 +0200218
219 if len(elem) > 1 {
220 elemint[1], err = strconv.Atoi(elem[1])
221 if err != nil {
222 break
223 }
224 if len(elem) > 2 {
225 elemint[2], err = strconv.Atoi(elem[2])
226 if err != nil {
227 break
228 }
229 if len(elem) > 3 {
230 elemint[3], err = strconv.Atoi(elem[3])
231 if err != nil {
232 break
233 }
234 if len(elem) > 4 {
235 elemint[4], err = strconv.Atoi(elem[4])
236 if err != nil {
237 break
238 }
239 }
240 }
241 }
242 }
243
244 switch len(elem) {
245 case 5:
246 {
Akron740f3d72021-08-03 12:12:34 +0200247 state = elemint[0]
248 inSym = elemint[1]
249 outSym = elemint[2]
250 end = elemint[3]
251 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200252 }
253 case 4:
254 {
255 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200256 state = elemint[0]
257 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200258 } else {
Akron740f3d72021-08-03 12:12:34 +0200259 state = elemint[0]
260 inSym = elemint[1]
261 end = elemint[2]
262 final = elemint[3]
263 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200264 }
265 }
266 case 3:
267 {
Akron740f3d72021-08-03 12:12:34 +0200268 inSym = elemint[0]
269 outSym = elemint[1]
270 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200271 }
272 case 2:
273 {
Akron740f3d72021-08-03 12:12:34 +0200274 inSym = elemint[0]
275 end = elemint[1]
276 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200277 }
278 }
279
Akron8ef408b2021-08-02 22:11:04 +0200280 // While the states in foma start with 0, the states in the
281 // Mizobuchi FSA start with one - so we increase every state by 1.
282
Akron83e75a22021-08-04 13:14:06 +0200283 nontoken := false
284 tokenend := false
285
Akron740f3d72021-08-03 12:12:34 +0200286 if inSym != outSym {
287
Akron83e75a22021-08-04 13:14:06 +0200288 if tok.sigmaRev[outSym] == NEWLINE {
289 tokenend = true
290 } else if outSym == tok.epsilon {
291 nontoken = true
292 } else {
Akron740f3d72021-08-03 12:12:34 +0200293 log.Error().Msg(
294 "Unsupported transition: " +
295 strconv.Itoa(state) +
296 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200297 " (" +
Akron740f3d72021-08-03 12:12:34 +0200298 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200299 ":" +
Akron740f3d72021-08-03 12:12:34 +0200300 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200301 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200302 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200303 ":" +
Akron740f3d72021-08-03 12:12:34 +0200304 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200305 ")")
Akron740f3d72021-08-03 12:12:34 +0200306 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200307 }
Akron83e75a22021-08-04 13:14:06 +0200308
Akron83e75a22021-08-04 13:14:06 +0200309 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200310 log.Error().Msg("Epsilon transitions not supported")
311 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200312 }
313
Akron740f3d72021-08-03 12:12:34 +0200314 // This collects all edges until arrstate changes
315
Akron8ef408b2021-08-02 22:11:04 +0200316 // TODO:
317 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
318 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
319 // This requires to remove the transition alltogether and marks the state instead.
320
321 // TODO:
322 // if arrout == EPSILON, mark the transition as NOTOKEN
323
324 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200325 inSym: inSym,
326 outSym: outSym,
327 end: end + 1,
328 tokenend: tokenend,
329 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200330 }
331
Akron740f3d72021-08-03 12:12:34 +0200332 // Initialize outgoing states
333 if tok.transitions[state+1] == nil {
334 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200335 }
336
Akron740f3d72021-08-03 12:12:34 +0200337 // Ignore transitions with invalid symbols
338 if inSym >= 0 {
339 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200340 }
Akron8ef408b2021-08-02 22:11:04 +0200341
Akron740f3d72021-08-03 12:12:34 +0200342 // Add final transition
343 if final == 1 {
Akronc17f1ca2021-08-03 19:47:27 +0200344 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200345 }
346
Akron740f3d72021-08-03 12:12:34 +0200347 if DEBUG {
348 fmt.Println("Add",
349 state+1, "->", end+1,
350 "(",
351 inSym,
352 ":",
353 outSym,
354 ") (",
355 string(tok.sigmaRev[inSym]),
356 ":",
357 string(tok.sigmaRev[outSym]),
358 ")")
359 }
Akron75ebe7f2021-08-03 10:34:10 +0200360
Akron8ef408b2021-08-02 22:11:04 +0200361 continue
362 }
363 case SIGMA:
364 {
365 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
366
367 // Turn string into sigma id
368 number, err := strconv.Atoi(elem[0])
369
370 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200371 log.Error().Err(err)
372 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200373 }
374
Akron740f3d72021-08-03 12:12:34 +0200375 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200376
377 var symbol rune
378
379 // Read rune
380 if utf8.RuneCountInString(elem[1]) == 1 {
381 symbol = []rune(elem[1])[0]
382
383 // Probably a MCS
384 } else if utf8.RuneCountInString(elem[1]) > 1 {
385 switch elem[1] {
386 case "@_EPSILON_SYMBOL_@":
387 {
Akronc17f1ca2021-08-03 19:47:27 +0200388 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200389 continue
390 }
391 case "@_UNKNOWN_SYMBOL_@":
392 {
Akronc17f1ca2021-08-03 19:47:27 +0200393 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200394 continue
395 }
396
397 case "@_IDENTITY_SYMBOL_@":
398 {
Akronc17f1ca2021-08-03 19:47:27 +0200399 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200400 continue
401 }
402 default:
Akron740f3d72021-08-03 12:12:34 +0200403 {
404 log.Error().Msg("MCS not supported: " + line)
405 os.Exit(1)
406 }
Akron8ef408b2021-08-02 22:11:04 +0200407 }
408
Akron740f3d72021-08-03 12:12:34 +0200409 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200410 line, err = r.ReadString('\n')
411 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200412 log.Error().Err(err)
413 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200414 }
415 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200416 log.Error().Msg("MCS not supported:" + line)
417 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200418 }
Akron740f3d72021-08-03 12:12:34 +0200419 symbol = rune(NEWLINE)
Akron8ef408b2021-08-02 22:11:04 +0200420 }
421
Akron740f3d72021-08-03 12:12:34 +0200422 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200423 }
424 }
425 }
426
427 return tok
428}
429
Akron64ffd9a2021-08-03 19:55:21 +0200430// Set alphabet A to the list of all symbols
431// outgoing from s
432func (tok *Tokenizer) get_set(s int, A *[]int) {
433 for a := range tok.transitions[s] {
434 *A = append(*A, a)
435 }
436
437 // Not required, but simplifies bug hunting
438 sort.Ints(*A)
439}
440
Akron8ef408b2021-08-02 22:11:04 +0200441// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200442func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
443
444 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200445 sigma: make(map[rune]int),
446 loadFactor: -1,
447 final: tok.final,
448 unknown: tok.unknown,
449 identity: tok.identity,
450 epsilon: tok.epsilon,
Akronf2120ca2021-08-03 16:26:41 +0200451 }
452
453 for num, sym := range tok.sigmaRev {
454 dat.sigma[sym] = num
455 }
Akron8ef408b2021-08-02 22:11:04 +0200456
457 mark := 0
458 size := 0
459
460 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200461 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200462
463 table[size] = &mapping{source: 1, target: 1}
464 size++
465
Akron740f3d72021-08-03 12:12:34 +0200466 // Allocate space for the outgoing symbol range
467 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200468
469 for mark < size {
470 s := table[mark].source // This is a state in Ms
471 t := table[mark].target // This is a state in Mt
472 mark++
Akron740f3d72021-08-03 12:12:34 +0200473
474 // Following the paper, here the state t can be remembered
475 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200476 A = A[:0]
477 tok.get_set(s, &A)
478
Akron740f3d72021-08-03 12:12:34 +0200479 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200480 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200481
Akron773b1ef2021-08-03 17:37:20 +0200482 // TODO:
Akron068874c2021-08-04 15:19:56 +0200483 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200484 // outdegree of .end
485
Akron740f3d72021-08-03 12:12:34 +0200486 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200487 for _, a := range A {
488
Akronc17f1ca2021-08-03 19:47:27 +0200489 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200490
Akron740f3d72021-08-03 12:12:34 +0200491 // Aka g(s, a)
492 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200493
Akron740f3d72021-08-03 12:12:34 +0200494 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200495 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200496 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200497
Akron83e75a22021-08-04 13:14:06 +0200498 // Mark the state as being the target of a nontoken transition
499 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200500 dat.setNonToken(t1, true)
Akron83e75a22021-08-04 13:14:06 +0200501 }
502
Akron740f3d72021-08-03 12:12:34 +0200503 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200504 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200505
Akron8ef408b2021-08-02 22:11:04 +0200506 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200507 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200508 table[size] = &mapping{source: s1, target: t1}
509 size++
510 } else {
Akron740f3d72021-08-03 12:12:34 +0200511 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200512 dat.setBase(t1, r)
513 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200514 }
515 } else {
Akron740f3d72021-08-03 12:12:34 +0200516 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200517 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200518 }
519 }
520 }
521
522 // Following Mizobuchi et al (2000) the size of the
523 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200524 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200525 dat.array = dat.array[:dat.maxSize+1]
526 return dat
Akron8ef408b2021-08-02 22:11:04 +0200527}
528
Akron8ef408b2021-08-02 22:11:04 +0200529// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200530// exists and return this as a representative.
531// Currently iterates through the whole table
532// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200533func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200534 for x := 0; x < size; x++ {
535 if table[x].source == s {
536 return table[x].target
537 }
538 }
539 return 0
540}
541
Akron64ffd9a2021-08-03 19:55:21 +0200542// Resize double array when necessary
543func (dat *DaTokenizer) resize(l int) {
544 // TODO:
545 // This is a bit too aggressive atm and should be calmed down.
546 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200547 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200548 }
Akron64ffd9a2021-08-03 19:55:21 +0200549}
Akronc9d84a62021-08-03 15:56:03 +0200550
Akron64ffd9a2021-08-03 19:55:21 +0200551// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200552func (dat *DaTokenizer) setBase(p uint32, v uint32) {
553 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200554 dat.resize(l)
555 if dat.maxSize < l {
556 dat.maxSize = l
557 }
558 dat.array[p*2] = v
559}
560
Akron3fdfec62021-08-04 11:40:10 +0200561// Returns true if a state is separate pointing to a representative
562func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200563 return dat.array[p*2]&firstBit != 0
Akron3fdfec62021-08-04 11:40:10 +0200564}
565
566// Mark a state as separate pointing to a representative
567func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
568 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200569 dat.array[p*2] |= firstBit
Akron3fdfec62021-08-04 11:40:10 +0200570 } else {
Akron2a4b9292021-08-04 15:35:22 +0200571 dat.array[p*2] &= (restBit | secondBit)
Akron3fdfec62021-08-04 11:40:10 +0200572 }
573}
574
Akron83e75a22021-08-04 13:14:06 +0200575// Returns true if a state is the target of a nontoken transition
576func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron2a4b9292021-08-04 15:35:22 +0200577 return dat.array[p*2+1]&firstBit != 0
Akron83e75a22021-08-04 13:14:06 +0200578}
579
580// Mark a state as being the target of a nontoken transition
581func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
582 if sep {
Akron2a4b9292021-08-04 15:35:22 +0200583 dat.array[p*2+1] |= firstBit
Akron83e75a22021-08-04 13:14:06 +0200584 } else {
Akron2a4b9292021-08-04 15:35:22 +0200585 dat.array[p*2+1] &= (restBit | secondBit)
Akron83e75a22021-08-04 13:14:06 +0200586 }
587}
588
Akron64ffd9a2021-08-03 19:55:21 +0200589// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200590func (dat *DaTokenizer) getBase(p uint32) uint32 {
591 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200592 return 0
593 }
Akron3fdfec62021-08-04 11:40:10 +0200594 return dat.array[p*2] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200595}
596
597// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200598func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
599 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200600 dat.resize(l)
601 if dat.maxSize < l {
602 dat.maxSize = l
603 }
604 dat.array[(p*2)+1] = v
605}
606
607// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200608func (dat *DaTokenizer) getCheck(p uint32) uint32 {
609 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200610 return 0
611 }
Akron3fdfec62021-08-04 11:40:10 +0200612 return dat.array[(p*2)+1] & restBit
Akron64ffd9a2021-08-03 19:55:21 +0200613}
614
615// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200616func (dat *DaTokenizer) setSize(v int) {
617 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200618}
619
620// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200621func (dat *DaTokenizer) GetSize() int {
622 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200623}
624
625// Based on Mizobuchi et al (2000), p. 124
626// This iterates for every state through the complete double array
627// structure until it finds a gap that fits all outgoing transitions
628// of the state. This is extremely slow, but is only necessary in the
629// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200630func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200631
632 // Start at the first entry of the double array list
Akron3fdfec62021-08-04 11:40:10 +0200633 base := uint32(1)
Akron8ef408b2021-08-02 22:11:04 +0200634
Akron8ef408b2021-08-02 22:11:04 +0200635OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200636
637 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200638 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200639 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200640 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200641 base++
642 goto OVERLAP
643 }
644 }
Akron8ef408b2021-08-02 22:11:04 +0200645 return base
646}
647
Akron03a3c612021-08-04 11:51:27 +0200648// LoadFactor as defined in Kanda et al (2018),
649// i.e. the proportion of non-empty elements to all elements.
650func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200651
Akron03a3c612021-08-04 11:51:27 +0200652 // Cache the loadfactor
653 if dat.loadFactor >= 0 {
654 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200655 }
Akrond66a9262021-08-03 17:09:09 +0200656 nonEmpty := 0
657 all := len(dat.array) / 2
658 for x := 1; x <= len(dat.array); x = x + 2 {
659 if dat.array[x] != 0 {
660 nonEmpty++
661 }
662 }
Akron03a3c612021-08-04 11:51:27 +0200663 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
664 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200665}
666
Akron6247a5d2021-08-03 19:18:28 +0200667// WriteTo stores the double array data in an io.Writer.
668func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
669
670 // Store magical header
671 all, err := w.Write([]byte(MAGIC))
672 if err != nil {
673 log.Error().Msg("Unable to write data")
674 }
675
676 // Get sigma as a list
677 sigmalist := make([]rune, len(dat.sigma)+16)
678 max := 0
679 for sym, num := range dat.sigma {
680 sigmalist[num] = sym
681 if num > max {
682 max = num
683 }
684 }
685
686 sigmalist = sigmalist[:max+1]
687
688 buf := make([]byte, 0, 12)
689 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200690 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
691 bo.PutUint16(buf[4:6], uint16(dat.unknown))
692 bo.PutUint16(buf[6:8], uint16(dat.identity))
693 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200694 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
695 more, err := w.Write(buf[0:12])
696 if err != nil {
697 log.Error().Msg("Unable to write data")
698 }
699
700 all += more
701
702 wbuf := bytes.NewBuffer(nil)
703 wbufWrap := bufio.NewWriter(wbuf)
704
705 // Write sigma
706 for _, sym := range sigmalist {
707 more, err = wbufWrap.WriteRune(sym)
708 if err != nil {
709 log.Error().Msg("Unable to write data")
710 }
711 all += more
712 }
713 wbufWrap.Flush()
714 more, err = w.Write(wbuf.Bytes())
715 if err != nil {
716 log.Error().Msg("Unable to write data")
717 }
718 all += more
719
720 // Test marker - could be checksum
721 more, err = w.Write([]byte("T"))
722 if err != nil {
723 log.Error().Msg("Unable to write data")
724 }
725 all += more
726
727 wbuf.Reset()
728
729 for _, d := range dat.array {
Akron3fdfec62021-08-04 11:40:10 +0200730 bo.PutUint32(buf[0:4], d)
Akron6247a5d2021-08-03 19:18:28 +0200731 more, err := w.Write(buf[0:4])
732 if err != nil {
733 log.Error().Msg("Unable to write data")
734 }
735 all += more
736 }
737
738 return int64(all), err
739}
740
Akron740f3d72021-08-03 12:12:34 +0200741// Match an input string against the double array
742// FSA.
743//
744// Based on Mizobuchi et al (2000), p. 129,
745// with additional support for IDENTITY, UNKNOWN
746// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +0200747func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +0200748 var a int
Akron3fdfec62021-08-04 11:40:10 +0200749 var tu uint32
Akron465a0992021-08-03 11:28:48 +0200750 var ok bool
751
Akron3fdfec62021-08-04 11:40:10 +0200752 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +0200753 chars := []rune(input)
754 i := 0
755
Akron49d27ee2021-08-03 11:58:13 +0200756 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +0200757 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200758
Akron740f3d72021-08-03 12:12:34 +0200759 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +0200760 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +0200761 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200762 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +0200763 }
Akron64ffd9a2021-08-03 19:55:21 +0200764 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +0200765 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +0200766 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +0200767 }
Akron465a0992021-08-03 11:28:48 +0200768 tu = t
Akron730a79c2021-08-03 11:05:29 +0200769 CHECK:
Akron3fdfec62021-08-04 11:40:10 +0200770 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +0200771
Akron740f3d72021-08-03 12:12:34 +0200772 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +0200773 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200774
775 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200776 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200777 }
Akron740f3d72021-08-03 12:12:34 +0200778
Akron64ffd9a2021-08-03 19:55:21 +0200779 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +0200780 // Try again with unknown symbol, in case identity failed
781 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200782 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +0200783 }
Akron64ffd9a2021-08-03 19:55:21 +0200784 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +0200785
Akron64ffd9a2021-08-03 19:55:21 +0200786 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200787 // Try again with epsilon symbol, in case everything else failed
788 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200789 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +0200790 }
Akron64ffd9a2021-08-03 19:55:21 +0200791 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +0200792 } else {
793 break
794 }
795 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +0200796 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +0200797 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200798 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +0200799 }
Akron740f3d72021-08-03 12:12:34 +0200800
801 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +0200802 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +0200803 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +0200804 i++
805 }
Akron83e75a22021-08-04 13:14:06 +0200806
Akron740f3d72021-08-03 12:12:34 +0200807 // TODO:
808 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +0200809 }
810
Akron740f3d72021-08-03 12:12:34 +0200811 if i != len(chars) {
812 if DEBUG {
813 fmt.Println("Not at the end")
814 }
Akron8ef408b2021-08-02 22:11:04 +0200815 return false
816 }
817
Akron465a0992021-08-03 11:28:48 +0200818FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +0200819
820 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +0200821 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +0200822 return true
823 }
Akron465a0992021-08-03 11:28:48 +0200824
Akron740f3d72021-08-03 12:12:34 +0200825 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +0200826 tu = t
Akron3fdfec62021-08-04 11:40:10 +0200827 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +0200828
Akron740f3d72021-08-03 12:12:34 +0200829 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +0200830 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +0200831 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +0200832 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +0200833 }
Akron465a0992021-08-03 11:28:48 +0200834 return false
Akron740f3d72021-08-03 12:12:34 +0200835
Akron3fdfec62021-08-04 11:40:10 +0200836 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +0200837 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +0200838 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +0200839 }
Akron740f3d72021-08-03 12:12:34 +0200840
Akron465a0992021-08-03 11:28:48 +0200841 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +0200842}
Akron068874c2021-08-04 15:19:56 +0200843
844// Match an input string against the double array
845// FSA.
846//
847// Based on Match with additional support
848// for NONTOKEN handling
849func (dat *DaTokenizer) Transduce(input string) bool {
850 var a int
851 var tu uint32
852 var ok, nontoken bool
853
854 t := uint32(1) // Initial state
855 chars := []rune(input)
856 i := 0
857
858 for i < len(chars) {
859 a, ok = dat.sigma[chars[i]]
860
861 // Support identity symbol if character is not in sigma
862 if !ok && dat.identity != -1 {
863 if DEBUG {
864 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
865 }
866 a = dat.identity
867 } else if DEBUG {
868 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
869 }
870 tu = t
871 CHECK:
872 nontoken = false
873 t = dat.getBase(tu) + uint32(a)
874
875 // Check if the transition is valid according to the double array
876 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
877
878 if DEBUG {
879 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
880 }
881
882 if !ok && a == dat.identity {
883 // Try again with unknown symbol, in case identity failed
884 if DEBUG {
885 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
886 }
887 a = dat.unknown
888
889 } else if a != dat.epsilon {
890 // Try again with epsilon symbol, in case everything else failed
891 if DEBUG {
892 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
893 }
894 a = dat.epsilon
895 } else {
896 break
897 }
898 goto CHECK
899 } else if dat.isSeparate(t) {
900 // Move to representative state
901 nontoken = dat.isNonToken(t)
902
903 t = dat.getBase(t)
904 } else {
905 nontoken = dat.isNonToken(t)
906 }
907
908 // Transition is fine
909 if a != dat.epsilon {
910 // Character consumed
911
912 if !nontoken {
913 fmt.Print("[", string(chars[i]), "]")
914 }
915 i++
916 }
917
918 if nontoken {
919 fmt.Print("<|>")
920 }
921
922 // TODO:
923 // Prevent endless epsilon loops!
924 }
925
926 if i != len(chars) {
927 if DEBUG {
928 fmt.Println("Not at the end")
929 }
930 return false
931 }
932
933FINALCHECK:
934
935 // Automaton is in a final state
936 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
937 if dat.isNonToken(t) {
938 fmt.Print("<|>")
939 }
940 return true
941 }
942
943 // Check epsilon transitions until a final state is reached
944 tu = t
945 t = dat.getBase(tu) + uint32(dat.epsilon)
946
947 // Epsilon transition failed
948 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
949 if DEBUG {
950 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
951 }
952 return false
953
954 } else if dat.isSeparate(t) {
955 nontoken = dat.isNonToken(t)
956 // Move to representative state
957 t = dat.getBase(t)
958 } else {
959 nontoken = dat.isNonToken(t)
960 }
961
962 if nontoken {
963 fmt.Print("<|>")
964 }
965
966 goto FINALCHECK
967}