blob: 919c61124e2e0e35b7650ee590095024e9726a86 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
18// - Strip first state and make everything start with 0!
Akron3a063ef2021-08-05 19:36:35 +020019// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020020// - Mark epsilon transitions in bytes
21// - Introduce methods on BC array entries instead of
22// jumping into the entries all the time!
23// - Instead of memoizing the loadFactor, better remember
24// the number of set transitions
Akron740f3d72021-08-03 12:12:34 +020025
Akron8ef408b2021-08-02 22:11:04 +020026import (
27 "bufio"
28 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020029 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020030 "fmt"
31 "io"
32 "os"
Akronc9d84a62021-08-03 15:56:03 +020033 "sort"
Akron8ef408b2021-08-02 22:11:04 +020034 "strconv"
35 "strings"
36 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020037
38 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020039)
40
41const (
Akron2a4b9292021-08-04 15:35:22 +020042 PROPS = 1
43 SIGMA = 2
44 STATES = 3
45 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020046 DEBUG = false
47 MAGIC = "DATOK"
48 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020049 FIRSTBIT uint32 = 1 << 31
50 SECONDBIT uint32 = 1 << 30
51 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020052)
53
Akron03c92fe2021-08-09 14:07:57 +020054// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020055var bo binary.ByteOrder = binary.LittleEndian
56
Akron8ef408b2021-08-02 22:11:04 +020057type mapping struct {
58 source int
Akron3fdfec62021-08-04 11:40:10 +020059 target uint32
Akron8ef408b2021-08-02 22:11:04 +020060}
61
62type edge struct {
Akron83e75a22021-08-04 13:14:06 +020063 inSym int
64 outSym int
65 end int
66 nontoken bool
67 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020068}
69
Akron03c92fe2021-08-09 14:07:57 +020070// Tokenizer is the intermediate representation
71// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020072type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020073 sigmaRev map[int]rune
74 arcCount int
75 stateCount int
76 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020077 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020078
79 // Special symbols in sigma
80 epsilon int
81 unknown int
82 identity int
83 final int
Akron03c92fe2021-08-09 14:07:57 +020084 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020085}
86
Akron03c92fe2021-08-09 14:07:57 +020087// DaTokenizer represents a tokenizer implemented as a
88// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020089type DaTokenizer struct {
Akron03a3c612021-08-04 11:51:27 +020090 sigma map[rune]int
91 maxSize int
92 loadFactor float64
93 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020094 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020095
96 // Special symbols in sigma
97 epsilon int
98 unknown int
99 identity int
100 final int
Akron03c92fe2021-08-09 14:07:57 +0200101 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200102}
103
Akron03c92fe2021-08-09 14:07:57 +0200104// ParseFoma reads the FST from a foma file
105// and creates an internal representation,
106// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200107func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200108 f, err := os.Open(file)
109 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200110 log.Error().Err(err)
111 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200112 }
113 defer f.Close()
114
115 gz, err := gzip.NewReader(f)
116 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200117 log.Error().Err(err)
118 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120 defer gz.Close()
121
Akron3fdfec62021-08-04 11:40:10 +0200122 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200123}
124
Akron03c92fe2021-08-09 14:07:57 +0200125// ParseFoma reads the FST from a foma file reader
126// and creates an internal representation,
127// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200128func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200129 r := bufio.NewReader(ior)
130
131 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200132 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200133 epsilon: -1,
134 unknown: -1,
135 identity: -1,
136 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200137 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200138 }
139
Akron740f3d72021-08-03 12:12:34 +0200140 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200141
142 mode := 0
143 var elem []string
144 var elemint [5]int
145
Akron03c92fe2021-08-09 14:07:57 +0200146 // Iterate over all lines of the file.
147 // This is mainly based on foma2js,
148 // licensed under the Apache License, version 2,
149 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200150 for {
151 line, err := r.ReadString('\n')
152 if err != nil {
153 if err == io.EOF {
154 break
155 }
Akron740f3d72021-08-03 12:12:34 +0200156 log.Error().Err(err)
157 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200158 }
159 if strings.HasPrefix(line, "##foma-net") {
160 continue
161 }
162 if strings.HasPrefix(line, "##props##") {
163 mode = PROPS
164 continue
165 }
166 if strings.HasPrefix(line, "##states##") {
167 mode = STATES
168
169 // Adds a final transition symbol to sigma
170 // written as '#' in Mizobuchi et al (2000)
Akron740f3d72021-08-03 12:12:34 +0200171 tok.sigmaCount++
Akronc17f1ca2021-08-03 19:47:27 +0200172 tok.final = tok.sigmaCount
Akron8ef408b2021-08-02 22:11:04 +0200173 continue
174 }
175 if strings.HasPrefix(line, "##sigma##") {
176 mode = SIGMA
177 continue
178 }
179 if strings.HasPrefix(line, "##end##") {
180 mode = NONE
181 continue
182 }
183
184 switch mode {
185 case PROPS:
186 {
187 elem = strings.Split(line, " ")
188 /*
189 fmt.Println("arity: " + elem[0])
190 fmt.Println("arccount: " + elem[1])
191 fmt.Println("statecount: " + elem[2])
192 fmt.Println("linecount: " + elem[3])
193 fmt.Println("finalcount: " + elem[4])
194 fmt.Println("pathcount: " + elem[5])
195 fmt.Println("is_deterministic: " + elem[6])
196 fmt.Println("is_pruned: " + elem[7])
197 fmt.Println("is_minimized: " + elem[8])
198 fmt.Println("is_epsilon_free: " + elem[9])
199 fmt.Println("is_loop_free: " + elem[10])
200 fmt.Println("extras: " + elem[11])
201 fmt.Println("name: " + elem[12])
202 */
203 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200204 log.Error().Msg("The FST needs to be deterministic")
205 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200206 }
207 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200208 log.Error().Msg("The FST needs to be epsilon free")
209 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200210 }
211
212 elemint[0], err = strconv.Atoi(elem[1])
213 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200214 log.Error().Msg("Can't read arccount")
215 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200216 }
Akron740f3d72021-08-03 12:12:34 +0200217 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200218
219 // States start at 1 in Mizobuchi et al (2000),
220 // as the state 0 is associated with a fail.
221 // Initialize states and transitions
222 elemint[0], err = strconv.Atoi(elem[2])
223 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200224 log.Error().Msg("Can't read statecount")
225 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200226 }
Akron740f3d72021-08-03 12:12:34 +0200227 tok.stateCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200228 tok.transitions = make([]map[int]*edge, elemint[0]+1)
229 continue
230 }
231 case STATES:
232 {
233 elem = strings.Split(line[0:len(line)-1], " ")
234 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200235 if DEBUG {
236 fmt.Println("Skip", elem)
237 }
Akron8ef408b2021-08-02 22:11:04 +0200238 continue
239 }
240 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200241 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200242 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200243 break
244 }
Akron8ef408b2021-08-02 22:11:04 +0200245
246 if len(elem) > 1 {
247 elemint[1], err = strconv.Atoi(elem[1])
248 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200249 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200250 break
251 }
252 if len(elem) > 2 {
253 elemint[2], err = strconv.Atoi(elem[2])
254 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200255 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200256 break
257 }
258 if len(elem) > 3 {
259 elemint[3], err = strconv.Atoi(elem[3])
260 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200261 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200262 break
263 }
264 if len(elem) > 4 {
265 elemint[4], err = strconv.Atoi(elem[4])
266 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200267 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200268 break
269 }
270 }
271 }
272 }
273 }
274
275 switch len(elem) {
276 case 5:
277 {
Akron740f3d72021-08-03 12:12:34 +0200278 state = elemint[0]
279 inSym = elemint[1]
280 outSym = elemint[2]
281 end = elemint[3]
282 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200283 }
284 case 4:
285 {
286 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200287 state = elemint[0]
288 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200289 } else {
Akron740f3d72021-08-03 12:12:34 +0200290 state = elemint[0]
291 inSym = elemint[1]
292 end = elemint[2]
293 final = elemint[3]
294 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200295 }
296 }
297 case 3:
298 {
Akron740f3d72021-08-03 12:12:34 +0200299 inSym = elemint[0]
300 outSym = elemint[1]
301 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200302 }
303 case 2:
304 {
Akron740f3d72021-08-03 12:12:34 +0200305 inSym = elemint[0]
306 end = elemint[1]
307 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200308 }
309 }
310
Akron8ef408b2021-08-02 22:11:04 +0200311 // While the states in foma start with 0, the states in the
312 // Mizobuchi FSA start with one - so we increase every state by 1.
313
Akron83e75a22021-08-04 13:14:06 +0200314 nontoken := false
315 tokenend := false
316
Akron524c5432021-08-05 14:14:27 +0200317 // ID needs to be > 1
318 inSym++
319 outSym++
320
Akron740f3d72021-08-03 12:12:34 +0200321 if inSym != outSym {
322
Akron03c92fe2021-08-09 14:07:57 +0200323 if outSym == tok.tokenend {
Akron83e75a22021-08-04 13:14:06 +0200324 tokenend = true
325 } else if outSym == tok.epsilon {
326 nontoken = true
327 } else {
Akron740f3d72021-08-03 12:12:34 +0200328 log.Error().Msg(
329 "Unsupported transition: " +
330 strconv.Itoa(state) +
331 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200332 " (" +
Akron740f3d72021-08-03 12:12:34 +0200333 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200334 ":" +
Akron740f3d72021-08-03 12:12:34 +0200335 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200336 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200337 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200338 ":" +
Akron740f3d72021-08-03 12:12:34 +0200339 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200340 ")")
Akron740f3d72021-08-03 12:12:34 +0200341 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200342 }
Akron83e75a22021-08-04 13:14:06 +0200343
Akron83e75a22021-08-04 13:14:06 +0200344 } else if inSym == tok.epsilon {
Akron068874c2021-08-04 15:19:56 +0200345 log.Error().Msg("Epsilon transitions not supported")
346 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200347 }
348
Akron03c92fe2021-08-09 14:07:57 +0200349 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200350 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200351 inSym: inSym,
352 outSym: outSym,
353 end: end + 1,
354 tokenend: tokenend,
355 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200356 }
357
Akron740f3d72021-08-03 12:12:34 +0200358 // Initialize outgoing states
359 if tok.transitions[state+1] == nil {
360 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200361 }
362
Akron740f3d72021-08-03 12:12:34 +0200363 // Ignore transitions with invalid symbols
364 if inSym >= 0 {
365 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200366 }
Akron8ef408b2021-08-02 22:11:04 +0200367
Akron740f3d72021-08-03 12:12:34 +0200368 // Add final transition
369 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200370 // TODO:
371 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200372 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200373 }
374
Akronb4bbb472021-08-09 11:49:38 +0200375 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200376 fmt.Println("Add",
377 state+1, "->", end+1,
378 "(",
379 inSym,
380 ":",
381 outSym,
382 ") (",
383 string(tok.sigmaRev[inSym]),
384 ":",
385 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200386 ")",
387 ";",
388 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200389 "NT:", nontoken,
390 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200391 }
Akron75ebe7f2021-08-03 10:34:10 +0200392
Akron8ef408b2021-08-02 22:11:04 +0200393 continue
394 }
395 case SIGMA:
396 {
397 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
398
399 // Turn string into sigma id
400 number, err := strconv.Atoi(elem[0])
401
Akron524c5432021-08-05 14:14:27 +0200402 // ID needs to be > 1
403 number++
404
Akron8ef408b2021-08-02 22:11:04 +0200405 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200406 log.Error().Err(err)
407 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200408 }
409
Akron740f3d72021-08-03 12:12:34 +0200410 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200411
412 var symbol rune
413
414 // Read rune
415 if utf8.RuneCountInString(elem[1]) == 1 {
416 symbol = []rune(elem[1])[0]
417
418 // Probably a MCS
419 } else if utf8.RuneCountInString(elem[1]) > 1 {
420 switch elem[1] {
421 case "@_EPSILON_SYMBOL_@":
422 {
Akronc17f1ca2021-08-03 19:47:27 +0200423 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200424 continue
425 }
426 case "@_UNKNOWN_SYMBOL_@":
427 {
Akronc17f1ca2021-08-03 19:47:27 +0200428 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200429 continue
430 }
431
432 case "@_IDENTITY_SYMBOL_@":
433 {
Akronc17f1ca2021-08-03 19:47:27 +0200434 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200435 continue
436 }
Akron03c92fe2021-08-09 14:07:57 +0200437
438 case "@_TOKEN_SYMBOL_@":
439 {
440 tok.tokenend = number
441 continue
442 }
Akron8ef408b2021-08-02 22:11:04 +0200443 default:
Akron740f3d72021-08-03 12:12:34 +0200444 {
445 log.Error().Msg("MCS not supported: " + line)
446 os.Exit(1)
447 }
Akron8ef408b2021-08-02 22:11:04 +0200448 }
449
Akron740f3d72021-08-03 12:12:34 +0200450 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200451 line, err = r.ReadString('\n')
452 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200453 log.Error().Err(err)
454 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200455 }
456 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200457 log.Error().Msg("MCS not supported:" + line)
458 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200459 }
Akron03c92fe2021-08-09 14:07:57 +0200460 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200461 }
462
Akron740f3d72021-08-03 12:12:34 +0200463 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200464 }
465 }
466 }
467
468 return tok
469}
470
Akron64ffd9a2021-08-03 19:55:21 +0200471// Set alphabet A to the list of all symbols
472// outgoing from s
473func (tok *Tokenizer) get_set(s int, A *[]int) {
474 for a := range tok.transitions[s] {
475 *A = append(*A, a)
476 }
477
478 // Not required, but simplifies bug hunting
479 sort.Ints(*A)
480}
481
Akron8ef408b2021-08-02 22:11:04 +0200482// Implementation of Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200483func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
484
485 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200486 sigma: make(map[rune]int),
487 loadFactor: -1,
488 final: tok.final,
489 unknown: tok.unknown,
490 identity: tok.identity,
491 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200492 tokenend: tok.tokenend,
Akron3f8571a2021-08-05 11:18:10 +0200493 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200494 }
495
496 for num, sym := range tok.sigmaRev {
497 dat.sigma[sym] = num
498 }
Akron8ef408b2021-08-02 22:11:04 +0200499
500 mark := 0
501 size := 0
502
503 // Create a mapping from s to t
Akron740f3d72021-08-03 12:12:34 +0200504 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200505
506 table[size] = &mapping{source: 1, target: 1}
507 size++
508
Akron740f3d72021-08-03 12:12:34 +0200509 // Allocate space for the outgoing symbol range
510 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200511
512 for mark < size {
513 s := table[mark].source // This is a state in Ms
514 t := table[mark].target // This is a state in Mt
515 mark++
Akron740f3d72021-08-03 12:12:34 +0200516
Akron3610f102021-08-08 14:13:25 +0200517 if t == 6288 {
518 fmt.Println("1 State", t, "was", s)
519 }
520
Akron740f3d72021-08-03 12:12:34 +0200521 // Following the paper, here the state t can be remembered
522 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200523 A = A[:0]
524 tok.get_set(s, &A)
525
Akron740f3d72021-08-03 12:12:34 +0200526 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200527 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200528
Akron773b1ef2021-08-03 17:37:20 +0200529 // TODO:
Akron068874c2021-08-04 15:19:56 +0200530 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200531 // outdegree of .end
532
Akron740f3d72021-08-03 12:12:34 +0200533 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200534 for _, a := range A {
535
Akronc17f1ca2021-08-03 19:47:27 +0200536 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200537
Akron740f3d72021-08-03 12:12:34 +0200538 // Aka g(s, a)
539 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200540
Akron740f3d72021-08-03 12:12:34 +0200541 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200542 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200543 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200544
Akron3610f102021-08-08 14:13:25 +0200545 if DEBUG || t1 == 6288 {
Akron524c5432021-08-05 14:14:27 +0200546 fmt.Println("Translate transition",
547 s, "->", s1, "(", a, ")", "to", t, "->", t1)
548 }
549
Akron83e75a22021-08-04 13:14:06 +0200550 // Mark the state as being the target of a nontoken transition
551 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200552 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200553 if DEBUG {
554 fmt.Println("Set", t1, "to nontoken")
555 }
Akron83e75a22021-08-04 13:14:06 +0200556 }
557
Akron84d68e62021-08-04 17:06:52 +0200558 // Mark the state as being the target of a tokenend transition
559 if tok.transitions[s][a].tokenend {
560 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200561 if DEBUG {
562 fmt.Println("Set", t1, "to tokenend")
563 }
Akron84d68e62021-08-04 17:06:52 +0200564 }
565
Akron740f3d72021-08-03 12:12:34 +0200566 // Check for representative states
Akron8ef408b2021-08-02 22:11:04 +0200567 r := in_table(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200568
Akron8ef408b2021-08-02 22:11:04 +0200569 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200570 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200571 table[size] = &mapping{source: s1, target: t1}
572 size++
573 } else {
Akron740f3d72021-08-03 12:12:34 +0200574 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200575 dat.setBase(t1, r)
576 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200577 }
578 } else {
Akron740f3d72021-08-03 12:12:34 +0200579 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200580 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200581 }
582 }
583 }
584
585 // Following Mizobuchi et al (2000) the size of the
586 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200587 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200588 dat.array = dat.array[:dat.maxSize+1]
589 return dat
Akron8ef408b2021-08-02 22:11:04 +0200590}
591
Akron8ef408b2021-08-02 22:11:04 +0200592// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200593// exists and return this as a representative.
594// Currently iterates through the whole table
595// in a bruteforce manner.
Akron3fdfec62021-08-04 11:40:10 +0200596func in_table(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200597 for x := 0; x < size; x++ {
598 if table[x].source == s {
599 return table[x].target
600 }
601 }
602 return 0
603}
604
Akron64ffd9a2021-08-03 19:55:21 +0200605// Resize double array when necessary
606func (dat *DaTokenizer) resize(l int) {
607 // TODO:
608 // This is a bit too aggressive atm and should be calmed down.
609 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200610 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200611 }
Akron64ffd9a2021-08-03 19:55:21 +0200612}
Akronc9d84a62021-08-03 15:56:03 +0200613
Akron64ffd9a2021-08-03 19:55:21 +0200614// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200615func (dat *DaTokenizer) setBase(p uint32, v uint32) {
616 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200617 dat.resize(l)
618 if dat.maxSize < l {
619 dat.maxSize = l
620 }
621 dat.array[p*2] = v
622}
623
Akron3fdfec62021-08-04 11:40:10 +0200624// Returns true if a state is separate pointing to a representative
625func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200626 return dat.array[p*2]&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200627}
628
629// Mark a state as separate pointing to a representative
630func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
631 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200632 dat.array[p*2] |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200633 } else {
Akron03c92fe2021-08-09 14:07:57 +0200634 dat.array[p*2] &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200635 }
636}
637
Akron83e75a22021-08-04 13:14:06 +0200638// Returns true if a state is the target of a nontoken transition
639func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200640 return dat.array[p*2+1]&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200641}
642
643// Mark a state as being the target of a nontoken transition
644func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
645 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200646 dat.array[p*2+1] |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200647 } else {
Akron03c92fe2021-08-09 14:07:57 +0200648 dat.array[p*2+1] &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200649 }
650}
651
Akron84d68e62021-08-04 17:06:52 +0200652// Returns true if a state is the target of a tokenend transition
653func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200654 return dat.array[p*2+1]&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200655}
656
657// Mark a state as being the target of a tokenend transition
658func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
659 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200660 dat.array[p*2+1] |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200661 } else {
Akron03c92fe2021-08-09 14:07:57 +0200662 dat.array[p*2+1] &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200663 }
664}
665
Akron64ffd9a2021-08-03 19:55:21 +0200666// Get base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200667func (dat *DaTokenizer) getBase(p uint32) uint32 {
668 if int(p*2) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200669 return 0
670 }
Akron03c92fe2021-08-09 14:07:57 +0200671 return dat.array[p*2] & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200672}
673
674// Set check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200675func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
676 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200677 dat.resize(l)
678 if dat.maxSize < l {
679 dat.maxSize = l
680 }
681 dat.array[(p*2)+1] = v
682}
683
684// Get check value in double array
Akron3fdfec62021-08-04 11:40:10 +0200685func (dat *DaTokenizer) getCheck(p uint32) uint32 {
686 if int((p*2)+1) >= len(dat.array) {
Akron64ffd9a2021-08-03 19:55:21 +0200687 return 0
688 }
Akron03c92fe2021-08-09 14:07:57 +0200689 return dat.array[(p*2)+1] & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200690}
691
692// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200693func (dat *DaTokenizer) setSize(v int) {
694 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200695}
696
697// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200698func (dat *DaTokenizer) GetSize() int {
699 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200700}
701
702// Based on Mizobuchi et al (2000), p. 124
703// This iterates for every state through the complete double array
704// structure until it finds a gap that fits all outgoing transitions
705// of the state. This is extremely slow, but is only necessary in the
706// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200707func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200708
709 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200710 base := uint32(1) // dat.lastFilledBase
711 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200712OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200713
Akron3f8571a2021-08-05 11:18:10 +0200714 /*
715 if !skip {
716 if dat.getCheck(base) != 0 {
717 dat.lastFilledBase = base
718 } else {
719 skip = true
720 }
721 }
722 */
723
Akron740f3d72021-08-03 12:12:34 +0200724 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200725 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200726 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200727 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200728 base++
729 goto OVERLAP
730 }
731 }
Akron8ef408b2021-08-02 22:11:04 +0200732 return base
733}
734
Akron3610f102021-08-08 14:13:25 +0200735// List all outgoing transitions for a state
736// for testing purposes
737func (dat *DaTokenizer) outgoing(t uint32) []int {
738
739 valid := make([]int, 0, len(dat.sigma))
740
741 for _, a := range dat.sigma {
742 t1 := dat.getBase(t) + uint32(a)
743 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
744 valid = append(valid, a)
745 }
746 }
747
748 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
749 t1 := dat.getBase(t) + uint32(a)
750 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
751 valid = append(valid, -1*a)
752 }
753 }
754
755 sort.Ints(valid)
756
757 return valid
758}
759
Akron03a3c612021-08-04 11:51:27 +0200760// LoadFactor as defined in Kanda et al (2018),
761// i.e. the proportion of non-empty elements to all elements.
762func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200763
Akron03a3c612021-08-04 11:51:27 +0200764 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200765 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200766 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200767 }
Akrond66a9262021-08-03 17:09:09 +0200768 nonEmpty := 0
769 all := len(dat.array) / 2
770 for x := 1; x <= len(dat.array); x = x + 2 {
771 if dat.array[x] != 0 {
772 nonEmpty++
773 }
774 }
Akron03a3c612021-08-04 11:51:27 +0200775 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
776 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200777}
778
Akron6247a5d2021-08-03 19:18:28 +0200779// WriteTo stores the double array data in an io.Writer.
Akron3a063ef2021-08-05 19:36:35 +0200780func (dat *DaTokenizer) Save(file string) (n int64, err error) {
781 f, err := os.Create(file)
782 if err != nil {
783 log.Error().Err(err)
784 return 0, nil
785 }
786 defer f.Close()
787 gz := gzip.NewWriter(f)
788 defer gz.Close()
789 n, err = dat.WriteTo(gz)
790 if err != nil {
791 log.Error().Err(err)
792 return n, err
793 }
794 gz.Flush()
795 return n, nil
796}
797
798// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200799func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
800
Akron3a063ef2021-08-05 19:36:35 +0200801 wb := bufio.NewWriter(w)
802 defer wb.Flush()
803
Akron6247a5d2021-08-03 19:18:28 +0200804 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200805 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200806 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200807 log.Error().Err(err)
808 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200809 }
810
811 // Get sigma as a list
812 sigmalist := make([]rune, len(dat.sigma)+16)
813 max := 0
814 for sym, num := range dat.sigma {
815 sigmalist[num] = sym
816 if num > max {
817 max = num
818 }
819 }
820
821 sigmalist = sigmalist[:max+1]
822
Akron3f8571a2021-08-05 11:18:10 +0200823 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200824 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200825 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
826 bo.PutUint16(buf[4:6], uint16(dat.unknown))
827 bo.PutUint16(buf[6:8], uint16(dat.identity))
828 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200829 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200830 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200831 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200832 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200833 log.Error().Err(err)
834 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200835 }
836
837 all += more
838
Akron3a063ef2021-08-05 19:36:35 +0200839 // wbuf := bytes.NewBuffer(nil)
840 // wbufWrap := bufio.NewWriter(wbuf)
Akron6247a5d2021-08-03 19:18:28 +0200841
842 // Write sigma
843 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200844
Akron3a063ef2021-08-05 19:36:35 +0200845 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200846 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200847 log.Error().Err(err)
848 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200849 }
850 all += more
851 }
Akron3a063ef2021-08-05 19:36:35 +0200852 // wbufWrap.Flush()
853 // more, err = w.Write(wbuf.Bytes())
Akron6247a5d2021-08-03 19:18:28 +0200854 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200855 log.Error().Err(err)
856 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200857 }
Akron3a063ef2021-08-05 19:36:35 +0200858 // all += more
Akron6247a5d2021-08-03 19:18:28 +0200859
860 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200861 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200862 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200863 log.Error().Err(err)
864 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200865 }
866 all += more
867
Akron3a063ef2021-08-05 19:36:35 +0200868 // wbuf.Reset()
Akron6247a5d2021-08-03 19:18:28 +0200869
Akron3a063ef2021-08-05 19:36:35 +0200870 for x := 0; x < len(dat.array); x++ {
871 // for _, d := range dat.array {
872 bo.PutUint32(buf[0:4], dat.array[x])
873 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200874 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200875 log.Error().Err(err)
876 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200877 }
Akron3a063ef2021-08-05 19:36:35 +0200878 if more != 4 {
879 log.Error().Msg("Can not write uint32")
880 return int64(all), err
881 }
Akron6247a5d2021-08-03 19:18:28 +0200882 all += more
883 }
884
Akron3a063ef2021-08-05 19:36:35 +0200885 // wbufWrap.Flush()
886
Akron6247a5d2021-08-03 19:18:28 +0200887 return int64(all), err
888}
889
Akron3f8571a2021-08-05 11:18:10 +0200890func LoadDatokFile(file string) *DaTokenizer {
891 f, err := os.Open(file)
892 if err != nil {
893 log.Error().Err(err)
894 os.Exit(0)
895 }
896 defer f.Close()
897
898 gz, err := gzip.NewReader(f)
899 if err != nil {
900 log.Error().Err(err)
901 os.Exit(0)
902 }
903 defer gz.Close()
904
Akron3a063ef2021-08-05 19:36:35 +0200905 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200906 return ParseDatok(gz)
907}
908
909func ParseDatok(ior io.Reader) *DaTokenizer {
910
911 dat := &DaTokenizer{
912 sigma: make(map[rune]int),
913 epsilon: 0,
914 unknown: 0,
915 identity: 0,
916 final: 0,
917 loadFactor: 0,
918 }
919
920 r := bufio.NewReader(ior)
921
922 all := 0
923
924 buf := make([]byte, 1024)
925 buf = buf[0:len(MAGIC)]
926
927 more, err := r.Read(buf)
928
929 if err != nil {
930 log.Error().Err(err)
931 return nil
932 }
933
934 all += more
935
936 if string(MAGIC) != string(buf) {
937 log.Error().Msg("Not a datok file")
938 return nil
939 }
940
Akron3a063ef2021-08-05 19:36:35 +0200941 more, err = io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200942 if err != nil {
943 log.Error().Err(err)
944 return nil
945 }
946
947 all += more
948
Akron3a063ef2021-08-05 19:36:35 +0200949 version := bo.Uint16(buf[0:2])
950
951 if version != VERSION {
952 log.Error().Msg("Version not compatible")
953 return nil
954 }
955
Akron3f8571a2021-08-05 11:18:10 +0200956 dat.epsilon = int(bo.Uint16(buf[2:4]))
957 dat.unknown = int(bo.Uint16(buf[4:6]))
958 dat.identity = int(bo.Uint16(buf[6:8]))
959 dat.final = int(bo.Uint16(buf[8:10]))
960
961 sigmaCount := int(bo.Uint16(buf[10:12]))
962 arraySize := int(bo.Uint32(buf[12:16]))
963
Akron3a063ef2021-08-05 19:36:35 +0200964 // Shouldn't be relevant though
965 dat.maxSize = arraySize - 1
966
Akron3f8571a2021-08-05 11:18:10 +0200967 for x := 0; x < sigmaCount; x++ {
968 sym, more, err := r.ReadRune()
969 if err == nil && sym != 0 {
970 dat.sigma[sym] = x
971 }
972 all += more
973 }
974
Akron3a063ef2021-08-05 19:36:35 +0200975 more, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200976
977 if err != nil {
978 log.Error().Err(err)
979 return nil
980 }
981
982 all += more
983
984 if string("T") != string(buf[0:1]) {
985 log.Error().Msg("Not a datok file")
986 return nil
987 }
988
989 // Read based on length
990 dat.array = make([]uint32, arraySize)
991
992 for x := 0; x < arraySize; x++ {
Akron3a063ef2021-08-05 19:36:35 +0200993 more, err = io.ReadFull(r, buf[0:4])
Akron3f8571a2021-08-05 11:18:10 +0200994 if err != nil {
Akron3a063ef2021-08-05 19:36:35 +0200995 if err == io.EOF {
996 fmt.Println(arraySize, x)
997 break
998 }
Akron3f8571a2021-08-05 11:18:10 +0200999 log.Error().Err(err)
1000 return nil
1001 }
1002 all += more
1003 dat.array[x] = bo.Uint32(buf[0:4])
1004 }
1005
1006 return dat
1007}
1008
Akron740f3d72021-08-03 12:12:34 +02001009// Match an input string against the double array
1010// FSA.
1011//
1012// Based on Mizobuchi et al (2000), p. 129,
1013// with additional support for IDENTITY, UNKNOWN
1014// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +02001015func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +02001016 var a int
Akron3fdfec62021-08-04 11:40:10 +02001017 var tu uint32
Akron465a0992021-08-03 11:28:48 +02001018 var ok bool
1019
Akron3fdfec62021-08-04 11:40:10 +02001020 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +02001021 chars := []rune(input)
1022 i := 0
1023
Akron49d27ee2021-08-03 11:58:13 +02001024 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +02001025 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +02001026
Akron740f3d72021-08-03 12:12:34 +02001027 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +02001028 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +02001029 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001030 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +02001031 }
Akron64ffd9a2021-08-03 19:55:21 +02001032 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +02001033 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +02001034 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +02001035 }
Akron465a0992021-08-03 11:28:48 +02001036 tu = t
Akron730a79c2021-08-03 11:05:29 +02001037 CHECK:
Akron3fdfec62021-08-04 11:40:10 +02001038 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +02001039
Akron740f3d72021-08-03 12:12:34 +02001040 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +02001041 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001042
1043 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001044 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +02001045 }
Akron740f3d72021-08-03 12:12:34 +02001046
Akron64ffd9a2021-08-03 19:55:21 +02001047 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +02001048 // Try again with unknown symbol, in case identity failed
1049 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001050 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +02001051 }
Akron64ffd9a2021-08-03 19:55:21 +02001052 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +02001053
Akron64ffd9a2021-08-03 19:55:21 +02001054 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001055 // Try again with epsilon symbol, in case everything else failed
1056 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001057 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +02001058 }
Akron64ffd9a2021-08-03 19:55:21 +02001059 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +02001060 } else {
1061 break
1062 }
1063 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +02001064 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +02001065 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001066 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +02001067 }
Akron740f3d72021-08-03 12:12:34 +02001068
1069 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +02001070 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001071 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +02001072 i++
1073 }
Akron83e75a22021-08-04 13:14:06 +02001074
Akron740f3d72021-08-03 12:12:34 +02001075 // TODO:
1076 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +02001077 }
1078
Akron740f3d72021-08-03 12:12:34 +02001079 if i != len(chars) {
1080 if DEBUG {
1081 fmt.Println("Not at the end")
1082 }
Akron8ef408b2021-08-02 22:11:04 +02001083 return false
1084 }
1085
Akron465a0992021-08-03 11:28:48 +02001086FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +02001087
1088 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +02001089 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +02001090 return true
1091 }
Akron465a0992021-08-03 11:28:48 +02001092
Akron740f3d72021-08-03 12:12:34 +02001093 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +02001094 tu = t
Akron3fdfec62021-08-04 11:40:10 +02001095 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +02001096
Akron740f3d72021-08-03 12:12:34 +02001097 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +02001098 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001099 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001100 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +02001101 }
Akron465a0992021-08-03 11:28:48 +02001102 return false
Akron740f3d72021-08-03 12:12:34 +02001103
Akron3fdfec62021-08-04 11:40:10 +02001104 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +02001105 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001106 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +02001107 }
Akron740f3d72021-08-03 12:12:34 +02001108
Akron465a0992021-08-03 11:28:48 +02001109 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +02001110}
Akron068874c2021-08-04 15:19:56 +02001111
Akron3610f102021-08-08 14:13:25 +02001112func showBuffer(buffer []rune, buffo int, buffi int) string {
1113 out := make([]rune, 0, 1024)
1114 for x := 0; x < len(buffer); x++ {
1115 if buffi == x {
1116 out = append(out, '^')
1117 }
1118 if buffo == x {
1119 out = append(out, '[', buffer[x], ']')
1120 } else {
1121 out = append(out, buffer[x])
1122 }
1123 }
1124 return string(out)
1125}
1126
Akron84d68e62021-08-04 17:06:52 +02001127// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001128// FSA. The rules are always greedy. If the automaton fails,
1129// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001130//
1131// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +02001132// for NONTOKEN and TOKENEND handling
Akron3f8571a2021-08-05 11:18:10 +02001133func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001134 var a int
Akronb4bbb472021-08-09 11:49:38 +02001135 var t0 uint32
Akron84d68e62021-08-04 17:06:52 +02001136 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +02001137
Akron3610f102021-08-08 14:13:25 +02001138 // Remember the last position of a possible tokenend,
1139 // in case the automaton fails.
1140 epsilonState := uint32(0)
1141 epsilonOffset := 0
1142
1143 // Implement a low level buffer for full control,
1144 // however - it is probably better to introduce
1145 // this on a higher level with a io.Reader interface
1146 // The buffer stores a single word and may have white
1147 // space at the end (but not at the beginning).
1148 //
1149 // This is the only backtracking requirement because of
1150 // epsilon transitions, to support tokenizations like:
1151 // "this is an example|.| And it works." vs
1152 // "this is an example.com| application."
1153 buffer := make([]rune, 1024)
1154 buffo := 0 // Buffer offset
1155 buffi := 0 // Buffer length
1156
Akron3f8571a2021-08-05 11:18:10 +02001157 reader := bufio.NewReader(r)
1158 writer := bufio.NewWriter(w)
1159 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001160
Akron3f8571a2021-08-05 11:18:10 +02001161 t := uint32(1) // Initial state
Akron3f8571a2021-08-05 11:18:10 +02001162
1163 var char rune
1164 var err error
1165 eof := false
1166
Akron3610f102021-08-08 14:13:25 +02001167 // TODO:
1168 // Write all characters first into a buffer
1169 // and flush when necessary
1170 // TODO:
1171 // Create an epsilon stack
Akron3f8571a2021-08-05 11:18:10 +02001172 for {
1173
Akronb4bbb472021-08-09 11:49:38 +02001174 // Get from reader if buffer is empty
Akron3610f102021-08-08 14:13:25 +02001175 if buffo >= buffi {
Akron3f8571a2021-08-05 11:18:10 +02001176 char, _, err = reader.ReadRune()
1177 if err != nil {
1178 eof = true
1179 break
1180 }
Akron3610f102021-08-08 14:13:25 +02001181 buffer[buffi] = char
1182 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001183 }
Akron3610f102021-08-08 14:13:25 +02001184
Akron3610f102021-08-08 14:13:25 +02001185 char = buffer[buffo]
Akron3610f102021-08-08 14:13:25 +02001186
1187 if DEBUG {
1188 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1189 }
Akron3f8571a2021-08-05 11:18:10 +02001190
1191 a, ok = dat.sigma[char]
Akron068874c2021-08-04 15:19:56 +02001192
1193 // Support identity symbol if character is not in sigma
1194 if !ok && dat.identity != -1 {
1195 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001196 fmt.Println("IDENTITY symbol", string(char), "->", dat.identity)
Akron068874c2021-08-04 15:19:56 +02001197 }
1198 a = dat.identity
Akron068874c2021-08-04 15:19:56 +02001199 }
Akron3610f102021-08-08 14:13:25 +02001200
Akronb4bbb472021-08-09 11:49:38 +02001201 t0 = t
1202
1203 if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
Akron03c92fe2021-08-09 14:07:57 +02001204 // Remember state for backtracking to last tokenend state
Akronb4bbb472021-08-09 11:49:38 +02001205 epsilonState = t0
Akron3610f102021-08-08 14:13:25 +02001206 epsilonOffset = buffo
1207 }
1208
Akron068874c2021-08-04 15:19:56 +02001209 CHECK:
1210 nontoken = false
Akron84d68e62021-08-04 17:06:52 +02001211 tokenend = false
1212
Akronb4bbb472021-08-09 11:49:38 +02001213 t = dat.getBase(t0) + uint32(a)
Akron068874c2021-08-04 15:19:56 +02001214
Akron524c5432021-08-05 14:14:27 +02001215 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001216 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1217 fmt.Println("Valid:", dat.outgoing(t0))
Akron524c5432021-08-05 14:14:27 +02001218 }
1219
Akron068874c2021-08-04 15:19:56 +02001220 // Check if the transition is valid according to the double array
Akronb4bbb472021-08-09 11:49:38 +02001221 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001222
1223 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001224 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001225 }
1226
1227 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001228
Akron068874c2021-08-04 15:19:56 +02001229 // Try again with unknown symbol, in case identity failed
1230 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001231 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001232 }
1233 a = dat.unknown
1234
1235 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001236
Akron068874c2021-08-04 15:19:56 +02001237 // Try again with epsilon symbol, in case everything else failed
1238 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001239 fmt.Println("EPSILON symbol", string(char), "->", dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001240 }
Akronb4bbb472021-08-09 11:49:38 +02001241 t0 = epsilonState
Akron068874c2021-08-04 15:19:56 +02001242 a = dat.epsilon
Akron3610f102021-08-08 14:13:25 +02001243 epsilonState = 0 // reset
1244 buffo = epsilonOffset
1245 if DEBUG {
1246 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1247 }
Akronb4bbb472021-08-09 11:49:38 +02001248
Akron068874c2021-08-04 15:19:56 +02001249 } else {
1250 break
1251 }
Akron068874c2021-08-04 15:19:56 +02001252
Akronb4bbb472021-08-09 11:49:38 +02001253 goto CHECK
1254
1255 }
1256
1257 // Move to representative state
1258 nontoken = dat.isNonToken(t)
1259 tokenend = dat.isTokenEnd(t)
1260
1261 if dat.isSeparate(t) {
Akron068874c2021-08-04 15:19:56 +02001262 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +02001263
Akron524c5432021-08-05 14:14:27 +02001264 if DEBUG {
1265 fmt.Println("Representative pointing to", t)
1266 }
Akron068874c2021-08-04 15:19:56 +02001267 }
1268
1269 // Transition is fine
Akronb4bbb472021-08-09 11:49:38 +02001270 if a != dat.epsilon {
1271
Akron068874c2021-08-04 15:19:56 +02001272 // Character consumed
Akron3610f102021-08-08 14:13:25 +02001273 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001274 if nontoken {
1275
Akron3610f102021-08-08 14:13:25 +02001276 if DEBUG {
1277 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1278 }
Akronb4bbb472021-08-09 11:49:38 +02001279
Akron3610f102021-08-08 14:13:25 +02001280 // Maybe remove the first character, if buffo == 0?
1281 if buffo == 1 {
Akronb4bbb472021-08-09 11:49:38 +02001282
1283 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001284 for x, i := range buffer[buffo:buffi] {
1285 buffer[x] = i
1286 }
1287 // writer.WriteRune('\n')
1288 buffi -= buffo
1289 epsilonOffset -= buffo
1290 buffo = 0
1291 }
1292 }
Akron3f8571a2021-08-05 11:18:10 +02001293 }
Akron068874c2021-08-04 15:19:56 +02001294
Akron524c5432021-08-05 14:14:27 +02001295 if DEBUG {
1296 fmt.Println(" --> ok!")
1297 }
1298
Akron84d68e62021-08-04 17:06:52 +02001299 if tokenend {
Akron3610f102021-08-08 14:13:25 +02001300 data := []byte(string(buffer[:buffo]))
1301 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001302 fmt.Println("-> Flush buffer:", string(data), showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001303 }
1304 writer.Write(data)
Akron3f8571a2021-08-05 11:18:10 +02001305 writer.WriteRune('\n')
Akron3610f102021-08-08 14:13:25 +02001306
1307 // Better as a ring buffer
1308 for x, i := range buffer[buffo:buffi] {
1309 buffer[x] = i
1310 }
Akronb4bbb472021-08-09 11:49:38 +02001311
Akron3610f102021-08-08 14:13:25 +02001312 buffi -= buffo
1313 epsilonOffset -= buffo
1314 buffo = 0
1315 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001316 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001317 }
Akron84d68e62021-08-04 17:06:52 +02001318 }
1319
Akron068874c2021-08-04 15:19:56 +02001320 // TODO:
1321 // Prevent endless epsilon loops!
1322 }
1323
Akron3f8571a2021-08-05 11:18:10 +02001324 if !eof {
Akron068874c2021-08-04 15:19:56 +02001325 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001326 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001327 }
1328 return false
1329 }
1330
1331FINALCHECK:
1332
1333 // Automaton is in a final state
1334 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akronb4bbb472021-08-09 11:49:38 +02001335
1336 if buffi > 0 {
1337 data := []byte(string(buffer[:buffi]))
1338 if DEBUG {
1339 fmt.Println("-> Flush buffer:", string(data))
Akron3f8571a2021-08-05 11:18:10 +02001340 }
Akronb4bbb472021-08-09 11:49:38 +02001341 writer.Write(data)
1342 // states are irrelevant here
1343 }
1344
Akron84d68e62021-08-04 17:06:52 +02001345 if dat.isTokenEnd(t) {
Akron3f8571a2021-08-05 11:18:10 +02001346 writer.WriteRune('\n')
Akron84d68e62021-08-04 17:06:52 +02001347 }
1348
1349 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001350 return true
1351 }
1352
1353 // Check epsilon transitions until a final state is reached
Akronb4bbb472021-08-09 11:49:38 +02001354 t0 = t
1355 t = dat.getBase(t0) + uint32(dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001356
1357 // Epsilon transition failed
Akronb4bbb472021-08-09 11:49:38 +02001358 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001359 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001360 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001361 }
1362 return false
Akron068874c2021-08-04 15:19:56 +02001363 }
1364
Akronb4bbb472021-08-09 11:49:38 +02001365 // nontoken = dat.isNonToken(t)
1366 tokenend = dat.isTokenEnd(t)
1367
1368 if dat.isSeparate(t) {
1369 // Move to representative state
1370 t = dat.getBase(t)
1371 }
Akron3f8571a2021-08-05 11:18:10 +02001372
1373 if tokenend {
Akron3610f102021-08-08 14:13:25 +02001374 if buffi > 0 {
1375 data := []byte(string(buffer[:buffi]))
1376 if DEBUG {
1377 fmt.Println("-> Flush buffer:", string(data))
1378 }
1379 writer.Write(data)
1380 buffi = 0
1381 buffo = 0
1382 epsilonState = 0
1383 }
Akron3f8571a2021-08-05 11:18:10 +02001384 writer.WriteRune('\n')
Akron068874c2021-08-04 15:19:56 +02001385 }
1386
1387 goto FINALCHECK
1388}