blob: b89bccd5c31fe3a959fd5ddee5b9f66909a14652 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
18// - Strip first state and make everything start with 0!
Akron3a063ef2021-08-05 19:36:35 +020019// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020020// - Mark epsilon transitions in bytes
21// - Introduce methods on BC array entries instead of
22// jumping into the entries all the time!
23// - Instead of memoizing the loadFactor, better remember
24// the number of set transitions
Akron740f3d72021-08-03 12:12:34 +020025
Akron8ef408b2021-08-02 22:11:04 +020026import (
27 "bufio"
28 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020029 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020030 "fmt"
31 "io"
32 "os"
Akronc9d84a62021-08-03 15:56:03 +020033 "sort"
Akron8ef408b2021-08-02 22:11:04 +020034 "strconv"
35 "strings"
36 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020037
38 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020039)
40
41const (
Akron2a4b9292021-08-04 15:35:22 +020042 PROPS = 1
43 SIGMA = 2
44 STATES = 3
45 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020046 DEBUG = false
47 MAGIC = "DATOK"
48 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020049 FIRSTBIT uint32 = 1 << 31
50 SECONDBIT uint32 = 1 << 30
51 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020052)
53
Akron03c92fe2021-08-09 14:07:57 +020054// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020055var bo binary.ByteOrder = binary.LittleEndian
56
Akron8ef408b2021-08-02 22:11:04 +020057type mapping struct {
58 source int
Akron3fdfec62021-08-04 11:40:10 +020059 target uint32
Akron8ef408b2021-08-02 22:11:04 +020060}
61
62type edge struct {
Akron83e75a22021-08-04 13:14:06 +020063 inSym int
64 outSym int
65 end int
66 nontoken bool
67 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020068}
69
Akron03c92fe2021-08-09 14:07:57 +020070// Tokenizer is the intermediate representation
71// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020072type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020073 sigmaRev map[int]rune
74 arcCount int
Akron740f3d72021-08-03 12:12:34 +020075 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020076 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020077
78 // Special symbols in sigma
79 epsilon int
80 unknown int
81 identity int
82 final int
Akron03c92fe2021-08-09 14:07:57 +020083 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020084}
85
Akron03c92fe2021-08-09 14:07:57 +020086// DaTokenizer represents a tokenizer implemented as a
87// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020088type DaTokenizer struct {
Akron03a3c612021-08-04 11:51:27 +020089 sigma map[rune]int
90 maxSize int
91 loadFactor float64
92 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020093 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020094
95 // Special symbols in sigma
96 epsilon int
97 unknown int
98 identity int
99 final int
Akron03c92fe2021-08-09 14:07:57 +0200100 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200101}
102
Akron03c92fe2021-08-09 14:07:57 +0200103// ParseFoma reads the FST from a foma file
104// and creates an internal representation,
105// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200106func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200107 f, err := os.Open(file)
108 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200109 log.Error().Err(err)
110 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200111 }
112 defer f.Close()
113
114 gz, err := gzip.NewReader(f)
115 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200116 log.Error().Err(err)
117 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200118 }
119 defer gz.Close()
120
Akron3fdfec62021-08-04 11:40:10 +0200121 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200122}
123
Akron03c92fe2021-08-09 14:07:57 +0200124// ParseFoma reads the FST from a foma file reader
125// and creates an internal representation,
126// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200127func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200128 r := bufio.NewReader(ior)
129
130 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200131 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200132 epsilon: -1,
133 unknown: -1,
134 identity: -1,
135 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200136 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200137 }
138
Akron740f3d72021-08-03 12:12:34 +0200139 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200140
141 mode := 0
142 var elem []string
143 var elemint [5]int
144
Akron03c92fe2021-08-09 14:07:57 +0200145 // Iterate over all lines of the file.
146 // This is mainly based on foma2js,
147 // licensed under the Apache License, version 2,
148 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200149 for {
150 line, err := r.ReadString('\n')
151 if err != nil {
152 if err == io.EOF {
153 break
154 }
Akron740f3d72021-08-03 12:12:34 +0200155 log.Error().Err(err)
156 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200157 }
Akron8ef408b2021-08-02 22:11:04 +0200158
Akron439f4ec2021-08-09 15:45:38 +0200159 // Read parser mode for the following lines
160 if strings.HasPrefix(line, "##") {
161 if strings.HasPrefix(line, "##props##") {
162 mode = PROPS
163
164 } else if strings.HasPrefix(line, "##states##") {
165 mode = STATES
166
167 // Adds a final transition symbol to sigma
168 // written as '#' in Mizobuchi et al (2000)
169 tok.sigmaCount++
170 tok.final = tok.sigmaCount
171
172 } else if strings.HasPrefix(line, "##sigma##") {
173
174 mode = SIGMA
175
176 } else if strings.HasPrefix(line, "##end##") {
177
178 mode = NONE
179
180 } else if !strings.HasPrefix(line, "##foma-net") {
181 log.Error().Msg("Unknown input line")
182 break
183 }
Akron8ef408b2021-08-02 22:11:04 +0200184 continue
185 }
186
Akron439f4ec2021-08-09 15:45:38 +0200187 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200188 switch mode {
189 case PROPS:
190 {
191 elem = strings.Split(line, " ")
192 /*
193 fmt.Println("arity: " + elem[0])
194 fmt.Println("arccount: " + elem[1])
195 fmt.Println("statecount: " + elem[2])
196 fmt.Println("linecount: " + elem[3])
197 fmt.Println("finalcount: " + elem[4])
198 fmt.Println("pathcount: " + elem[5])
199 fmt.Println("is_deterministic: " + elem[6])
200 fmt.Println("is_pruned: " + elem[7])
201 fmt.Println("is_minimized: " + elem[8])
202 fmt.Println("is_epsilon_free: " + elem[9])
203 fmt.Println("is_loop_free: " + elem[10])
204 fmt.Println("extras: " + elem[11])
205 fmt.Println("name: " + elem[12])
206 */
207 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200208 log.Error().Msg("The FST needs to be deterministic")
209 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200210 }
Akron439f4ec2021-08-09 15:45:38 +0200211
Akron8ef408b2021-08-02 22:11:04 +0200212 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200213 log.Error().Msg("The FST needs to be epsilon free")
214 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200215 }
216
217 elemint[0], err = strconv.Atoi(elem[1])
218 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200219 log.Error().Msg("Can't read arccount")
220 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200221 }
Akron740f3d72021-08-03 12:12:34 +0200222 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200223
Akron8ef408b2021-08-02 22:11:04 +0200224 elemint[0], err = strconv.Atoi(elem[2])
225 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200226 log.Error().Msg("Can't read statecount")
227 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200228 }
Akron439f4ec2021-08-09 15:45:38 +0200229
230 // States start at 1 in Mizobuchi et al (2000),
231 // as the state 0 is associated with a fail.
232 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200233 tok.transitions = make([]map[int]*edge, elemint[0]+1)
234 continue
235 }
236 case STATES:
237 {
238 elem = strings.Split(line[0:len(line)-1], " ")
239 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200240 if DEBUG {
241 fmt.Println("Skip", elem)
242 }
Akron8ef408b2021-08-02 22:11:04 +0200243 continue
244 }
245 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200246 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200247 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200248 break
249 }
Akron8ef408b2021-08-02 22:11:04 +0200250
251 if len(elem) > 1 {
252 elemint[1], err = strconv.Atoi(elem[1])
253 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200254 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200255 break
256 }
257 if len(elem) > 2 {
258 elemint[2], err = strconv.Atoi(elem[2])
259 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200260 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200261 break
262 }
263 if len(elem) > 3 {
264 elemint[3], err = strconv.Atoi(elem[3])
265 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200266 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200267 break
268 }
269 if len(elem) > 4 {
270 elemint[4], err = strconv.Atoi(elem[4])
271 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200272 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200273 break
274 }
275 }
276 }
277 }
278 }
279
280 switch len(elem) {
281 case 5:
282 {
Akron740f3d72021-08-03 12:12:34 +0200283 state = elemint[0]
284 inSym = elemint[1]
285 outSym = elemint[2]
286 end = elemint[3]
287 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200288 }
289 case 4:
290 {
291 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200292 state = elemint[0]
293 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200294 } else {
Akron740f3d72021-08-03 12:12:34 +0200295 state = elemint[0]
296 inSym = elemint[1]
297 end = elemint[2]
298 final = elemint[3]
299 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200300 }
301 }
302 case 3:
303 {
Akron740f3d72021-08-03 12:12:34 +0200304 inSym = elemint[0]
305 outSym = elemint[1]
306 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200307 }
308 case 2:
309 {
Akron740f3d72021-08-03 12:12:34 +0200310 inSym = elemint[0]
311 end = elemint[1]
312 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200313 }
314 }
315
Akron83e75a22021-08-04 13:14:06 +0200316 nontoken := false
317 tokenend := false
318
Akron439f4ec2021-08-09 15:45:38 +0200319 // While the states in foma start with 0, the states in the
320 // Mizobuchi FSA start with one - so we increase every state by 1.
321 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200322 inSym++
323 outSym++
324
Akron439f4ec2021-08-09 15:45:38 +0200325 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200326 if inSym != outSym {
Akron03c92fe2021-08-09 14:07:57 +0200327 if outSym == tok.tokenend {
Akron83e75a22021-08-04 13:14:06 +0200328 tokenend = true
329 } else if outSym == tok.epsilon {
330 nontoken = true
331 } else {
Akron740f3d72021-08-03 12:12:34 +0200332 log.Error().Msg(
333 "Unsupported transition: " +
334 strconv.Itoa(state) +
335 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200336 " (" +
Akron740f3d72021-08-03 12:12:34 +0200337 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200338 ":" +
Akron740f3d72021-08-03 12:12:34 +0200339 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200340 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200341 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200342 ":" +
Akron740f3d72021-08-03 12:12:34 +0200343 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200344 ")")
Akron740f3d72021-08-03 12:12:34 +0200345 os.Exit(1)
Akron75ebe7f2021-08-03 10:34:10 +0200346 }
Akron83e75a22021-08-04 13:14:06 +0200347
Akron83e75a22021-08-04 13:14:06 +0200348 } else if inSym == tok.epsilon {
Akron439f4ec2021-08-09 15:45:38 +0200349 log.Error().Msg("General epsilon transitions are not supported")
Akron068874c2021-08-04 15:19:56 +0200350 os.Exit(1)
Akron8ef408b2021-08-02 22:11:04 +0200351 }
352
Akron03c92fe2021-08-09 14:07:57 +0200353 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200354 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200355 inSym: inSym,
356 outSym: outSym,
357 end: end + 1,
358 tokenend: tokenend,
359 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200360 }
361
Akron740f3d72021-08-03 12:12:34 +0200362 // Initialize outgoing states
363 if tok.transitions[state+1] == nil {
364 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200365 }
366
Akron740f3d72021-08-03 12:12:34 +0200367 // Ignore transitions with invalid symbols
368 if inSym >= 0 {
369 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200370 }
Akron8ef408b2021-08-02 22:11:04 +0200371
Akron740f3d72021-08-03 12:12:34 +0200372 // Add final transition
373 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200374 // TODO:
375 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200376 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200377 }
378
Akronb4bbb472021-08-09 11:49:38 +0200379 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200380 fmt.Println("Add",
381 state+1, "->", end+1,
382 "(",
383 inSym,
384 ":",
385 outSym,
386 ") (",
387 string(tok.sigmaRev[inSym]),
388 ":",
389 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200390 ")",
391 ";",
392 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200393 "NT:", nontoken,
394 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200395 }
Akron75ebe7f2021-08-03 10:34:10 +0200396
Akron8ef408b2021-08-02 22:11:04 +0200397 continue
398 }
399 case SIGMA:
400 {
401 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
402
403 // Turn string into sigma id
404 number, err := strconv.Atoi(elem[0])
405
Akron524c5432021-08-05 14:14:27 +0200406 // ID needs to be > 1
407 number++
408
Akron8ef408b2021-08-02 22:11:04 +0200409 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200410 log.Error().Err(err)
411 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200412 }
413
Akron740f3d72021-08-03 12:12:34 +0200414 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200415
416 var symbol rune
417
418 // Read rune
419 if utf8.RuneCountInString(elem[1]) == 1 {
420 symbol = []rune(elem[1])[0]
421
Akron8ef408b2021-08-02 22:11:04 +0200422 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200423
424 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200425 switch elem[1] {
426 case "@_EPSILON_SYMBOL_@":
427 {
Akronc17f1ca2021-08-03 19:47:27 +0200428 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200429 }
430 case "@_UNKNOWN_SYMBOL_@":
431 {
Akronc17f1ca2021-08-03 19:47:27 +0200432 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200433 }
434
435 case "@_IDENTITY_SYMBOL_@":
436 {
Akronc17f1ca2021-08-03 19:47:27 +0200437 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200438 }
Akron03c92fe2021-08-09 14:07:57 +0200439
440 case "@_TOKEN_SYMBOL_@":
441 {
442 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200443 }
Akron8ef408b2021-08-02 22:11:04 +0200444 default:
Akron740f3d72021-08-03 12:12:34 +0200445 {
446 log.Error().Msg("MCS not supported: " + line)
447 os.Exit(1)
448 }
Akron8ef408b2021-08-02 22:11:04 +0200449 }
Akron439f4ec2021-08-09 15:45:38 +0200450 continue
Akron8ef408b2021-08-02 22:11:04 +0200451
Akron740f3d72021-08-03 12:12:34 +0200452 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200453 line, err = r.ReadString('\n')
454 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200455 log.Error().Err(err)
456 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200457 }
458 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200459 log.Error().Msg("MCS not supported:" + line)
460 os.Exit(0)
Akron8ef408b2021-08-02 22:11:04 +0200461 }
Akron03c92fe2021-08-09 14:07:57 +0200462 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200463 }
464
Akron740f3d72021-08-03 12:12:34 +0200465 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200466 }
467 }
468 }
469
470 return tok
471}
472
Akron64ffd9a2021-08-03 19:55:21 +0200473// Set alphabet A to the list of all symbols
474// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200475func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200476 for a := range tok.transitions[s] {
477 *A = append(*A, a)
478 }
479
480 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200481 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200482}
483
Akron439f4ec2021-08-09 15:45:38 +0200484// ToDoubleArray turns the intermediate tokenizer representation
485// into a double array representation.
486//
487// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200488func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
489
490 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200491 sigma: make(map[rune]int),
492 loadFactor: -1,
493 final: tok.final,
494 unknown: tok.unknown,
495 identity: tok.identity,
496 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200497 tokenend: tok.tokenend,
Akron3f8571a2021-08-05 11:18:10 +0200498 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200499 }
500
501 for num, sym := range tok.sigmaRev {
502 dat.sigma[sym] = num
503 }
Akron8ef408b2021-08-02 22:11:04 +0200504
505 mark := 0
506 size := 0
507
Akron439f4ec2021-08-09 15:45:38 +0200508 // Create a mapping from s (in Ms aka Intermediate FSA)
509 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200510 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200511
Akron439f4ec2021-08-09 15:45:38 +0200512 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200513 table[size] = &mapping{source: 1, target: 1}
514 size++
515
Akron740f3d72021-08-03 12:12:34 +0200516 // Allocate space for the outgoing symbol range
517 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200518
519 for mark < size {
520 s := table[mark].source // This is a state in Ms
521 t := table[mark].target // This is a state in Mt
522 mark++
Akron740f3d72021-08-03 12:12:34 +0200523
524 // Following the paper, here the state t can be remembered
525 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200526 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200527 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200528
Akron740f3d72021-08-03 12:12:34 +0200529 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200530 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200531
Akron773b1ef2021-08-03 17:37:20 +0200532 // TODO:
Akron068874c2021-08-04 15:19:56 +0200533 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200534 // outdegree of .end
535
Akron740f3d72021-08-03 12:12:34 +0200536 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200537 for _, a := range A {
538
Akronc17f1ca2021-08-03 19:47:27 +0200539 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200540
Akron740f3d72021-08-03 12:12:34 +0200541 // Aka g(s, a)
542 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200543
Akron740f3d72021-08-03 12:12:34 +0200544 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200545 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200546 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200547
Akron439f4ec2021-08-09 15:45:38 +0200548 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200549 fmt.Println("Translate transition",
550 s, "->", s1, "(", a, ")", "to", t, "->", t1)
551 }
552
Akron83e75a22021-08-04 13:14:06 +0200553 // Mark the state as being the target of a nontoken transition
554 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200555 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200556 if DEBUG {
557 fmt.Println("Set", t1, "to nontoken")
558 }
Akron83e75a22021-08-04 13:14:06 +0200559 }
560
Akron84d68e62021-08-04 17:06:52 +0200561 // Mark the state as being the target of a tokenend transition
562 if tok.transitions[s][a].tokenend {
563 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200564 if DEBUG {
565 fmt.Println("Set", t1, "to tokenend")
566 }
Akron84d68e62021-08-04 17:06:52 +0200567 }
568
Akron740f3d72021-08-03 12:12:34 +0200569 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200570 r := stateAlreadyInTable(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200571
Akron439f4ec2021-08-09 15:45:38 +0200572 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200573 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200574 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200575 table[size] = &mapping{source: s1, target: t1}
576 size++
577 } else {
Akron740f3d72021-08-03 12:12:34 +0200578 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200579 dat.setBase(t1, r)
580 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200581 }
582 } else {
Akron740f3d72021-08-03 12:12:34 +0200583 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200584 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200585 }
586 }
587 }
588
589 // Following Mizobuchi et al (2000) the size of the
590 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200591 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200592 dat.array = dat.array[:dat.maxSize+1]
593 return dat
Akron8ef408b2021-08-02 22:11:04 +0200594}
595
Akron8ef408b2021-08-02 22:11:04 +0200596// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200597// exists and return this as a representative.
598// Currently iterates through the whole table
599// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200600func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200601 for x := 0; x < size; x++ {
602 if table[x].source == s {
603 return table[x].target
604 }
605 }
606 return 0
607}
608
Akron64ffd9a2021-08-03 19:55:21 +0200609// Resize double array when necessary
610func (dat *DaTokenizer) resize(l int) {
611 // TODO:
612 // This is a bit too aggressive atm and should be calmed down.
613 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200614 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200615 }
Akron64ffd9a2021-08-03 19:55:21 +0200616}
Akronc9d84a62021-08-03 15:56:03 +0200617
Akron64ffd9a2021-08-03 19:55:21 +0200618// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200619func (dat *DaTokenizer) setBase(p uint32, v uint32) {
620 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200621 if dat.maxSize < l {
Akron439f4ec2021-08-09 15:45:38 +0200622 dat.resize(l)
Akron64ffd9a2021-08-03 19:55:21 +0200623 dat.maxSize = l
624 }
Akron439f4ec2021-08-09 15:45:38 +0200625 dat.array[l-1] = v
626}
627
628// Get base value in double array
629func (dat *DaTokenizer) getBase(p uint32) uint32 {
630 if int(p*2) > dat.maxSize {
631 return 0
632 }
633 return dat.array[p*2] & RESTBIT
634}
635
636// Set check value in double array
637func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
638 l := int(p*2 + 1)
639 if dat.maxSize < l {
640 dat.resize(l)
641 dat.maxSize = l
642 }
643 dat.array[l] = v
644}
645
646// Get check value in double array
647func (dat *DaTokenizer) getCheck(p uint32) uint32 {
648 if int((p*2)+1) > dat.maxSize {
649 return 0
650 }
651 return dat.array[(p*2)+1] & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200652}
653
Akron3fdfec62021-08-04 11:40:10 +0200654// Returns true if a state is separate pointing to a representative
655func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200656 return dat.array[p*2]&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200657}
658
659// Mark a state as separate pointing to a representative
660func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
661 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200662 dat.array[p*2] |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200663 } else {
Akron03c92fe2021-08-09 14:07:57 +0200664 dat.array[p*2] &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200665 }
666}
667
Akron83e75a22021-08-04 13:14:06 +0200668// Returns true if a state is the target of a nontoken transition
669func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200670 return dat.array[p*2+1]&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200671}
672
673// Mark a state as being the target of a nontoken transition
674func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
675 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200676 dat.array[p*2+1] |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200677 } else {
Akron03c92fe2021-08-09 14:07:57 +0200678 dat.array[p*2+1] &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200679 }
680}
681
Akron84d68e62021-08-04 17:06:52 +0200682// Returns true if a state is the target of a tokenend transition
683func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200684 return dat.array[p*2+1]&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200685}
686
687// Mark a state as being the target of a tokenend transition
688func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
689 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200690 dat.array[p*2+1] |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200691 } else {
Akron03c92fe2021-08-09 14:07:57 +0200692 dat.array[p*2+1] &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200693 }
694}
695
Akron64ffd9a2021-08-03 19:55:21 +0200696// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200697func (dat *DaTokenizer) setSize(v int) {
698 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200699}
700
701// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200702func (dat *DaTokenizer) GetSize() int {
703 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200704}
705
706// Based on Mizobuchi et al (2000), p. 124
707// This iterates for every state through the complete double array
708// structure until it finds a gap that fits all outgoing transitions
709// of the state. This is extremely slow, but is only necessary in the
710// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200711func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200712
713 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200714 base := uint32(1) // dat.lastFilledBase
715 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200716OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200717
Akron3f8571a2021-08-05 11:18:10 +0200718 /*
719 if !skip {
720 if dat.getCheck(base) != 0 {
721 dat.lastFilledBase = base
722 } else {
723 skip = true
724 }
725 }
726 */
727
Akron740f3d72021-08-03 12:12:34 +0200728 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200729 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200730 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200731 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200732 base++
733 goto OVERLAP
734 }
735 }
Akron8ef408b2021-08-02 22:11:04 +0200736 return base
737}
738
Akron3610f102021-08-08 14:13:25 +0200739// List all outgoing transitions for a state
740// for testing purposes
741func (dat *DaTokenizer) outgoing(t uint32) []int {
742
743 valid := make([]int, 0, len(dat.sigma))
744
745 for _, a := range dat.sigma {
746 t1 := dat.getBase(t) + uint32(a)
747 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
748 valid = append(valid, a)
749 }
750 }
751
752 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
753 t1 := dat.getBase(t) + uint32(a)
754 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
755 valid = append(valid, -1*a)
756 }
757 }
758
759 sort.Ints(valid)
760
761 return valid
762}
763
Akron03a3c612021-08-04 11:51:27 +0200764// LoadFactor as defined in Kanda et al (2018),
765// i.e. the proportion of non-empty elements to all elements.
766func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200767
Akron03a3c612021-08-04 11:51:27 +0200768 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200769 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200770 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200771 }
Akrond66a9262021-08-03 17:09:09 +0200772 nonEmpty := 0
773 all := len(dat.array) / 2
774 for x := 1; x <= len(dat.array); x = x + 2 {
775 if dat.array[x] != 0 {
776 nonEmpty++
777 }
778 }
Akron03a3c612021-08-04 11:51:27 +0200779 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
780 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200781}
782
Akron439f4ec2021-08-09 15:45:38 +0200783// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200784func (dat *DaTokenizer) Save(file string) (n int64, err error) {
785 f, err := os.Create(file)
786 if err != nil {
787 log.Error().Err(err)
Akron439f4ec2021-08-09 15:45:38 +0200788 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200789 }
790 defer f.Close()
791 gz := gzip.NewWriter(f)
792 defer gz.Close()
793 n, err = dat.WriteTo(gz)
794 if err != nil {
795 log.Error().Err(err)
796 return n, err
797 }
798 gz.Flush()
799 return n, nil
800}
801
802// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200803func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
804
Akron3a063ef2021-08-05 19:36:35 +0200805 wb := bufio.NewWriter(w)
806 defer wb.Flush()
807
Akron6247a5d2021-08-03 19:18:28 +0200808 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200809 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200810 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200811 log.Error().Err(err)
812 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200813 }
814
815 // Get sigma as a list
816 sigmalist := make([]rune, len(dat.sigma)+16)
817 max := 0
818 for sym, num := range dat.sigma {
819 sigmalist[num] = sym
820 if num > max {
821 max = num
822 }
823 }
824
825 sigmalist = sigmalist[:max+1]
826
Akron3f8571a2021-08-05 11:18:10 +0200827 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200828 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200829 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
830 bo.PutUint16(buf[4:6], uint16(dat.unknown))
831 bo.PutUint16(buf[6:8], uint16(dat.identity))
832 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200833 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200834 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200835 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200836 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200837 log.Error().Err(err)
838 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200839 }
840
841 all += more
842
Akron6247a5d2021-08-03 19:18:28 +0200843 // Write sigma
844 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200845
Akron3a063ef2021-08-05 19:36:35 +0200846 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200847 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200848 log.Error().Err(err)
849 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200850 }
851 all += more
852 }
Akron439f4ec2021-08-09 15:45:38 +0200853
Akron6247a5d2021-08-03 19:18:28 +0200854 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200855 log.Error().Err(err)
856 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200857 }
Akron6247a5d2021-08-03 19:18:28 +0200858
859 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200860 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200861 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200862 log.Error().Err(err)
863 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200864 }
865 all += more
866
Akron3a063ef2021-08-05 19:36:35 +0200867 for x := 0; x < len(dat.array); x++ {
868 // for _, d := range dat.array {
869 bo.PutUint32(buf[0:4], dat.array[x])
870 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200871 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200872 log.Error().Err(err)
873 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200874 }
Akron439f4ec2021-08-09 15:45:38 +0200875 all += more
Akron3a063ef2021-08-05 19:36:35 +0200876 if more != 4 {
877 log.Error().Msg("Can not write uint32")
878 return int64(all), err
879 }
Akron6247a5d2021-08-03 19:18:28 +0200880 }
881
882 return int64(all), err
883}
884
Akron439f4ec2021-08-09 15:45:38 +0200885// LoadDatokFile reads a double array represented tokenizer
886// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200887func LoadDatokFile(file string) *DaTokenizer {
888 f, err := os.Open(file)
889 if err != nil {
890 log.Error().Err(err)
891 os.Exit(0)
892 }
893 defer f.Close()
894
895 gz, err := gzip.NewReader(f)
896 if err != nil {
897 log.Error().Err(err)
898 os.Exit(0)
899 }
900 defer gz.Close()
901
Akron3a063ef2021-08-05 19:36:35 +0200902 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200903 return ParseDatok(gz)
904}
905
Akron439f4ec2021-08-09 15:45:38 +0200906// LoadDatokFile reads a double array represented tokenizer
907// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200908func ParseDatok(ior io.Reader) *DaTokenizer {
909
Akron439f4ec2021-08-09 15:45:38 +0200910 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200911 dat := &DaTokenizer{
912 sigma: make(map[rune]int),
913 epsilon: 0,
914 unknown: 0,
915 identity: 0,
916 final: 0,
917 loadFactor: 0,
918 }
919
920 r := bufio.NewReader(ior)
921
Akron3f8571a2021-08-05 11:18:10 +0200922 buf := make([]byte, 1024)
923 buf = buf[0:len(MAGIC)]
924
Akron439f4ec2021-08-09 15:45:38 +0200925 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200926
927 if err != nil {
928 log.Error().Err(err)
929 return nil
930 }
931
Akron3f8571a2021-08-05 11:18:10 +0200932 if string(MAGIC) != string(buf) {
933 log.Error().Msg("Not a datok file")
934 return nil
935 }
936
Akron439f4ec2021-08-09 15:45:38 +0200937 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200938 if err != nil {
939 log.Error().Err(err)
940 return nil
941 }
942
Akron439f4ec2021-08-09 15:45:38 +0200943 if more != 16 {
944 log.Error().Msg("Read bytes do not fit")
945 return nil
946 }
Akron3f8571a2021-08-05 11:18:10 +0200947
Akron3a063ef2021-08-05 19:36:35 +0200948 version := bo.Uint16(buf[0:2])
949
950 if version != VERSION {
951 log.Error().Msg("Version not compatible")
952 return nil
953 }
954
Akron3f8571a2021-08-05 11:18:10 +0200955 dat.epsilon = int(bo.Uint16(buf[2:4]))
956 dat.unknown = int(bo.Uint16(buf[4:6]))
957 dat.identity = int(bo.Uint16(buf[6:8]))
958 dat.final = int(bo.Uint16(buf[8:10]))
959
960 sigmaCount := int(bo.Uint16(buf[10:12]))
961 arraySize := int(bo.Uint32(buf[12:16]))
962
Akron3a063ef2021-08-05 19:36:35 +0200963 // Shouldn't be relevant though
964 dat.maxSize = arraySize - 1
965
Akron3f8571a2021-08-05 11:18:10 +0200966 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200967 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200968 if err == nil && sym != 0 {
969 dat.sigma[sym] = x
970 }
Akron3f8571a2021-08-05 11:18:10 +0200971 }
972
Akron439f4ec2021-08-09 15:45:38 +0200973 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200974
975 if err != nil {
976 log.Error().Err(err)
977 return nil
978 }
979
Akron3f8571a2021-08-05 11:18:10 +0200980 if string("T") != string(buf[0:1]) {
981 log.Error().Msg("Not a datok file")
982 return nil
983 }
984
985 // Read based on length
986 dat.array = make([]uint32, arraySize)
987
988 for x := 0; x < arraySize; x++ {
Akron3a063ef2021-08-05 19:36:35 +0200989 more, err = io.ReadFull(r, buf[0:4])
Akron3f8571a2021-08-05 11:18:10 +0200990 if err != nil {
Akron3a063ef2021-08-05 19:36:35 +0200991 if err == io.EOF {
992 fmt.Println(arraySize, x)
993 break
994 }
Akron3f8571a2021-08-05 11:18:10 +0200995 log.Error().Err(err)
996 return nil
997 }
Akron439f4ec2021-08-09 15:45:38 +0200998 if more != 4 {
999 log.Error().Msg("Not enough bytes read")
1000 return nil
1001 }
1002
Akron3f8571a2021-08-05 11:18:10 +02001003 dat.array[x] = bo.Uint32(buf[0:4])
1004 }
1005
1006 return dat
1007}
1008
Akron740f3d72021-08-03 12:12:34 +02001009// Match an input string against the double array
1010// FSA.
1011//
1012// Based on Mizobuchi et al (2000), p. 129,
1013// with additional support for IDENTITY, UNKNOWN
1014// and EPSILON transitions.
Akron64ffd9a2021-08-03 19:55:21 +02001015func (dat *DaTokenizer) Match(input string) bool {
Akron465a0992021-08-03 11:28:48 +02001016 var a int
Akron3fdfec62021-08-04 11:40:10 +02001017 var tu uint32
Akron465a0992021-08-03 11:28:48 +02001018 var ok bool
1019
Akron3fdfec62021-08-04 11:40:10 +02001020 t := uint32(1) // Initial state
Akron740f3d72021-08-03 12:12:34 +02001021 chars := []rune(input)
1022 i := 0
1023
Akron49d27ee2021-08-03 11:58:13 +02001024 for i < len(chars) {
Akron64ffd9a2021-08-03 19:55:21 +02001025 a, ok = dat.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +02001026
Akron740f3d72021-08-03 12:12:34 +02001027 // Support identity symbol if character is not in sigma
Akron64ffd9a2021-08-03 19:55:21 +02001028 if !ok && dat.identity != -1 {
Akron740f3d72021-08-03 12:12:34 +02001029 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001030 fmt.Println("IDENTITY symbol", string(chars[i]), "->", dat.identity)
Akron740f3d72021-08-03 12:12:34 +02001031 }
Akron64ffd9a2021-08-03 19:55:21 +02001032 a = dat.identity
Akron740f3d72021-08-03 12:12:34 +02001033 } else if DEBUG {
Akron49d27ee2021-08-03 11:58:13 +02001034 fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
Akron730a79c2021-08-03 11:05:29 +02001035 }
Akron465a0992021-08-03 11:28:48 +02001036 tu = t
Akron730a79c2021-08-03 11:05:29 +02001037 CHECK:
Akron3fdfec62021-08-04 11:40:10 +02001038 t = dat.getBase(tu) + uint32(a)
Akron730a79c2021-08-03 11:05:29 +02001039
Akron740f3d72021-08-03 12:12:34 +02001040 // Check if the transition is valid according to the double array
Akron64ffd9a2021-08-03 19:55:21 +02001041 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001042
1043 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001044 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +02001045 }
Akron740f3d72021-08-03 12:12:34 +02001046
Akron64ffd9a2021-08-03 19:55:21 +02001047 if !ok && a == dat.identity {
Akron740f3d72021-08-03 12:12:34 +02001048 // Try again with unknown symbol, in case identity failed
1049 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001050 fmt.Println("UNKNOWN symbol", string(chars[i]), "->", dat.unknown)
Akron740f3d72021-08-03 12:12:34 +02001051 }
Akron64ffd9a2021-08-03 19:55:21 +02001052 a = dat.unknown
Akron740f3d72021-08-03 12:12:34 +02001053
Akron64ffd9a2021-08-03 19:55:21 +02001054 } else if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001055 // Try again with epsilon symbol, in case everything else failed
1056 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001057 fmt.Println("EPSILON symbol", string(chars[i]), "->", dat.epsilon)
Akron740f3d72021-08-03 12:12:34 +02001058 }
Akron64ffd9a2021-08-03 19:55:21 +02001059 a = dat.epsilon
Akron740f3d72021-08-03 12:12:34 +02001060 } else {
1061 break
1062 }
1063 goto CHECK
Akron3fdfec62021-08-04 11:40:10 +02001064 } else if dat.isSeparate(t) {
Akron730a79c2021-08-03 11:05:29 +02001065 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001066 t = dat.getBase(t)
Akron8ef408b2021-08-02 22:11:04 +02001067 }
Akron740f3d72021-08-03 12:12:34 +02001068
1069 // Transition is fine
Akron64ffd9a2021-08-03 19:55:21 +02001070 if a != dat.epsilon {
Akron740f3d72021-08-03 12:12:34 +02001071 // Character consumed
Akron49d27ee2021-08-03 11:58:13 +02001072 i++
1073 }
Akron83e75a22021-08-04 13:14:06 +02001074
Akron740f3d72021-08-03 12:12:34 +02001075 // TODO:
1076 // Prevent endless epsilon loops!
Akron8ef408b2021-08-02 22:11:04 +02001077 }
1078
Akron740f3d72021-08-03 12:12:34 +02001079 if i != len(chars) {
1080 if DEBUG {
1081 fmt.Println("Not at the end")
1082 }
Akron8ef408b2021-08-02 22:11:04 +02001083 return false
1084 }
1085
Akron465a0992021-08-03 11:28:48 +02001086FINALCHECK:
Akron740f3d72021-08-03 12:12:34 +02001087
1088 // Automaton is in a final state
Akron3fdfec62021-08-04 11:40:10 +02001089 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akron8ef408b2021-08-02 22:11:04 +02001090 return true
1091 }
Akron465a0992021-08-03 11:28:48 +02001092
Akron740f3d72021-08-03 12:12:34 +02001093 // Check epsilon transitions until a final state is reached
Akron465a0992021-08-03 11:28:48 +02001094 tu = t
Akron3fdfec62021-08-04 11:40:10 +02001095 t = dat.getBase(tu) + uint32(dat.epsilon)
Akron465a0992021-08-03 11:28:48 +02001096
Akron740f3d72021-08-03 12:12:34 +02001097 // Epsilon transition failed
Akron64ffd9a2021-08-03 19:55:21 +02001098 if t > dat.getCheck(1) || dat.getCheck(t) != tu {
Akron740f3d72021-08-03 12:12:34 +02001099 if DEBUG {
Akron64ffd9a2021-08-03 19:55:21 +02001100 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", tu)
Akron740f3d72021-08-03 12:12:34 +02001101 }
Akron465a0992021-08-03 11:28:48 +02001102 return false
Akron740f3d72021-08-03 12:12:34 +02001103
Akron3fdfec62021-08-04 11:40:10 +02001104 } else if dat.isSeparate(t) {
Akron465a0992021-08-03 11:28:48 +02001105 // Move to representative state
Akron3fdfec62021-08-04 11:40:10 +02001106 t = dat.getBase(t)
Akron465a0992021-08-03 11:28:48 +02001107 }
Akron740f3d72021-08-03 12:12:34 +02001108
Akron465a0992021-08-03 11:28:48 +02001109 goto FINALCHECK
Akron8ef408b2021-08-02 22:11:04 +02001110}
Akron068874c2021-08-04 15:19:56 +02001111
Akron439f4ec2021-08-09 15:45:38 +02001112// Show the current state of the buffer,
1113// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001114func showBuffer(buffer []rune, buffo int, buffi int) string {
1115 out := make([]rune, 0, 1024)
1116 for x := 0; x < len(buffer); x++ {
1117 if buffi == x {
1118 out = append(out, '^')
1119 }
1120 if buffo == x {
1121 out = append(out, '[', buffer[x], ']')
1122 } else {
1123 out = append(out, buffer[x])
1124 }
1125 }
1126 return string(out)
1127}
1128
Akron84d68e62021-08-04 17:06:52 +02001129// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001130// FSA. The rules are always greedy. If the automaton fails,
1131// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001132//
1133// Based on Match with additional support
Akron84d68e62021-08-04 17:06:52 +02001134// for NONTOKEN and TOKENEND handling
Akron3f8571a2021-08-05 11:18:10 +02001135func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001136 var a int
Akronb4bbb472021-08-09 11:49:38 +02001137 var t0 uint32
Akron84d68e62021-08-04 17:06:52 +02001138 var ok, nontoken, tokenend bool
Akron068874c2021-08-04 15:19:56 +02001139
Akron3610f102021-08-08 14:13:25 +02001140 // Remember the last position of a possible tokenend,
1141 // in case the automaton fails.
1142 epsilonState := uint32(0)
1143 epsilonOffset := 0
1144
1145 // Implement a low level buffer for full control,
1146 // however - it is probably better to introduce
1147 // this on a higher level with a io.Reader interface
1148 // The buffer stores a single word and may have white
1149 // space at the end (but not at the beginning).
1150 //
1151 // This is the only backtracking requirement because of
1152 // epsilon transitions, to support tokenizations like:
1153 // "this is an example|.| And it works." vs
1154 // "this is an example.com| application."
1155 buffer := make([]rune, 1024)
1156 buffo := 0 // Buffer offset
1157 buffi := 0 // Buffer length
1158
Akron3f8571a2021-08-05 11:18:10 +02001159 reader := bufio.NewReader(r)
1160 writer := bufio.NewWriter(w)
1161 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001162
Akron3f8571a2021-08-05 11:18:10 +02001163 t := uint32(1) // Initial state
Akron3f8571a2021-08-05 11:18:10 +02001164
1165 var char rune
1166 var err error
1167 eof := false
1168
1169 for {
1170
Akronb4bbb472021-08-09 11:49:38 +02001171 // Get from reader if buffer is empty
Akron3610f102021-08-08 14:13:25 +02001172 if buffo >= buffi {
Akron3f8571a2021-08-05 11:18:10 +02001173 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001174
1175 // No more runes to read
Akron3f8571a2021-08-05 11:18:10 +02001176 if err != nil {
1177 eof = true
1178 break
1179 }
Akron3610f102021-08-08 14:13:25 +02001180 buffer[buffi] = char
1181 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001182 }
Akron3610f102021-08-08 14:13:25 +02001183
Akron3610f102021-08-08 14:13:25 +02001184 char = buffer[buffo]
Akron3610f102021-08-08 14:13:25 +02001185
1186 if DEBUG {
1187 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1188 }
Akron3f8571a2021-08-05 11:18:10 +02001189
1190 a, ok = dat.sigma[char]
Akron068874c2021-08-04 15:19:56 +02001191
Akron439f4ec2021-08-09 15:45:38 +02001192 // Use identity symbol if character is not in sigma
Akron068874c2021-08-04 15:19:56 +02001193 if !ok && dat.identity != -1 {
Akron068874c2021-08-04 15:19:56 +02001194 a = dat.identity
Akron068874c2021-08-04 15:19:56 +02001195 }
Akron3610f102021-08-08 14:13:25 +02001196
Akronb4bbb472021-08-09 11:49:38 +02001197 t0 = t
1198
Akron439f4ec2021-08-09 15:45:38 +02001199 // Check for epsilon transitions and remember
Akronb4bbb472021-08-09 11:49:38 +02001200 if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
Akron03c92fe2021-08-09 14:07:57 +02001201 // Remember state for backtracking to last tokenend state
Akronb4bbb472021-08-09 11:49:38 +02001202 epsilonState = t0
Akron3610f102021-08-08 14:13:25 +02001203 epsilonOffset = buffo
1204 }
1205
Akron068874c2021-08-04 15:19:56 +02001206 CHECK:
1207 nontoken = false
Akron84d68e62021-08-04 17:06:52 +02001208 tokenend = false
1209
Akronb4bbb472021-08-09 11:49:38 +02001210 t = dat.getBase(t0) + uint32(a)
Akron068874c2021-08-04 15:19:56 +02001211
Akron524c5432021-08-05 14:14:27 +02001212 if DEBUG {
Akron439f4ec2021-08-09 15:45:38 +02001213 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t, dat.outgoing(t0))
Akron524c5432021-08-05 14:14:27 +02001214 }
1215
Akron068874c2021-08-04 15:19:56 +02001216 // Check if the transition is valid according to the double array
Akronb4bbb472021-08-09 11:49:38 +02001217 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001218
1219 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001220 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001221 }
1222
1223 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001224
Akron068874c2021-08-04 15:19:56 +02001225 // Try again with unknown symbol, in case identity failed
1226 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001227 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001228 }
1229 a = dat.unknown
1230
1231 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001232
Akron068874c2021-08-04 15:19:56 +02001233 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001234 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001235 epsilonState = 0 // reset
1236 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001237 a = dat.epsilon
1238
Akron3610f102021-08-08 14:13:25 +02001239 if DEBUG {
1240 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1241 }
Akronb4bbb472021-08-09 11:49:38 +02001242
Akron068874c2021-08-04 15:19:56 +02001243 } else {
1244 break
1245 }
Akron068874c2021-08-04 15:19:56 +02001246
Akronb4bbb472021-08-09 11:49:38 +02001247 goto CHECK
1248
1249 }
1250
1251 // Move to representative state
1252 nontoken = dat.isNonToken(t)
1253 tokenend = dat.isTokenEnd(t)
1254
Akron439f4ec2021-08-09 15:45:38 +02001255 // Check for representative states
Akronb4bbb472021-08-09 11:49:38 +02001256 if dat.isSeparate(t) {
Akron068874c2021-08-04 15:19:56 +02001257 t = dat.getBase(t)
Akron84d68e62021-08-04 17:06:52 +02001258
Akron524c5432021-08-05 14:14:27 +02001259 if DEBUG {
1260 fmt.Println("Representative pointing to", t)
1261 }
Akron068874c2021-08-04 15:19:56 +02001262 }
1263
Akron439f4ec2021-08-09 15:45:38 +02001264 rewindBuffer := false
1265
1266 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001267 if a != dat.epsilon {
1268
Akron3610f102021-08-08 14:13:25 +02001269 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001270
Akron439f4ec2021-08-09 15:45:38 +02001271 // Transition does not produce a character
1272 if nontoken && buffo == 1 {
Akron3610f102021-08-08 14:13:25 +02001273 if DEBUG {
1274 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1275 }
Akron439f4ec2021-08-09 15:45:38 +02001276 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001277 }
Akron3f8571a2021-08-05 11:18:10 +02001278 }
Akron068874c2021-08-04 15:19:56 +02001279
Akron439f4ec2021-08-09 15:45:38 +02001280 if tokenend { // Transition marks the end of a token
Akron524c5432021-08-05 14:14:27 +02001281
Akron3610f102021-08-08 14:13:25 +02001282 data := []byte(string(buffer[:buffo]))
1283 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001284 fmt.Println("-> Flush buffer:", string(data), showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001285 }
1286 writer.Write(data)
Akron3f8571a2021-08-05 11:18:10 +02001287 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001288 rewindBuffer = true
1289 }
Akron3610f102021-08-08 14:13:25 +02001290
Akron439f4ec2021-08-09 15:45:38 +02001291 // Rewind the buffer
1292 if rewindBuffer {
1293
1294 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001295 for x, i := range buffer[buffo:buffi] {
1296 buffer[x] = i
1297 }
Akronb4bbb472021-08-09 11:49:38 +02001298
Akron3610f102021-08-08 14:13:25 +02001299 buffi -= buffo
1300 epsilonOffset -= buffo
1301 buffo = 0
1302 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001303 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001304 }
Akron84d68e62021-08-04 17:06:52 +02001305 }
1306
Akron068874c2021-08-04 15:19:56 +02001307 // TODO:
1308 // Prevent endless epsilon loops!
1309 }
1310
Akron439f4ec2021-08-09 15:45:38 +02001311 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001312 if !eof {
Akron068874c2021-08-04 15:19:56 +02001313 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001314 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001315 }
1316 return false
1317 }
1318
1319FINALCHECK:
1320
1321 // Automaton is in a final state
1322 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akronb4bbb472021-08-09 11:49:38 +02001323
1324 if buffi > 0 {
1325 data := []byte(string(buffer[:buffi]))
1326 if DEBUG {
1327 fmt.Println("-> Flush buffer:", string(data))
Akron3f8571a2021-08-05 11:18:10 +02001328 }
Akronb4bbb472021-08-09 11:49:38 +02001329 writer.Write(data)
Akrondf0a3ef2021-08-09 15:53:45 +02001330 if dat.isTokenEnd(t) {
1331 writer.WriteRune('\n')
1332 }
Akron84d68e62021-08-04 17:06:52 +02001333 }
1334
1335 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001336 return true
1337 }
1338
1339 // Check epsilon transitions until a final state is reached
Akronb4bbb472021-08-09 11:49:38 +02001340 t0 = t
1341 t = dat.getBase(t0) + uint32(dat.epsilon)
Akron068874c2021-08-04 15:19:56 +02001342
1343 // Epsilon transition failed
Akronb4bbb472021-08-09 11:49:38 +02001344 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001345 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001346 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001347 }
1348 return false
Akron068874c2021-08-04 15:19:56 +02001349 }
1350
Akronb4bbb472021-08-09 11:49:38 +02001351 if dat.isSeparate(t) {
1352 // Move to representative state
1353 t = dat.getBase(t)
1354 }
Akron3f8571a2021-08-05 11:18:10 +02001355
Akron439f4ec2021-08-09 15:45:38 +02001356 if dat.isTokenEnd(t) {
Akron3610f102021-08-08 14:13:25 +02001357 if buffi > 0 {
1358 data := []byte(string(buffer[:buffi]))
1359 if DEBUG {
1360 fmt.Println("-> Flush buffer:", string(data))
1361 }
1362 writer.Write(data)
1363 buffi = 0
1364 buffo = 0
1365 epsilonState = 0
1366 }
Akron3f8571a2021-08-05 11:18:10 +02001367 writer.WriteRune('\n')
Akron068874c2021-08-04 15:19:56 +02001368 }
1369
1370 goto FINALCHECK
1371}