blob: 1c75a336db16fa99cd856d2743bda2e9d972fef6 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
Akron8e1d69b2021-08-12 17:38:49 +020017// - Turn sigma into an array instead of using a map.
Akron740f3d72021-08-03 12:12:34 +020018// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020019// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020020// - Introduce methods on BC array entries instead of
21// jumping into the entries all the time!
22// - Instead of memoizing the loadFactor, better remember
23// the number of set transitions
Akron4db3ecf2021-08-11 18:49:03 +020024// - Replace table with a map
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
33 "os"
Akronc9d84a62021-08-03 15:56:03 +020034 "sort"
Akron8ef408b2021-08-02 22:11:04 +020035 "strconv"
36 "strings"
37 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020038
Akron527c10c2021-08-13 01:45:18 +020039 "log"
Akron8ef408b2021-08-02 22:11:04 +020040)
41
42const (
Akron2a4b9292021-08-04 15:35:22 +020043 PROPS = 1
44 SIGMA = 2
45 STATES = 3
46 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020047 DEBUG = false
48 MAGIC = "DATOK"
49 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020050 FIRSTBIT uint32 = 1 << 31
51 SECONDBIT uint32 = 1 << 30
52 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020053)
54
Akron03c92fe2021-08-09 14:07:57 +020055// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020056var bo binary.ByteOrder = binary.LittleEndian
57
Akron8ef408b2021-08-02 22:11:04 +020058type mapping struct {
59 source int
Akron3fdfec62021-08-04 11:40:10 +020060 target uint32
Akron8ef408b2021-08-02 22:11:04 +020061}
62
63type edge struct {
Akron83e75a22021-08-04 13:14:06 +020064 inSym int
65 outSym int
66 end int
67 nontoken bool
68 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020069}
70
Akron03c92fe2021-08-09 14:07:57 +020071// Tokenizer is the intermediate representation
72// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020073type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020074 sigmaRev map[int]rune
75 arcCount int
Akron740f3d72021-08-03 12:12:34 +020076 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020077 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020078
79 // Special symbols in sigma
80 epsilon int
81 unknown int
82 identity int
83 final int
Akron03c92fe2021-08-09 14:07:57 +020084 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020085}
86
Akron03c92fe2021-08-09 14:07:57 +020087// DaTokenizer represents a tokenizer implemented as a
88// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020089type DaTokenizer struct {
Akron03a3c612021-08-04 11:51:27 +020090 sigma map[rune]int
91 maxSize int
92 loadFactor float64
93 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020094 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020095
96 // Special symbols in sigma
97 epsilon int
98 unknown int
99 identity int
100 final int
Akron03c92fe2021-08-09 14:07:57 +0200101 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200102}
103
Akron03c92fe2021-08-09 14:07:57 +0200104// ParseFoma reads the FST from a foma file
105// and creates an internal representation,
106// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200107func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200108 f, err := os.Open(file)
109 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200110 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200111 return nil
Akron8ef408b2021-08-02 22:11:04 +0200112 }
113 defer f.Close()
114
115 gz, err := gzip.NewReader(f)
116 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200117 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200118 return nil
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120 defer gz.Close()
121
Akron3fdfec62021-08-04 11:40:10 +0200122 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200123}
124
Akron03c92fe2021-08-09 14:07:57 +0200125// ParseFoma reads the FST from a foma file reader
126// and creates an internal representation,
127// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200128func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200129 r := bufio.NewReader(ior)
130
131 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200132 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200133 epsilon: -1,
134 unknown: -1,
135 identity: -1,
136 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200137 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200138 }
139
Akron740f3d72021-08-03 12:12:34 +0200140 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200141
142 mode := 0
143 var elem []string
144 var elemint [5]int
145
Akron03c92fe2021-08-09 14:07:57 +0200146 // Iterate over all lines of the file.
147 // This is mainly based on foma2js,
148 // licensed under the Apache License, version 2,
149 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200150 for {
151 line, err := r.ReadString('\n')
152 if err != nil {
153 if err == io.EOF {
154 break
155 }
Akron527c10c2021-08-13 01:45:18 +0200156 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200157 return nil
Akron8ef408b2021-08-02 22:11:04 +0200158 }
Akron8ef408b2021-08-02 22:11:04 +0200159
Akron439f4ec2021-08-09 15:45:38 +0200160 // Read parser mode for the following lines
161 if strings.HasPrefix(line, "##") {
162 if strings.HasPrefix(line, "##props##") {
163 mode = PROPS
164
165 } else if strings.HasPrefix(line, "##states##") {
166 mode = STATES
167
168 // Adds a final transition symbol to sigma
169 // written as '#' in Mizobuchi et al (2000)
170 tok.sigmaCount++
171 tok.final = tok.sigmaCount
172
173 } else if strings.HasPrefix(line, "##sigma##") {
174
175 mode = SIGMA
176
177 } else if strings.HasPrefix(line, "##end##") {
178
179 mode = NONE
180
181 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200182 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200183 break
184 }
Akron8ef408b2021-08-02 22:11:04 +0200185 continue
186 }
187
Akron439f4ec2021-08-09 15:45:38 +0200188 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200189 switch mode {
190 case PROPS:
191 {
192 elem = strings.Split(line, " ")
193 /*
194 fmt.Println("arity: " + elem[0])
195 fmt.Println("arccount: " + elem[1])
196 fmt.Println("statecount: " + elem[2])
197 fmt.Println("linecount: " + elem[3])
198 fmt.Println("finalcount: " + elem[4])
199 fmt.Println("pathcount: " + elem[5])
200 fmt.Println("is_deterministic: " + elem[6])
201 fmt.Println("is_pruned: " + elem[7])
202 fmt.Println("is_minimized: " + elem[8])
203 fmt.Println("is_epsilon_free: " + elem[9])
204 fmt.Println("is_loop_free: " + elem[10])
205 fmt.Println("extras: " + elem[11])
206 fmt.Println("name: " + elem[12])
207 */
208 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200209 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200210 return nil
Akron8ef408b2021-08-02 22:11:04 +0200211 }
Akron439f4ec2021-08-09 15:45:38 +0200212
Akron8ef408b2021-08-02 22:11:04 +0200213 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200214 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200215 return nil
Akron8ef408b2021-08-02 22:11:04 +0200216 }
217
218 elemint[0], err = strconv.Atoi(elem[1])
219 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200220 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200221 return nil
Akron8ef408b2021-08-02 22:11:04 +0200222 }
Akron740f3d72021-08-03 12:12:34 +0200223 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200224
Akron8ef408b2021-08-02 22:11:04 +0200225 elemint[0], err = strconv.Atoi(elem[2])
226 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200227 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200228 return nil
Akron8ef408b2021-08-02 22:11:04 +0200229 }
Akron439f4ec2021-08-09 15:45:38 +0200230
231 // States start at 1 in Mizobuchi et al (2000),
232 // as the state 0 is associated with a fail.
233 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200234 tok.transitions = make([]map[int]*edge, elemint[0]+1)
235 continue
236 }
237 case STATES:
238 {
239 elem = strings.Split(line[0:len(line)-1], " ")
240 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200241 if DEBUG {
242 fmt.Println("Skip", elem)
243 }
Akron8ef408b2021-08-02 22:11:04 +0200244 continue
245 }
246 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200247 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200248 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200249 break
250 }
Akron8ef408b2021-08-02 22:11:04 +0200251
252 if len(elem) > 1 {
253 elemint[1], err = strconv.Atoi(elem[1])
254 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200255 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200256 break
257 }
258 if len(elem) > 2 {
259 elemint[2], err = strconv.Atoi(elem[2])
260 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200261 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200262 break
263 }
264 if len(elem) > 3 {
265 elemint[3], err = strconv.Atoi(elem[3])
266 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200267 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200268 break
269 }
270 if len(elem) > 4 {
271 elemint[4], err = strconv.Atoi(elem[4])
272 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200273 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200274 break
275 }
276 }
277 }
278 }
279 }
280
281 switch len(elem) {
282 case 5:
283 {
Akron740f3d72021-08-03 12:12:34 +0200284 state = elemint[0]
285 inSym = elemint[1]
286 outSym = elemint[2]
287 end = elemint[3]
288 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200289 }
290 case 4:
291 {
292 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200293 state = elemint[0]
294 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200295 } else {
Akron740f3d72021-08-03 12:12:34 +0200296 state = elemint[0]
297 inSym = elemint[1]
298 end = elemint[2]
299 final = elemint[3]
300 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200301 }
302 }
303 case 3:
304 {
Akron740f3d72021-08-03 12:12:34 +0200305 inSym = elemint[0]
306 outSym = elemint[1]
307 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200308 }
309 case 2:
310 {
Akron740f3d72021-08-03 12:12:34 +0200311 inSym = elemint[0]
312 end = elemint[1]
313 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200314 }
315 }
316
Akron83e75a22021-08-04 13:14:06 +0200317 nontoken := false
318 tokenend := false
319
Akron439f4ec2021-08-09 15:45:38 +0200320 // While the states in foma start with 0, the states in the
321 // Mizobuchi FSA start with one - so we increase every state by 1.
322 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200323 inSym++
324 outSym++
325
Akron439f4ec2021-08-09 15:45:38 +0200326 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200327 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200328 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200329 tokenend = true
330 } else if outSym == tok.epsilon {
331 nontoken = true
332 } else {
Akron527c10c2021-08-13 01:45:18 +0200333 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200334 "Unsupported transition: " +
335 strconv.Itoa(state) +
336 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200337 " (" +
Akron740f3d72021-08-03 12:12:34 +0200338 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200339 ":" +
Akron740f3d72021-08-03 12:12:34 +0200340 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200341 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200342 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200343 ":" +
Akron740f3d72021-08-03 12:12:34 +0200344 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200345 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200346 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200347 }
Akron83e75a22021-08-04 13:14:06 +0200348
Akron83e75a22021-08-04 13:14:06 +0200349 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200350 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200351 return nil
Akron8ef408b2021-08-02 22:11:04 +0200352 }
353
Akron03c92fe2021-08-09 14:07:57 +0200354 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200355 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200356 inSym: inSym,
357 outSym: outSym,
358 end: end + 1,
359 tokenend: tokenend,
360 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200361 }
362
Akron740f3d72021-08-03 12:12:34 +0200363 // Initialize outgoing states
364 if tok.transitions[state+1] == nil {
365 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200366 }
367
Akron740f3d72021-08-03 12:12:34 +0200368 // Ignore transitions with invalid symbols
369 if inSym >= 0 {
370 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200371 }
Akron8ef408b2021-08-02 22:11:04 +0200372
Akron740f3d72021-08-03 12:12:34 +0200373 // Add final transition
374 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200375 // TODO:
376 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200377 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200378 }
379
Akronb4bbb472021-08-09 11:49:38 +0200380 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200381 fmt.Println("Add",
382 state+1, "->", end+1,
383 "(",
384 inSym,
385 ":",
386 outSym,
387 ") (",
388 string(tok.sigmaRev[inSym]),
389 ":",
390 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200391 ")",
392 ";",
393 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200394 "NT:", nontoken,
395 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200396 }
Akron75ebe7f2021-08-03 10:34:10 +0200397
Akron8ef408b2021-08-02 22:11:04 +0200398 continue
399 }
400 case SIGMA:
401 {
402 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
403
404 // Turn string into sigma id
405 number, err := strconv.Atoi(elem[0])
406
Akron524c5432021-08-05 14:14:27 +0200407 // ID needs to be > 1
408 number++
409
Akron8ef408b2021-08-02 22:11:04 +0200410 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200411 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200412 return nil
Akron8ef408b2021-08-02 22:11:04 +0200413 }
414
Akron740f3d72021-08-03 12:12:34 +0200415 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200416
417 var symbol rune
418
419 // Read rune
420 if utf8.RuneCountInString(elem[1]) == 1 {
421 symbol = []rune(elem[1])[0]
422
Akron8ef408b2021-08-02 22:11:04 +0200423 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200424
425 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200426 switch elem[1] {
427 case "@_EPSILON_SYMBOL_@":
428 {
Akronc17f1ca2021-08-03 19:47:27 +0200429 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200430 }
431 case "@_UNKNOWN_SYMBOL_@":
432 {
Akronc17f1ca2021-08-03 19:47:27 +0200433 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200434 }
435
436 case "@_IDENTITY_SYMBOL_@":
437 {
Akronc17f1ca2021-08-03 19:47:27 +0200438 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200439 }
Akron03c92fe2021-08-09 14:07:57 +0200440
441 case "@_TOKEN_SYMBOL_@":
442 {
443 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200444 }
Akron8ef408b2021-08-02 22:11:04 +0200445 default:
Akron740f3d72021-08-03 12:12:34 +0200446 {
Akron527c10c2021-08-13 01:45:18 +0200447 log.Println("MCS not supported: " + line)
Akron4db3ecf2021-08-11 18:49:03 +0200448 return nil
Akron740f3d72021-08-03 12:12:34 +0200449 }
Akron8ef408b2021-08-02 22:11:04 +0200450 }
Akron439f4ec2021-08-09 15:45:38 +0200451 continue
Akron8ef408b2021-08-02 22:11:04 +0200452
Akron740f3d72021-08-03 12:12:34 +0200453 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200454 line, err = r.ReadString('\n')
455 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200456 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200457 return nil
Akron8ef408b2021-08-02 22:11:04 +0200458 }
459 if len(line) != 1 {
Akron527c10c2021-08-13 01:45:18 +0200460 log.Println("MCS not supported:" + line)
Akron4db3ecf2021-08-11 18:49:03 +0200461 return nil
Akron8ef408b2021-08-02 22:11:04 +0200462 }
Akron03c92fe2021-08-09 14:07:57 +0200463 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200464 }
465
Akron740f3d72021-08-03 12:12:34 +0200466 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200467 }
468 }
469 }
470
471 return tok
472}
473
Akron64ffd9a2021-08-03 19:55:21 +0200474// Set alphabet A to the list of all symbols
475// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200476func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200477 for a := range tok.transitions[s] {
478 *A = append(*A, a)
479 }
480
481 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200482 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200483}
484
Akron439f4ec2021-08-09 15:45:38 +0200485// ToDoubleArray turns the intermediate tokenizer representation
486// into a double array representation.
487//
488// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200489func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
490
491 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200492 sigma: make(map[rune]int),
493 loadFactor: -1,
494 final: tok.final,
495 unknown: tok.unknown,
496 identity: tok.identity,
497 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200498 tokenend: tok.tokenend,
Akron3f8571a2021-08-05 11:18:10 +0200499 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200500 }
501
502 for num, sym := range tok.sigmaRev {
503 dat.sigma[sym] = num
504 }
Akron8ef408b2021-08-02 22:11:04 +0200505
506 mark := 0
507 size := 0
508
Akron439f4ec2021-08-09 15:45:38 +0200509 // Create a mapping from s (in Ms aka Intermediate FSA)
510 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200511 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200512
Akron439f4ec2021-08-09 15:45:38 +0200513 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200514 table[size] = &mapping{source: 1, target: 1}
515 size++
516
Akron740f3d72021-08-03 12:12:34 +0200517 // Allocate space for the outgoing symbol range
518 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200519
520 for mark < size {
521 s := table[mark].source // This is a state in Ms
522 t := table[mark].target // This is a state in Mt
523 mark++
Akron740f3d72021-08-03 12:12:34 +0200524
525 // Following the paper, here the state t can be remembered
526 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200527 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200528 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200529
Akron740f3d72021-08-03 12:12:34 +0200530 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200531 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200532
Akron773b1ef2021-08-03 17:37:20 +0200533 // TODO:
Akron068874c2021-08-04 15:19:56 +0200534 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200535 // outdegree of .end
536
Akron740f3d72021-08-03 12:12:34 +0200537 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200538 for _, a := range A {
539
Akronc17f1ca2021-08-03 19:47:27 +0200540 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200541
Akron740f3d72021-08-03 12:12:34 +0200542 // Aka g(s, a)
543 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200544
Akron740f3d72021-08-03 12:12:34 +0200545 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200546 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200547 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200548
Akron439f4ec2021-08-09 15:45:38 +0200549 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200550 fmt.Println("Translate transition",
551 s, "->", s1, "(", a, ")", "to", t, "->", t1)
552 }
553
Akron83e75a22021-08-04 13:14:06 +0200554 // Mark the state as being the target of a nontoken transition
555 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200556 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200557 if DEBUG {
558 fmt.Println("Set", t1, "to nontoken")
559 }
Akron83e75a22021-08-04 13:14:06 +0200560 }
561
Akron84d68e62021-08-04 17:06:52 +0200562 // Mark the state as being the target of a tokenend transition
563 if tok.transitions[s][a].tokenend {
564 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200565 if DEBUG {
566 fmt.Println("Set", t1, "to tokenend")
567 }
Akron84d68e62021-08-04 17:06:52 +0200568 }
569
Akron740f3d72021-08-03 12:12:34 +0200570 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200571 r := stateAlreadyInTable(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200572
Akron439f4ec2021-08-09 15:45:38 +0200573 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200574 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200575 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200576 table[size] = &mapping{source: s1, target: t1}
577 size++
578 } else {
Akron740f3d72021-08-03 12:12:34 +0200579 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200580 dat.setBase(t1, r)
581 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200582 }
583 } else {
Akron740f3d72021-08-03 12:12:34 +0200584 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200585 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200586 }
587 }
588 }
589
590 // Following Mizobuchi et al (2000) the size of the
591 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200592 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200593 dat.array = dat.array[:dat.maxSize+1]
594 return dat
Akron8ef408b2021-08-02 22:11:04 +0200595}
596
Akron8ef408b2021-08-02 22:11:04 +0200597// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200598// exists and return this as a representative.
599// Currently iterates through the whole table
600// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200601func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200602 for x := 0; x < size; x++ {
603 if table[x].source == s {
604 return table[x].target
605 }
606 }
607 return 0
608}
609
Akron64ffd9a2021-08-03 19:55:21 +0200610// Resize double array when necessary
611func (dat *DaTokenizer) resize(l int) {
612 // TODO:
613 // This is a bit too aggressive atm and should be calmed down.
614 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200615 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200616 }
Akron64ffd9a2021-08-03 19:55:21 +0200617}
Akronc9d84a62021-08-03 15:56:03 +0200618
Akron64ffd9a2021-08-03 19:55:21 +0200619// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200620func (dat *DaTokenizer) setBase(p uint32, v uint32) {
621 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200622 if dat.maxSize < l {
Akron439f4ec2021-08-09 15:45:38 +0200623 dat.resize(l)
Akron64ffd9a2021-08-03 19:55:21 +0200624 dat.maxSize = l
625 }
Akron439f4ec2021-08-09 15:45:38 +0200626 dat.array[l-1] = v
627}
628
629// Get base value in double array
630func (dat *DaTokenizer) getBase(p uint32) uint32 {
631 if int(p*2) > dat.maxSize {
632 return 0
633 }
634 return dat.array[p*2] & RESTBIT
635}
636
637// Set check value in double array
638func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
639 l := int(p*2 + 1)
640 if dat.maxSize < l {
641 dat.resize(l)
642 dat.maxSize = l
643 }
644 dat.array[l] = v
645}
646
647// Get check value in double array
648func (dat *DaTokenizer) getCheck(p uint32) uint32 {
649 if int((p*2)+1) > dat.maxSize {
650 return 0
651 }
652 return dat.array[(p*2)+1] & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200653}
654
Akron3fdfec62021-08-04 11:40:10 +0200655// Returns true if a state is separate pointing to a representative
656func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200657 return dat.array[p*2]&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200658}
659
660// Mark a state as separate pointing to a representative
661func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
662 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200663 dat.array[p*2] |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200664 } else {
Akron03c92fe2021-08-09 14:07:57 +0200665 dat.array[p*2] &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200666 }
667}
668
Akron83e75a22021-08-04 13:14:06 +0200669// Returns true if a state is the target of a nontoken transition
670func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200671 return dat.array[p*2+1]&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200672}
673
674// Mark a state as being the target of a nontoken transition
675func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
676 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200677 dat.array[p*2+1] |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200678 } else {
Akron03c92fe2021-08-09 14:07:57 +0200679 dat.array[p*2+1] &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200680 }
681}
682
Akron84d68e62021-08-04 17:06:52 +0200683// Returns true if a state is the target of a tokenend transition
684func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200685 return dat.array[p*2+1]&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200686}
687
688// Mark a state as being the target of a tokenend transition
689func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
690 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200691 dat.array[p*2+1] |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200692 } else {
Akron03c92fe2021-08-09 14:07:57 +0200693 dat.array[p*2+1] &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200694 }
695}
696
Akron64ffd9a2021-08-03 19:55:21 +0200697// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200698func (dat *DaTokenizer) setSize(v int) {
699 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200700}
701
702// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200703func (dat *DaTokenizer) GetSize() int {
704 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200705}
706
707// Based on Mizobuchi et al (2000), p. 124
708// This iterates for every state through the complete double array
709// structure until it finds a gap that fits all outgoing transitions
710// of the state. This is extremely slow, but is only necessary in the
711// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200712func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200713
714 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200715 base := uint32(1) // dat.lastFilledBase
716 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200717OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200718
Akron3f8571a2021-08-05 11:18:10 +0200719 /*
720 if !skip {
721 if dat.getCheck(base) != 0 {
722 dat.lastFilledBase = base
723 } else {
724 skip = true
725 }
726 }
727 */
728
Akron740f3d72021-08-03 12:12:34 +0200729 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200730 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200731 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200732 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200733 base++
734 goto OVERLAP
735 }
736 }
Akron8ef408b2021-08-02 22:11:04 +0200737 return base
738}
739
Akron3610f102021-08-08 14:13:25 +0200740// List all outgoing transitions for a state
741// for testing purposes
742func (dat *DaTokenizer) outgoing(t uint32) []int {
743
744 valid := make([]int, 0, len(dat.sigma))
745
746 for _, a := range dat.sigma {
747 t1 := dat.getBase(t) + uint32(a)
748 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
749 valid = append(valid, a)
750 }
751 }
752
753 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
754 t1 := dat.getBase(t) + uint32(a)
755 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
756 valid = append(valid, -1*a)
757 }
758 }
759
760 sort.Ints(valid)
761
762 return valid
763}
764
Akron03a3c612021-08-04 11:51:27 +0200765// LoadFactor as defined in Kanda et al (2018),
766// i.e. the proportion of non-empty elements to all elements.
767func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200768
Akron03a3c612021-08-04 11:51:27 +0200769 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200770 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200771 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200772 }
Akrond66a9262021-08-03 17:09:09 +0200773 nonEmpty := 0
774 all := len(dat.array) / 2
775 for x := 1; x <= len(dat.array); x = x + 2 {
776 if dat.array[x] != 0 {
777 nonEmpty++
778 }
779 }
Akron03a3c612021-08-04 11:51:27 +0200780 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
781 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200782}
783
Akron439f4ec2021-08-09 15:45:38 +0200784// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200785func (dat *DaTokenizer) Save(file string) (n int64, err error) {
786 f, err := os.Create(file)
787 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200788 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200789 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200790 }
791 defer f.Close()
792 gz := gzip.NewWriter(f)
793 defer gz.Close()
794 n, err = dat.WriteTo(gz)
795 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200796 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200797 return n, err
798 }
799 gz.Flush()
800 return n, nil
801}
802
803// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200804func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
805
Akron3a063ef2021-08-05 19:36:35 +0200806 wb := bufio.NewWriter(w)
807 defer wb.Flush()
808
Akron6247a5d2021-08-03 19:18:28 +0200809 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200810 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200811 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200812 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200813 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200814 }
815
816 // Get sigma as a list
817 sigmalist := make([]rune, len(dat.sigma)+16)
818 max := 0
819 for sym, num := range dat.sigma {
820 sigmalist[num] = sym
821 if num > max {
822 max = num
823 }
824 }
825
826 sigmalist = sigmalist[:max+1]
827
Akron3f8571a2021-08-05 11:18:10 +0200828 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200829 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200830 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
831 bo.PutUint16(buf[4:6], uint16(dat.unknown))
832 bo.PutUint16(buf[6:8], uint16(dat.identity))
833 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200834 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200835 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200836 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200837 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200838 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200839 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200840 }
841
842 all += more
843
Akron6247a5d2021-08-03 19:18:28 +0200844 // Write sigma
845 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200846
Akron3a063ef2021-08-05 19:36:35 +0200847 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200848 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200849 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200850 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200851 }
852 all += more
853 }
Akron439f4ec2021-08-09 15:45:38 +0200854
Akron6247a5d2021-08-03 19:18:28 +0200855 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200856 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200857 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200858 }
Akron6247a5d2021-08-03 19:18:28 +0200859
860 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200861 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200862 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200863 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200864 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200865 }
866 all += more
867
Akron3a063ef2021-08-05 19:36:35 +0200868 for x := 0; x < len(dat.array); x++ {
869 // for _, d := range dat.array {
870 bo.PutUint32(buf[0:4], dat.array[x])
871 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200872 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200873 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200874 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200875 }
Akron439f4ec2021-08-09 15:45:38 +0200876 all += more
Akron3a063ef2021-08-05 19:36:35 +0200877 if more != 4 {
Akron527c10c2021-08-13 01:45:18 +0200878 log.Println("Can not write uint32")
Akron3a063ef2021-08-05 19:36:35 +0200879 return int64(all), err
880 }
Akron6247a5d2021-08-03 19:18:28 +0200881 }
882
883 return int64(all), err
884}
885
Akron439f4ec2021-08-09 15:45:38 +0200886// LoadDatokFile reads a double array represented tokenizer
887// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200888func LoadDatokFile(file string) *DaTokenizer {
889 f, err := os.Open(file)
890 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200891 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200892 return nil
Akron3f8571a2021-08-05 11:18:10 +0200893 }
894 defer f.Close()
895
896 gz, err := gzip.NewReader(f)
897 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200898 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200899 return nil
Akron3f8571a2021-08-05 11:18:10 +0200900 }
901 defer gz.Close()
902
Akron3a063ef2021-08-05 19:36:35 +0200903 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200904 return ParseDatok(gz)
905}
906
Akron439f4ec2021-08-09 15:45:38 +0200907// LoadDatokFile reads a double array represented tokenizer
908// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200909func ParseDatok(ior io.Reader) *DaTokenizer {
910
Akron439f4ec2021-08-09 15:45:38 +0200911 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200912 dat := &DaTokenizer{
913 sigma: make(map[rune]int),
914 epsilon: 0,
915 unknown: 0,
916 identity: 0,
917 final: 0,
918 loadFactor: 0,
919 }
920
921 r := bufio.NewReader(ior)
922
Akron3f8571a2021-08-05 11:18:10 +0200923 buf := make([]byte, 1024)
924 buf = buf[0:len(MAGIC)]
925
Akron439f4ec2021-08-09 15:45:38 +0200926 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200927
928 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200929 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200930 return nil
931 }
932
Akron3f8571a2021-08-05 11:18:10 +0200933 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200934 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200935 return nil
936 }
937
Akron439f4ec2021-08-09 15:45:38 +0200938 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200939 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200940 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200941 return nil
942 }
943
Akron439f4ec2021-08-09 15:45:38 +0200944 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200945 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200946 return nil
947 }
Akron3f8571a2021-08-05 11:18:10 +0200948
Akron3a063ef2021-08-05 19:36:35 +0200949 version := bo.Uint16(buf[0:2])
950
951 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200952 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200953 return nil
954 }
955
Akron3f8571a2021-08-05 11:18:10 +0200956 dat.epsilon = int(bo.Uint16(buf[2:4]))
957 dat.unknown = int(bo.Uint16(buf[4:6]))
958 dat.identity = int(bo.Uint16(buf[6:8]))
959 dat.final = int(bo.Uint16(buf[8:10]))
960
961 sigmaCount := int(bo.Uint16(buf[10:12]))
962 arraySize := int(bo.Uint32(buf[12:16]))
963
Akron3a063ef2021-08-05 19:36:35 +0200964 // Shouldn't be relevant though
965 dat.maxSize = arraySize - 1
966
Akron3f8571a2021-08-05 11:18:10 +0200967 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200968 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200969 if err == nil && sym != 0 {
970 dat.sigma[sym] = x
971 }
Akron3f8571a2021-08-05 11:18:10 +0200972 }
973
Akron439f4ec2021-08-09 15:45:38 +0200974 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200975
976 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200977 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200978 return nil
979 }
980
Akron3f8571a2021-08-05 11:18:10 +0200981 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200982 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200983 return nil
984 }
985
986 // Read based on length
987 dat.array = make([]uint32, arraySize)
988
Akronbb4aac52021-08-13 00:52:27 +0200989 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200990
Akronbb4aac52021-08-13 00:52:27 +0200991 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +0200992 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +0200993 return nil
994 }
995
996 if len(dataArray) < arraySize*4 {
Akron527c10c2021-08-13 01:45:18 +0200997 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +0200998 return nil
999 }
1000
1001 for x := 0; x < arraySize; x++ {
1002 dat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
Akron3f8571a2021-08-05 11:18:10 +02001003 }
1004
1005 return dat
1006}
1007
Akron439f4ec2021-08-09 15:45:38 +02001008// Show the current state of the buffer,
1009// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001010func showBuffer(buffer []rune, buffo int, buffi int) string {
1011 out := make([]rune, 0, 1024)
1012 for x := 0; x < len(buffer); x++ {
1013 if buffi == x {
1014 out = append(out, '^')
1015 }
1016 if buffo == x {
1017 out = append(out, '[', buffer[x], ']')
1018 } else {
1019 out = append(out, buffer[x])
1020 }
1021 }
1022 return string(out)
1023}
1024
Akron84d68e62021-08-04 17:06:52 +02001025// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001026// FSA. The rules are always greedy. If the automaton fails,
1027// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001028//
Akron4db3ecf2021-08-11 18:49:03 +02001029// Based on Mizobuchi et al (2000), p. 129,
1030// with additional support for IDENTITY, UNKNOWN
1031// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001032func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001033 var a int
Akronb4bbb472021-08-09 11:49:38 +02001034 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001035 t := uint32(1) // Initial state
1036 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001037
Akron3610f102021-08-08 14:13:25 +02001038 // Remember the last position of a possible tokenend,
1039 // in case the automaton fails.
1040 epsilonState := uint32(0)
1041 epsilonOffset := 0
1042
1043 // Implement a low level buffer for full control,
1044 // however - it is probably better to introduce
1045 // this on a higher level with a io.Reader interface
1046 // The buffer stores a single word and may have white
1047 // space at the end (but not at the beginning).
1048 //
1049 // This is the only backtracking requirement because of
1050 // epsilon transitions, to support tokenizations like:
1051 // "this is an example|.| And it works." vs
1052 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001053 //
1054 // TODO:
1055 // Store a translation buffer as well, so characters don't
1056 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001057 buffer := make([]rune, 1024)
1058 buffo := 0 // Buffer offset
1059 buffi := 0 // Buffer length
1060
Akron3f8571a2021-08-05 11:18:10 +02001061 reader := bufio.NewReader(r)
1062 writer := bufio.NewWriter(w)
1063 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001064
Akron3f8571a2021-08-05 11:18:10 +02001065 var char rune
1066 var err error
1067 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001068 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001069
Akronc5d8d432021-08-10 16:48:44 +02001070PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001071 for {
1072
Akronb7e1f132021-08-10 11:52:31 +02001073 if newchar {
1074 // Get from reader if buffer is empty
1075 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001076 if eof {
1077 break
1078 }
Akronb7e1f132021-08-10 11:52:31 +02001079 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001080
Akronb7e1f132021-08-10 11:52:31 +02001081 // No more runes to read
1082 if err != nil {
1083 eof = true
1084 break
1085 }
1086 buffer[buffi] = char
1087 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001088 }
Akronb7e1f132021-08-10 11:52:31 +02001089
1090 char = buffer[buffo]
1091
1092 if DEBUG {
1093 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1094 }
1095
1096 // TODO: Better not repeatedly check for a!
1097 a, ok = dat.sigma[char]
1098
1099 // Use identity symbol if character is not in sigma
1100 if !ok && dat.identity != -1 {
1101 a = dat.identity
1102 }
1103
1104 t0 = t
1105
1106 // Check for epsilon transitions and remember
1107 if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
1108 // Remember state for backtracking to last tokenend state
1109 epsilonState = t0
1110 epsilonOffset = buffo
1111 }
Akron3f8571a2021-08-05 11:18:10 +02001112 }
Akron3610f102021-08-08 14:13:25 +02001113
Akronb7e1f132021-08-10 11:52:31 +02001114 // Checks a transition based on t0, a and buffo
Akronb4bbb472021-08-09 11:49:38 +02001115 t = dat.getBase(t0) + uint32(a)
Akron068874c2021-08-04 15:19:56 +02001116
Akron524c5432021-08-05 14:14:27 +02001117 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001118 // Char is only relevant if set
1119 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1120 if false {
1121 fmt.Println(dat.outgoing(t0))
1122 }
Akron524c5432021-08-05 14:14:27 +02001123 }
1124
Akronb7e1f132021-08-10 11:52:31 +02001125 // Check if the transition is invalid according to the double array
Akronb4bbb472021-08-09 11:49:38 +02001126 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001127
1128 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001129 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001130 }
1131
1132 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001133
Akron068874c2021-08-04 15:19:56 +02001134 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001135 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001136 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001137 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001138 }
1139 a = dat.unknown
1140
1141 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001142
Akron068874c2021-08-04 15:19:56 +02001143 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001144 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001145 epsilonState = 0 // reset
1146 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001147 a = dat.epsilon
1148
Akron3610f102021-08-08 14:13:25 +02001149 if DEBUG {
1150 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1151 }
Akronb4bbb472021-08-09 11:49:38 +02001152
Akron068874c2021-08-04 15:19:56 +02001153 } else {
1154 break
1155 }
Akron068874c2021-08-04 15:19:56 +02001156
Akronb7e1f132021-08-10 11:52:31 +02001157 newchar = false
1158 continue
Akronb4bbb472021-08-09 11:49:38 +02001159 }
1160
Akronb7e1f132021-08-10 11:52:31 +02001161 // Transition was successful
1162 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001163
1164 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001165 if a != dat.epsilon {
1166
Akron3610f102021-08-08 14:13:25 +02001167 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001168
Akron439f4ec2021-08-09 15:45:38 +02001169 // Transition does not produce a character
Akronc5d8d432021-08-10 16:48:44 +02001170 if buffo == 1 && dat.isNonToken(t) {
Akron3610f102021-08-08 14:13:25 +02001171 if DEBUG {
1172 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1173 }
Akron439f4ec2021-08-09 15:45:38 +02001174 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001175 }
Akron3f8571a2021-08-05 11:18:10 +02001176 }
Akron068874c2021-08-04 15:19:56 +02001177
Akronc5d8d432021-08-10 16:48:44 +02001178 // Transition marks the end of a token - so flush the buffer
Akronb7e1f132021-08-10 11:52:31 +02001179 if dat.isTokenEnd(t) {
Akron524c5432021-08-05 14:14:27 +02001180
Akronc5d8d432021-08-10 16:48:44 +02001181 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001182 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001183 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001184 }
Akron01912fc2021-08-12 11:41:58 +02001185 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001186 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001187 }
Akron1594cb82021-08-11 11:14:56 +02001188 if DEBUG {
1189 fmt.Println("-> Newline")
1190 }
1191 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001192 }
Akron3610f102021-08-08 14:13:25 +02001193
Akronc5d8d432021-08-10 16:48:44 +02001194 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001195 if rewindBuffer {
1196
1197 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001198 for x, i := range buffer[buffo:buffi] {
1199 buffer[x] = i
1200 }
Akronb4bbb472021-08-09 11:49:38 +02001201
Akron3610f102021-08-08 14:13:25 +02001202 buffi -= buffo
1203 epsilonOffset -= buffo
1204 buffo = 0
1205 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001206 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001207 }
Akron84d68e62021-08-04 17:06:52 +02001208 }
1209
Akronb7e1f132021-08-10 11:52:31 +02001210 // Move to representative state
1211 if dat.isSeparate(t) {
1212 t = dat.getBase(t)
1213
1214 if DEBUG {
1215 fmt.Println("Representative pointing to", t)
1216 }
1217 }
1218
Akronc5d8d432021-08-10 16:48:44 +02001219 newchar = true
1220
Akron068874c2021-08-04 15:19:56 +02001221 // TODO:
1222 // Prevent endless epsilon loops!
1223 }
1224
Akron439f4ec2021-08-09 15:45:38 +02001225 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001226 if !eof {
Akron068874c2021-08-04 15:19:56 +02001227 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001228 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001229 }
1230 return false
1231 }
1232
Akronb7e1f132021-08-10 11:52:31 +02001233 if DEBUG {
1234 fmt.Println("Entering final check")
1235 }
1236
Akronc5d8d432021-08-10 16:48:44 +02001237 // Automaton is in a final state, so flush the buffer and return
Akron068874c2021-08-04 15:19:56 +02001238 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akronb4bbb472021-08-09 11:49:38 +02001239
1240 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001241 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001242 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001243 }
Akron01912fc2021-08-12 11:41:58 +02001244 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001245
Akrondf0a3ef2021-08-09 15:53:45 +02001246 if dat.isTokenEnd(t) {
1247 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001248 if DEBUG {
1249 fmt.Println("-> Newline")
1250 }
Akrondf0a3ef2021-08-09 15:53:45 +02001251 }
Akron84d68e62021-08-04 17:06:52 +02001252 }
1253
Akron6e70dc82021-08-11 11:33:18 +02001254 // Add an additional sentence ending, if the file is over but no explicit
1255 // sentence split was reached. This may be controversial and therefore
1256 // optional via parameter.
1257 if !dat.isTokenEnd(t0) {
1258 writer.WriteRune('\n')
1259 if DEBUG {
1260 fmt.Println("-> Newline")
1261 }
1262 }
1263
Akron84d68e62021-08-04 17:06:52 +02001264 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001265 return true
1266 }
1267
Akronc5d8d432021-08-10 16:48:44 +02001268 // Check epsilon transitions until a final state is reached
1269 t0 = t
1270 t = dat.getBase(t0) + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001271 a = dat.epsilon
1272 newchar = false
Akronc5d8d432021-08-10 16:48:44 +02001273 if dat.getCheck(t) == t0 {
1274 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001275 goto PARSECHAR
1276 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001277 t0 = epsilonState
1278 epsilonState = 0 // reset
1279 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001280 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001281 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001282 }
Akronc5d8d432021-08-10 16:48:44 +02001283 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001284 }
Akronc5d8d432021-08-10 16:48:44 +02001285 return false
Akron068874c2021-08-04 15:19:56 +02001286}