blob: 805b8dfdcf4c5a08e0e34073666d32718819eee1 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
Akrone61380b2021-08-16 10:10:46 +020017// - Turn sigma into an array instead of using a map
18// and improve the mapping beforehand so that ASCII
19// is mapped directly and only non-ASCII needs to be
20// looked up in a map or similar.
Akron740f3d72021-08-03 12:12:34 +020021// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020023// - Instead of memoizing the loadFactor, better remember
24// the number of set transitions
Akrone61380b2021-08-16 10:10:46 +020025// - Replace/Enhance table with a map
26// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020027// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020028
Akron8ef408b2021-08-02 22:11:04 +020029import (
30 "bufio"
31 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020032 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020033 "fmt"
34 "io"
35 "os"
Akronc9d84a62021-08-03 15:56:03 +020036 "sort"
Akron8ef408b2021-08-02 22:11:04 +020037 "strconv"
38 "strings"
39 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020040
Akron527c10c2021-08-13 01:45:18 +020041 "log"
Akron8ef408b2021-08-02 22:11:04 +020042)
43
44const (
Akron2a4b9292021-08-04 15:35:22 +020045 PROPS = 1
46 SIGMA = 2
47 STATES = 3
48 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020049 DEBUG = false
50 MAGIC = "DATOK"
51 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020052 FIRSTBIT uint32 = 1 << 31
53 SECONDBIT uint32 = 1 << 30
54 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020055)
56
Akron03c92fe2021-08-09 14:07:57 +020057// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020058var bo binary.ByteOrder = binary.LittleEndian
59
Akron8ef408b2021-08-02 22:11:04 +020060type mapping struct {
61 source int
Akron3fdfec62021-08-04 11:40:10 +020062 target uint32
Akron8ef408b2021-08-02 22:11:04 +020063}
64
65type edge struct {
Akron83e75a22021-08-04 13:14:06 +020066 inSym int
67 outSym int
68 end int
69 nontoken bool
70 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020071}
72
Akron03c92fe2021-08-09 14:07:57 +020073// Tokenizer is the intermediate representation
74// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020075type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020076 sigmaRev map[int]rune
Akron31f3c062021-08-27 10:15:13 +020077 sigmaMCS map[int]string
Akron740f3d72021-08-03 12:12:34 +020078 arcCount int
Akron740f3d72021-08-03 12:12:34 +020079 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020080 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020081
82 // Special symbols in sigma
83 epsilon int
84 unknown int
85 identity int
86 final int
Akron03c92fe2021-08-09 14:07:57 +020087 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020088}
89
Akronf1a16502021-08-16 15:24:38 +020090type bc struct {
91 base uint32
92 check uint32
93}
94
Akron03c92fe2021-08-09 14:07:57 +020095// DaTokenizer represents a tokenizer implemented as a
96// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020097type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020098 sigma map[rune]int
99 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +0200100 maxSize int
101 loadFactor float64
Akronf1a16502021-08-16 15:24:38 +0200102 array []bc
Akronc17f1ca2021-08-03 19:47:27 +0200103
104 // Special symbols in sigma
105 epsilon int
106 unknown int
107 identity int
108 final int
Akron03c92fe2021-08-09 14:07:57 +0200109 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200110}
111
Akron03c92fe2021-08-09 14:07:57 +0200112// ParseFoma reads the FST from a foma file
113// and creates an internal representation,
114// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200115func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200116 f, err := os.Open(file)
117 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200118 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200119 return nil
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121 defer f.Close()
122
123 gz, err := gzip.NewReader(f)
124 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200125 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200126 return nil
Akron8ef408b2021-08-02 22:11:04 +0200127 }
128 defer gz.Close()
129
Akron3fdfec62021-08-04 11:40:10 +0200130 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200131}
132
Akron03c92fe2021-08-09 14:07:57 +0200133// ParseFoma reads the FST from a foma file reader
134// and creates an internal representation,
135// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200136func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200137 r := bufio.NewReader(ior)
138
139 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200140 sigmaRev: make(map[int]rune),
Akron31f3c062021-08-27 10:15:13 +0200141 sigmaMCS: make(map[int]string),
Akronc17f1ca2021-08-03 19:47:27 +0200142 epsilon: -1,
143 unknown: -1,
144 identity: -1,
145 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200146 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200147 }
148
Akron740f3d72021-08-03 12:12:34 +0200149 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200150
151 mode := 0
152 var elem []string
153 var elemint [5]int
154
Akron03c92fe2021-08-09 14:07:57 +0200155 // Iterate over all lines of the file.
156 // This is mainly based on foma2js,
157 // licensed under the Apache License, version 2,
158 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200159 for {
160 line, err := r.ReadString('\n')
161 if err != nil {
162 if err == io.EOF {
163 break
164 }
Akron527c10c2021-08-13 01:45:18 +0200165 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200166 return nil
Akron8ef408b2021-08-02 22:11:04 +0200167 }
Akron8ef408b2021-08-02 22:11:04 +0200168
Akron439f4ec2021-08-09 15:45:38 +0200169 // Read parser mode for the following lines
170 if strings.HasPrefix(line, "##") {
171 if strings.HasPrefix(line, "##props##") {
172 mode = PROPS
173
174 } else if strings.HasPrefix(line, "##states##") {
175 mode = STATES
176
177 // Adds a final transition symbol to sigma
178 // written as '#' in Mizobuchi et al (2000)
179 tok.sigmaCount++
180 tok.final = tok.sigmaCount
181
182 } else if strings.HasPrefix(line, "##sigma##") {
183
184 mode = SIGMA
185
186 } else if strings.HasPrefix(line, "##end##") {
187
188 mode = NONE
189
190 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200191 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200192 break
193 }
Akron8ef408b2021-08-02 22:11:04 +0200194 continue
195 }
196
Akron439f4ec2021-08-09 15:45:38 +0200197 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200198 switch mode {
199 case PROPS:
200 {
201 elem = strings.Split(line, " ")
202 /*
203 fmt.Println("arity: " + elem[0])
204 fmt.Println("arccount: " + elem[1])
205 fmt.Println("statecount: " + elem[2])
206 fmt.Println("linecount: " + elem[3])
207 fmt.Println("finalcount: " + elem[4])
208 fmt.Println("pathcount: " + elem[5])
209 fmt.Println("is_deterministic: " + elem[6])
210 fmt.Println("is_pruned: " + elem[7])
211 fmt.Println("is_minimized: " + elem[8])
212 fmt.Println("is_epsilon_free: " + elem[9])
213 fmt.Println("is_loop_free: " + elem[10])
214 fmt.Println("extras: " + elem[11])
215 fmt.Println("name: " + elem[12])
216 */
217 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200218 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200219 return nil
Akron8ef408b2021-08-02 22:11:04 +0200220 }
Akron439f4ec2021-08-09 15:45:38 +0200221
Akron8ef408b2021-08-02 22:11:04 +0200222 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200223 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200224 return nil
Akron8ef408b2021-08-02 22:11:04 +0200225 }
226
227 elemint[0], err = strconv.Atoi(elem[1])
228 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200229 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200230 return nil
Akron8ef408b2021-08-02 22:11:04 +0200231 }
Akron740f3d72021-08-03 12:12:34 +0200232 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200233
Akron8ef408b2021-08-02 22:11:04 +0200234 elemint[0], err = strconv.Atoi(elem[2])
235 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200236 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200237 return nil
Akron8ef408b2021-08-02 22:11:04 +0200238 }
Akron439f4ec2021-08-09 15:45:38 +0200239
240 // States start at 1 in Mizobuchi et al (2000),
241 // as the state 0 is associated with a fail.
242 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200243 tok.transitions = make([]map[int]*edge, elemint[0]+1)
244 continue
245 }
246 case STATES:
247 {
248 elem = strings.Split(line[0:len(line)-1], " ")
249 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200250 if DEBUG {
251 fmt.Println("Skip", elem)
252 }
Akron8ef408b2021-08-02 22:11:04 +0200253 continue
254 }
255 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200256 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200257 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200258 break
259 }
Akron8ef408b2021-08-02 22:11:04 +0200260
261 if len(elem) > 1 {
262 elemint[1], err = strconv.Atoi(elem[1])
263 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200264 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200265 break
266 }
267 if len(elem) > 2 {
268 elemint[2], err = strconv.Atoi(elem[2])
269 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200270 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200271 break
272 }
273 if len(elem) > 3 {
274 elemint[3], err = strconv.Atoi(elem[3])
275 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200276 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200277 break
278 }
279 if len(elem) > 4 {
280 elemint[4], err = strconv.Atoi(elem[4])
281 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200282 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200283 break
284 }
285 }
286 }
287 }
288 }
289
290 switch len(elem) {
291 case 5:
292 {
Akron740f3d72021-08-03 12:12:34 +0200293 state = elemint[0]
294 inSym = elemint[1]
295 outSym = elemint[2]
296 end = elemint[3]
297 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200298 }
299 case 4:
300 {
301 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200302 state = elemint[0]
303 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200304 } else {
Akron740f3d72021-08-03 12:12:34 +0200305 state = elemint[0]
306 inSym = elemint[1]
307 end = elemint[2]
308 final = elemint[3]
309 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200310 }
311 }
312 case 3:
313 {
Akron740f3d72021-08-03 12:12:34 +0200314 inSym = elemint[0]
315 outSym = elemint[1]
316 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200317 }
318 case 2:
319 {
Akron740f3d72021-08-03 12:12:34 +0200320 inSym = elemint[0]
321 end = elemint[1]
322 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200323 }
324 }
325
Akron83e75a22021-08-04 13:14:06 +0200326 nontoken := false
327 tokenend := false
328
Akron439f4ec2021-08-09 15:45:38 +0200329 // While the states in foma start with 0, the states in the
330 // Mizobuchi FSA start with one - so we increase every state by 1.
331 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200332 inSym++
333 outSym++
334
Akron439f4ec2021-08-09 15:45:38 +0200335 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200336 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200337 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200338 tokenend = true
339 } else if outSym == tok.epsilon {
340 nontoken = true
341 } else {
Akron527c10c2021-08-13 01:45:18 +0200342 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200343 "Unsupported transition: " +
344 strconv.Itoa(state) +
345 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200346 " (" +
Akron740f3d72021-08-03 12:12:34 +0200347 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200348 ":" +
Akron740f3d72021-08-03 12:12:34 +0200349 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200350 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200351 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200352 ":" +
Akron740f3d72021-08-03 12:12:34 +0200353 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200354 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200355 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200356 }
Akron83e75a22021-08-04 13:14:06 +0200357 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200358 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200359 return nil
Akron31f3c062021-08-27 10:15:13 +0200360 } else if tok.sigmaMCS[inSym] != "" {
361 log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
362 } else if tok.sigmaMCS[outSym] != "" {
363 log.Fatalln("Non supported character", tok.sigmaMCS[outSym])
Akron8ef408b2021-08-02 22:11:04 +0200364 }
365
Akron03c92fe2021-08-09 14:07:57 +0200366 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200367 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200368 inSym: inSym,
369 outSym: outSym,
370 end: end + 1,
371 tokenend: tokenend,
372 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200373 }
374
Akron740f3d72021-08-03 12:12:34 +0200375 // Initialize outgoing states
376 if tok.transitions[state+1] == nil {
377 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200378 }
379
Akron740f3d72021-08-03 12:12:34 +0200380 // Ignore transitions with invalid symbols
381 if inSym >= 0 {
382 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200383 }
Akron8ef408b2021-08-02 22:11:04 +0200384
Akron740f3d72021-08-03 12:12:34 +0200385 // Add final transition
386 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200387 // TODO:
388 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200389 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200390 }
391
Akronb4bbb472021-08-09 11:49:38 +0200392 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200393 fmt.Println("Add",
394 state+1, "->", end+1,
395 "(",
396 inSym,
397 ":",
398 outSym,
399 ") (",
400 string(tok.sigmaRev[inSym]),
401 ":",
402 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200403 ")",
404 ";",
405 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200406 "NT:", nontoken,
407 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200408 }
Akron75ebe7f2021-08-03 10:34:10 +0200409
Akron8ef408b2021-08-02 22:11:04 +0200410 continue
411 }
412 case SIGMA:
413 {
414 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
415
416 // Turn string into sigma id
417 number, err := strconv.Atoi(elem[0])
418
Akron524c5432021-08-05 14:14:27 +0200419 // ID needs to be > 1
420 number++
421
Akron8ef408b2021-08-02 22:11:04 +0200422 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200423 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200424 return nil
Akron8ef408b2021-08-02 22:11:04 +0200425 }
426
Akron740f3d72021-08-03 12:12:34 +0200427 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200428
429 var symbol rune
430
431 // Read rune
432 if utf8.RuneCountInString(elem[1]) == 1 {
433 symbol = []rune(elem[1])[0]
434
Akron8ef408b2021-08-02 22:11:04 +0200435 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200436
437 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200438 switch elem[1] {
439 case "@_EPSILON_SYMBOL_@":
440 {
Akronc17f1ca2021-08-03 19:47:27 +0200441 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200442 }
443 case "@_UNKNOWN_SYMBOL_@":
444 {
Akronc17f1ca2021-08-03 19:47:27 +0200445 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200446 }
447
448 case "@_IDENTITY_SYMBOL_@":
449 {
Akronc17f1ca2021-08-03 19:47:27 +0200450 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200451 }
Akron03c92fe2021-08-09 14:07:57 +0200452
453 case "@_TOKEN_SYMBOL_@":
454 {
455 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200456 }
Akron8ef408b2021-08-02 22:11:04 +0200457 default:
Akron740f3d72021-08-03 12:12:34 +0200458 {
Akron31f3c062021-08-27 10:15:13 +0200459 // MCS not supported
460 tok.sigmaMCS[number] = line
Akron740f3d72021-08-03 12:12:34 +0200461 }
Akron8ef408b2021-08-02 22:11:04 +0200462 }
Akron439f4ec2021-08-09 15:45:38 +0200463 continue
Akron8ef408b2021-08-02 22:11:04 +0200464
Akron740f3d72021-08-03 12:12:34 +0200465 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200466 line, err = r.ReadString('\n')
467 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200468 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200469 return nil
Akron8ef408b2021-08-02 22:11:04 +0200470 }
471 if len(line) != 1 {
Akron31f3c062021-08-27 10:15:13 +0200472 // MCS not supported
473 tok.sigmaMCS[number] = line
474 continue
Akron8ef408b2021-08-02 22:11:04 +0200475 }
Akron03c92fe2021-08-09 14:07:57 +0200476 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200477 }
478
Akron740f3d72021-08-03 12:12:34 +0200479 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200480 }
481 }
482 }
Akron31f3c062021-08-27 10:15:13 +0200483 tok.sigmaMCS = nil
Akron8ef408b2021-08-02 22:11:04 +0200484 return tok
485}
486
Akron64ffd9a2021-08-03 19:55:21 +0200487// Set alphabet A to the list of all symbols
488// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200489func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200490 for a := range tok.transitions[s] {
491 *A = append(*A, a)
492 }
493
494 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200495 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200496}
497
Akron439f4ec2021-08-09 15:45:38 +0200498// ToDoubleArray turns the intermediate tokenizer representation
499// into a double array representation.
500//
501// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200502func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
503
504 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200505 sigma: make(map[rune]int),
506 loadFactor: -1,
507 final: tok.final,
508 unknown: tok.unknown,
509 identity: tok.identity,
510 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200511 tokenend: tok.tokenend,
Akronf2120ca2021-08-03 16:26:41 +0200512 }
513
Akronf1a16502021-08-16 15:24:38 +0200514 dat.resize(dat.final)
515
Akronf2120ca2021-08-03 16:26:41 +0200516 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200517 if int(sym) < 256 {
518 dat.sigmaASCII[int(sym)] = num
519 }
Akronf2120ca2021-08-03 16:26:41 +0200520 dat.sigma[sym] = num
521 }
Akron8ef408b2021-08-02 22:11:04 +0200522
523 mark := 0
524 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200525 var base uint32
Akronde18e902021-08-27 09:34:12 +0200526 var atrans *edge
527 var s, s1 int
528 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200529
Akron439f4ec2021-08-09 15:45:38 +0200530 // Create a mapping from s (in Ms aka Intermediate FSA)
531 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200532 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200533
Akron439f4ec2021-08-09 15:45:38 +0200534 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200535 table[size] = &mapping{source: 1, target: 1}
536 size++
537
Akron740f3d72021-08-03 12:12:34 +0200538 // Allocate space for the outgoing symbol range
539 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200540
Akronde18e902021-08-27 09:34:12 +0200541 // TODO:
542 // Table lookup for the moment
543 // only gives a minor performance benefit.
544 // should be rewritten and should preplace the
545 // table all together.
546 // tableLookup := make(map[int]uint32)
547 // tableLookup[1] = 1
548
Akron8ef408b2021-08-02 22:11:04 +0200549 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200550 s = table[mark].source // This is a state in Ms
551 t = table[mark].target // This is a state in Mt
Akron8ef408b2021-08-02 22:11:04 +0200552 mark++
Akron740f3d72021-08-03 12:12:34 +0200553
554 // Following the paper, here the state t can be remembered
555 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200556 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200557 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200558
Akron740f3d72021-08-03 12:12:34 +0200559 // Set base to the first free slot in the double array
Akron6f1c16c2021-08-17 10:45:42 +0200560 base = dat.xCheck(A)
561 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200562
Akron773b1ef2021-08-03 17:37:20 +0200563 // TODO:
Akron068874c2021-08-04 15:19:56 +0200564 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200565 // outdegree of .end
566
Akron740f3d72021-08-03 12:12:34 +0200567 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200568 for _, a := range A {
569
Akronc17f1ca2021-08-03 19:47:27 +0200570 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200571
Akronde18e902021-08-27 09:34:12 +0200572 atrans = tok.transitions[s][a]
573
Akron740f3d72021-08-03 12:12:34 +0200574 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200575 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200576
Akron740f3d72021-08-03 12:12:34 +0200577 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200578 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200579 dat.array[t1].setCheck(t)
580
581 // Set maxSize
582 if dat.maxSize < int(t1) {
583 dat.maxSize = int(t1)
584 }
Akron8ef408b2021-08-02 22:11:04 +0200585
Akron439f4ec2021-08-09 15:45:38 +0200586 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200587 fmt.Println("Translate transition",
588 s, "->", s1, "(", a, ")", "to", t, "->", t1)
589 }
590
Akron83e75a22021-08-04 13:14:06 +0200591 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200592 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200593 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200594 if DEBUG {
595 fmt.Println("Set", t1, "to nontoken")
596 }
Akron83e75a22021-08-04 13:14:06 +0200597 }
598
Akron84d68e62021-08-04 17:06:52 +0200599 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200600 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200601 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200602 if DEBUG {
603 fmt.Println("Set", t1, "to tokenend")
604 }
Akron84d68e62021-08-04 17:06:52 +0200605 }
606
Akron740f3d72021-08-03 12:12:34 +0200607 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200608 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200609 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200610
Akron439f4ec2021-08-09 15:45:38 +0200611 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200612 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200613 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200614 table[size] = &mapping{source: s1, target: t1}
Akronde18e902021-08-27 09:34:12 +0200615 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200616 size++
617 } else {
Akron740f3d72021-08-03 12:12:34 +0200618 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200619 dat.array[t1].setBase(r)
620 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200621 }
622 } else {
Akron740f3d72021-08-03 12:12:34 +0200623 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200624 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200625
Akronde18e902021-08-27 09:34:12 +0200626 if dat.maxSize < int(base)+dat.final {
627 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200628 }
Akron8ef408b2021-08-02 22:11:04 +0200629 }
630 }
631 }
632
633 // Following Mizobuchi et al (2000) the size of the
634 // FSA should be stored in check(1).
Akronf1a16502021-08-16 15:24:38 +0200635 // We make the size a bit smaller so we never have to check for boundaries.
Akron3fdfec62021-08-04 11:40:10 +0200636 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200637 dat.array = dat.array[:dat.maxSize+1]
638 return dat
Akron8ef408b2021-08-02 22:11:04 +0200639}
640
Akron8ef408b2021-08-02 22:11:04 +0200641// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200642// exists and return this as a representative.
643// Currently iterates through the whole table
644// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200645func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200646 for x := 0; x < size; x++ {
647 if table[x].source == s {
648 return table[x].target
649 }
650 }
651 return 0
652}
653
Akron64ffd9a2021-08-03 19:55:21 +0200654// Resize double array when necessary
655func (dat *DaTokenizer) resize(l int) {
656 // TODO:
657 // This is a bit too aggressive atm and should be calmed down.
658 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200659 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200660 }
Akron64ffd9a2021-08-03 19:55:21 +0200661}
Akronc9d84a62021-08-03 15:56:03 +0200662
Akron64ffd9a2021-08-03 19:55:21 +0200663// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200664func (bc *bc) setBase(v uint32) {
665 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200666}
667
668// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200669func (bc *bc) getBase() uint32 {
670 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200671}
672
673// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200674func (bc *bc) setCheck(v uint32) {
675 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200676}
677
678// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200679func (bc *bc) getCheck() uint32 {
680 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200681}
682
Akron3fdfec62021-08-04 11:40:10 +0200683// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200684func (bc *bc) isSeparate() bool {
685 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200686}
687
688// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200689func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200690 if sep {
Akronf1a16502021-08-16 15:24:38 +0200691 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200692 } else {
Akronf1a16502021-08-16 15:24:38 +0200693 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200694 }
695}
696
Akron83e75a22021-08-04 13:14:06 +0200697// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200698func (bc *bc) isNonToken() bool {
699 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200700}
701
702// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200703func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200704 if sep {
Akronf1a16502021-08-16 15:24:38 +0200705 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200706 } else {
Akronf1a16502021-08-16 15:24:38 +0200707 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200708 }
709}
710
Akron84d68e62021-08-04 17:06:52 +0200711// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200712func (bc *bc) isTokenEnd() bool {
713 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200714}
715
716// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200717func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200718 if sep {
Akronf1a16502021-08-16 15:24:38 +0200719 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200720 } else {
Akronf1a16502021-08-16 15:24:38 +0200721 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200722 }
723}
724
Akron64ffd9a2021-08-03 19:55:21 +0200725// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200726func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200727 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200728}
729
730// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200731func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200732 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200733}
734
735// Based on Mizobuchi et al (2000), p. 124
736// This iterates for every state through the complete double array
737// structure until it finds a gap that fits all outgoing transitions
738// of the state. This is extremely slow, but is only necessary in the
739// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200740func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200741
742 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200743 base := uint32(1)
744
Akron8ef408b2021-08-02 22:11:04 +0200745OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200746 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200747 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200748 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200749 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200750 base++
751 goto OVERLAP
752 }
753 }
Akron8ef408b2021-08-02 22:11:04 +0200754 return base
755}
756
Akron3610f102021-08-08 14:13:25 +0200757// List all outgoing transitions for a state
758// for testing purposes
759func (dat *DaTokenizer) outgoing(t uint32) []int {
760
761 valid := make([]int, 0, len(dat.sigma))
762
763 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200764 t1 := dat.array[t].getBase() + uint32(a)
765 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200766 valid = append(valid, a)
767 }
768 }
769
770 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200771 t1 := dat.array[t].getBase() + uint32(a)
772 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200773 valid = append(valid, -1*a)
774 }
775 }
776
777 sort.Ints(valid)
778
779 return valid
780}
781
Akron03a3c612021-08-04 11:51:27 +0200782// LoadFactor as defined in Kanda et al (2018),
783// i.e. the proportion of non-empty elements to all elements.
784func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200785
Akron03a3c612021-08-04 11:51:27 +0200786 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200787 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200788 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200789 }
Akrond66a9262021-08-03 17:09:09 +0200790 nonEmpty := 0
791 all := len(dat.array) / 2
Akronf1a16502021-08-16 15:24:38 +0200792 for x := 1; x < len(dat.array); x++ {
793 if dat.array[x].getBase() != 0 {
Akrond66a9262021-08-03 17:09:09 +0200794 nonEmpty++
795 }
796 }
Akron03a3c612021-08-04 11:51:27 +0200797 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
798 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200799}
800
Akron439f4ec2021-08-09 15:45:38 +0200801// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200802func (dat *DaTokenizer) Save(file string) (n int64, err error) {
803 f, err := os.Create(file)
804 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200805 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200806 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200807 }
808 defer f.Close()
809 gz := gzip.NewWriter(f)
810 defer gz.Close()
811 n, err = dat.WriteTo(gz)
812 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200813 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200814 return n, err
815 }
816 gz.Flush()
817 return n, nil
818}
819
820// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200821func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
822
Akron3a063ef2021-08-05 19:36:35 +0200823 wb := bufio.NewWriter(w)
824 defer wb.Flush()
825
Akron6247a5d2021-08-03 19:18:28 +0200826 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200827 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200828 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200829 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200830 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200831 }
832
833 // Get sigma as a list
834 sigmalist := make([]rune, len(dat.sigma)+16)
835 max := 0
836 for sym, num := range dat.sigma {
837 sigmalist[num] = sym
838 if num > max {
839 max = num
840 }
841 }
842
843 sigmalist = sigmalist[:max+1]
844
Akron3f8571a2021-08-05 11:18:10 +0200845 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200846 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200847 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
848 bo.PutUint16(buf[4:6], uint16(dat.unknown))
849 bo.PutUint16(buf[6:8], uint16(dat.identity))
850 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200851 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200852 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200853 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200854 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200855 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200856 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200857 }
858
859 all += more
860
Akron6247a5d2021-08-03 19:18:28 +0200861 // Write sigma
862 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200863
Akron3a063ef2021-08-05 19:36:35 +0200864 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200865 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200866 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200867 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200868 }
869 all += more
870 }
Akron439f4ec2021-08-09 15:45:38 +0200871
Akron6247a5d2021-08-03 19:18:28 +0200872 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200873 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200874 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200875 }
Akron6247a5d2021-08-03 19:18:28 +0200876
877 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200878 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200879 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200880 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200881 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200882 }
883 all += more
884
Akronf1a16502021-08-16 15:24:38 +0200885 // for x := 0; x < len(dat.array); x++ {
886 for _, bc := range dat.array {
887 bo.PutUint32(buf[0:4], bc.base)
888 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200889 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200890 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200891 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200892 }
Akron439f4ec2021-08-09 15:45:38 +0200893 all += more
Akron3a063ef2021-08-05 19:36:35 +0200894 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200895 log.Println("Can not write base uint32")
896 return int64(all), err
897 }
898 bo.PutUint32(buf[0:4], bc.check)
899 more, err = wb.Write(buf[0:4])
900 if err != nil {
901 log.Println(err)
902 return int64(all), err
903 }
904 all += more
905 if more != 4 {
906 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200907 return int64(all), err
908 }
Akron6247a5d2021-08-03 19:18:28 +0200909 }
910
911 return int64(all), err
912}
913
Akron439f4ec2021-08-09 15:45:38 +0200914// LoadDatokFile reads a double array represented tokenizer
915// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200916func LoadDatokFile(file string) *DaTokenizer {
917 f, err := os.Open(file)
918 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200919 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200920 return nil
Akron3f8571a2021-08-05 11:18:10 +0200921 }
922 defer f.Close()
923
924 gz, err := gzip.NewReader(f)
925 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200926 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200927 return nil
Akron3f8571a2021-08-05 11:18:10 +0200928 }
929 defer gz.Close()
930
Akron3a063ef2021-08-05 19:36:35 +0200931 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200932 return ParseDatok(gz)
933}
934
Akron439f4ec2021-08-09 15:45:38 +0200935// LoadDatokFile reads a double array represented tokenizer
936// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200937func ParseDatok(ior io.Reader) *DaTokenizer {
938
Akron439f4ec2021-08-09 15:45:38 +0200939 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200940 dat := &DaTokenizer{
941 sigma: make(map[rune]int),
942 epsilon: 0,
943 unknown: 0,
944 identity: 0,
945 final: 0,
946 loadFactor: 0,
947 }
948
949 r := bufio.NewReader(ior)
950
Akron3f8571a2021-08-05 11:18:10 +0200951 buf := make([]byte, 1024)
952 buf = buf[0:len(MAGIC)]
953
Akron439f4ec2021-08-09 15:45:38 +0200954 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200955
956 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200957 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200958 return nil
959 }
960
Akron3f8571a2021-08-05 11:18:10 +0200961 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200962 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200963 return nil
964 }
965
Akron439f4ec2021-08-09 15:45:38 +0200966 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200967 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200968 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200969 return nil
970 }
971
Akron439f4ec2021-08-09 15:45:38 +0200972 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200973 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200974 return nil
975 }
Akron3f8571a2021-08-05 11:18:10 +0200976
Akron3a063ef2021-08-05 19:36:35 +0200977 version := bo.Uint16(buf[0:2])
978
979 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200980 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200981 return nil
982 }
983
Akron3f8571a2021-08-05 11:18:10 +0200984 dat.epsilon = int(bo.Uint16(buf[2:4]))
985 dat.unknown = int(bo.Uint16(buf[4:6]))
986 dat.identity = int(bo.Uint16(buf[6:8]))
987 dat.final = int(bo.Uint16(buf[8:10]))
988
989 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200990 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200991
Akron3a063ef2021-08-05 19:36:35 +0200992 // Shouldn't be relevant though
993 dat.maxSize = arraySize - 1
994
Akron3f8571a2021-08-05 11:18:10 +0200995 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200996 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200997 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200998 if int(sym) < 256 {
999 dat.sigmaASCII[int(sym)] = x
1000 }
Akron3f8571a2021-08-05 11:18:10 +02001001 dat.sigma[sym] = x
1002 }
Akron3f8571a2021-08-05 11:18:10 +02001003 }
1004
Akron439f4ec2021-08-09 15:45:38 +02001005 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +02001006
1007 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001008 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +02001009 return nil
1010 }
1011
Akron3f8571a2021-08-05 11:18:10 +02001012 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001013 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001014 return nil
1015 }
1016
1017 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001018 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001019
Akronbb4aac52021-08-13 00:52:27 +02001020 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001021
Akronbb4aac52021-08-13 00:52:27 +02001022 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001023 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001024 return nil
1025 }
1026
Akronf1a16502021-08-16 15:24:38 +02001027 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001028 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001029 return nil
1030 }
1031
1032 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001033 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1034 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001035 }
1036
1037 return dat
1038}
1039
Akron439f4ec2021-08-09 15:45:38 +02001040// Show the current state of the buffer,
1041// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001042func showBuffer(buffer []rune, buffo int, buffi int) string {
1043 out := make([]rune, 0, 1024)
1044 for x := 0; x < len(buffer); x++ {
1045 if buffi == x {
1046 out = append(out, '^')
1047 }
1048 if buffo == x {
1049 out = append(out, '[', buffer[x], ']')
1050 } else {
1051 out = append(out, buffer[x])
1052 }
1053 }
1054 return string(out)
1055}
1056
Akron84d68e62021-08-04 17:06:52 +02001057// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001058// FSA. The rules are always greedy. If the automaton fails,
1059// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001060//
Akron4db3ecf2021-08-11 18:49:03 +02001061// Based on Mizobuchi et al (2000), p. 129,
1062// with additional support for IDENTITY, UNKNOWN
1063// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001064func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001065 var a int
Akronb4bbb472021-08-09 11:49:38 +02001066 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001067 t := uint32(1) // Initial state
1068 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001069
Akron3610f102021-08-08 14:13:25 +02001070 // Remember the last position of a possible tokenend,
1071 // in case the automaton fails.
1072 epsilonState := uint32(0)
1073 epsilonOffset := 0
1074
1075 // Implement a low level buffer for full control,
1076 // however - it is probably better to introduce
1077 // this on a higher level with a io.Reader interface
1078 // The buffer stores a single word and may have white
1079 // space at the end (but not at the beginning).
1080 //
1081 // This is the only backtracking requirement because of
1082 // epsilon transitions, to support tokenizations like:
1083 // "this is an example|.| And it works." vs
1084 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001085 //
1086 // TODO:
1087 // Store a translation buffer as well, so characters don't
1088 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001089 buffer := make([]rune, 1024)
1090 buffo := 0 // Buffer offset
1091 buffi := 0 // Buffer length
1092
Akron3f8571a2021-08-05 11:18:10 +02001093 reader := bufio.NewReader(r)
1094 writer := bufio.NewWriter(w)
1095 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001096
Akron3f8571a2021-08-05 11:18:10 +02001097 var char rune
1098 var err error
1099 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001100 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001101
Akronc5d8d432021-08-10 16:48:44 +02001102PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001103 for {
1104
Akronb7e1f132021-08-10 11:52:31 +02001105 if newchar {
1106 // Get from reader if buffer is empty
1107 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001108 if eof {
1109 break
1110 }
Akronb7e1f132021-08-10 11:52:31 +02001111 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001112
Akronb7e1f132021-08-10 11:52:31 +02001113 // No more runes to read
1114 if err != nil {
1115 eof = true
1116 break
1117 }
1118 buffer[buffi] = char
1119 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001120 }
Akronb7e1f132021-08-10 11:52:31 +02001121
1122 char = buffer[buffo]
1123
1124 if DEBUG {
1125 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1126 }
1127
Akron6f1c16c2021-08-17 10:45:42 +02001128 // TODO:
1129 // Better not repeatedly check for a!
1130 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +02001131 if int(char) < 256 {
1132 a = dat.sigmaASCII[int(char)]
1133 } else {
1134 a, ok = dat.sigma[char]
1135 if !ok {
1136 a = 0
1137 }
1138 }
Akronb7e1f132021-08-10 11:52:31 +02001139
1140 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001141 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001142 a = dat.identity
1143 }
1144
1145 t0 = t
1146
1147 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001148 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001149 // Remember state for backtracking to last tokenend state
1150 epsilonState = t0
1151 epsilonOffset = buffo
1152 }
Akron3f8571a2021-08-05 11:18:10 +02001153 }
Akron3610f102021-08-08 14:13:25 +02001154
Akronb7e1f132021-08-10 11:52:31 +02001155 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001156 t = dat.array[t0].getBase() + uint32(a)
1157 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001158
Akron524c5432021-08-05 14:14:27 +02001159 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001160 // Char is only relevant if set
1161 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1162 if false {
1163 fmt.Println(dat.outgoing(t0))
1164 }
Akron524c5432021-08-05 14:14:27 +02001165 }
1166
Akronb7e1f132021-08-10 11:52:31 +02001167 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001168 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001169
1170 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001171 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001172 }
1173
1174 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001175
Akron068874c2021-08-04 15:19:56 +02001176 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001177 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001178 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001179 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001180 }
1181 a = dat.unknown
1182
1183 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001184
Akron068874c2021-08-04 15:19:56 +02001185 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001186 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001187 epsilonState = 0 // reset
1188 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001189 a = dat.epsilon
1190
Akron3610f102021-08-08 14:13:25 +02001191 if DEBUG {
1192 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1193 }
Akronb4bbb472021-08-09 11:49:38 +02001194
Akron068874c2021-08-04 15:19:56 +02001195 } else {
1196 break
1197 }
Akron068874c2021-08-04 15:19:56 +02001198
Akronb7e1f132021-08-10 11:52:31 +02001199 newchar = false
1200 continue
Akronb4bbb472021-08-09 11:49:38 +02001201 }
1202
Akronb7e1f132021-08-10 11:52:31 +02001203 // Transition was successful
1204 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001205
1206 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001207 if a != dat.epsilon {
1208
Akron3610f102021-08-08 14:13:25 +02001209 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001210
Akron439f4ec2021-08-09 15:45:38 +02001211 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001212 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001213 if DEBUG {
1214 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1215 }
Akron439f4ec2021-08-09 15:45:38 +02001216 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001217 }
Akron3f8571a2021-08-05 11:18:10 +02001218 }
Akron068874c2021-08-04 15:19:56 +02001219
Akronc5d8d432021-08-10 16:48:44 +02001220 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001221 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001222
Akronc5d8d432021-08-10 16:48:44 +02001223 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001224 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001225 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001226 }
Akron01912fc2021-08-12 11:41:58 +02001227 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001228 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001229 }
Akron1594cb82021-08-11 11:14:56 +02001230 if DEBUG {
1231 fmt.Println("-> Newline")
1232 }
1233 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001234 }
Akron3610f102021-08-08 14:13:25 +02001235
Akronc5d8d432021-08-10 16:48:44 +02001236 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001237 if rewindBuffer {
1238
1239 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001240 for x, i := range buffer[buffo:buffi] {
1241 buffer[x] = i
1242 }
Akronb4bbb472021-08-09 11:49:38 +02001243
Akron3610f102021-08-08 14:13:25 +02001244 buffi -= buffo
1245 epsilonOffset -= buffo
1246 buffo = 0
1247 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001248 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001249 }
Akron84d68e62021-08-04 17:06:52 +02001250 }
1251
Akronb7e1f132021-08-10 11:52:31 +02001252 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001253 if ta.isSeparate() {
1254 t = ta.getBase()
1255 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001256
1257 if DEBUG {
1258 fmt.Println("Representative pointing to", t)
1259 }
1260 }
1261
Akronc5d8d432021-08-10 16:48:44 +02001262 newchar = true
1263
Akron068874c2021-08-04 15:19:56 +02001264 // TODO:
1265 // Prevent endless epsilon loops!
1266 }
1267
Akron439f4ec2021-08-09 15:45:38 +02001268 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001269 if !eof {
Akron068874c2021-08-04 15:19:56 +02001270 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001271 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001272 }
1273 return false
1274 }
1275
Akronb7e1f132021-08-10 11:52:31 +02001276 if DEBUG {
1277 fmt.Println("Entering final check")
1278 }
1279
Akronc5d8d432021-08-10 16:48:44 +02001280 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001281 x := dat.array[t].getBase() + uint32(dat.final)
1282
1283 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001284
1285 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001286 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001287 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001288 }
Akron01912fc2021-08-12 11:41:58 +02001289 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001290
Akronf1a16502021-08-16 15:24:38 +02001291 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001292 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001293 if DEBUG {
1294 fmt.Println("-> Newline")
1295 }
Akrondf0a3ef2021-08-09 15:53:45 +02001296 }
Akron84d68e62021-08-04 17:06:52 +02001297 }
1298
Akron6e70dc82021-08-11 11:33:18 +02001299 // Add an additional sentence ending, if the file is over but no explicit
1300 // sentence split was reached. This may be controversial and therefore
1301 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001302 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001303 writer.WriteRune('\n')
1304 if DEBUG {
1305 fmt.Println("-> Newline")
1306 }
1307 }
1308
Akrone61380b2021-08-16 10:10:46 +02001309 // TODO:
1310 // There may be a new line at the end, from an epsilon,
1311 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001312 return true
1313 }
1314
Akronc5d8d432021-08-10 16:48:44 +02001315 // Check epsilon transitions until a final state is reached
1316 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001317 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001318 a = dat.epsilon
1319 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001320 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001321 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001322 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001323
Akronc5d8d432021-08-10 16:48:44 +02001324 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001325 t0 = epsilonState
1326 epsilonState = 0 // reset
1327 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001328 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001329 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001330 }
Akronc5d8d432021-08-10 16:48:44 +02001331 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001332 }
Akronc5d8d432021-08-10 16:48:44 +02001333 return false
Akron068874c2021-08-04 15:19:56 +02001334}