blob: 96243f1fdd4849abda80275626259cc12d93a4ce [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020018// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020019// - Replace/Enhance table with a map
20// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020021// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020022
Akron8ef408b2021-08-02 22:11:04 +020023import (
24 "bufio"
25 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020026 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020027 "fmt"
28 "io"
29 "os"
Akronc9d84a62021-08-03 15:56:03 +020030 "sort"
Akron8ef408b2021-08-02 22:11:04 +020031 "strconv"
32 "strings"
33 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020034
Akron527c10c2021-08-13 01:45:18 +020035 "log"
Akron8ef408b2021-08-02 22:11:04 +020036)
37
38const (
Akron2a4b9292021-08-04 15:35:22 +020039 PROPS = 1
40 SIGMA = 2
41 STATES = 3
42 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020043 DEBUG = false
44 MAGIC = "DATOK"
45 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020046 FIRSTBIT uint32 = 1 << 31
47 SECONDBIT uint32 = 1 << 30
48 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020049)
50
Akron03c92fe2021-08-09 14:07:57 +020051// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020052var bo binary.ByteOrder = binary.LittleEndian
53
Akron8ef408b2021-08-02 22:11:04 +020054type mapping struct {
55 source int
Akron3fdfec62021-08-04 11:40:10 +020056 target uint32
Akron8ef408b2021-08-02 22:11:04 +020057}
58
59type edge struct {
Akron83e75a22021-08-04 13:14:06 +020060 inSym int
61 outSym int
62 end int
63 nontoken bool
64 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020065}
66
Akron03c92fe2021-08-09 14:07:57 +020067// Tokenizer is the intermediate representation
68// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020069type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020070 sigmaRev map[int]rune
Akron31f3c062021-08-27 10:15:13 +020071 sigmaMCS map[int]string
Akron740f3d72021-08-03 12:12:34 +020072 arcCount int
Akron740f3d72021-08-03 12:12:34 +020073 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020074 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020075
76 // Special symbols in sigma
77 epsilon int
78 unknown int
79 identity int
80 final int
Akron03c92fe2021-08-09 14:07:57 +020081 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020082}
83
Akronf1a16502021-08-16 15:24:38 +020084type bc struct {
85 base uint32
86 check uint32
87}
88
Akron03c92fe2021-08-09 14:07:57 +020089// DaTokenizer represents a tokenizer implemented as a
90// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020091type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020092 sigma map[rune]int
93 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020094 maxSize int
Akron4fa28b32021-08-27 10:55:41 +020095 transCount int
Akronf1a16502021-08-16 15:24:38 +020096 array []bc
Akronc17f1ca2021-08-03 19:47:27 +020097
98 // Special symbols in sigma
99 epsilon int
100 unknown int
101 identity int
102 final int
Akron03c92fe2021-08-09 14:07:57 +0200103 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200104}
105
Akron03c92fe2021-08-09 14:07:57 +0200106// ParseFoma reads the FST from a foma file
107// and creates an internal representation,
108// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200109func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200110 f, err := os.Open(file)
111 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200112 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200113 return nil
Akron8ef408b2021-08-02 22:11:04 +0200114 }
115 defer f.Close()
116
117 gz, err := gzip.NewReader(f)
118 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200119 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200120 return nil
Akron8ef408b2021-08-02 22:11:04 +0200121 }
122 defer gz.Close()
123
Akron3fdfec62021-08-04 11:40:10 +0200124 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200125}
126
Akron03c92fe2021-08-09 14:07:57 +0200127// ParseFoma reads the FST from a foma file reader
128// and creates an internal representation,
129// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200130func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200131 r := bufio.NewReader(ior)
132
133 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200134 sigmaRev: make(map[int]rune),
Akron31f3c062021-08-27 10:15:13 +0200135 sigmaMCS: make(map[int]string),
Akronc17f1ca2021-08-03 19:47:27 +0200136 epsilon: -1,
137 unknown: -1,
138 identity: -1,
139 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200140 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200141 }
142
Akron740f3d72021-08-03 12:12:34 +0200143 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200144
145 mode := 0
146 var elem []string
147 var elemint [5]int
148
Akron03c92fe2021-08-09 14:07:57 +0200149 // Iterate over all lines of the file.
150 // This is mainly based on foma2js,
151 // licensed under the Apache License, version 2,
152 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200153 for {
154 line, err := r.ReadString('\n')
155 if err != nil {
156 if err == io.EOF {
157 break
158 }
Akron527c10c2021-08-13 01:45:18 +0200159 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200160 return nil
Akron8ef408b2021-08-02 22:11:04 +0200161 }
Akron8ef408b2021-08-02 22:11:04 +0200162
Akron439f4ec2021-08-09 15:45:38 +0200163 // Read parser mode for the following lines
164 if strings.HasPrefix(line, "##") {
165 if strings.HasPrefix(line, "##props##") {
166 mode = PROPS
167
168 } else if strings.HasPrefix(line, "##states##") {
169 mode = STATES
170
171 // Adds a final transition symbol to sigma
172 // written as '#' in Mizobuchi et al (2000)
173 tok.sigmaCount++
174 tok.final = tok.sigmaCount
175
176 } else if strings.HasPrefix(line, "##sigma##") {
177
178 mode = SIGMA
179
180 } else if strings.HasPrefix(line, "##end##") {
181
182 mode = NONE
183
184 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200185 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200186 break
187 }
Akron8ef408b2021-08-02 22:11:04 +0200188 continue
189 }
190
Akron439f4ec2021-08-09 15:45:38 +0200191 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200192 switch mode {
193 case PROPS:
194 {
195 elem = strings.Split(line, " ")
196 /*
197 fmt.Println("arity: " + elem[0])
198 fmt.Println("arccount: " + elem[1])
199 fmt.Println("statecount: " + elem[2])
200 fmt.Println("linecount: " + elem[3])
201 fmt.Println("finalcount: " + elem[4])
202 fmt.Println("pathcount: " + elem[5])
203 fmt.Println("is_deterministic: " + elem[6])
204 fmt.Println("is_pruned: " + elem[7])
205 fmt.Println("is_minimized: " + elem[8])
206 fmt.Println("is_epsilon_free: " + elem[9])
207 fmt.Println("is_loop_free: " + elem[10])
208 fmt.Println("extras: " + elem[11])
209 fmt.Println("name: " + elem[12])
210 */
211 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200212 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200213 return nil
Akron8ef408b2021-08-02 22:11:04 +0200214 }
Akron439f4ec2021-08-09 15:45:38 +0200215
Akron8ef408b2021-08-02 22:11:04 +0200216 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200217 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200218 return nil
Akron8ef408b2021-08-02 22:11:04 +0200219 }
220
221 elemint[0], err = strconv.Atoi(elem[1])
222 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200223 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200224 return nil
Akron8ef408b2021-08-02 22:11:04 +0200225 }
Akron740f3d72021-08-03 12:12:34 +0200226 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200227
Akron8ef408b2021-08-02 22:11:04 +0200228 elemint[0], err = strconv.Atoi(elem[2])
229 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200230 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200231 return nil
Akron8ef408b2021-08-02 22:11:04 +0200232 }
Akron439f4ec2021-08-09 15:45:38 +0200233
234 // States start at 1 in Mizobuchi et al (2000),
235 // as the state 0 is associated with a fail.
236 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200237 tok.transitions = make([]map[int]*edge, elemint[0]+1)
238 continue
239 }
240 case STATES:
241 {
242 elem = strings.Split(line[0:len(line)-1], " ")
243 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200244 if DEBUG {
245 fmt.Println("Skip", elem)
246 }
Akron8ef408b2021-08-02 22:11:04 +0200247 continue
248 }
249 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200250 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200251 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200252 break
253 }
Akron8ef408b2021-08-02 22:11:04 +0200254
255 if len(elem) > 1 {
256 elemint[1], err = strconv.Atoi(elem[1])
257 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200258 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200259 break
260 }
261 if len(elem) > 2 {
262 elemint[2], err = strconv.Atoi(elem[2])
263 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200264 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200265 break
266 }
267 if len(elem) > 3 {
268 elemint[3], err = strconv.Atoi(elem[3])
269 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200270 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200271 break
272 }
273 if len(elem) > 4 {
274 elemint[4], err = strconv.Atoi(elem[4])
275 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200276 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200277 break
278 }
279 }
280 }
281 }
282 }
283
284 switch len(elem) {
285 case 5:
286 {
Akron740f3d72021-08-03 12:12:34 +0200287 state = elemint[0]
288 inSym = elemint[1]
289 outSym = elemint[2]
290 end = elemint[3]
291 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200292 }
293 case 4:
294 {
295 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200296 state = elemint[0]
297 final = elemint[3]
Akron0630be52021-08-28 09:06:16 +0200298
299 // Final state that has no outgoing edges
300 if final == 1 {
301
302 // Initialize outgoing states
303 if tok.transitions[state+1] == nil {
304 tok.transitions[state+1] = make(map[int]*edge)
305 }
306
307 // TODO:
308 // Maybe this is less relevant for tokenizers
309 tok.transitions[state+1][tok.final] = &edge{}
310 }
311 continue
Akron8ef408b2021-08-02 22:11:04 +0200312 } else {
Akron740f3d72021-08-03 12:12:34 +0200313 state = elemint[0]
314 inSym = elemint[1]
315 end = elemint[2]
316 final = elemint[3]
317 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200318 }
319 }
320 case 3:
321 {
Akron740f3d72021-08-03 12:12:34 +0200322 inSym = elemint[0]
323 outSym = elemint[1]
324 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200325 }
326 case 2:
327 {
Akron740f3d72021-08-03 12:12:34 +0200328 inSym = elemint[0]
329 end = elemint[1]
330 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200331 }
332 }
333
Akron83e75a22021-08-04 13:14:06 +0200334 nontoken := false
335 tokenend := false
336
Akron439f4ec2021-08-09 15:45:38 +0200337 // While the states in foma start with 0, the states in the
338 // Mizobuchi FSA start with one - so we increase every state by 1.
339 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200340 inSym++
341 outSym++
342
Akron439f4ec2021-08-09 15:45:38 +0200343 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200344 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200345 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200346 tokenend = true
347 } else if outSym == tok.epsilon {
348 nontoken = true
349 } else {
Akron527c10c2021-08-13 01:45:18 +0200350 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200351 "Unsupported transition: " +
352 strconv.Itoa(state) +
353 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200354 " (" +
Akron740f3d72021-08-03 12:12:34 +0200355 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200356 ":" +
Akron740f3d72021-08-03 12:12:34 +0200357 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200358 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200359 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200360 ":" +
Akron740f3d72021-08-03 12:12:34 +0200361 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200362 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200363 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200364 }
Akron92704eb2021-08-27 10:59:46 +0200365 } else if inSym == tok.tokenend {
366 // Ignore tokenend accepting arcs
367 continue
Akron83e75a22021-08-04 13:14:06 +0200368 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200369 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200370 return nil
Akron31f3c062021-08-27 10:15:13 +0200371 } else if tok.sigmaMCS[inSym] != "" {
372 log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
373 } else if tok.sigmaMCS[outSym] != "" {
374 log.Fatalln("Non supported character", tok.sigmaMCS[outSym])
Akron8ef408b2021-08-02 22:11:04 +0200375 }
376
Akron03c92fe2021-08-09 14:07:57 +0200377 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200378 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200379 inSym: inSym,
380 outSym: outSym,
381 end: end + 1,
382 tokenend: tokenend,
383 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200384 }
385
Akron740f3d72021-08-03 12:12:34 +0200386 // Initialize outgoing states
387 if tok.transitions[state+1] == nil {
388 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200389 }
390
Akron740f3d72021-08-03 12:12:34 +0200391 // Ignore transitions with invalid symbols
392 if inSym >= 0 {
393 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200394 }
Akron8ef408b2021-08-02 22:11:04 +0200395
Akron740f3d72021-08-03 12:12:34 +0200396 // Add final transition
397 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200398 // TODO:
399 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200400 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200401 }
402
Akronb4bbb472021-08-09 11:49:38 +0200403 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200404 fmt.Println("Add",
405 state+1, "->", end+1,
406 "(",
407 inSym,
408 ":",
409 outSym,
410 ") (",
411 string(tok.sigmaRev[inSym]),
412 ":",
413 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200414 ")",
415 ";",
416 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200417 "NT:", nontoken,
418 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200419 }
Akron75ebe7f2021-08-03 10:34:10 +0200420
Akron8ef408b2021-08-02 22:11:04 +0200421 continue
422 }
423 case SIGMA:
424 {
425 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
426
427 // Turn string into sigma id
428 number, err := strconv.Atoi(elem[0])
429
Akron524c5432021-08-05 14:14:27 +0200430 // ID needs to be > 1
431 number++
432
Akron8ef408b2021-08-02 22:11:04 +0200433 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200434 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200435 return nil
Akron8ef408b2021-08-02 22:11:04 +0200436 }
437
Akron740f3d72021-08-03 12:12:34 +0200438 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200439
440 var symbol rune
441
442 // Read rune
443 if utf8.RuneCountInString(elem[1]) == 1 {
444 symbol = []rune(elem[1])[0]
445
Akron8ef408b2021-08-02 22:11:04 +0200446 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200447
448 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200449 switch elem[1] {
450 case "@_EPSILON_SYMBOL_@":
451 {
Akronc17f1ca2021-08-03 19:47:27 +0200452 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200453 }
454 case "@_UNKNOWN_SYMBOL_@":
455 {
Akronc17f1ca2021-08-03 19:47:27 +0200456 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200457 }
458
459 case "@_IDENTITY_SYMBOL_@":
460 {
Akronc17f1ca2021-08-03 19:47:27 +0200461 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200462 }
Akron03c92fe2021-08-09 14:07:57 +0200463
464 case "@_TOKEN_SYMBOL_@":
465 {
466 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200467 }
Akron8ef408b2021-08-02 22:11:04 +0200468 default:
Akron740f3d72021-08-03 12:12:34 +0200469 {
Akron31f3c062021-08-27 10:15:13 +0200470 // MCS not supported
471 tok.sigmaMCS[number] = line
Akron740f3d72021-08-03 12:12:34 +0200472 }
Akron8ef408b2021-08-02 22:11:04 +0200473 }
Akron439f4ec2021-08-09 15:45:38 +0200474 continue
Akron8ef408b2021-08-02 22:11:04 +0200475
Akron740f3d72021-08-03 12:12:34 +0200476 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200477 line, err = r.ReadString('\n')
478 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200479 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200480 return nil
Akron8ef408b2021-08-02 22:11:04 +0200481 }
482 if len(line) != 1 {
Akron31f3c062021-08-27 10:15:13 +0200483 // MCS not supported
484 tok.sigmaMCS[number] = line
485 continue
Akron8ef408b2021-08-02 22:11:04 +0200486 }
Akron03c92fe2021-08-09 14:07:57 +0200487 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200488 }
489
Akron740f3d72021-08-03 12:12:34 +0200490 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200491 }
492 }
493 }
Akron31f3c062021-08-27 10:15:13 +0200494 tok.sigmaMCS = nil
Akron8ef408b2021-08-02 22:11:04 +0200495 return tok
496}
497
Akron64ffd9a2021-08-03 19:55:21 +0200498// Set alphabet A to the list of all symbols
499// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200500func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200501 for a := range tok.transitions[s] {
502 *A = append(*A, a)
503 }
504
505 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200506 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200507}
508
Akron439f4ec2021-08-09 15:45:38 +0200509// ToDoubleArray turns the intermediate tokenizer representation
510// into a double array representation.
511//
512// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200513func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
514
515 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200516 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +0200517 transCount: -1,
Akron03a3c612021-08-04 11:51:27 +0200518 final: tok.final,
519 unknown: tok.unknown,
520 identity: tok.identity,
521 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200522 tokenend: tok.tokenend,
Akronf2120ca2021-08-03 16:26:41 +0200523 }
524
Akronf1a16502021-08-16 15:24:38 +0200525 dat.resize(dat.final)
526
Akronf2120ca2021-08-03 16:26:41 +0200527 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200528 if int(sym) < 256 {
529 dat.sigmaASCII[int(sym)] = num
530 }
Akronf2120ca2021-08-03 16:26:41 +0200531 dat.sigma[sym] = num
532 }
Akron8ef408b2021-08-02 22:11:04 +0200533
534 mark := 0
535 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200536 var base uint32
Akronde18e902021-08-27 09:34:12 +0200537 var atrans *edge
538 var s, s1 int
539 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200540
Akron439f4ec2021-08-09 15:45:38 +0200541 // Create a mapping from s (in Ms aka Intermediate FSA)
542 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200543 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200544
Akron439f4ec2021-08-09 15:45:38 +0200545 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200546 table[size] = &mapping{source: 1, target: 1}
547 size++
548
Akron740f3d72021-08-03 12:12:34 +0200549 // Allocate space for the outgoing symbol range
550 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200551
Akronde18e902021-08-27 09:34:12 +0200552 // TODO:
553 // Table lookup for the moment
554 // only gives a minor performance benefit.
555 // should be rewritten and should preplace the
556 // table all together.
557 // tableLookup := make(map[int]uint32)
558 // tableLookup[1] = 1
559
Akron8ef408b2021-08-02 22:11:04 +0200560 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200561 s = table[mark].source // This is a state in Ms
562 t = table[mark].target // This is a state in Mt
Akron8ef408b2021-08-02 22:11:04 +0200563 mark++
Akron740f3d72021-08-03 12:12:34 +0200564
565 // Following the paper, here the state t can be remembered
566 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200567 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200568 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200569
Akron740f3d72021-08-03 12:12:34 +0200570 // Set base to the first free slot in the double array
Akron6f1c16c2021-08-17 10:45:42 +0200571 base = dat.xCheck(A)
572 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200573
Akron773b1ef2021-08-03 17:37:20 +0200574 // TODO:
Akron068874c2021-08-04 15:19:56 +0200575 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200576 // outdegree of .end
577
Akron740f3d72021-08-03 12:12:34 +0200578 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200579 for _, a := range A {
580
Akronc17f1ca2021-08-03 19:47:27 +0200581 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200582
Akronde18e902021-08-27 09:34:12 +0200583 atrans = tok.transitions[s][a]
584
Akron740f3d72021-08-03 12:12:34 +0200585 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200586 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200587
Akron740f3d72021-08-03 12:12:34 +0200588 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200589 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200590 dat.array[t1].setCheck(t)
591
592 // Set maxSize
593 if dat.maxSize < int(t1) {
594 dat.maxSize = int(t1)
595 }
Akron8ef408b2021-08-02 22:11:04 +0200596
Akron439f4ec2021-08-09 15:45:38 +0200597 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200598 fmt.Println("Translate transition",
599 s, "->", s1, "(", a, ")", "to", t, "->", t1)
600 }
601
Akron83e75a22021-08-04 13:14:06 +0200602 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200603 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200604 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200605 if DEBUG {
606 fmt.Println("Set", t1, "to nontoken")
607 }
Akron83e75a22021-08-04 13:14:06 +0200608 }
609
Akron84d68e62021-08-04 17:06:52 +0200610 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200611 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200612 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200613 if DEBUG {
614 fmt.Println("Set", t1, "to tokenend")
615 }
Akron84d68e62021-08-04 17:06:52 +0200616 }
617
Akron740f3d72021-08-03 12:12:34 +0200618 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200619 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200620 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200621
Akron439f4ec2021-08-09 15:45:38 +0200622 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200623 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200624 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200625 table[size] = &mapping{source: s1, target: t1}
Akronde18e902021-08-27 09:34:12 +0200626 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200627 size++
628 } else {
Akron740f3d72021-08-03 12:12:34 +0200629 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200630 dat.array[t1].setBase(r)
631 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200632 }
633 } else {
Akron740f3d72021-08-03 12:12:34 +0200634 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200635 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200636
Akronde18e902021-08-27 09:34:12 +0200637 if dat.maxSize < int(base)+dat.final {
638 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200639 }
Akron8ef408b2021-08-02 22:11:04 +0200640 }
641 }
642 }
643
644 // Following Mizobuchi et al (2000) the size of the
645 // FSA should be stored in check(1).
Akronf1a16502021-08-16 15:24:38 +0200646 // We make the size a bit smaller so we never have to check for boundaries.
Akron3fdfec62021-08-04 11:40:10 +0200647 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200648 dat.array = dat.array[:dat.maxSize+1]
649 return dat
Akron8ef408b2021-08-02 22:11:04 +0200650}
651
Akron8ef408b2021-08-02 22:11:04 +0200652// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200653// exists and return this as a representative.
654// Currently iterates through the whole table
655// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200656func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200657 for x := 0; x < size; x++ {
658 if table[x].source == s {
659 return table[x].target
660 }
661 }
662 return 0
663}
664
Akron64ffd9a2021-08-03 19:55:21 +0200665// Resize double array when necessary
666func (dat *DaTokenizer) resize(l int) {
667 // TODO:
668 // This is a bit too aggressive atm and should be calmed down.
669 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200670 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200671 }
Akron64ffd9a2021-08-03 19:55:21 +0200672}
Akronc9d84a62021-08-03 15:56:03 +0200673
Akron64ffd9a2021-08-03 19:55:21 +0200674// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200675func (bc *bc) setBase(v uint32) {
676 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200677}
678
679// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200680func (bc *bc) getBase() uint32 {
681 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200682}
683
684// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200685func (bc *bc) setCheck(v uint32) {
686 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200687}
688
689// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200690func (bc *bc) getCheck() uint32 {
691 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200692}
693
Akron3fdfec62021-08-04 11:40:10 +0200694// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200695func (bc *bc) isSeparate() bool {
696 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200697}
698
699// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200700func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200701 if sep {
Akronf1a16502021-08-16 15:24:38 +0200702 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200703 } else {
Akronf1a16502021-08-16 15:24:38 +0200704 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200705 }
706}
707
Akron83e75a22021-08-04 13:14:06 +0200708// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200709func (bc *bc) isNonToken() bool {
710 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200711}
712
713// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200714func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200715 if sep {
Akronf1a16502021-08-16 15:24:38 +0200716 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200717 } else {
Akronf1a16502021-08-16 15:24:38 +0200718 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200719 }
720}
721
Akron84d68e62021-08-04 17:06:52 +0200722// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200723func (bc *bc) isTokenEnd() bool {
724 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200725}
726
727// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200728func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200729 if sep {
Akronf1a16502021-08-16 15:24:38 +0200730 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200731 } else {
Akronf1a16502021-08-16 15:24:38 +0200732 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200733 }
734}
735
Akron64ffd9a2021-08-03 19:55:21 +0200736// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200737func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200738 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200739}
740
741// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200742func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200743 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200744}
745
746// Based on Mizobuchi et al (2000), p. 124
747// This iterates for every state through the complete double array
748// structure until it finds a gap that fits all outgoing transitions
749// of the state. This is extremely slow, but is only necessary in the
750// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200751func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200752
753 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200754 base := uint32(1)
755
Akron8ef408b2021-08-02 22:11:04 +0200756OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200757 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200758 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200759 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200760 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200761 base++
762 goto OVERLAP
763 }
764 }
Akron8ef408b2021-08-02 22:11:04 +0200765 return base
766}
767
Akron3610f102021-08-08 14:13:25 +0200768// List all outgoing transitions for a state
769// for testing purposes
770func (dat *DaTokenizer) outgoing(t uint32) []int {
771
772 valid := make([]int, 0, len(dat.sigma))
773
774 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200775 t1 := dat.array[t].getBase() + uint32(a)
776 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200777 valid = append(valid, a)
778 }
779 }
780
781 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200782 t1 := dat.array[t].getBase() + uint32(a)
783 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200784 valid = append(valid, -1*a)
785 }
786 }
787
788 sort.Ints(valid)
789
790 return valid
791}
792
Akron4fa28b32021-08-27 10:55:41 +0200793// TransCount as the number of transitions aka arcs in the
794// finite state automaton
795func (dat *DaTokenizer) TransCount() int {
796 // Cache the transCount
797 if dat.transCount > 0 {
798 return dat.transCount
799 }
800
801 dat.transCount = 0
802 for x := 1; x < len(dat.array); x++ {
803 if dat.array[x].getBase() != 0 {
804 dat.transCount++
805 }
806 }
807
808 return dat.transCount
809}
810
Akron03a3c612021-08-04 11:51:27 +0200811// LoadFactor as defined in Kanda et al (2018),
812// i.e. the proportion of non-empty elements to all elements.
813func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200814 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200815}
816
Akron439f4ec2021-08-09 15:45:38 +0200817// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200818func (dat *DaTokenizer) Save(file string) (n int64, err error) {
819 f, err := os.Create(file)
820 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200821 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200822 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200823 }
824 defer f.Close()
825 gz := gzip.NewWriter(f)
826 defer gz.Close()
827 n, err = dat.WriteTo(gz)
828 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200829 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200830 return n, err
831 }
832 gz.Flush()
833 return n, nil
834}
835
836// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200837func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
838
Akron3a063ef2021-08-05 19:36:35 +0200839 wb := bufio.NewWriter(w)
840 defer wb.Flush()
841
Akron6247a5d2021-08-03 19:18:28 +0200842 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200843 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200844 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200845 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200846 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200847 }
848
849 // Get sigma as a list
850 sigmalist := make([]rune, len(dat.sigma)+16)
851 max := 0
852 for sym, num := range dat.sigma {
853 sigmalist[num] = sym
854 if num > max {
855 max = num
856 }
857 }
858
859 sigmalist = sigmalist[:max+1]
860
Akron3f8571a2021-08-05 11:18:10 +0200861 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200862 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200863 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
864 bo.PutUint16(buf[4:6], uint16(dat.unknown))
865 bo.PutUint16(buf[6:8], uint16(dat.identity))
866 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200867 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200868 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200869 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200870 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200871 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200872 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200873 }
874
875 all += more
876
Akron6247a5d2021-08-03 19:18:28 +0200877 // Write sigma
878 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200879
Akron3a063ef2021-08-05 19:36:35 +0200880 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200881 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200882 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200883 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200884 }
885 all += more
886 }
Akron439f4ec2021-08-09 15:45:38 +0200887
Akron6247a5d2021-08-03 19:18:28 +0200888 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200889 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200890 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200891 }
Akron6247a5d2021-08-03 19:18:28 +0200892
893 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200894 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200895 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200896 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200897 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200898 }
899 all += more
900
Akronf1a16502021-08-16 15:24:38 +0200901 // for x := 0; x < len(dat.array); x++ {
902 for _, bc := range dat.array {
903 bo.PutUint32(buf[0:4], bc.base)
904 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200905 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200906 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200907 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200908 }
Akron439f4ec2021-08-09 15:45:38 +0200909 all += more
Akron3a063ef2021-08-05 19:36:35 +0200910 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200911 log.Println("Can not write base uint32")
912 return int64(all), err
913 }
914 bo.PutUint32(buf[0:4], bc.check)
915 more, err = wb.Write(buf[0:4])
916 if err != nil {
917 log.Println(err)
918 return int64(all), err
919 }
920 all += more
921 if more != 4 {
922 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200923 return int64(all), err
924 }
Akron6247a5d2021-08-03 19:18:28 +0200925 }
926
927 return int64(all), err
928}
929
Akron439f4ec2021-08-09 15:45:38 +0200930// LoadDatokFile reads a double array represented tokenizer
931// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200932func LoadDatokFile(file string) *DaTokenizer {
933 f, err := os.Open(file)
934 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200935 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200936 return nil
Akron3f8571a2021-08-05 11:18:10 +0200937 }
938 defer f.Close()
939
940 gz, err := gzip.NewReader(f)
941 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200942 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200943 return nil
Akron3f8571a2021-08-05 11:18:10 +0200944 }
945 defer gz.Close()
946
Akron3a063ef2021-08-05 19:36:35 +0200947 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200948 return ParseDatok(gz)
949}
950
Akron439f4ec2021-08-09 15:45:38 +0200951// LoadDatokFile reads a double array represented tokenizer
952// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200953func ParseDatok(ior io.Reader) *DaTokenizer {
954
Akron439f4ec2021-08-09 15:45:38 +0200955 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200956 dat := &DaTokenizer{
957 sigma: make(map[rune]int),
958 epsilon: 0,
959 unknown: 0,
960 identity: 0,
961 final: 0,
Akron4fa28b32021-08-27 10:55:41 +0200962 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +0200963 }
964
965 r := bufio.NewReader(ior)
966
Akron3f8571a2021-08-05 11:18:10 +0200967 buf := make([]byte, 1024)
968 buf = buf[0:len(MAGIC)]
969
Akron439f4ec2021-08-09 15:45:38 +0200970 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200971
972 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200973 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200974 return nil
975 }
976
Akron3f8571a2021-08-05 11:18:10 +0200977 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200978 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200979 return nil
980 }
981
Akron439f4ec2021-08-09 15:45:38 +0200982 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200983 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200984 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200985 return nil
986 }
987
Akron439f4ec2021-08-09 15:45:38 +0200988 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200989 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200990 return nil
991 }
Akron3f8571a2021-08-05 11:18:10 +0200992
Akron3a063ef2021-08-05 19:36:35 +0200993 version := bo.Uint16(buf[0:2])
994
995 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200996 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200997 return nil
998 }
999
Akron3f8571a2021-08-05 11:18:10 +02001000 dat.epsilon = int(bo.Uint16(buf[2:4]))
1001 dat.unknown = int(bo.Uint16(buf[4:6]))
1002 dat.identity = int(bo.Uint16(buf[6:8]))
1003 dat.final = int(bo.Uint16(buf[8:10]))
1004
1005 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +02001006 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +02001007
Akron3a063ef2021-08-05 19:36:35 +02001008 // Shouldn't be relevant though
1009 dat.maxSize = arraySize - 1
1010
Akron3f8571a2021-08-05 11:18:10 +02001011 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +02001012 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +02001013 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +02001014 if int(sym) < 256 {
1015 dat.sigmaASCII[int(sym)] = x
1016 }
Akron3f8571a2021-08-05 11:18:10 +02001017 dat.sigma[sym] = x
1018 }
Akron3f8571a2021-08-05 11:18:10 +02001019 }
1020
Akron439f4ec2021-08-09 15:45:38 +02001021 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +02001022
1023 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001024 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +02001025 return nil
1026 }
1027
Akron3f8571a2021-08-05 11:18:10 +02001028 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001029 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001030 return nil
1031 }
1032
1033 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001034 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001035
Akronbb4aac52021-08-13 00:52:27 +02001036 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001037
Akronbb4aac52021-08-13 00:52:27 +02001038 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001039 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001040 return nil
1041 }
1042
Akronf1a16502021-08-16 15:24:38 +02001043 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001044 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001045 return nil
1046 }
1047
1048 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001049 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1050 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001051 }
1052
1053 return dat
1054}
1055
Akron439f4ec2021-08-09 15:45:38 +02001056// Show the current state of the buffer,
1057// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001058func showBuffer(buffer []rune, buffo int, buffi int) string {
1059 out := make([]rune, 0, 1024)
1060 for x := 0; x < len(buffer); x++ {
1061 if buffi == x {
1062 out = append(out, '^')
1063 }
1064 if buffo == x {
1065 out = append(out, '[', buffer[x], ']')
1066 } else {
1067 out = append(out, buffer[x])
1068 }
1069 }
1070 return string(out)
1071}
1072
Akron84d68e62021-08-04 17:06:52 +02001073// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001074// FSA. The rules are always greedy. If the automaton fails,
1075// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001076//
Akron4db3ecf2021-08-11 18:49:03 +02001077// Based on Mizobuchi et al (2000), p. 129,
1078// with additional support for IDENTITY, UNKNOWN
1079// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001080func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001081 var a int
Akronb4bbb472021-08-09 11:49:38 +02001082 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001083 t := uint32(1) // Initial state
1084 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001085
Akron3610f102021-08-08 14:13:25 +02001086 // Remember the last position of a possible tokenend,
1087 // in case the automaton fails.
1088 epsilonState := uint32(0)
1089 epsilonOffset := 0
1090
1091 // Implement a low level buffer for full control,
1092 // however - it is probably better to introduce
1093 // this on a higher level with a io.Reader interface
1094 // The buffer stores a single word and may have white
1095 // space at the end (but not at the beginning).
1096 //
1097 // This is the only backtracking requirement because of
1098 // epsilon transitions, to support tokenizations like:
1099 // "this is an example|.| And it works." vs
1100 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001101 //
1102 // TODO:
1103 // Store a translation buffer as well, so characters don't
1104 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001105 buffer := make([]rune, 1024)
1106 buffo := 0 // Buffer offset
1107 buffi := 0 // Buffer length
1108
Akron3f8571a2021-08-05 11:18:10 +02001109 reader := bufio.NewReader(r)
1110 writer := bufio.NewWriter(w)
1111 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001112
Akron3f8571a2021-08-05 11:18:10 +02001113 var char rune
1114 var err error
1115 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001116 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001117
Akronc5d8d432021-08-10 16:48:44 +02001118PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001119 for {
1120
Akronb7e1f132021-08-10 11:52:31 +02001121 if newchar {
1122 // Get from reader if buffer is empty
1123 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001124 if eof {
1125 break
1126 }
Akronb7e1f132021-08-10 11:52:31 +02001127 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001128
Akronb7e1f132021-08-10 11:52:31 +02001129 // No more runes to read
1130 if err != nil {
1131 eof = true
1132 break
1133 }
1134 buffer[buffi] = char
1135 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001136 }
Akronb7e1f132021-08-10 11:52:31 +02001137
1138 char = buffer[buffo]
1139
1140 if DEBUG {
1141 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1142 }
1143
Akron6f1c16c2021-08-17 10:45:42 +02001144 // TODO:
1145 // Better not repeatedly check for a!
1146 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +02001147 if int(char) < 256 {
1148 a = dat.sigmaASCII[int(char)]
1149 } else {
1150 a, ok = dat.sigma[char]
1151 if !ok {
1152 a = 0
1153 }
1154 }
Akronb7e1f132021-08-10 11:52:31 +02001155
1156 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001157 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001158 a = dat.identity
1159 }
1160
1161 t0 = t
1162
1163 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001164 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001165 // Remember state for backtracking to last tokenend state
1166 epsilonState = t0
1167 epsilonOffset = buffo
1168 }
Akron3f8571a2021-08-05 11:18:10 +02001169 }
Akron3610f102021-08-08 14:13:25 +02001170
Akronb7e1f132021-08-10 11:52:31 +02001171 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001172 t = dat.array[t0].getBase() + uint32(a)
1173 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001174
Akron524c5432021-08-05 14:14:27 +02001175 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001176 // Char is only relevant if set
1177 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1178 if false {
1179 fmt.Println(dat.outgoing(t0))
1180 }
Akron524c5432021-08-05 14:14:27 +02001181 }
1182
Akronb7e1f132021-08-10 11:52:31 +02001183 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001184 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001185
1186 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001187 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001188 }
1189
1190 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001191
Akron068874c2021-08-04 15:19:56 +02001192 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001193 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001194 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001195 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001196 }
1197 a = dat.unknown
1198
1199 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001200
Akron068874c2021-08-04 15:19:56 +02001201 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001202 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001203 epsilonState = 0 // reset
1204 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001205 a = dat.epsilon
1206
Akron3610f102021-08-08 14:13:25 +02001207 if DEBUG {
1208 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1209 }
Akronb4bbb472021-08-09 11:49:38 +02001210
Akron068874c2021-08-04 15:19:56 +02001211 } else {
1212 break
1213 }
Akron068874c2021-08-04 15:19:56 +02001214
Akronb7e1f132021-08-10 11:52:31 +02001215 newchar = false
1216 continue
Akronb4bbb472021-08-09 11:49:38 +02001217 }
1218
Akronb7e1f132021-08-10 11:52:31 +02001219 // Transition was successful
1220 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001221
1222 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001223 if a != dat.epsilon {
1224
Akron3610f102021-08-08 14:13:25 +02001225 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001226
Akron439f4ec2021-08-09 15:45:38 +02001227 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001228 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001229 if DEBUG {
1230 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1231 }
Akron439f4ec2021-08-09 15:45:38 +02001232 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001233 }
Akron3f8571a2021-08-05 11:18:10 +02001234 }
Akron068874c2021-08-04 15:19:56 +02001235
Akronc5d8d432021-08-10 16:48:44 +02001236 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001237 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001238
Akronc5d8d432021-08-10 16:48:44 +02001239 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001240 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001241 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001242 }
Akron01912fc2021-08-12 11:41:58 +02001243 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001244 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001245 }
Akron1594cb82021-08-11 11:14:56 +02001246 if DEBUG {
1247 fmt.Println("-> Newline")
1248 }
1249 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001250 }
Akron3610f102021-08-08 14:13:25 +02001251
Akronc5d8d432021-08-10 16:48:44 +02001252 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001253 if rewindBuffer {
1254
1255 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001256 for x, i := range buffer[buffo:buffi] {
1257 buffer[x] = i
1258 }
Akronb4bbb472021-08-09 11:49:38 +02001259
Akron3610f102021-08-08 14:13:25 +02001260 buffi -= buffo
1261 epsilonOffset -= buffo
1262 buffo = 0
1263 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001264 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001265 }
Akron84d68e62021-08-04 17:06:52 +02001266 }
1267
Akronb7e1f132021-08-10 11:52:31 +02001268 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001269 if ta.isSeparate() {
1270 t = ta.getBase()
1271 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001272
1273 if DEBUG {
1274 fmt.Println("Representative pointing to", t)
1275 }
1276 }
1277
Akronc5d8d432021-08-10 16:48:44 +02001278 newchar = true
1279
Akron068874c2021-08-04 15:19:56 +02001280 // TODO:
1281 // Prevent endless epsilon loops!
1282 }
1283
Akron439f4ec2021-08-09 15:45:38 +02001284 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001285 if !eof {
Akron068874c2021-08-04 15:19:56 +02001286 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001287 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001288 }
1289 return false
1290 }
1291
Akronb7e1f132021-08-10 11:52:31 +02001292 if DEBUG {
1293 fmt.Println("Entering final check")
1294 }
1295
Akronc5d8d432021-08-10 16:48:44 +02001296 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001297 x := dat.array[t].getBase() + uint32(dat.final)
1298
1299 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001300
1301 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001302 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001303 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001304 }
Akron01912fc2021-08-12 11:41:58 +02001305 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001306
Akronf1a16502021-08-16 15:24:38 +02001307 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001308 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001309 if DEBUG {
1310 fmt.Println("-> Newline")
1311 }
Akrondf0a3ef2021-08-09 15:53:45 +02001312 }
Akron84d68e62021-08-04 17:06:52 +02001313 }
1314
Akron6e70dc82021-08-11 11:33:18 +02001315 // Add an additional sentence ending, if the file is over but no explicit
1316 // sentence split was reached. This may be controversial and therefore
1317 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001318 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001319 writer.WriteRune('\n')
1320 if DEBUG {
1321 fmt.Println("-> Newline")
1322 }
1323 }
1324
Akrone61380b2021-08-16 10:10:46 +02001325 // TODO:
1326 // There may be a new line at the end, from an epsilon,
1327 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001328 return true
1329 }
1330
Akronc5d8d432021-08-10 16:48:44 +02001331 // Check epsilon transitions until a final state is reached
1332 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001333 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001334 a = dat.epsilon
1335 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001336 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001337 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001338 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001339
Akronc5d8d432021-08-10 16:48:44 +02001340 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001341 t0 = epsilonState
1342 epsilonState = 0 // reset
1343 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001344 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001345 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001346 }
Akronc5d8d432021-08-10 16:48:44 +02001347 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001348 }
Akronc5d8d432021-08-10 16:48:44 +02001349 return false
Akron068874c2021-08-04 15:19:56 +02001350}