blob: 4f1e12d9935c3e8e5596c2e7caeaea6791efebb9 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
Akrone61380b2021-08-16 10:10:46 +020017// - Turn sigma into an array instead of using a map
18// and improve the mapping beforehand so that ASCII
19// is mapped directly and only non-ASCII needs to be
20// looked up in a map or similar.
Akron740f3d72021-08-03 12:12:34 +020021// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020023// - Instead of memoizing the loadFactor, better remember
24// the number of set transitions
Akrone61380b2021-08-16 10:10:46 +020025// - Replace/Enhance table with a map
26// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020027// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020028
Akron8ef408b2021-08-02 22:11:04 +020029import (
30 "bufio"
31 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020032 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020033 "fmt"
34 "io"
35 "os"
Akronc9d84a62021-08-03 15:56:03 +020036 "sort"
Akron8ef408b2021-08-02 22:11:04 +020037 "strconv"
38 "strings"
39 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020040
Akron527c10c2021-08-13 01:45:18 +020041 "log"
Akron8ef408b2021-08-02 22:11:04 +020042)
43
44const (
Akron2a4b9292021-08-04 15:35:22 +020045 PROPS = 1
46 SIGMA = 2
47 STATES = 3
48 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020049 DEBUG = false
50 MAGIC = "DATOK"
51 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020052 FIRSTBIT uint32 = 1 << 31
53 SECONDBIT uint32 = 1 << 30
54 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020055)
56
Akron03c92fe2021-08-09 14:07:57 +020057// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020058var bo binary.ByteOrder = binary.LittleEndian
59
Akron8ef408b2021-08-02 22:11:04 +020060type mapping struct {
61 source int
Akron3fdfec62021-08-04 11:40:10 +020062 target uint32
Akron8ef408b2021-08-02 22:11:04 +020063}
64
65type edge struct {
Akron83e75a22021-08-04 13:14:06 +020066 inSym int
67 outSym int
68 end int
69 nontoken bool
70 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020071}
72
Akron03c92fe2021-08-09 14:07:57 +020073// Tokenizer is the intermediate representation
74// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020075type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020076 sigmaRev map[int]rune
77 arcCount int
Akron740f3d72021-08-03 12:12:34 +020078 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020079 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020080
81 // Special symbols in sigma
82 epsilon int
83 unknown int
84 identity int
85 final int
Akron03c92fe2021-08-09 14:07:57 +020086 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020087}
88
Akronf1a16502021-08-16 15:24:38 +020089type bc struct {
90 base uint32
91 check uint32
92}
93
Akron03c92fe2021-08-09 14:07:57 +020094// DaTokenizer represents a tokenizer implemented as a
95// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020096type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020097 sigma map[rune]int
98 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020099 maxSize int
100 loadFactor float64
Akronf1a16502021-08-16 15:24:38 +0200101 array []bc
Akron3f8571a2021-08-05 11:18:10 +0200102 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +0200103
104 // Special symbols in sigma
105 epsilon int
106 unknown int
107 identity int
108 final int
Akron03c92fe2021-08-09 14:07:57 +0200109 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200110}
111
Akron03c92fe2021-08-09 14:07:57 +0200112// ParseFoma reads the FST from a foma file
113// and creates an internal representation,
114// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200115func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200116 f, err := os.Open(file)
117 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200118 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200119 return nil
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121 defer f.Close()
122
123 gz, err := gzip.NewReader(f)
124 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200125 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200126 return nil
Akron8ef408b2021-08-02 22:11:04 +0200127 }
128 defer gz.Close()
129
Akron3fdfec62021-08-04 11:40:10 +0200130 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200131}
132
Akron03c92fe2021-08-09 14:07:57 +0200133// ParseFoma reads the FST from a foma file reader
134// and creates an internal representation,
135// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200136func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200137 r := bufio.NewReader(ior)
138
139 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200140 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200141 epsilon: -1,
142 unknown: -1,
143 identity: -1,
144 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200145 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200146 }
147
Akron740f3d72021-08-03 12:12:34 +0200148 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200149
150 mode := 0
151 var elem []string
152 var elemint [5]int
153
Akron03c92fe2021-08-09 14:07:57 +0200154 // Iterate over all lines of the file.
155 // This is mainly based on foma2js,
156 // licensed under the Apache License, version 2,
157 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200158 for {
159 line, err := r.ReadString('\n')
160 if err != nil {
161 if err == io.EOF {
162 break
163 }
Akron527c10c2021-08-13 01:45:18 +0200164 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200165 return nil
Akron8ef408b2021-08-02 22:11:04 +0200166 }
Akron8ef408b2021-08-02 22:11:04 +0200167
Akron439f4ec2021-08-09 15:45:38 +0200168 // Read parser mode for the following lines
169 if strings.HasPrefix(line, "##") {
170 if strings.HasPrefix(line, "##props##") {
171 mode = PROPS
172
173 } else if strings.HasPrefix(line, "##states##") {
174 mode = STATES
175
176 // Adds a final transition symbol to sigma
177 // written as '#' in Mizobuchi et al (2000)
178 tok.sigmaCount++
179 tok.final = tok.sigmaCount
180
181 } else if strings.HasPrefix(line, "##sigma##") {
182
183 mode = SIGMA
184
185 } else if strings.HasPrefix(line, "##end##") {
186
187 mode = NONE
188
189 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200190 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200191 break
192 }
Akron8ef408b2021-08-02 22:11:04 +0200193 continue
194 }
195
Akron439f4ec2021-08-09 15:45:38 +0200196 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200197 switch mode {
198 case PROPS:
199 {
200 elem = strings.Split(line, " ")
201 /*
202 fmt.Println("arity: " + elem[0])
203 fmt.Println("arccount: " + elem[1])
204 fmt.Println("statecount: " + elem[2])
205 fmt.Println("linecount: " + elem[3])
206 fmt.Println("finalcount: " + elem[4])
207 fmt.Println("pathcount: " + elem[5])
208 fmt.Println("is_deterministic: " + elem[6])
209 fmt.Println("is_pruned: " + elem[7])
210 fmt.Println("is_minimized: " + elem[8])
211 fmt.Println("is_epsilon_free: " + elem[9])
212 fmt.Println("is_loop_free: " + elem[10])
213 fmt.Println("extras: " + elem[11])
214 fmt.Println("name: " + elem[12])
215 */
216 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200217 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200218 return nil
Akron8ef408b2021-08-02 22:11:04 +0200219 }
Akron439f4ec2021-08-09 15:45:38 +0200220
Akron8ef408b2021-08-02 22:11:04 +0200221 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200222 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200223 return nil
Akron8ef408b2021-08-02 22:11:04 +0200224 }
225
226 elemint[0], err = strconv.Atoi(elem[1])
227 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200228 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200229 return nil
Akron8ef408b2021-08-02 22:11:04 +0200230 }
Akron740f3d72021-08-03 12:12:34 +0200231 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200232
Akron8ef408b2021-08-02 22:11:04 +0200233 elemint[0], err = strconv.Atoi(elem[2])
234 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200235 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200236 return nil
Akron8ef408b2021-08-02 22:11:04 +0200237 }
Akron439f4ec2021-08-09 15:45:38 +0200238
239 // States start at 1 in Mizobuchi et al (2000),
240 // as the state 0 is associated with a fail.
241 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200242 tok.transitions = make([]map[int]*edge, elemint[0]+1)
243 continue
244 }
245 case STATES:
246 {
247 elem = strings.Split(line[0:len(line)-1], " ")
248 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200249 if DEBUG {
250 fmt.Println("Skip", elem)
251 }
Akron8ef408b2021-08-02 22:11:04 +0200252 continue
253 }
254 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200255 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200256 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200257 break
258 }
Akron8ef408b2021-08-02 22:11:04 +0200259
260 if len(elem) > 1 {
261 elemint[1], err = strconv.Atoi(elem[1])
262 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200263 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200264 break
265 }
266 if len(elem) > 2 {
267 elemint[2], err = strconv.Atoi(elem[2])
268 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200269 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200270 break
271 }
272 if len(elem) > 3 {
273 elemint[3], err = strconv.Atoi(elem[3])
274 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200275 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200276 break
277 }
278 if len(elem) > 4 {
279 elemint[4], err = strconv.Atoi(elem[4])
280 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200281 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200282 break
283 }
284 }
285 }
286 }
287 }
288
289 switch len(elem) {
290 case 5:
291 {
Akron740f3d72021-08-03 12:12:34 +0200292 state = elemint[0]
293 inSym = elemint[1]
294 outSym = elemint[2]
295 end = elemint[3]
296 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200297 }
298 case 4:
299 {
300 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200301 state = elemint[0]
302 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200303 } else {
Akron740f3d72021-08-03 12:12:34 +0200304 state = elemint[0]
305 inSym = elemint[1]
306 end = elemint[2]
307 final = elemint[3]
308 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200309 }
310 }
311 case 3:
312 {
Akron740f3d72021-08-03 12:12:34 +0200313 inSym = elemint[0]
314 outSym = elemint[1]
315 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200316 }
317 case 2:
318 {
Akron740f3d72021-08-03 12:12:34 +0200319 inSym = elemint[0]
320 end = elemint[1]
321 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200322 }
323 }
324
Akron83e75a22021-08-04 13:14:06 +0200325 nontoken := false
326 tokenend := false
327
Akron439f4ec2021-08-09 15:45:38 +0200328 // While the states in foma start with 0, the states in the
329 // Mizobuchi FSA start with one - so we increase every state by 1.
330 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200331 inSym++
332 outSym++
333
Akron439f4ec2021-08-09 15:45:38 +0200334 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200335 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200336 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200337 tokenend = true
338 } else if outSym == tok.epsilon {
339 nontoken = true
340 } else {
Akron527c10c2021-08-13 01:45:18 +0200341 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200342 "Unsupported transition: " +
343 strconv.Itoa(state) +
344 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200345 " (" +
Akron740f3d72021-08-03 12:12:34 +0200346 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200347 ":" +
Akron740f3d72021-08-03 12:12:34 +0200348 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200349 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200350 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200351 ":" +
Akron740f3d72021-08-03 12:12:34 +0200352 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200353 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200354 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200355 }
Akron83e75a22021-08-04 13:14:06 +0200356
Akron83e75a22021-08-04 13:14:06 +0200357 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200358 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200359 return nil
Akron8ef408b2021-08-02 22:11:04 +0200360 }
361
Akron03c92fe2021-08-09 14:07:57 +0200362 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200363 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200364 inSym: inSym,
365 outSym: outSym,
366 end: end + 1,
367 tokenend: tokenend,
368 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200369 }
370
Akron740f3d72021-08-03 12:12:34 +0200371 // Initialize outgoing states
372 if tok.transitions[state+1] == nil {
373 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200374 }
375
Akron740f3d72021-08-03 12:12:34 +0200376 // Ignore transitions with invalid symbols
377 if inSym >= 0 {
378 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200379 }
Akron8ef408b2021-08-02 22:11:04 +0200380
Akron740f3d72021-08-03 12:12:34 +0200381 // Add final transition
382 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200383 // TODO:
384 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200385 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200386 }
387
Akronb4bbb472021-08-09 11:49:38 +0200388 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200389 fmt.Println("Add",
390 state+1, "->", end+1,
391 "(",
392 inSym,
393 ":",
394 outSym,
395 ") (",
396 string(tok.sigmaRev[inSym]),
397 ":",
398 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200399 ")",
400 ";",
401 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200402 "NT:", nontoken,
403 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200404 }
Akron75ebe7f2021-08-03 10:34:10 +0200405
Akron8ef408b2021-08-02 22:11:04 +0200406 continue
407 }
408 case SIGMA:
409 {
410 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
411
412 // Turn string into sigma id
413 number, err := strconv.Atoi(elem[0])
414
Akron524c5432021-08-05 14:14:27 +0200415 // ID needs to be > 1
416 number++
417
Akron8ef408b2021-08-02 22:11:04 +0200418 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200419 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200420 return nil
Akron8ef408b2021-08-02 22:11:04 +0200421 }
422
Akron740f3d72021-08-03 12:12:34 +0200423 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200424
425 var symbol rune
426
427 // Read rune
428 if utf8.RuneCountInString(elem[1]) == 1 {
429 symbol = []rune(elem[1])[0]
430
Akron8ef408b2021-08-02 22:11:04 +0200431 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200432
433 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200434 switch elem[1] {
435 case "@_EPSILON_SYMBOL_@":
436 {
Akronc17f1ca2021-08-03 19:47:27 +0200437 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200438 }
439 case "@_UNKNOWN_SYMBOL_@":
440 {
Akronc17f1ca2021-08-03 19:47:27 +0200441 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200442 }
443
444 case "@_IDENTITY_SYMBOL_@":
445 {
Akronc17f1ca2021-08-03 19:47:27 +0200446 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200447 }
Akron03c92fe2021-08-09 14:07:57 +0200448
449 case "@_TOKEN_SYMBOL_@":
450 {
451 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200452 }
Akron8ef408b2021-08-02 22:11:04 +0200453 default:
Akron740f3d72021-08-03 12:12:34 +0200454 {
Akron527c10c2021-08-13 01:45:18 +0200455 log.Println("MCS not supported: " + line)
Akron4db3ecf2021-08-11 18:49:03 +0200456 return nil
Akron740f3d72021-08-03 12:12:34 +0200457 }
Akron8ef408b2021-08-02 22:11:04 +0200458 }
Akron439f4ec2021-08-09 15:45:38 +0200459 continue
Akron8ef408b2021-08-02 22:11:04 +0200460
Akron740f3d72021-08-03 12:12:34 +0200461 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200462 line, err = r.ReadString('\n')
463 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200464 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200465 return nil
Akron8ef408b2021-08-02 22:11:04 +0200466 }
467 if len(line) != 1 {
Akron527c10c2021-08-13 01:45:18 +0200468 log.Println("MCS not supported:" + line)
Akron4db3ecf2021-08-11 18:49:03 +0200469 return nil
Akron8ef408b2021-08-02 22:11:04 +0200470 }
Akron03c92fe2021-08-09 14:07:57 +0200471 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200472 }
473
Akron740f3d72021-08-03 12:12:34 +0200474 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200475 }
476 }
477 }
478
479 return tok
480}
481
Akron64ffd9a2021-08-03 19:55:21 +0200482// Set alphabet A to the list of all symbols
483// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200484func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200485 for a := range tok.transitions[s] {
486 *A = append(*A, a)
487 }
488
489 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200490 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200491}
492
Akron439f4ec2021-08-09 15:45:38 +0200493// ToDoubleArray turns the intermediate tokenizer representation
494// into a double array representation.
495//
496// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200497func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
498
499 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200500 sigma: make(map[rune]int),
501 loadFactor: -1,
502 final: tok.final,
503 unknown: tok.unknown,
504 identity: tok.identity,
505 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200506 tokenend: tok.tokenend,
Akron3f8571a2021-08-05 11:18:10 +0200507 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200508 }
509
Akronf1a16502021-08-16 15:24:38 +0200510 dat.resize(dat.final)
511
Akronf2120ca2021-08-03 16:26:41 +0200512 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200513 if int(sym) < 256 {
514 dat.sigmaASCII[int(sym)] = num
515 }
Akronf2120ca2021-08-03 16:26:41 +0200516 dat.sigma[sym] = num
517 }
Akron8ef408b2021-08-02 22:11:04 +0200518
519 mark := 0
520 size := 0
521
Akron439f4ec2021-08-09 15:45:38 +0200522 // Create a mapping from s (in Ms aka Intermediate FSA)
523 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200524 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200525
Akron439f4ec2021-08-09 15:45:38 +0200526 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200527 table[size] = &mapping{source: 1, target: 1}
528 size++
529
Akron740f3d72021-08-03 12:12:34 +0200530 // Allocate space for the outgoing symbol range
531 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200532
533 for mark < size {
534 s := table[mark].source // This is a state in Ms
535 t := table[mark].target // This is a state in Mt
536 mark++
Akron740f3d72021-08-03 12:12:34 +0200537
538 // Following the paper, here the state t can be remembered
539 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200540 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200541 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200542
Akron740f3d72021-08-03 12:12:34 +0200543 // Set base to the first free slot in the double array
Akronf1a16502021-08-16 15:24:38 +0200544 dat.array[t].setBase(dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200545
Akron773b1ef2021-08-03 17:37:20 +0200546 // TODO:
Akron068874c2021-08-04 15:19:56 +0200547 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200548 // outdegree of .end
549
Akron740f3d72021-08-03 12:12:34 +0200550 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200551 for _, a := range A {
552
Akronc17f1ca2021-08-03 19:47:27 +0200553 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200554
Akron740f3d72021-08-03 12:12:34 +0200555 // Aka g(s, a)
556 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200557
Akron740f3d72021-08-03 12:12:34 +0200558 // Store the transition
Akronf1a16502021-08-16 15:24:38 +0200559 t1 := dat.array[t].getBase() + uint32(a)
560 dat.array[t1].setCheck(t)
561
562 // Set maxSize
563 if dat.maxSize < int(t1) {
564 dat.maxSize = int(t1)
565 }
Akron8ef408b2021-08-02 22:11:04 +0200566
Akron439f4ec2021-08-09 15:45:38 +0200567 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200568 fmt.Println("Translate transition",
569 s, "->", s1, "(", a, ")", "to", t, "->", t1)
570 }
571
Akron83e75a22021-08-04 13:14:06 +0200572 // Mark the state as being the target of a nontoken transition
573 if tok.transitions[s][a].nontoken {
Akronf1a16502021-08-16 15:24:38 +0200574 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200575 if DEBUG {
576 fmt.Println("Set", t1, "to nontoken")
577 }
Akron83e75a22021-08-04 13:14:06 +0200578 }
579
Akron84d68e62021-08-04 17:06:52 +0200580 // Mark the state as being the target of a tokenend transition
581 if tok.transitions[s][a].tokenend {
Akronf1a16502021-08-16 15:24:38 +0200582 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200583 if DEBUG {
584 fmt.Println("Set", t1, "to tokenend")
585 }
Akron84d68e62021-08-04 17:06:52 +0200586 }
587
Akron740f3d72021-08-03 12:12:34 +0200588 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200589 r := stateAlreadyInTable(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200590
Akron439f4ec2021-08-09 15:45:38 +0200591 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200592 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200593 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200594 table[size] = &mapping{source: s1, target: t1}
595 size++
596 } else {
Akron740f3d72021-08-03 12:12:34 +0200597 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200598 dat.array[t1].setBase(r)
599 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200600 }
601 } else {
Akron740f3d72021-08-03 12:12:34 +0200602 // Store a final transition
Akronf1a16502021-08-16 15:24:38 +0200603 dat.array[dat.array[t].getBase()+uint32(dat.final)].setCheck(t)
604
605 if dat.maxSize < int(dat.array[t].getBase()+uint32(dat.final)) {
606 dat.maxSize = int(dat.array[t].getBase() + uint32(dat.final))
607 }
Akron8ef408b2021-08-02 22:11:04 +0200608 }
609 }
610 }
611
612 // Following Mizobuchi et al (2000) the size of the
613 // FSA should be stored in check(1).
Akronf1a16502021-08-16 15:24:38 +0200614 // We make the size a bit smaller so we never have to check for boundaries.
Akron3fdfec62021-08-04 11:40:10 +0200615 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200616 dat.array = dat.array[:dat.maxSize+1]
617 return dat
Akron8ef408b2021-08-02 22:11:04 +0200618}
619
Akron8ef408b2021-08-02 22:11:04 +0200620// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200621// exists and return this as a representative.
622// Currently iterates through the whole table
623// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200624func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200625 for x := 0; x < size; x++ {
626 if table[x].source == s {
627 return table[x].target
628 }
629 }
630 return 0
631}
632
Akron64ffd9a2021-08-03 19:55:21 +0200633// Resize double array when necessary
634func (dat *DaTokenizer) resize(l int) {
635 // TODO:
636 // This is a bit too aggressive atm and should be calmed down.
637 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200638 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200639 }
Akron64ffd9a2021-08-03 19:55:21 +0200640}
Akronc9d84a62021-08-03 15:56:03 +0200641
Akron64ffd9a2021-08-03 19:55:21 +0200642// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200643func (bc *bc) setBase(v uint32) {
644 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200645}
646
647// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200648func (bc *bc) getBase() uint32 {
649 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200650}
651
652// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200653func (bc *bc) setCheck(v uint32) {
654 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200655}
656
657// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200658func (bc *bc) getCheck() uint32 {
659 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200660}
661
Akron3fdfec62021-08-04 11:40:10 +0200662// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200663func (bc *bc) isSeparate() bool {
664 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200665}
666
667// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200668func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200669 if sep {
Akronf1a16502021-08-16 15:24:38 +0200670 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200671 } else {
Akronf1a16502021-08-16 15:24:38 +0200672 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200673 }
674}
675
Akron83e75a22021-08-04 13:14:06 +0200676// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200677func (bc *bc) isNonToken() bool {
678 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200679}
680
681// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200682func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200683 if sep {
Akronf1a16502021-08-16 15:24:38 +0200684 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200685 } else {
Akronf1a16502021-08-16 15:24:38 +0200686 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200687 }
688}
689
Akron84d68e62021-08-04 17:06:52 +0200690// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200691func (bc *bc) isTokenEnd() bool {
692 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200693}
694
695// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200696func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200697 if sep {
Akronf1a16502021-08-16 15:24:38 +0200698 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200699 } else {
Akronf1a16502021-08-16 15:24:38 +0200700 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200701 }
702}
703
Akron64ffd9a2021-08-03 19:55:21 +0200704// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200705func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200706 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200707}
708
709// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200710func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200711 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200712}
713
714// Based on Mizobuchi et al (2000), p. 124
715// This iterates for every state through the complete double array
716// structure until it finds a gap that fits all outgoing transitions
717// of the state. This is extremely slow, but is only necessary in the
718// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200719func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200720
721 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200722 base := uint32(1) // dat.lastFilledBase
723 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200724OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200725
Akron3f8571a2021-08-05 11:18:10 +0200726 /*
727 if !skip {
728 if dat.getCheck(base) != 0 {
729 dat.lastFilledBase = base
730 } else {
731 skip = true
732 }
733 }
734 */
735
Akron740f3d72021-08-03 12:12:34 +0200736 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200737 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200738 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200739 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200740 base++
741 goto OVERLAP
742 }
743 }
Akron8ef408b2021-08-02 22:11:04 +0200744 return base
745}
746
Akron3610f102021-08-08 14:13:25 +0200747// List all outgoing transitions for a state
748// for testing purposes
749func (dat *DaTokenizer) outgoing(t uint32) []int {
750
751 valid := make([]int, 0, len(dat.sigma))
752
753 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200754 t1 := dat.array[t].getBase() + uint32(a)
755 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200756 valid = append(valid, a)
757 }
758 }
759
760 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200761 t1 := dat.array[t].getBase() + uint32(a)
762 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200763 valid = append(valid, -1*a)
764 }
765 }
766
767 sort.Ints(valid)
768
769 return valid
770}
771
Akron03a3c612021-08-04 11:51:27 +0200772// LoadFactor as defined in Kanda et al (2018),
773// i.e. the proportion of non-empty elements to all elements.
774func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200775
Akron03a3c612021-08-04 11:51:27 +0200776 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200777 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200778 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200779 }
Akrond66a9262021-08-03 17:09:09 +0200780 nonEmpty := 0
781 all := len(dat.array) / 2
Akronf1a16502021-08-16 15:24:38 +0200782 for x := 1; x < len(dat.array); x++ {
783 if dat.array[x].getBase() != 0 {
Akrond66a9262021-08-03 17:09:09 +0200784 nonEmpty++
785 }
786 }
Akron03a3c612021-08-04 11:51:27 +0200787 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
788 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200789}
790
Akron439f4ec2021-08-09 15:45:38 +0200791// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200792func (dat *DaTokenizer) Save(file string) (n int64, err error) {
793 f, err := os.Create(file)
794 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200795 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200796 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200797 }
798 defer f.Close()
799 gz := gzip.NewWriter(f)
800 defer gz.Close()
801 n, err = dat.WriteTo(gz)
802 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200803 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200804 return n, err
805 }
806 gz.Flush()
807 return n, nil
808}
809
810// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200811func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
812
Akron3a063ef2021-08-05 19:36:35 +0200813 wb := bufio.NewWriter(w)
814 defer wb.Flush()
815
Akron6247a5d2021-08-03 19:18:28 +0200816 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200817 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200818 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200819 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200820 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200821 }
822
823 // Get sigma as a list
824 sigmalist := make([]rune, len(dat.sigma)+16)
825 max := 0
826 for sym, num := range dat.sigma {
827 sigmalist[num] = sym
828 if num > max {
829 max = num
830 }
831 }
832
833 sigmalist = sigmalist[:max+1]
834
Akron3f8571a2021-08-05 11:18:10 +0200835 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200836 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200837 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
838 bo.PutUint16(buf[4:6], uint16(dat.unknown))
839 bo.PutUint16(buf[6:8], uint16(dat.identity))
840 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200841 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200842 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200843 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200844 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200845 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200846 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200847 }
848
849 all += more
850
Akron6247a5d2021-08-03 19:18:28 +0200851 // Write sigma
852 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200853
Akron3a063ef2021-08-05 19:36:35 +0200854 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200855 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200856 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200857 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200858 }
859 all += more
860 }
Akron439f4ec2021-08-09 15:45:38 +0200861
Akron6247a5d2021-08-03 19:18:28 +0200862 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200863 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200864 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200865 }
Akron6247a5d2021-08-03 19:18:28 +0200866
867 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200868 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200869 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200870 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200871 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200872 }
873 all += more
874
Akronf1a16502021-08-16 15:24:38 +0200875 // for x := 0; x < len(dat.array); x++ {
876 for _, bc := range dat.array {
877 bo.PutUint32(buf[0:4], bc.base)
878 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200879 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200880 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200881 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200882 }
Akron439f4ec2021-08-09 15:45:38 +0200883 all += more
Akron3a063ef2021-08-05 19:36:35 +0200884 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200885 log.Println("Can not write base uint32")
886 return int64(all), err
887 }
888 bo.PutUint32(buf[0:4], bc.check)
889 more, err = wb.Write(buf[0:4])
890 if err != nil {
891 log.Println(err)
892 return int64(all), err
893 }
894 all += more
895 if more != 4 {
896 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200897 return int64(all), err
898 }
Akron6247a5d2021-08-03 19:18:28 +0200899 }
900
901 return int64(all), err
902}
903
Akron439f4ec2021-08-09 15:45:38 +0200904// LoadDatokFile reads a double array represented tokenizer
905// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200906func LoadDatokFile(file string) *DaTokenizer {
907 f, err := os.Open(file)
908 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200909 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200910 return nil
Akron3f8571a2021-08-05 11:18:10 +0200911 }
912 defer f.Close()
913
914 gz, err := gzip.NewReader(f)
915 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200916 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200917 return nil
Akron3f8571a2021-08-05 11:18:10 +0200918 }
919 defer gz.Close()
920
Akron3a063ef2021-08-05 19:36:35 +0200921 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200922 return ParseDatok(gz)
923}
924
Akron439f4ec2021-08-09 15:45:38 +0200925// LoadDatokFile reads a double array represented tokenizer
926// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200927func ParseDatok(ior io.Reader) *DaTokenizer {
928
Akron439f4ec2021-08-09 15:45:38 +0200929 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200930 dat := &DaTokenizer{
931 sigma: make(map[rune]int),
932 epsilon: 0,
933 unknown: 0,
934 identity: 0,
935 final: 0,
936 loadFactor: 0,
937 }
938
939 r := bufio.NewReader(ior)
940
Akron3f8571a2021-08-05 11:18:10 +0200941 buf := make([]byte, 1024)
942 buf = buf[0:len(MAGIC)]
943
Akron439f4ec2021-08-09 15:45:38 +0200944 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200945
946 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200947 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200948 return nil
949 }
950
Akron3f8571a2021-08-05 11:18:10 +0200951 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200952 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200953 return nil
954 }
955
Akron439f4ec2021-08-09 15:45:38 +0200956 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200957 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200958 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200959 return nil
960 }
961
Akron439f4ec2021-08-09 15:45:38 +0200962 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200963 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200964 return nil
965 }
Akron3f8571a2021-08-05 11:18:10 +0200966
Akron3a063ef2021-08-05 19:36:35 +0200967 version := bo.Uint16(buf[0:2])
968
969 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200970 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200971 return nil
972 }
973
Akron3f8571a2021-08-05 11:18:10 +0200974 dat.epsilon = int(bo.Uint16(buf[2:4]))
975 dat.unknown = int(bo.Uint16(buf[4:6]))
976 dat.identity = int(bo.Uint16(buf[6:8]))
977 dat.final = int(bo.Uint16(buf[8:10]))
978
979 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200980 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200981
Akron3a063ef2021-08-05 19:36:35 +0200982 // Shouldn't be relevant though
983 dat.maxSize = arraySize - 1
984
Akron3f8571a2021-08-05 11:18:10 +0200985 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200986 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200987 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200988 if int(sym) < 256 {
989 dat.sigmaASCII[int(sym)] = x
990 }
Akron3f8571a2021-08-05 11:18:10 +0200991 dat.sigma[sym] = x
992 }
Akron3f8571a2021-08-05 11:18:10 +0200993 }
994
Akron439f4ec2021-08-09 15:45:38 +0200995 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200996
997 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200998 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200999 return nil
1000 }
1001
Akron3f8571a2021-08-05 11:18:10 +02001002 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001003 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001004 return nil
1005 }
1006
1007 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001008 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001009
Akronbb4aac52021-08-13 00:52:27 +02001010 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001011
Akronbb4aac52021-08-13 00:52:27 +02001012 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001013 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001014 return nil
1015 }
1016
Akronf1a16502021-08-16 15:24:38 +02001017 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001018 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001019 return nil
1020 }
1021
1022 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001023 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1024 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001025 }
1026
1027 return dat
1028}
1029
Akron439f4ec2021-08-09 15:45:38 +02001030// Show the current state of the buffer,
1031// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001032func showBuffer(buffer []rune, buffo int, buffi int) string {
1033 out := make([]rune, 0, 1024)
1034 for x := 0; x < len(buffer); x++ {
1035 if buffi == x {
1036 out = append(out, '^')
1037 }
1038 if buffo == x {
1039 out = append(out, '[', buffer[x], ']')
1040 } else {
1041 out = append(out, buffer[x])
1042 }
1043 }
1044 return string(out)
1045}
1046
Akron84d68e62021-08-04 17:06:52 +02001047// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001048// FSA. The rules are always greedy. If the automaton fails,
1049// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001050//
Akron4db3ecf2021-08-11 18:49:03 +02001051// Based on Mizobuchi et al (2000), p. 129,
1052// with additional support for IDENTITY, UNKNOWN
1053// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001054func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001055 var a int
Akronb4bbb472021-08-09 11:49:38 +02001056 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001057 t := uint32(1) // Initial state
1058 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001059
Akron3610f102021-08-08 14:13:25 +02001060 // Remember the last position of a possible tokenend,
1061 // in case the automaton fails.
1062 epsilonState := uint32(0)
1063 epsilonOffset := 0
1064
1065 // Implement a low level buffer for full control,
1066 // however - it is probably better to introduce
1067 // this on a higher level with a io.Reader interface
1068 // The buffer stores a single word and may have white
1069 // space at the end (but not at the beginning).
1070 //
1071 // This is the only backtracking requirement because of
1072 // epsilon transitions, to support tokenizations like:
1073 // "this is an example|.| And it works." vs
1074 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001075 //
1076 // TODO:
1077 // Store a translation buffer as well, so characters don't
1078 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001079 buffer := make([]rune, 1024)
1080 buffo := 0 // Buffer offset
1081 buffi := 0 // Buffer length
1082
Akron3f8571a2021-08-05 11:18:10 +02001083 reader := bufio.NewReader(r)
1084 writer := bufio.NewWriter(w)
1085 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001086
Akron3f8571a2021-08-05 11:18:10 +02001087 var char rune
1088 var err error
1089 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001090 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001091
Akronc5d8d432021-08-10 16:48:44 +02001092PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001093 for {
1094
Akronb7e1f132021-08-10 11:52:31 +02001095 if newchar {
1096 // Get from reader if buffer is empty
1097 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001098 if eof {
1099 break
1100 }
Akronb7e1f132021-08-10 11:52:31 +02001101 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001102
Akronb7e1f132021-08-10 11:52:31 +02001103 // No more runes to read
1104 if err != nil {
1105 eof = true
1106 break
1107 }
1108 buffer[buffi] = char
1109 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001110 }
Akronb7e1f132021-08-10 11:52:31 +02001111
1112 char = buffer[buffo]
1113
1114 if DEBUG {
1115 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1116 }
1117
1118 // TODO: Better not repeatedly check for a!
Akronea46e8a2021-08-17 00:36:31 +02001119 if int(char) < 256 {
1120 a = dat.sigmaASCII[int(char)]
1121 } else {
1122 a, ok = dat.sigma[char]
1123 if !ok {
1124 a = 0
1125 }
1126 }
Akronb7e1f132021-08-10 11:52:31 +02001127
1128 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001129 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001130 a = dat.identity
1131 }
1132
1133 t0 = t
1134
1135 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001136 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001137 // Remember state for backtracking to last tokenend state
1138 epsilonState = t0
1139 epsilonOffset = buffo
1140 }
Akron3f8571a2021-08-05 11:18:10 +02001141 }
Akron3610f102021-08-08 14:13:25 +02001142
Akronb7e1f132021-08-10 11:52:31 +02001143 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001144 t = dat.array[t0].getBase() + uint32(a)
1145 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001146
Akron524c5432021-08-05 14:14:27 +02001147 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001148 // Char is only relevant if set
1149 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1150 if false {
1151 fmt.Println(dat.outgoing(t0))
1152 }
Akron524c5432021-08-05 14:14:27 +02001153 }
1154
Akronb7e1f132021-08-10 11:52:31 +02001155 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001156 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001157
1158 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001159 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001160 }
1161
1162 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001163
Akron068874c2021-08-04 15:19:56 +02001164 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001165 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001166 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001167 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001168 }
1169 a = dat.unknown
1170
1171 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001172
Akron068874c2021-08-04 15:19:56 +02001173 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001174 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001175 epsilonState = 0 // reset
1176 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001177 a = dat.epsilon
1178
Akron3610f102021-08-08 14:13:25 +02001179 if DEBUG {
1180 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1181 }
Akronb4bbb472021-08-09 11:49:38 +02001182
Akron068874c2021-08-04 15:19:56 +02001183 } else {
1184 break
1185 }
Akron068874c2021-08-04 15:19:56 +02001186
Akronb7e1f132021-08-10 11:52:31 +02001187 newchar = false
1188 continue
Akronb4bbb472021-08-09 11:49:38 +02001189 }
1190
Akronb7e1f132021-08-10 11:52:31 +02001191 // Transition was successful
1192 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001193
1194 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001195 if a != dat.epsilon {
1196
Akron3610f102021-08-08 14:13:25 +02001197 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001198
Akron439f4ec2021-08-09 15:45:38 +02001199 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001200 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001201 if DEBUG {
1202 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1203 }
Akron439f4ec2021-08-09 15:45:38 +02001204 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001205 }
Akron3f8571a2021-08-05 11:18:10 +02001206 }
Akron068874c2021-08-04 15:19:56 +02001207
Akronc5d8d432021-08-10 16:48:44 +02001208 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001209 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001210
Akronc5d8d432021-08-10 16:48:44 +02001211 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001212 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001213 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001214 }
Akron01912fc2021-08-12 11:41:58 +02001215 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001216 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001217 }
Akron1594cb82021-08-11 11:14:56 +02001218 if DEBUG {
1219 fmt.Println("-> Newline")
1220 }
1221 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001222 }
Akron3610f102021-08-08 14:13:25 +02001223
Akronc5d8d432021-08-10 16:48:44 +02001224 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001225 if rewindBuffer {
1226
1227 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001228 for x, i := range buffer[buffo:buffi] {
1229 buffer[x] = i
1230 }
Akronb4bbb472021-08-09 11:49:38 +02001231
Akron3610f102021-08-08 14:13:25 +02001232 buffi -= buffo
1233 epsilonOffset -= buffo
1234 buffo = 0
1235 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001236 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001237 }
Akron84d68e62021-08-04 17:06:52 +02001238 }
1239
Akronb7e1f132021-08-10 11:52:31 +02001240 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001241 if ta.isSeparate() {
1242 t = ta.getBase()
1243 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001244
1245 if DEBUG {
1246 fmt.Println("Representative pointing to", t)
1247 }
1248 }
1249
Akronc5d8d432021-08-10 16:48:44 +02001250 newchar = true
1251
Akron068874c2021-08-04 15:19:56 +02001252 // TODO:
1253 // Prevent endless epsilon loops!
1254 }
1255
Akron439f4ec2021-08-09 15:45:38 +02001256 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001257 if !eof {
Akron068874c2021-08-04 15:19:56 +02001258 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001259 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001260 }
1261 return false
1262 }
1263
Akronb7e1f132021-08-10 11:52:31 +02001264 if DEBUG {
1265 fmt.Println("Entering final check")
1266 }
1267
Akronc5d8d432021-08-10 16:48:44 +02001268 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001269 x := dat.array[t].getBase() + uint32(dat.final)
1270
1271 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001272
1273 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001274 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001275 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001276 }
Akron01912fc2021-08-12 11:41:58 +02001277 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001278
Akronf1a16502021-08-16 15:24:38 +02001279 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001280 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001281 if DEBUG {
1282 fmt.Println("-> Newline")
1283 }
Akrondf0a3ef2021-08-09 15:53:45 +02001284 }
Akron84d68e62021-08-04 17:06:52 +02001285 }
1286
Akron6e70dc82021-08-11 11:33:18 +02001287 // Add an additional sentence ending, if the file is over but no explicit
1288 // sentence split was reached. This may be controversial and therefore
1289 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001290 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001291 writer.WriteRune('\n')
1292 if DEBUG {
1293 fmt.Println("-> Newline")
1294 }
1295 }
1296
Akrone61380b2021-08-16 10:10:46 +02001297 // TODO:
1298 // There may be a new line at the end, from an epsilon,
1299 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001300 return true
1301 }
1302
Akronc5d8d432021-08-10 16:48:44 +02001303 // Check epsilon transitions until a final state is reached
1304 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001305 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001306 a = dat.epsilon
1307 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001308 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001309 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001310 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001311
Akronc5d8d432021-08-10 16:48:44 +02001312 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001313 t0 = epsilonState
1314 epsilonState = 0 // reset
1315 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001316 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001317 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001318 }
Akronc5d8d432021-08-10 16:48:44 +02001319 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001320 }
Akronc5d8d432021-08-10 16:48:44 +02001321 return false
Akron068874c2021-08-04 15:19:56 +02001322}