blob: 9f8079b661b75559756f8edacaa129c2d9d4e417 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
Akrone61380b2021-08-16 10:10:46 +020017// - Turn sigma into an array instead of using a map
18// and improve the mapping beforehand so that ASCII
19// is mapped directly and only non-ASCII needs to be
20// looked up in a map or similar.
Akron740f3d72021-08-03 12:12:34 +020021// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020023// - Instead of memoizing the loadFactor, better remember
24// the number of set transitions
Akrone61380b2021-08-16 10:10:46 +020025// - Replace/Enhance table with a map
26// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020027// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020028
Akron8ef408b2021-08-02 22:11:04 +020029import (
30 "bufio"
31 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020032 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020033 "fmt"
34 "io"
35 "os"
Akronc9d84a62021-08-03 15:56:03 +020036 "sort"
Akron8ef408b2021-08-02 22:11:04 +020037 "strconv"
38 "strings"
39 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020040
Akron527c10c2021-08-13 01:45:18 +020041 "log"
Akron8ef408b2021-08-02 22:11:04 +020042)
43
44const (
Akron2a4b9292021-08-04 15:35:22 +020045 PROPS = 1
46 SIGMA = 2
47 STATES = 3
48 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020049 DEBUG = false
50 MAGIC = "DATOK"
51 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020052 FIRSTBIT uint32 = 1 << 31
53 SECONDBIT uint32 = 1 << 30
54 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020055)
56
Akron03c92fe2021-08-09 14:07:57 +020057// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020058var bo binary.ByteOrder = binary.LittleEndian
59
Akron8ef408b2021-08-02 22:11:04 +020060type mapping struct {
61 source int
Akron3fdfec62021-08-04 11:40:10 +020062 target uint32
Akron8ef408b2021-08-02 22:11:04 +020063}
64
65type edge struct {
Akron83e75a22021-08-04 13:14:06 +020066 inSym int
67 outSym int
68 end int
69 nontoken bool
70 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020071}
72
Akron03c92fe2021-08-09 14:07:57 +020073// Tokenizer is the intermediate representation
74// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020075type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020076 sigmaRev map[int]rune
77 arcCount int
Akron740f3d72021-08-03 12:12:34 +020078 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020079 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020080
81 // Special symbols in sigma
82 epsilon int
83 unknown int
84 identity int
85 final int
Akron03c92fe2021-08-09 14:07:57 +020086 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020087}
88
Akronf1a16502021-08-16 15:24:38 +020089type bc struct {
90 base uint32
91 check uint32
92}
93
Akron03c92fe2021-08-09 14:07:57 +020094// DaTokenizer represents a tokenizer implemented as a
95// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020096type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020097 sigma map[rune]int
98 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020099 maxSize int
100 loadFactor float64
Akronf1a16502021-08-16 15:24:38 +0200101 array []bc
Akronc17f1ca2021-08-03 19:47:27 +0200102
103 // Special symbols in sigma
104 epsilon int
105 unknown int
106 identity int
107 final int
Akron03c92fe2021-08-09 14:07:57 +0200108 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200109}
110
Akron03c92fe2021-08-09 14:07:57 +0200111// ParseFoma reads the FST from a foma file
112// and creates an internal representation,
113// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200114func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200115 f, err := os.Open(file)
116 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200117 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200118 return nil
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120 defer f.Close()
121
122 gz, err := gzip.NewReader(f)
123 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200124 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200125 return nil
Akron8ef408b2021-08-02 22:11:04 +0200126 }
127 defer gz.Close()
128
Akron3fdfec62021-08-04 11:40:10 +0200129 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200130}
131
Akron03c92fe2021-08-09 14:07:57 +0200132// ParseFoma reads the FST from a foma file reader
133// and creates an internal representation,
134// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200135func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200136 r := bufio.NewReader(ior)
137
138 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200139 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200140 epsilon: -1,
141 unknown: -1,
142 identity: -1,
143 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200144 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200145 }
146
Akron740f3d72021-08-03 12:12:34 +0200147 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200148
149 mode := 0
150 var elem []string
151 var elemint [5]int
152
Akron03c92fe2021-08-09 14:07:57 +0200153 // Iterate over all lines of the file.
154 // This is mainly based on foma2js,
155 // licensed under the Apache License, version 2,
156 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200157 for {
158 line, err := r.ReadString('\n')
159 if err != nil {
160 if err == io.EOF {
161 break
162 }
Akron527c10c2021-08-13 01:45:18 +0200163 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200164 return nil
Akron8ef408b2021-08-02 22:11:04 +0200165 }
Akron8ef408b2021-08-02 22:11:04 +0200166
Akron439f4ec2021-08-09 15:45:38 +0200167 // Read parser mode for the following lines
168 if strings.HasPrefix(line, "##") {
169 if strings.HasPrefix(line, "##props##") {
170 mode = PROPS
171
172 } else if strings.HasPrefix(line, "##states##") {
173 mode = STATES
174
175 // Adds a final transition symbol to sigma
176 // written as '#' in Mizobuchi et al (2000)
177 tok.sigmaCount++
178 tok.final = tok.sigmaCount
179
180 } else if strings.HasPrefix(line, "##sigma##") {
181
182 mode = SIGMA
183
184 } else if strings.HasPrefix(line, "##end##") {
185
186 mode = NONE
187
188 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200189 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200190 break
191 }
Akron8ef408b2021-08-02 22:11:04 +0200192 continue
193 }
194
Akron439f4ec2021-08-09 15:45:38 +0200195 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200196 switch mode {
197 case PROPS:
198 {
199 elem = strings.Split(line, " ")
200 /*
201 fmt.Println("arity: " + elem[0])
202 fmt.Println("arccount: " + elem[1])
203 fmt.Println("statecount: " + elem[2])
204 fmt.Println("linecount: " + elem[3])
205 fmt.Println("finalcount: " + elem[4])
206 fmt.Println("pathcount: " + elem[5])
207 fmt.Println("is_deterministic: " + elem[6])
208 fmt.Println("is_pruned: " + elem[7])
209 fmt.Println("is_minimized: " + elem[8])
210 fmt.Println("is_epsilon_free: " + elem[9])
211 fmt.Println("is_loop_free: " + elem[10])
212 fmt.Println("extras: " + elem[11])
213 fmt.Println("name: " + elem[12])
214 */
215 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200216 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200217 return nil
Akron8ef408b2021-08-02 22:11:04 +0200218 }
Akron439f4ec2021-08-09 15:45:38 +0200219
Akron8ef408b2021-08-02 22:11:04 +0200220 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200221 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200222 return nil
Akron8ef408b2021-08-02 22:11:04 +0200223 }
224
225 elemint[0], err = strconv.Atoi(elem[1])
226 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200227 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200228 return nil
Akron8ef408b2021-08-02 22:11:04 +0200229 }
Akron740f3d72021-08-03 12:12:34 +0200230 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200231
Akron8ef408b2021-08-02 22:11:04 +0200232 elemint[0], err = strconv.Atoi(elem[2])
233 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200234 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200235 return nil
Akron8ef408b2021-08-02 22:11:04 +0200236 }
Akron439f4ec2021-08-09 15:45:38 +0200237
238 // States start at 1 in Mizobuchi et al (2000),
239 // as the state 0 is associated with a fail.
240 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200241 tok.transitions = make([]map[int]*edge, elemint[0]+1)
242 continue
243 }
244 case STATES:
245 {
246 elem = strings.Split(line[0:len(line)-1], " ")
247 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200248 if DEBUG {
249 fmt.Println("Skip", elem)
250 }
Akron8ef408b2021-08-02 22:11:04 +0200251 continue
252 }
253 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200254 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200255 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200256 break
257 }
Akron8ef408b2021-08-02 22:11:04 +0200258
259 if len(elem) > 1 {
260 elemint[1], err = strconv.Atoi(elem[1])
261 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200262 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200263 break
264 }
265 if len(elem) > 2 {
266 elemint[2], err = strconv.Atoi(elem[2])
267 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200268 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200269 break
270 }
271 if len(elem) > 3 {
272 elemint[3], err = strconv.Atoi(elem[3])
273 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200274 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200275 break
276 }
277 if len(elem) > 4 {
278 elemint[4], err = strconv.Atoi(elem[4])
279 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200280 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200281 break
282 }
283 }
284 }
285 }
286 }
287
288 switch len(elem) {
289 case 5:
290 {
Akron740f3d72021-08-03 12:12:34 +0200291 state = elemint[0]
292 inSym = elemint[1]
293 outSym = elemint[2]
294 end = elemint[3]
295 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200296 }
297 case 4:
298 {
299 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200300 state = elemint[0]
301 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200302 } else {
Akron740f3d72021-08-03 12:12:34 +0200303 state = elemint[0]
304 inSym = elemint[1]
305 end = elemint[2]
306 final = elemint[3]
307 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200308 }
309 }
310 case 3:
311 {
Akron740f3d72021-08-03 12:12:34 +0200312 inSym = elemint[0]
313 outSym = elemint[1]
314 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200315 }
316 case 2:
317 {
Akron740f3d72021-08-03 12:12:34 +0200318 inSym = elemint[0]
319 end = elemint[1]
320 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200321 }
322 }
323
Akron83e75a22021-08-04 13:14:06 +0200324 nontoken := false
325 tokenend := false
326
Akron439f4ec2021-08-09 15:45:38 +0200327 // While the states in foma start with 0, the states in the
328 // Mizobuchi FSA start with one - so we increase every state by 1.
329 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200330 inSym++
331 outSym++
332
Akron439f4ec2021-08-09 15:45:38 +0200333 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200334 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200335 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200336 tokenend = true
337 } else if outSym == tok.epsilon {
338 nontoken = true
339 } else {
Akron527c10c2021-08-13 01:45:18 +0200340 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200341 "Unsupported transition: " +
342 strconv.Itoa(state) +
343 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200344 " (" +
Akron740f3d72021-08-03 12:12:34 +0200345 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200346 ":" +
Akron740f3d72021-08-03 12:12:34 +0200347 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200348 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200349 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200350 ":" +
Akron740f3d72021-08-03 12:12:34 +0200351 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200352 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200353 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200354 }
Akron83e75a22021-08-04 13:14:06 +0200355
Akron83e75a22021-08-04 13:14:06 +0200356 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200357 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200358 return nil
Akron8ef408b2021-08-02 22:11:04 +0200359 }
360
Akron03c92fe2021-08-09 14:07:57 +0200361 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200362 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200363 inSym: inSym,
364 outSym: outSym,
365 end: end + 1,
366 tokenend: tokenend,
367 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200368 }
369
Akron740f3d72021-08-03 12:12:34 +0200370 // Initialize outgoing states
371 if tok.transitions[state+1] == nil {
372 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200373 }
374
Akron740f3d72021-08-03 12:12:34 +0200375 // Ignore transitions with invalid symbols
376 if inSym >= 0 {
377 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200378 }
Akron8ef408b2021-08-02 22:11:04 +0200379
Akron740f3d72021-08-03 12:12:34 +0200380 // Add final transition
381 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200382 // TODO:
383 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200384 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200385 }
386
Akronb4bbb472021-08-09 11:49:38 +0200387 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200388 fmt.Println("Add",
389 state+1, "->", end+1,
390 "(",
391 inSym,
392 ":",
393 outSym,
394 ") (",
395 string(tok.sigmaRev[inSym]),
396 ":",
397 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200398 ")",
399 ";",
400 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200401 "NT:", nontoken,
402 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200403 }
Akron75ebe7f2021-08-03 10:34:10 +0200404
Akron8ef408b2021-08-02 22:11:04 +0200405 continue
406 }
407 case SIGMA:
408 {
409 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
410
411 // Turn string into sigma id
412 number, err := strconv.Atoi(elem[0])
413
Akron524c5432021-08-05 14:14:27 +0200414 // ID needs to be > 1
415 number++
416
Akron8ef408b2021-08-02 22:11:04 +0200417 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200418 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200419 return nil
Akron8ef408b2021-08-02 22:11:04 +0200420 }
421
Akron740f3d72021-08-03 12:12:34 +0200422 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200423
424 var symbol rune
425
426 // Read rune
427 if utf8.RuneCountInString(elem[1]) == 1 {
428 symbol = []rune(elem[1])[0]
429
Akron8ef408b2021-08-02 22:11:04 +0200430 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200431
432 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200433 switch elem[1] {
434 case "@_EPSILON_SYMBOL_@":
435 {
Akronc17f1ca2021-08-03 19:47:27 +0200436 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200437 }
438 case "@_UNKNOWN_SYMBOL_@":
439 {
Akronc17f1ca2021-08-03 19:47:27 +0200440 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200441 }
442
443 case "@_IDENTITY_SYMBOL_@":
444 {
Akronc17f1ca2021-08-03 19:47:27 +0200445 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200446 }
Akron03c92fe2021-08-09 14:07:57 +0200447
448 case "@_TOKEN_SYMBOL_@":
449 {
450 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200451 }
Akron8ef408b2021-08-02 22:11:04 +0200452 default:
Akron740f3d72021-08-03 12:12:34 +0200453 {
Akron527c10c2021-08-13 01:45:18 +0200454 log.Println("MCS not supported: " + line)
Akron4db3ecf2021-08-11 18:49:03 +0200455 return nil
Akron740f3d72021-08-03 12:12:34 +0200456 }
Akron8ef408b2021-08-02 22:11:04 +0200457 }
Akron439f4ec2021-08-09 15:45:38 +0200458 continue
Akron8ef408b2021-08-02 22:11:04 +0200459
Akron740f3d72021-08-03 12:12:34 +0200460 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200461 line, err = r.ReadString('\n')
462 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200463 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200464 return nil
Akron8ef408b2021-08-02 22:11:04 +0200465 }
466 if len(line) != 1 {
Akron527c10c2021-08-13 01:45:18 +0200467 log.Println("MCS not supported:" + line)
Akron4db3ecf2021-08-11 18:49:03 +0200468 return nil
Akron8ef408b2021-08-02 22:11:04 +0200469 }
Akron03c92fe2021-08-09 14:07:57 +0200470 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200471 }
472
Akron740f3d72021-08-03 12:12:34 +0200473 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200474 }
475 }
476 }
477
478 return tok
479}
480
Akron64ffd9a2021-08-03 19:55:21 +0200481// Set alphabet A to the list of all symbols
482// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200483func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200484 for a := range tok.transitions[s] {
485 *A = append(*A, a)
486 }
487
488 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200489 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200490}
491
Akron439f4ec2021-08-09 15:45:38 +0200492// ToDoubleArray turns the intermediate tokenizer representation
493// into a double array representation.
494//
495// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200496func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
497
498 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200499 sigma: make(map[rune]int),
500 loadFactor: -1,
501 final: tok.final,
502 unknown: tok.unknown,
503 identity: tok.identity,
504 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200505 tokenend: tok.tokenend,
Akronf2120ca2021-08-03 16:26:41 +0200506 }
507
Akronf1a16502021-08-16 15:24:38 +0200508 dat.resize(dat.final)
509
Akronf2120ca2021-08-03 16:26:41 +0200510 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200511 if int(sym) < 256 {
512 dat.sigmaASCII[int(sym)] = num
513 }
Akronf2120ca2021-08-03 16:26:41 +0200514 dat.sigma[sym] = num
515 }
Akron8ef408b2021-08-02 22:11:04 +0200516
517 mark := 0
518 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200519 var base uint32
Akronde18e902021-08-27 09:34:12 +0200520 var atrans *edge
521 var s, s1 int
522 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200523
Akron439f4ec2021-08-09 15:45:38 +0200524 // Create a mapping from s (in Ms aka Intermediate FSA)
525 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200526 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200527
Akron439f4ec2021-08-09 15:45:38 +0200528 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200529 table[size] = &mapping{source: 1, target: 1}
530 size++
531
Akron740f3d72021-08-03 12:12:34 +0200532 // Allocate space for the outgoing symbol range
533 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200534
Akronde18e902021-08-27 09:34:12 +0200535 // TODO:
536 // Table lookup for the moment
537 // only gives a minor performance benefit.
538 // should be rewritten and should preplace the
539 // table all together.
540 // tableLookup := make(map[int]uint32)
541 // tableLookup[1] = 1
542
Akron8ef408b2021-08-02 22:11:04 +0200543 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200544 s = table[mark].source // This is a state in Ms
545 t = table[mark].target // This is a state in Mt
Akron8ef408b2021-08-02 22:11:04 +0200546 mark++
Akron740f3d72021-08-03 12:12:34 +0200547
548 // Following the paper, here the state t can be remembered
549 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200550 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200551 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200552
Akron740f3d72021-08-03 12:12:34 +0200553 // Set base to the first free slot in the double array
Akron6f1c16c2021-08-17 10:45:42 +0200554 base = dat.xCheck(A)
555 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200556
Akron773b1ef2021-08-03 17:37:20 +0200557 // TODO:
Akron068874c2021-08-04 15:19:56 +0200558 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200559 // outdegree of .end
560
Akron740f3d72021-08-03 12:12:34 +0200561 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200562 for _, a := range A {
563
Akronc17f1ca2021-08-03 19:47:27 +0200564 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200565
Akronde18e902021-08-27 09:34:12 +0200566 atrans = tok.transitions[s][a]
567
Akron740f3d72021-08-03 12:12:34 +0200568 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200569 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200570
Akron740f3d72021-08-03 12:12:34 +0200571 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200572 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200573 dat.array[t1].setCheck(t)
574
575 // Set maxSize
576 if dat.maxSize < int(t1) {
577 dat.maxSize = int(t1)
578 }
Akron8ef408b2021-08-02 22:11:04 +0200579
Akron439f4ec2021-08-09 15:45:38 +0200580 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200581 fmt.Println("Translate transition",
582 s, "->", s1, "(", a, ")", "to", t, "->", t1)
583 }
584
Akron83e75a22021-08-04 13:14:06 +0200585 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200586 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200587 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200588 if DEBUG {
589 fmt.Println("Set", t1, "to nontoken")
590 }
Akron83e75a22021-08-04 13:14:06 +0200591 }
592
Akron84d68e62021-08-04 17:06:52 +0200593 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200594 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200595 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200596 if DEBUG {
597 fmt.Println("Set", t1, "to tokenend")
598 }
Akron84d68e62021-08-04 17:06:52 +0200599 }
600
Akron740f3d72021-08-03 12:12:34 +0200601 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200602 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200603 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200604
Akron439f4ec2021-08-09 15:45:38 +0200605 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200606 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200607 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200608 table[size] = &mapping{source: s1, target: t1}
Akronde18e902021-08-27 09:34:12 +0200609 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200610 size++
611 } else {
Akron740f3d72021-08-03 12:12:34 +0200612 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200613 dat.array[t1].setBase(r)
614 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200615 }
616 } else {
Akron740f3d72021-08-03 12:12:34 +0200617 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200618 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200619
Akronde18e902021-08-27 09:34:12 +0200620 if dat.maxSize < int(base)+dat.final {
621 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200622 }
Akron8ef408b2021-08-02 22:11:04 +0200623 }
624 }
625 }
626
627 // Following Mizobuchi et al (2000) the size of the
628 // FSA should be stored in check(1).
Akronf1a16502021-08-16 15:24:38 +0200629 // We make the size a bit smaller so we never have to check for boundaries.
Akron3fdfec62021-08-04 11:40:10 +0200630 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200631 dat.array = dat.array[:dat.maxSize+1]
632 return dat
Akron8ef408b2021-08-02 22:11:04 +0200633}
634
Akron8ef408b2021-08-02 22:11:04 +0200635// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200636// exists and return this as a representative.
637// Currently iterates through the whole table
638// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200639func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200640 for x := 0; x < size; x++ {
641 if table[x].source == s {
642 return table[x].target
643 }
644 }
645 return 0
646}
647
Akron64ffd9a2021-08-03 19:55:21 +0200648// Resize double array when necessary
649func (dat *DaTokenizer) resize(l int) {
650 // TODO:
651 // This is a bit too aggressive atm and should be calmed down.
652 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200653 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200654 }
Akron64ffd9a2021-08-03 19:55:21 +0200655}
Akronc9d84a62021-08-03 15:56:03 +0200656
Akron64ffd9a2021-08-03 19:55:21 +0200657// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200658func (bc *bc) setBase(v uint32) {
659 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200660}
661
662// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200663func (bc *bc) getBase() uint32 {
664 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200665}
666
667// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200668func (bc *bc) setCheck(v uint32) {
669 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200670}
671
672// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200673func (bc *bc) getCheck() uint32 {
674 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200675}
676
Akron3fdfec62021-08-04 11:40:10 +0200677// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200678func (bc *bc) isSeparate() bool {
679 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200680}
681
682// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200683func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200684 if sep {
Akronf1a16502021-08-16 15:24:38 +0200685 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200686 } else {
Akronf1a16502021-08-16 15:24:38 +0200687 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200688 }
689}
690
Akron83e75a22021-08-04 13:14:06 +0200691// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200692func (bc *bc) isNonToken() bool {
693 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200694}
695
696// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200697func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200698 if sep {
Akronf1a16502021-08-16 15:24:38 +0200699 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200700 } else {
Akronf1a16502021-08-16 15:24:38 +0200701 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200702 }
703}
704
Akron84d68e62021-08-04 17:06:52 +0200705// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200706func (bc *bc) isTokenEnd() bool {
707 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200708}
709
710// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200711func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200712 if sep {
Akronf1a16502021-08-16 15:24:38 +0200713 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200714 } else {
Akronf1a16502021-08-16 15:24:38 +0200715 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200716 }
717}
718
Akron64ffd9a2021-08-03 19:55:21 +0200719// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200720func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200721 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200722}
723
724// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200725func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200726 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200727}
728
729// Based on Mizobuchi et al (2000), p. 124
730// This iterates for every state through the complete double array
731// structure until it finds a gap that fits all outgoing transitions
732// of the state. This is extremely slow, but is only necessary in the
733// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200734func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200735
736 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200737 base := uint32(1)
738
Akron8ef408b2021-08-02 22:11:04 +0200739OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200740 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200741 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200742 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200743 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200744 base++
745 goto OVERLAP
746 }
747 }
Akron8ef408b2021-08-02 22:11:04 +0200748 return base
749}
750
Akron3610f102021-08-08 14:13:25 +0200751// List all outgoing transitions for a state
752// for testing purposes
753func (dat *DaTokenizer) outgoing(t uint32) []int {
754
755 valid := make([]int, 0, len(dat.sigma))
756
757 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200758 t1 := dat.array[t].getBase() + uint32(a)
759 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200760 valid = append(valid, a)
761 }
762 }
763
764 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200765 t1 := dat.array[t].getBase() + uint32(a)
766 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200767 valid = append(valid, -1*a)
768 }
769 }
770
771 sort.Ints(valid)
772
773 return valid
774}
775
Akron03a3c612021-08-04 11:51:27 +0200776// LoadFactor as defined in Kanda et al (2018),
777// i.e. the proportion of non-empty elements to all elements.
778func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200779
Akron03a3c612021-08-04 11:51:27 +0200780 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200781 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200782 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200783 }
Akrond66a9262021-08-03 17:09:09 +0200784 nonEmpty := 0
785 all := len(dat.array) / 2
Akronf1a16502021-08-16 15:24:38 +0200786 for x := 1; x < len(dat.array); x++ {
787 if dat.array[x].getBase() != 0 {
Akrond66a9262021-08-03 17:09:09 +0200788 nonEmpty++
789 }
790 }
Akron03a3c612021-08-04 11:51:27 +0200791 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
792 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200793}
794
Akron439f4ec2021-08-09 15:45:38 +0200795// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200796func (dat *DaTokenizer) Save(file string) (n int64, err error) {
797 f, err := os.Create(file)
798 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200799 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200800 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200801 }
802 defer f.Close()
803 gz := gzip.NewWriter(f)
804 defer gz.Close()
805 n, err = dat.WriteTo(gz)
806 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200807 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200808 return n, err
809 }
810 gz.Flush()
811 return n, nil
812}
813
814// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200815func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
816
Akron3a063ef2021-08-05 19:36:35 +0200817 wb := bufio.NewWriter(w)
818 defer wb.Flush()
819
Akron6247a5d2021-08-03 19:18:28 +0200820 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200821 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200822 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200823 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200824 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200825 }
826
827 // Get sigma as a list
828 sigmalist := make([]rune, len(dat.sigma)+16)
829 max := 0
830 for sym, num := range dat.sigma {
831 sigmalist[num] = sym
832 if num > max {
833 max = num
834 }
835 }
836
837 sigmalist = sigmalist[:max+1]
838
Akron3f8571a2021-08-05 11:18:10 +0200839 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200840 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200841 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
842 bo.PutUint16(buf[4:6], uint16(dat.unknown))
843 bo.PutUint16(buf[6:8], uint16(dat.identity))
844 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200845 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200846 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200847 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200848 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200849 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200850 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200851 }
852
853 all += more
854
Akron6247a5d2021-08-03 19:18:28 +0200855 // Write sigma
856 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200857
Akron3a063ef2021-08-05 19:36:35 +0200858 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200859 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200860 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200861 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200862 }
863 all += more
864 }
Akron439f4ec2021-08-09 15:45:38 +0200865
Akron6247a5d2021-08-03 19:18:28 +0200866 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200867 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200868 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200869 }
Akron6247a5d2021-08-03 19:18:28 +0200870
871 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200872 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200873 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200874 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200875 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200876 }
877 all += more
878
Akronf1a16502021-08-16 15:24:38 +0200879 // for x := 0; x < len(dat.array); x++ {
880 for _, bc := range dat.array {
881 bo.PutUint32(buf[0:4], bc.base)
882 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200883 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200884 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200885 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200886 }
Akron439f4ec2021-08-09 15:45:38 +0200887 all += more
Akron3a063ef2021-08-05 19:36:35 +0200888 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200889 log.Println("Can not write base uint32")
890 return int64(all), err
891 }
892 bo.PutUint32(buf[0:4], bc.check)
893 more, err = wb.Write(buf[0:4])
894 if err != nil {
895 log.Println(err)
896 return int64(all), err
897 }
898 all += more
899 if more != 4 {
900 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200901 return int64(all), err
902 }
Akron6247a5d2021-08-03 19:18:28 +0200903 }
904
905 return int64(all), err
906}
907
Akron439f4ec2021-08-09 15:45:38 +0200908// LoadDatokFile reads a double array represented tokenizer
909// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200910func LoadDatokFile(file string) *DaTokenizer {
911 f, err := os.Open(file)
912 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200913 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200914 return nil
Akron3f8571a2021-08-05 11:18:10 +0200915 }
916 defer f.Close()
917
918 gz, err := gzip.NewReader(f)
919 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200920 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200921 return nil
Akron3f8571a2021-08-05 11:18:10 +0200922 }
923 defer gz.Close()
924
Akron3a063ef2021-08-05 19:36:35 +0200925 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200926 return ParseDatok(gz)
927}
928
Akron439f4ec2021-08-09 15:45:38 +0200929// LoadDatokFile reads a double array represented tokenizer
930// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200931func ParseDatok(ior io.Reader) *DaTokenizer {
932
Akron439f4ec2021-08-09 15:45:38 +0200933 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200934 dat := &DaTokenizer{
935 sigma: make(map[rune]int),
936 epsilon: 0,
937 unknown: 0,
938 identity: 0,
939 final: 0,
940 loadFactor: 0,
941 }
942
943 r := bufio.NewReader(ior)
944
Akron3f8571a2021-08-05 11:18:10 +0200945 buf := make([]byte, 1024)
946 buf = buf[0:len(MAGIC)]
947
Akron439f4ec2021-08-09 15:45:38 +0200948 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200949
950 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200951 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200952 return nil
953 }
954
Akron3f8571a2021-08-05 11:18:10 +0200955 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200956 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200957 return nil
958 }
959
Akron439f4ec2021-08-09 15:45:38 +0200960 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200961 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200962 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200963 return nil
964 }
965
Akron439f4ec2021-08-09 15:45:38 +0200966 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200967 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200968 return nil
969 }
Akron3f8571a2021-08-05 11:18:10 +0200970
Akron3a063ef2021-08-05 19:36:35 +0200971 version := bo.Uint16(buf[0:2])
972
973 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200974 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200975 return nil
976 }
977
Akron3f8571a2021-08-05 11:18:10 +0200978 dat.epsilon = int(bo.Uint16(buf[2:4]))
979 dat.unknown = int(bo.Uint16(buf[4:6]))
980 dat.identity = int(bo.Uint16(buf[6:8]))
981 dat.final = int(bo.Uint16(buf[8:10]))
982
983 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +0200984 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +0200985
Akron3a063ef2021-08-05 19:36:35 +0200986 // Shouldn't be relevant though
987 dat.maxSize = arraySize - 1
988
Akron3f8571a2021-08-05 11:18:10 +0200989 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200990 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200991 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +0200992 if int(sym) < 256 {
993 dat.sigmaASCII[int(sym)] = x
994 }
Akron3f8571a2021-08-05 11:18:10 +0200995 dat.sigma[sym] = x
996 }
Akron3f8571a2021-08-05 11:18:10 +0200997 }
998
Akron439f4ec2021-08-09 15:45:38 +0200999 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +02001000
1001 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001002 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +02001003 return nil
1004 }
1005
Akron3f8571a2021-08-05 11:18:10 +02001006 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001007 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001008 return nil
1009 }
1010
1011 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001012 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001013
Akronbb4aac52021-08-13 00:52:27 +02001014 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001015
Akronbb4aac52021-08-13 00:52:27 +02001016 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001017 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001018 return nil
1019 }
1020
Akronf1a16502021-08-16 15:24:38 +02001021 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001022 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001023 return nil
1024 }
1025
1026 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001027 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1028 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001029 }
1030
1031 return dat
1032}
1033
Akron439f4ec2021-08-09 15:45:38 +02001034// Show the current state of the buffer,
1035// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001036func showBuffer(buffer []rune, buffo int, buffi int) string {
1037 out := make([]rune, 0, 1024)
1038 for x := 0; x < len(buffer); x++ {
1039 if buffi == x {
1040 out = append(out, '^')
1041 }
1042 if buffo == x {
1043 out = append(out, '[', buffer[x], ']')
1044 } else {
1045 out = append(out, buffer[x])
1046 }
1047 }
1048 return string(out)
1049}
1050
Akron84d68e62021-08-04 17:06:52 +02001051// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001052// FSA. The rules are always greedy. If the automaton fails,
1053// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001054//
Akron4db3ecf2021-08-11 18:49:03 +02001055// Based on Mizobuchi et al (2000), p. 129,
1056// with additional support for IDENTITY, UNKNOWN
1057// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001058func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001059 var a int
Akronb4bbb472021-08-09 11:49:38 +02001060 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001061 t := uint32(1) // Initial state
1062 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001063
Akron3610f102021-08-08 14:13:25 +02001064 // Remember the last position of a possible tokenend,
1065 // in case the automaton fails.
1066 epsilonState := uint32(0)
1067 epsilonOffset := 0
1068
1069 // Implement a low level buffer for full control,
1070 // however - it is probably better to introduce
1071 // this on a higher level with a io.Reader interface
1072 // The buffer stores a single word and may have white
1073 // space at the end (but not at the beginning).
1074 //
1075 // This is the only backtracking requirement because of
1076 // epsilon transitions, to support tokenizations like:
1077 // "this is an example|.| And it works." vs
1078 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001079 //
1080 // TODO:
1081 // Store a translation buffer as well, so characters don't
1082 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001083 buffer := make([]rune, 1024)
1084 buffo := 0 // Buffer offset
1085 buffi := 0 // Buffer length
1086
Akron3f8571a2021-08-05 11:18:10 +02001087 reader := bufio.NewReader(r)
1088 writer := bufio.NewWriter(w)
1089 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001090
Akron3f8571a2021-08-05 11:18:10 +02001091 var char rune
1092 var err error
1093 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001094 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001095
Akronc5d8d432021-08-10 16:48:44 +02001096PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001097 for {
1098
Akronb7e1f132021-08-10 11:52:31 +02001099 if newchar {
1100 // Get from reader if buffer is empty
1101 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001102 if eof {
1103 break
1104 }
Akronb7e1f132021-08-10 11:52:31 +02001105 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001106
Akronb7e1f132021-08-10 11:52:31 +02001107 // No more runes to read
1108 if err != nil {
1109 eof = true
1110 break
1111 }
1112 buffer[buffi] = char
1113 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001114 }
Akronb7e1f132021-08-10 11:52:31 +02001115
1116 char = buffer[buffo]
1117
1118 if DEBUG {
1119 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1120 }
1121
Akron6f1c16c2021-08-17 10:45:42 +02001122 // TODO:
1123 // Better not repeatedly check for a!
1124 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +02001125 if int(char) < 256 {
1126 a = dat.sigmaASCII[int(char)]
1127 } else {
1128 a, ok = dat.sigma[char]
1129 if !ok {
1130 a = 0
1131 }
1132 }
Akronb7e1f132021-08-10 11:52:31 +02001133
1134 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001135 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001136 a = dat.identity
1137 }
1138
1139 t0 = t
1140
1141 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001142 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001143 // Remember state for backtracking to last tokenend state
1144 epsilonState = t0
1145 epsilonOffset = buffo
1146 }
Akron3f8571a2021-08-05 11:18:10 +02001147 }
Akron3610f102021-08-08 14:13:25 +02001148
Akronb7e1f132021-08-10 11:52:31 +02001149 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001150 t = dat.array[t0].getBase() + uint32(a)
1151 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001152
Akron524c5432021-08-05 14:14:27 +02001153 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001154 // Char is only relevant if set
1155 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1156 if false {
1157 fmt.Println(dat.outgoing(t0))
1158 }
Akron524c5432021-08-05 14:14:27 +02001159 }
1160
Akronb7e1f132021-08-10 11:52:31 +02001161 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001162 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001163
1164 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001165 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001166 }
1167
1168 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001169
Akron068874c2021-08-04 15:19:56 +02001170 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001171 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001172 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001173 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001174 }
1175 a = dat.unknown
1176
1177 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001178
Akron068874c2021-08-04 15:19:56 +02001179 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001180 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001181 epsilonState = 0 // reset
1182 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001183 a = dat.epsilon
1184
Akron3610f102021-08-08 14:13:25 +02001185 if DEBUG {
1186 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1187 }
Akronb4bbb472021-08-09 11:49:38 +02001188
Akron068874c2021-08-04 15:19:56 +02001189 } else {
1190 break
1191 }
Akron068874c2021-08-04 15:19:56 +02001192
Akronb7e1f132021-08-10 11:52:31 +02001193 newchar = false
1194 continue
Akronb4bbb472021-08-09 11:49:38 +02001195 }
1196
Akronb7e1f132021-08-10 11:52:31 +02001197 // Transition was successful
1198 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001199
1200 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001201 if a != dat.epsilon {
1202
Akron3610f102021-08-08 14:13:25 +02001203 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001204
Akron439f4ec2021-08-09 15:45:38 +02001205 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001206 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001207 if DEBUG {
1208 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1209 }
Akron439f4ec2021-08-09 15:45:38 +02001210 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001211 }
Akron3f8571a2021-08-05 11:18:10 +02001212 }
Akron068874c2021-08-04 15:19:56 +02001213
Akronc5d8d432021-08-10 16:48:44 +02001214 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001215 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001216
Akronc5d8d432021-08-10 16:48:44 +02001217 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001218 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001219 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001220 }
Akron01912fc2021-08-12 11:41:58 +02001221 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001222 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001223 }
Akron1594cb82021-08-11 11:14:56 +02001224 if DEBUG {
1225 fmt.Println("-> Newline")
1226 }
1227 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001228 }
Akron3610f102021-08-08 14:13:25 +02001229
Akronc5d8d432021-08-10 16:48:44 +02001230 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001231 if rewindBuffer {
1232
1233 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001234 for x, i := range buffer[buffo:buffi] {
1235 buffer[x] = i
1236 }
Akronb4bbb472021-08-09 11:49:38 +02001237
Akron3610f102021-08-08 14:13:25 +02001238 buffi -= buffo
1239 epsilonOffset -= buffo
1240 buffo = 0
1241 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001242 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001243 }
Akron84d68e62021-08-04 17:06:52 +02001244 }
1245
Akronb7e1f132021-08-10 11:52:31 +02001246 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001247 if ta.isSeparate() {
1248 t = ta.getBase()
1249 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001250
1251 if DEBUG {
1252 fmt.Println("Representative pointing to", t)
1253 }
1254 }
1255
Akronc5d8d432021-08-10 16:48:44 +02001256 newchar = true
1257
Akron068874c2021-08-04 15:19:56 +02001258 // TODO:
1259 // Prevent endless epsilon loops!
1260 }
1261
Akron439f4ec2021-08-09 15:45:38 +02001262 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001263 if !eof {
Akron068874c2021-08-04 15:19:56 +02001264 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001265 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001266 }
1267 return false
1268 }
1269
Akronb7e1f132021-08-10 11:52:31 +02001270 if DEBUG {
1271 fmt.Println("Entering final check")
1272 }
1273
Akronc5d8d432021-08-10 16:48:44 +02001274 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001275 x := dat.array[t].getBase() + uint32(dat.final)
1276
1277 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001278
1279 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001280 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001281 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001282 }
Akron01912fc2021-08-12 11:41:58 +02001283 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001284
Akronf1a16502021-08-16 15:24:38 +02001285 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001286 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001287 if DEBUG {
1288 fmt.Println("-> Newline")
1289 }
Akrondf0a3ef2021-08-09 15:53:45 +02001290 }
Akron84d68e62021-08-04 17:06:52 +02001291 }
1292
Akron6e70dc82021-08-11 11:33:18 +02001293 // Add an additional sentence ending, if the file is over but no explicit
1294 // sentence split was reached. This may be controversial and therefore
1295 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001296 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001297 writer.WriteRune('\n')
1298 if DEBUG {
1299 fmt.Println("-> Newline")
1300 }
1301 }
1302
Akrone61380b2021-08-16 10:10:46 +02001303 // TODO:
1304 // There may be a new line at the end, from an epsilon,
1305 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001306 return true
1307 }
1308
Akronc5d8d432021-08-10 16:48:44 +02001309 // Check epsilon transitions until a final state is reached
1310 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001311 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001312 a = dat.epsilon
1313 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001314 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001315 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001316 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001317
Akronc5d8d432021-08-10 16:48:44 +02001318 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001319 t0 = epsilonState
1320 epsilonState = 0 // reset
1321 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001322 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001323 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001324 }
Akronc5d8d432021-08-10 16:48:44 +02001325 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001326 }
Akronc5d8d432021-08-10 16:48:44 +02001327 return false
Akron068874c2021-08-04 15:19:56 +02001328}