blob: 3e01396fe603935906469df26092dbdfea6a704c [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
Akron679b4862021-09-02 16:59:26 +020033 "math"
Akron8ef408b2021-08-02 22:11:04 +020034 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron8ef408b2021-08-02 22:11:04 +020036 "strconv"
37 "strings"
38 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020039
Akron527c10c2021-08-13 01:45:18 +020040 "log"
Akron8ef408b2021-08-02 22:11:04 +020041)
42
43const (
Akron2a4b9292021-08-04 15:35:22 +020044 PROPS = 1
45 SIGMA = 2
46 STATES = 3
47 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020048 DEBUG = false
49 MAGIC = "DATOK"
50 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020051 FIRSTBIT uint32 = 1 << 31
52 SECONDBIT uint32 = 1 << 30
53 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020054)
55
Akron03c92fe2021-08-09 14:07:57 +020056// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020057var bo binary.ByteOrder = binary.LittleEndian
58
Akron8ef408b2021-08-02 22:11:04 +020059type mapping struct {
60 source int
Akron3fdfec62021-08-04 11:40:10 +020061 target uint32
Akron8ef408b2021-08-02 22:11:04 +020062}
63
64type edge struct {
Akron83e75a22021-08-04 13:14:06 +020065 inSym int
66 outSym int
67 end int
68 nontoken bool
69 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020070}
71
Akron03c92fe2021-08-09 14:07:57 +020072// Tokenizer is the intermediate representation
73// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020074type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020075 sigmaRev map[int]rune
Akron31f3c062021-08-27 10:15:13 +020076 sigmaMCS map[int]string
Akron740f3d72021-08-03 12:12:34 +020077 arcCount int
Akron740f3d72021-08-03 12:12:34 +020078 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020079 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020080
81 // Special symbols in sigma
82 epsilon int
83 unknown int
84 identity int
85 final int
Akron03c92fe2021-08-09 14:07:57 +020086 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020087}
88
Akronf1a16502021-08-16 15:24:38 +020089type bc struct {
90 base uint32
91 check uint32
92}
93
Akron03c92fe2021-08-09 14:07:57 +020094// DaTokenizer represents a tokenizer implemented as a
95// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020096type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020097 sigma map[rune]int
98 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020099 maxSize int
Akron4fa28b32021-08-27 10:55:41 +0200100 transCount int
Akronf1a16502021-08-16 15:24:38 +0200101 array []bc
Akronc17f1ca2021-08-03 19:47:27 +0200102
103 // Special symbols in sigma
104 epsilon int
105 unknown int
106 identity int
107 final int
Akron03c92fe2021-08-09 14:07:57 +0200108 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200109}
110
Akron03c92fe2021-08-09 14:07:57 +0200111// ParseFoma reads the FST from a foma file
112// and creates an internal representation,
113// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200114func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200115 f, err := os.Open(file)
116 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200117 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200118 return nil
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120 defer f.Close()
121
122 gz, err := gzip.NewReader(f)
123 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200124 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200125 return nil
Akron8ef408b2021-08-02 22:11:04 +0200126 }
127 defer gz.Close()
128
Akron3fdfec62021-08-04 11:40:10 +0200129 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200130}
131
Akron03c92fe2021-08-09 14:07:57 +0200132// ParseFoma reads the FST from a foma file reader
133// and creates an internal representation,
134// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200135func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200136 r := bufio.NewReader(ior)
137
138 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200139 sigmaRev: make(map[int]rune),
Akron31f3c062021-08-27 10:15:13 +0200140 sigmaMCS: make(map[int]string),
Akronc17f1ca2021-08-03 19:47:27 +0200141 epsilon: -1,
142 unknown: -1,
143 identity: -1,
144 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200145 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200146 }
147
Akron740f3d72021-08-03 12:12:34 +0200148 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200149
150 mode := 0
151 var elem []string
152 var elemint [5]int
153
Akron03c92fe2021-08-09 14:07:57 +0200154 // Iterate over all lines of the file.
155 // This is mainly based on foma2js,
156 // licensed under the Apache License, version 2,
157 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200158 for {
159 line, err := r.ReadString('\n')
160 if err != nil {
161 if err == io.EOF {
162 break
163 }
Akron527c10c2021-08-13 01:45:18 +0200164 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200165 return nil
Akron8ef408b2021-08-02 22:11:04 +0200166 }
Akron8ef408b2021-08-02 22:11:04 +0200167
Akron439f4ec2021-08-09 15:45:38 +0200168 // Read parser mode for the following lines
169 if strings.HasPrefix(line, "##") {
170 if strings.HasPrefix(line, "##props##") {
171 mode = PROPS
172
173 } else if strings.HasPrefix(line, "##states##") {
174 mode = STATES
175
176 // Adds a final transition symbol to sigma
177 // written as '#' in Mizobuchi et al (2000)
178 tok.sigmaCount++
179 tok.final = tok.sigmaCount
180
181 } else if strings.HasPrefix(line, "##sigma##") {
182
183 mode = SIGMA
184
185 } else if strings.HasPrefix(line, "##end##") {
186
187 mode = NONE
188
189 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200190 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200191 break
192 }
Akron8ef408b2021-08-02 22:11:04 +0200193 continue
194 }
195
Akron439f4ec2021-08-09 15:45:38 +0200196 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200197 switch mode {
198 case PROPS:
199 {
200 elem = strings.Split(line, " ")
201 /*
202 fmt.Println("arity: " + elem[0])
203 fmt.Println("arccount: " + elem[1])
204 fmt.Println("statecount: " + elem[2])
205 fmt.Println("linecount: " + elem[3])
206 fmt.Println("finalcount: " + elem[4])
207 fmt.Println("pathcount: " + elem[5])
208 fmt.Println("is_deterministic: " + elem[6])
209 fmt.Println("is_pruned: " + elem[7])
210 fmt.Println("is_minimized: " + elem[8])
211 fmt.Println("is_epsilon_free: " + elem[9])
212 fmt.Println("is_loop_free: " + elem[10])
213 fmt.Println("extras: " + elem[11])
214 fmt.Println("name: " + elem[12])
215 */
216 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200217 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200218 return nil
Akron8ef408b2021-08-02 22:11:04 +0200219 }
Akron439f4ec2021-08-09 15:45:38 +0200220
Akron8ef408b2021-08-02 22:11:04 +0200221 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200222 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200223 return nil
Akron8ef408b2021-08-02 22:11:04 +0200224 }
225
226 elemint[0], err = strconv.Atoi(elem[1])
227 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200228 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200229 return nil
Akron8ef408b2021-08-02 22:11:04 +0200230 }
Akron740f3d72021-08-03 12:12:34 +0200231 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200232
Akron8ef408b2021-08-02 22:11:04 +0200233 elemint[0], err = strconv.Atoi(elem[2])
234 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200235 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200236 return nil
Akron8ef408b2021-08-02 22:11:04 +0200237 }
Akron439f4ec2021-08-09 15:45:38 +0200238
239 // States start at 1 in Mizobuchi et al (2000),
240 // as the state 0 is associated with a fail.
241 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200242 tok.transitions = make([]map[int]*edge, elemint[0]+1)
243 continue
244 }
245 case STATES:
246 {
247 elem = strings.Split(line[0:len(line)-1], " ")
248 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200249 if DEBUG {
250 fmt.Println("Skip", elem)
251 }
Akron8ef408b2021-08-02 22:11:04 +0200252 continue
253 }
254 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200255 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200256 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200257 break
258 }
Akron8ef408b2021-08-02 22:11:04 +0200259
260 if len(elem) > 1 {
261 elemint[1], err = strconv.Atoi(elem[1])
262 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200263 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200264 break
265 }
266 if len(elem) > 2 {
267 elemint[2], err = strconv.Atoi(elem[2])
268 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200269 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200270 break
271 }
272 if len(elem) > 3 {
273 elemint[3], err = strconv.Atoi(elem[3])
274 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200275 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200276 break
277 }
278 if len(elem) > 4 {
279 elemint[4], err = strconv.Atoi(elem[4])
280 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200281 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200282 break
283 }
284 }
285 }
286 }
287 }
288
289 switch len(elem) {
290 case 5:
291 {
Akron740f3d72021-08-03 12:12:34 +0200292 state = elemint[0]
293 inSym = elemint[1]
294 outSym = elemint[2]
295 end = elemint[3]
296 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200297 }
298 case 4:
299 {
300 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200301 state = elemint[0]
302 final = elemint[3]
Akron0630be52021-08-28 09:06:16 +0200303
304 // Final state that has no outgoing edges
305 if final == 1 {
306
307 // Initialize outgoing states
308 if tok.transitions[state+1] == nil {
309 tok.transitions[state+1] = make(map[int]*edge)
310 }
311
312 // TODO:
313 // Maybe this is less relevant for tokenizers
314 tok.transitions[state+1][tok.final] = &edge{}
315 }
316 continue
Akron8ef408b2021-08-02 22:11:04 +0200317 } else {
Akron740f3d72021-08-03 12:12:34 +0200318 state = elemint[0]
319 inSym = elemint[1]
320 end = elemint[2]
321 final = elemint[3]
322 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200323 }
324 }
325 case 3:
326 {
Akron740f3d72021-08-03 12:12:34 +0200327 inSym = elemint[0]
328 outSym = elemint[1]
329 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200330 }
331 case 2:
332 {
Akron740f3d72021-08-03 12:12:34 +0200333 inSym = elemint[0]
334 end = elemint[1]
335 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200336 }
337 }
338
Akron83e75a22021-08-04 13:14:06 +0200339 nontoken := false
340 tokenend := false
341
Akron439f4ec2021-08-09 15:45:38 +0200342 // While the states in foma start with 0, the states in the
343 // Mizobuchi FSA start with one - so we increase every state by 1.
344 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200345 inSym++
346 outSym++
347
Akron439f4ec2021-08-09 15:45:38 +0200348 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200349 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200350 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200351 tokenend = true
352 } else if outSym == tok.epsilon {
353 nontoken = true
354 } else {
Akron527c10c2021-08-13 01:45:18 +0200355 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200356 "Unsupported transition: " +
357 strconv.Itoa(state) +
358 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200359 " (" +
Akron740f3d72021-08-03 12:12:34 +0200360 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200361 ":" +
Akron740f3d72021-08-03 12:12:34 +0200362 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200363 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200364 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200365 ":" +
Akron740f3d72021-08-03 12:12:34 +0200366 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200367 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200368 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200369 }
Akron92704eb2021-08-27 10:59:46 +0200370 } else if inSym == tok.tokenend {
371 // Ignore tokenend accepting arcs
372 continue
Akron83e75a22021-08-04 13:14:06 +0200373 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200374 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200375 return nil
Akron31f3c062021-08-27 10:15:13 +0200376 } else if tok.sigmaMCS[inSym] != "" {
Akron34dbe972021-08-29 17:44:34 +0200377 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
378 // Ignore MCS transitions
379 continue
Akron8ef408b2021-08-02 22:11:04 +0200380 }
381
Akron03c92fe2021-08-09 14:07:57 +0200382 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200383 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200384 inSym: inSym,
385 outSym: outSym,
386 end: end + 1,
387 tokenend: tokenend,
388 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200389 }
390
Akron740f3d72021-08-03 12:12:34 +0200391 // Initialize outgoing states
392 if tok.transitions[state+1] == nil {
393 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200394 }
395
Akron740f3d72021-08-03 12:12:34 +0200396 // Ignore transitions with invalid symbols
397 if inSym >= 0 {
398 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200399 }
Akron8ef408b2021-08-02 22:11:04 +0200400
Akron740f3d72021-08-03 12:12:34 +0200401 // Add final transition
402 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200403 // TODO:
404 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200405 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200406 }
407
Akronb4bbb472021-08-09 11:49:38 +0200408 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200409 fmt.Println("Add",
410 state+1, "->", end+1,
411 "(",
412 inSym,
413 ":",
414 outSym,
415 ") (",
416 string(tok.sigmaRev[inSym]),
417 ":",
418 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200419 ")",
420 ";",
421 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200422 "NT:", nontoken,
423 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200424 }
Akron75ebe7f2021-08-03 10:34:10 +0200425
Akron8ef408b2021-08-02 22:11:04 +0200426 continue
427 }
428 case SIGMA:
429 {
430 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
431
432 // Turn string into sigma id
433 number, err := strconv.Atoi(elem[0])
434
Akron524c5432021-08-05 14:14:27 +0200435 // ID needs to be > 1
436 number++
437
Akron8ef408b2021-08-02 22:11:04 +0200438 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200439 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200440 return nil
Akron8ef408b2021-08-02 22:11:04 +0200441 }
442
Akron740f3d72021-08-03 12:12:34 +0200443 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200444
445 var symbol rune
446
447 // Read rune
448 if utf8.RuneCountInString(elem[1]) == 1 {
449 symbol = []rune(elem[1])[0]
450
Akron8ef408b2021-08-02 22:11:04 +0200451 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200452
453 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200454 switch elem[1] {
455 case "@_EPSILON_SYMBOL_@":
456 {
Akronc17f1ca2021-08-03 19:47:27 +0200457 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200458 }
459 case "@_UNKNOWN_SYMBOL_@":
460 {
Akronc17f1ca2021-08-03 19:47:27 +0200461 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200462 }
463
464 case "@_IDENTITY_SYMBOL_@":
465 {
Akronc17f1ca2021-08-03 19:47:27 +0200466 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200467 }
Akron03c92fe2021-08-09 14:07:57 +0200468
469 case "@_TOKEN_SYMBOL_@":
470 {
471 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200472 }
Akron8ef408b2021-08-02 22:11:04 +0200473 default:
Akron740f3d72021-08-03 12:12:34 +0200474 {
Akron31f3c062021-08-27 10:15:13 +0200475 // MCS not supported
476 tok.sigmaMCS[number] = line
Akron740f3d72021-08-03 12:12:34 +0200477 }
Akron8ef408b2021-08-02 22:11:04 +0200478 }
Akron439f4ec2021-08-09 15:45:38 +0200479 continue
Akron8ef408b2021-08-02 22:11:04 +0200480
Akron740f3d72021-08-03 12:12:34 +0200481 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200482 line, err = r.ReadString('\n')
483 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200484 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200485 return nil
Akron8ef408b2021-08-02 22:11:04 +0200486 }
487 if len(line) != 1 {
Akron31f3c062021-08-27 10:15:13 +0200488 // MCS not supported
489 tok.sigmaMCS[number] = line
490 continue
Akron8ef408b2021-08-02 22:11:04 +0200491 }
Akron03c92fe2021-08-09 14:07:57 +0200492 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200493 }
494
Akron740f3d72021-08-03 12:12:34 +0200495 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200496 }
497 }
498 }
Akron31f3c062021-08-27 10:15:13 +0200499 tok.sigmaMCS = nil
Akron8ef408b2021-08-02 22:11:04 +0200500 return tok
501}
502
Akron64ffd9a2021-08-03 19:55:21 +0200503// Set alphabet A to the list of all symbols
504// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200505func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200506 for a := range tok.transitions[s] {
507 *A = append(*A, a)
508 }
509
510 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200511 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200512}
513
Akron439f4ec2021-08-09 15:45:38 +0200514// ToDoubleArray turns the intermediate tokenizer representation
515// into a double array representation.
516//
517// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200518func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
519
520 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200521 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +0200522 transCount: -1,
Akron03a3c612021-08-04 11:51:27 +0200523 final: tok.final,
524 unknown: tok.unknown,
525 identity: tok.identity,
526 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200527 tokenend: tok.tokenend,
Akronf2120ca2021-08-03 16:26:41 +0200528 }
529
Akronf1a16502021-08-16 15:24:38 +0200530 dat.resize(dat.final)
531
Akronf2120ca2021-08-03 16:26:41 +0200532 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200533 if int(sym) < 256 {
534 dat.sigmaASCII[int(sym)] = num
535 }
Akronf2120ca2021-08-03 16:26:41 +0200536 dat.sigma[sym] = num
537 }
Akron8ef408b2021-08-02 22:11:04 +0200538
539 mark := 0
540 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200541 var base uint32
Akronde18e902021-08-27 09:34:12 +0200542 var atrans *edge
543 var s, s1 int
544 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200545
Akron439f4ec2021-08-09 15:45:38 +0200546 // Create a mapping from s (in Ms aka Intermediate FSA)
547 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200548 table := make([]*mapping, tok.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200549 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200550
Akron439f4ec2021-08-09 15:45:38 +0200551 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200552 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200553 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200554 size++
555
Akron740f3d72021-08-03 12:12:34 +0200556 // Allocate space for the outgoing symbol range
557 A := make([]int, 0, tok.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200558 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
559 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200560
Akron7b1faa62021-09-02 16:10:21 +0200561 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200562
Akron8ef408b2021-08-02 22:11:04 +0200563 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200564 s = table[mark].source // This is a state in Ms
565 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200566 // s = tableQueue[mark]
567 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200568 mark++
Akron740f3d72021-08-03 12:12:34 +0200569
570 // Following the paper, here the state t can be remembered
571 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200572 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200573 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200574
Akron740f3d72021-08-03 12:12:34 +0200575 // Set base to the first free slot in the double array
Akron6f1c16c2021-08-17 10:45:42 +0200576 base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200577 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200578 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron6f1c16c2021-08-17 10:45:42 +0200579 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200580
Akron773b1ef2021-08-03 17:37:20 +0200581 // TODO:
Akron068874c2021-08-04 15:19:56 +0200582 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200583 // outdegree of .end
584
Akron740f3d72021-08-03 12:12:34 +0200585 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200586 for _, a := range A {
587
Akronc17f1ca2021-08-03 19:47:27 +0200588 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200589
Akronde18e902021-08-27 09:34:12 +0200590 atrans = tok.transitions[s][a]
591
Akron740f3d72021-08-03 12:12:34 +0200592 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200593 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200594
Akron740f3d72021-08-03 12:12:34 +0200595 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200596 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200597 dat.array[t1].setCheck(t)
598
599 // Set maxSize
600 if dat.maxSize < int(t1) {
601 dat.maxSize = int(t1)
602 }
Akron8ef408b2021-08-02 22:11:04 +0200603
Akron439f4ec2021-08-09 15:45:38 +0200604 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200605 fmt.Println("Translate transition",
606 s, "->", s1, "(", a, ")", "to", t, "->", t1)
607 }
608
Akron83e75a22021-08-04 13:14:06 +0200609 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200610 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200611 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200612 if DEBUG {
613 fmt.Println("Set", t1, "to nontoken")
614 }
Akron83e75a22021-08-04 13:14:06 +0200615 }
616
Akron84d68e62021-08-04 17:06:52 +0200617 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200618 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200619 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200620 if DEBUG {
621 fmt.Println("Set", t1, "to tokenend")
622 }
Akron84d68e62021-08-04 17:06:52 +0200623 }
624
Akron740f3d72021-08-03 12:12:34 +0200625 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200626 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200627 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200628
Akron439f4ec2021-08-09 15:45:38 +0200629 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200630 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200631 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200632 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200633 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200634 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200635 size++
636 } else {
Akron740f3d72021-08-03 12:12:34 +0200637 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200638 dat.array[t1].setBase(r)
639 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200640 }
641 } else {
Akron740f3d72021-08-03 12:12:34 +0200642 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200643 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200644
Akronde18e902021-08-27 09:34:12 +0200645 if dat.maxSize < int(base)+dat.final {
646 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200647 }
Akron8ef408b2021-08-02 22:11:04 +0200648 }
649 }
650 }
651
652 // Following Mizobuchi et al (2000) the size of the
653 // FSA should be stored in check(1).
Akronf1a16502021-08-16 15:24:38 +0200654 // We make the size a bit smaller so we never have to check for boundaries.
Akron3fdfec62021-08-04 11:40:10 +0200655 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200656 dat.array = dat.array[:dat.maxSize+1]
657 return dat
Akron8ef408b2021-08-02 22:11:04 +0200658}
659
Akron8ef408b2021-08-02 22:11:04 +0200660// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200661// exists and return this as a representative.
662// Currently iterates through the whole table
663// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200664func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200665 for x := 0; x < size; x++ {
666 if table[x].source == s {
667 return table[x].target
668 }
669 }
670 return 0
671}
672
Akron64ffd9a2021-08-03 19:55:21 +0200673// Resize double array when necessary
674func (dat *DaTokenizer) resize(l int) {
675 // TODO:
676 // This is a bit too aggressive atm and should be calmed down.
677 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200678 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200679 }
Akron64ffd9a2021-08-03 19:55:21 +0200680}
Akronc9d84a62021-08-03 15:56:03 +0200681
Akron64ffd9a2021-08-03 19:55:21 +0200682// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200683func (bc *bc) setBase(v uint32) {
684 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200685}
686
687// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200688func (bc *bc) getBase() uint32 {
689 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200690}
691
692// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200693func (bc *bc) setCheck(v uint32) {
694 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200695}
696
697// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200698func (bc *bc) getCheck() uint32 {
699 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200700}
701
Akron3fdfec62021-08-04 11:40:10 +0200702// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200703func (bc *bc) isSeparate() bool {
704 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200705}
706
707// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200708func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200709 if sep {
Akronf1a16502021-08-16 15:24:38 +0200710 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200711 } else {
Akronf1a16502021-08-16 15:24:38 +0200712 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200713 }
714}
715
Akron83e75a22021-08-04 13:14:06 +0200716// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200717func (bc *bc) isNonToken() bool {
718 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200719}
720
721// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200722func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200723 if sep {
Akronf1a16502021-08-16 15:24:38 +0200724 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200725 } else {
Akronf1a16502021-08-16 15:24:38 +0200726 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200727 }
728}
729
Akron84d68e62021-08-04 17:06:52 +0200730// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200731func (bc *bc) isTokenEnd() bool {
732 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200733}
734
735// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200736func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200737 if sep {
Akronf1a16502021-08-16 15:24:38 +0200738 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200739 } else {
Akronf1a16502021-08-16 15:24:38 +0200740 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200741 }
742}
743
Akron64ffd9a2021-08-03 19:55:21 +0200744// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200745func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200746 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200747}
748
749// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200750func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200751 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200752}
753
754// Based on Mizobuchi et al (2000), p. 124
755// This iterates for every state through the complete double array
756// structure until it finds a gap that fits all outgoing transitions
757// of the state. This is extremely slow, but is only necessary in the
758// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200759func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200760
761 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200762 base := uint32(1)
763
Akron8ef408b2021-08-02 22:11:04 +0200764OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200765 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200766 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200767 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200768 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200769 base++
770 goto OVERLAP
771 }
772 }
Akron8ef408b2021-08-02 22:11:04 +0200773 return base
774}
775
Akron679b4862021-09-02 16:59:26 +0200776// This is an implementation of xCheck with the skip-improvement
777// proposed by Morita et al. (2001)
778func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
779
780 // Start at the first entry of the double array list
781 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
782
783OVERLAP:
784 // Resize the array if necessary
785 dat.resize(int(base) + dat.final)
786 for _, a := range symbols {
787 if dat.array[int(base)+a].getCheck() != 0 {
788 base++
789 goto OVERLAP
790 }
791 }
792 return base
793}
794
Akron7b1faa62021-09-02 16:10:21 +0200795// This is an implementation of xCheck wit an improvement
796// proposed by Niu et al. (2013)
797func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
798
799 // Start at the first entry of the double array list
800 base := uint32(1)
801
802 if len(symbols) > 3 {
803 sort.Ints(symbols)
804 if *block_begin_pos > uint32(symbols[0]) {
805 dat.resize(int(*block_begin_pos) + dat.final)
806 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
807 return *block_begin_pos - uint32(symbols[0])
808 }
809 }
810
811OVERLAP:
812 // Resize the array if necessary
813 dat.resize(int(base) + dat.final)
814 for _, a := range symbols {
815 if dat.array[int(base)+a].getCheck() != 0 {
816 base++
817 goto OVERLAP
818 }
819 }
820 return base
821}
822
Akron3610f102021-08-08 14:13:25 +0200823// List all outgoing transitions for a state
824// for testing purposes
825func (dat *DaTokenizer) outgoing(t uint32) []int {
826
827 valid := make([]int, 0, len(dat.sigma))
828
829 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200830 t1 := dat.array[t].getBase() + uint32(a)
831 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200832 valid = append(valid, a)
833 }
834 }
835
836 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200837 t1 := dat.array[t].getBase() + uint32(a)
838 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200839 valid = append(valid, -1*a)
840 }
841 }
842
843 sort.Ints(valid)
844
845 return valid
846}
847
Akron4fa28b32021-08-27 10:55:41 +0200848// TransCount as the number of transitions aka arcs in the
849// finite state automaton
850func (dat *DaTokenizer) TransCount() int {
851 // Cache the transCount
852 if dat.transCount > 0 {
853 return dat.transCount
854 }
855
856 dat.transCount = 0
857 for x := 1; x < len(dat.array); x++ {
858 if dat.array[x].getBase() != 0 {
859 dat.transCount++
860 }
861 }
862
863 return dat.transCount
864}
865
Akron03a3c612021-08-04 11:51:27 +0200866// LoadFactor as defined in Kanda et al (2018),
867// i.e. the proportion of non-empty elements to all elements.
868func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200869 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200870}
871
Akron439f4ec2021-08-09 15:45:38 +0200872// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200873func (dat *DaTokenizer) Save(file string) (n int64, err error) {
874 f, err := os.Create(file)
875 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200876 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200877 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200878 }
879 defer f.Close()
880 gz := gzip.NewWriter(f)
881 defer gz.Close()
882 n, err = dat.WriteTo(gz)
883 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200884 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200885 return n, err
886 }
887 gz.Flush()
888 return n, nil
889}
890
891// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200892func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
893
Akron3a063ef2021-08-05 19:36:35 +0200894 wb := bufio.NewWriter(w)
895 defer wb.Flush()
896
Akron6247a5d2021-08-03 19:18:28 +0200897 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200898 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200899 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200900 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200901 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200902 }
903
904 // Get sigma as a list
905 sigmalist := make([]rune, len(dat.sigma)+16)
906 max := 0
907 for sym, num := range dat.sigma {
908 sigmalist[num] = sym
909 if num > max {
910 max = num
911 }
912 }
913
914 sigmalist = sigmalist[:max+1]
915
Akron3f8571a2021-08-05 11:18:10 +0200916 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200917 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200918 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
919 bo.PutUint16(buf[4:6], uint16(dat.unknown))
920 bo.PutUint16(buf[6:8], uint16(dat.identity))
921 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200922 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200923 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200924 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200925 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200926 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200927 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200928 }
929
930 all += more
931
Akron6247a5d2021-08-03 19:18:28 +0200932 // Write sigma
933 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200934
Akron3a063ef2021-08-05 19:36:35 +0200935 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200936 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200937 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200938 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200939 }
940 all += more
941 }
Akron439f4ec2021-08-09 15:45:38 +0200942
Akron6247a5d2021-08-03 19:18:28 +0200943 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200944 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200945 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200946 }
Akron6247a5d2021-08-03 19:18:28 +0200947
948 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200949 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200950 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200951 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200952 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200953 }
954 all += more
955
Akronf1a16502021-08-16 15:24:38 +0200956 // for x := 0; x < len(dat.array); x++ {
957 for _, bc := range dat.array {
958 bo.PutUint32(buf[0:4], bc.base)
959 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200960 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200961 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200962 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200963 }
Akron439f4ec2021-08-09 15:45:38 +0200964 all += more
Akron3a063ef2021-08-05 19:36:35 +0200965 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200966 log.Println("Can not write base uint32")
967 return int64(all), err
968 }
969 bo.PutUint32(buf[0:4], bc.check)
970 more, err = wb.Write(buf[0:4])
971 if err != nil {
972 log.Println(err)
973 return int64(all), err
974 }
975 all += more
976 if more != 4 {
977 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +0200978 return int64(all), err
979 }
Akron6247a5d2021-08-03 19:18:28 +0200980 }
981
982 return int64(all), err
983}
984
Akron439f4ec2021-08-09 15:45:38 +0200985// LoadDatokFile reads a double array represented tokenizer
986// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200987func LoadDatokFile(file string) *DaTokenizer {
988 f, err := os.Open(file)
989 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200990 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200991 return nil
Akron3f8571a2021-08-05 11:18:10 +0200992 }
993 defer f.Close()
994
995 gz, err := gzip.NewReader(f)
996 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200997 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200998 return nil
Akron3f8571a2021-08-05 11:18:10 +0200999 }
1000 defer gz.Close()
1001
Akron3a063ef2021-08-05 19:36:35 +02001002 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +02001003 return ParseDatok(gz)
1004}
1005
Akron439f4ec2021-08-09 15:45:38 +02001006// LoadDatokFile reads a double array represented tokenizer
1007// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +02001008func ParseDatok(ior io.Reader) *DaTokenizer {
1009
Akron439f4ec2021-08-09 15:45:38 +02001010 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +02001011 dat := &DaTokenizer{
1012 sigma: make(map[rune]int),
1013 epsilon: 0,
1014 unknown: 0,
1015 identity: 0,
1016 final: 0,
Akron4fa28b32021-08-27 10:55:41 +02001017 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +02001018 }
1019
1020 r := bufio.NewReader(ior)
1021
Akron3f8571a2021-08-05 11:18:10 +02001022 buf := make([]byte, 1024)
1023 buf = buf[0:len(MAGIC)]
1024
Akron439f4ec2021-08-09 15:45:38 +02001025 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +02001026
1027 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001028 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +02001029 return nil
1030 }
1031
Akron3f8571a2021-08-05 11:18:10 +02001032 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +02001033 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001034 return nil
1035 }
1036
Akron439f4ec2021-08-09 15:45:38 +02001037 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +02001038 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001039 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +02001040 return nil
1041 }
1042
Akron439f4ec2021-08-09 15:45:38 +02001043 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +02001044 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +02001045 return nil
1046 }
Akron3f8571a2021-08-05 11:18:10 +02001047
Akron3a063ef2021-08-05 19:36:35 +02001048 version := bo.Uint16(buf[0:2])
1049
1050 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +02001051 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +02001052 return nil
1053 }
1054
Akron3f8571a2021-08-05 11:18:10 +02001055 dat.epsilon = int(bo.Uint16(buf[2:4]))
1056 dat.unknown = int(bo.Uint16(buf[4:6]))
1057 dat.identity = int(bo.Uint16(buf[6:8]))
1058 dat.final = int(bo.Uint16(buf[8:10]))
1059
1060 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +02001061 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +02001062
Akron3a063ef2021-08-05 19:36:35 +02001063 // Shouldn't be relevant though
1064 dat.maxSize = arraySize - 1
1065
Akron3f8571a2021-08-05 11:18:10 +02001066 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +02001067 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +02001068 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +02001069 if int(sym) < 256 {
1070 dat.sigmaASCII[int(sym)] = x
1071 }
Akron3f8571a2021-08-05 11:18:10 +02001072 dat.sigma[sym] = x
1073 }
Akron3f8571a2021-08-05 11:18:10 +02001074 }
1075
Akron439f4ec2021-08-09 15:45:38 +02001076 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +02001077
1078 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001079 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +02001080 return nil
1081 }
1082
Akron3f8571a2021-08-05 11:18:10 +02001083 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001084 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001085 return nil
1086 }
1087
1088 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001089 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001090
Akronbb4aac52021-08-13 00:52:27 +02001091 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001092
Akronbb4aac52021-08-13 00:52:27 +02001093 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001094 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001095 return nil
1096 }
1097
Akronf1a16502021-08-16 15:24:38 +02001098 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001099 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001100 return nil
1101 }
1102
1103 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001104 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1105 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001106 }
1107
1108 return dat
1109}
1110
Akron439f4ec2021-08-09 15:45:38 +02001111// Show the current state of the buffer,
1112// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001113func showBuffer(buffer []rune, buffo int, buffi int) string {
1114 out := make([]rune, 0, 1024)
1115 for x := 0; x < len(buffer); x++ {
1116 if buffi == x {
1117 out = append(out, '^')
1118 }
1119 if buffo == x {
1120 out = append(out, '[', buffer[x], ']')
1121 } else {
1122 out = append(out, buffer[x])
1123 }
1124 }
1125 return string(out)
1126}
1127
Akron84d68e62021-08-04 17:06:52 +02001128// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001129// FSA. The rules are always greedy. If the automaton fails,
1130// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001131//
Akron4db3ecf2021-08-11 18:49:03 +02001132// Based on Mizobuchi et al (2000), p. 129,
1133// with additional support for IDENTITY, UNKNOWN
1134// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001135func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001136 var a int
Akronb4bbb472021-08-09 11:49:38 +02001137 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001138 t := uint32(1) // Initial state
1139 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001140
Akron3610f102021-08-08 14:13:25 +02001141 // Remember the last position of a possible tokenend,
1142 // in case the automaton fails.
1143 epsilonState := uint32(0)
1144 epsilonOffset := 0
1145
1146 // Implement a low level buffer for full control,
1147 // however - it is probably better to introduce
1148 // this on a higher level with a io.Reader interface
1149 // The buffer stores a single word and may have white
1150 // space at the end (but not at the beginning).
1151 //
1152 // This is the only backtracking requirement because of
1153 // epsilon transitions, to support tokenizations like:
1154 // "this is an example|.| And it works." vs
1155 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001156 //
1157 // TODO:
1158 // Store a translation buffer as well, so characters don't
1159 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001160 buffer := make([]rune, 1024)
1161 buffo := 0 // Buffer offset
1162 buffi := 0 // Buffer length
1163
Akron3f8571a2021-08-05 11:18:10 +02001164 reader := bufio.NewReader(r)
1165 writer := bufio.NewWriter(w)
1166 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001167
Akron3f8571a2021-08-05 11:18:10 +02001168 var char rune
1169 var err error
1170 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001171 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001172
Akronc5d8d432021-08-10 16:48:44 +02001173PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001174 for {
1175
Akronb7e1f132021-08-10 11:52:31 +02001176 if newchar {
1177 // Get from reader if buffer is empty
1178 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001179 if eof {
1180 break
1181 }
Akronb7e1f132021-08-10 11:52:31 +02001182 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001183
Akronb7e1f132021-08-10 11:52:31 +02001184 // No more runes to read
1185 if err != nil {
1186 eof = true
1187 break
1188 }
1189 buffer[buffi] = char
1190 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001191 }
Akronb7e1f132021-08-10 11:52:31 +02001192
1193 char = buffer[buffo]
1194
1195 if DEBUG {
1196 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1197 }
1198
Akron6f1c16c2021-08-17 10:45:42 +02001199 // TODO:
1200 // Better not repeatedly check for a!
1201 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +02001202 if int(char) < 256 {
1203 a = dat.sigmaASCII[int(char)]
1204 } else {
1205 a, ok = dat.sigma[char]
1206 if !ok {
1207 a = 0
1208 }
1209 }
Akronb7e1f132021-08-10 11:52:31 +02001210
1211 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001212 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001213 a = dat.identity
1214 }
1215
1216 t0 = t
1217
1218 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001219 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001220 // Remember state for backtracking to last tokenend state
1221 epsilonState = t0
1222 epsilonOffset = buffo
1223 }
Akron3f8571a2021-08-05 11:18:10 +02001224 }
Akron3610f102021-08-08 14:13:25 +02001225
Akronb7e1f132021-08-10 11:52:31 +02001226 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001227 t = dat.array[t0].getBase() + uint32(a)
1228 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001229
Akron524c5432021-08-05 14:14:27 +02001230 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001231 // Char is only relevant if set
1232 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1233 if false {
1234 fmt.Println(dat.outgoing(t0))
1235 }
Akron524c5432021-08-05 14:14:27 +02001236 }
1237
Akronb7e1f132021-08-10 11:52:31 +02001238 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001239 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001240
1241 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001242 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001243 }
1244
1245 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001246
Akron068874c2021-08-04 15:19:56 +02001247 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001248 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001249 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001250 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001251 }
1252 a = dat.unknown
1253
1254 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001255
Akron068874c2021-08-04 15:19:56 +02001256 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001257 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001258 epsilonState = 0 // reset
1259 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001260 a = dat.epsilon
1261
Akron3610f102021-08-08 14:13:25 +02001262 if DEBUG {
1263 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1264 }
Akronb4bbb472021-08-09 11:49:38 +02001265
Akron068874c2021-08-04 15:19:56 +02001266 } else {
1267 break
1268 }
Akron068874c2021-08-04 15:19:56 +02001269
Akronb7e1f132021-08-10 11:52:31 +02001270 newchar = false
1271 continue
Akronb4bbb472021-08-09 11:49:38 +02001272 }
1273
Akronb7e1f132021-08-10 11:52:31 +02001274 // Transition was successful
1275 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001276
1277 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001278 if a != dat.epsilon {
1279
Akron3610f102021-08-08 14:13:25 +02001280 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001281
Akron439f4ec2021-08-09 15:45:38 +02001282 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001283 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001284 if DEBUG {
1285 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1286 }
Akron439f4ec2021-08-09 15:45:38 +02001287 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001288 }
Akron3f8571a2021-08-05 11:18:10 +02001289 }
Akron068874c2021-08-04 15:19:56 +02001290
Akronc5d8d432021-08-10 16:48:44 +02001291 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001292 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001293
Akronc5d8d432021-08-10 16:48:44 +02001294 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001295 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001296 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001297 }
Akron01912fc2021-08-12 11:41:58 +02001298 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001299 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001300 }
Akron1594cb82021-08-11 11:14:56 +02001301 if DEBUG {
1302 fmt.Println("-> Newline")
1303 }
1304 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001305 }
Akron3610f102021-08-08 14:13:25 +02001306
Akronc5d8d432021-08-10 16:48:44 +02001307 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001308 if rewindBuffer {
1309
1310 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001311 for x, i := range buffer[buffo:buffi] {
1312 buffer[x] = i
1313 }
Akronb4bbb472021-08-09 11:49:38 +02001314
Akron3610f102021-08-08 14:13:25 +02001315 buffi -= buffo
1316 epsilonOffset -= buffo
1317 buffo = 0
1318 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001319 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001320 }
Akron84d68e62021-08-04 17:06:52 +02001321 }
1322
Akronb7e1f132021-08-10 11:52:31 +02001323 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001324 if ta.isSeparate() {
1325 t = ta.getBase()
1326 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001327
1328 if DEBUG {
1329 fmt.Println("Representative pointing to", t)
1330 }
1331 }
1332
Akronc5d8d432021-08-10 16:48:44 +02001333 newchar = true
1334
Akron068874c2021-08-04 15:19:56 +02001335 // TODO:
1336 // Prevent endless epsilon loops!
1337 }
1338
Akron439f4ec2021-08-09 15:45:38 +02001339 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001340 if !eof {
Akron068874c2021-08-04 15:19:56 +02001341 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001342 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001343 }
1344 return false
1345 }
1346
Akronb7e1f132021-08-10 11:52:31 +02001347 if DEBUG {
1348 fmt.Println("Entering final check")
1349 }
1350
Akronc5d8d432021-08-10 16:48:44 +02001351 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001352 x := dat.array[t].getBase() + uint32(dat.final)
1353
1354 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001355
1356 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001357 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001358 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001359 }
Akron01912fc2021-08-12 11:41:58 +02001360 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001361
Akronf1a16502021-08-16 15:24:38 +02001362 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001363 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001364 if DEBUG {
1365 fmt.Println("-> Newline")
1366 }
Akrondf0a3ef2021-08-09 15:53:45 +02001367 }
Akron84d68e62021-08-04 17:06:52 +02001368 }
1369
Akron6e70dc82021-08-11 11:33:18 +02001370 // Add an additional sentence ending, if the file is over but no explicit
1371 // sentence split was reached. This may be controversial and therefore
1372 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001373 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001374 writer.WriteRune('\n')
1375 if DEBUG {
1376 fmt.Println("-> Newline")
1377 }
1378 }
1379
Akrone61380b2021-08-16 10:10:46 +02001380 // TODO:
1381 // There may be a new line at the end, from an epsilon,
1382 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001383 return true
1384 }
1385
Akronc5d8d432021-08-10 16:48:44 +02001386 // Check epsilon transitions until a final state is reached
1387 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001388 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001389 a = dat.epsilon
1390 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001391 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001392 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001393 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001394
Akronc5d8d432021-08-10 16:48:44 +02001395 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001396 t0 = epsilonState
1397 epsilonState = 0 // reset
1398 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001399 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001400 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001401 }
Akronc5d8d432021-08-10 16:48:44 +02001402 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001403 }
Akronc5d8d432021-08-10 16:48:44 +02001404 return false
Akron068874c2021-08-04 15:19:56 +02001405}