blob: 5dfe0ca5eb3a8d19902a26ca1fd785f7f11c7730 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
17// - replace maxSize with the check value
Akron7b1faa62021-09-02 16:10:21 +020018// - Check if final states can be optional.
19// - Introduce ELM (Morita et al. 2001) to speed
20// up construction. Can be ignored on serialization
21// to improve compression.
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akrone61380b2021-08-16 10:10:46 +020023// - Replace/Enhance table with a map
24// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020025// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020026
Akron8ef408b2021-08-02 22:11:04 +020027import (
28 "bufio"
29 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020030 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020031 "fmt"
32 "io"
Akron679b4862021-09-02 16:59:26 +020033 "math"
Akron8ef408b2021-08-02 22:11:04 +020034 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron8ef408b2021-08-02 22:11:04 +020036 "strconv"
37 "strings"
38 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020039
Akron527c10c2021-08-13 01:45:18 +020040 "log"
Akron8ef408b2021-08-02 22:11:04 +020041)
42
43const (
Akron2a4b9292021-08-04 15:35:22 +020044 PROPS = 1
45 SIGMA = 2
46 STATES = 3
47 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020048 DEBUG = false
49 MAGIC = "DATOK"
50 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020051 FIRSTBIT uint32 = 1 << 31
52 SECONDBIT uint32 = 1 << 30
53 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020054)
55
Akron03c92fe2021-08-09 14:07:57 +020056// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020057var bo binary.ByteOrder = binary.LittleEndian
58
Akron8ef408b2021-08-02 22:11:04 +020059type mapping struct {
60 source int
Akron3fdfec62021-08-04 11:40:10 +020061 target uint32
Akron8ef408b2021-08-02 22:11:04 +020062}
63
64type edge struct {
Akron83e75a22021-08-04 13:14:06 +020065 inSym int
66 outSym int
67 end int
68 nontoken bool
69 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020070}
71
Akron03c92fe2021-08-09 14:07:57 +020072// Tokenizer is the intermediate representation
73// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020074type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020075 sigmaRev map[int]rune
Akron31f3c062021-08-27 10:15:13 +020076 sigmaMCS map[int]string
Akron740f3d72021-08-03 12:12:34 +020077 arcCount int
Akron740f3d72021-08-03 12:12:34 +020078 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020079 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020080
81 // Special symbols in sigma
82 epsilon int
83 unknown int
84 identity int
85 final int
Akron03c92fe2021-08-09 14:07:57 +020086 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020087}
88
Akronf1a16502021-08-16 15:24:38 +020089type bc struct {
90 base uint32
91 check uint32
92}
93
Akron03c92fe2021-08-09 14:07:57 +020094// DaTokenizer represents a tokenizer implemented as a
95// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020096type DaTokenizer struct {
Akronea46e8a2021-08-17 00:36:31 +020097 sigma map[rune]int
98 sigmaASCII [256]int
Akron03a3c612021-08-04 11:51:27 +020099 maxSize int
Akron4fa28b32021-08-27 10:55:41 +0200100 transCount int
Akronf1a16502021-08-16 15:24:38 +0200101 array []bc
Akronc17f1ca2021-08-03 19:47:27 +0200102
103 // Special symbols in sigma
104 epsilon int
105 unknown int
106 identity int
107 final int
Akron03c92fe2021-08-09 14:07:57 +0200108 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200109}
110
Akron03c92fe2021-08-09 14:07:57 +0200111// ParseFoma reads the FST from a foma file
112// and creates an internal representation,
113// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200114func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200115 f, err := os.Open(file)
116 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200117 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200118 return nil
Akron8ef408b2021-08-02 22:11:04 +0200119 }
120 defer f.Close()
121
122 gz, err := gzip.NewReader(f)
123 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200124 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200125 return nil
Akron8ef408b2021-08-02 22:11:04 +0200126 }
127 defer gz.Close()
128
Akron3fdfec62021-08-04 11:40:10 +0200129 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200130}
131
Akron03c92fe2021-08-09 14:07:57 +0200132// ParseFoma reads the FST from a foma file reader
133// and creates an internal representation,
134// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200135func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200136 r := bufio.NewReader(ior)
137
138 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200139 sigmaRev: make(map[int]rune),
Akron31f3c062021-08-27 10:15:13 +0200140 sigmaMCS: make(map[int]string),
Akronc17f1ca2021-08-03 19:47:27 +0200141 epsilon: -1,
142 unknown: -1,
143 identity: -1,
144 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200145 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200146 }
147
Akron740f3d72021-08-03 12:12:34 +0200148 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200149
150 mode := 0
151 var elem []string
152 var elemint [5]int
153
Akron03c92fe2021-08-09 14:07:57 +0200154 // Iterate over all lines of the file.
155 // This is mainly based on foma2js,
156 // licensed under the Apache License, version 2,
157 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200158 for {
159 line, err := r.ReadString('\n')
160 if err != nil {
161 if err == io.EOF {
162 break
163 }
Akron527c10c2021-08-13 01:45:18 +0200164 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200165 return nil
Akron8ef408b2021-08-02 22:11:04 +0200166 }
Akron8ef408b2021-08-02 22:11:04 +0200167
Akron439f4ec2021-08-09 15:45:38 +0200168 // Read parser mode for the following lines
169 if strings.HasPrefix(line, "##") {
170 if strings.HasPrefix(line, "##props##") {
171 mode = PROPS
172
173 } else if strings.HasPrefix(line, "##states##") {
174 mode = STATES
175
176 // Adds a final transition symbol to sigma
177 // written as '#' in Mizobuchi et al (2000)
178 tok.sigmaCount++
179 tok.final = tok.sigmaCount
180
181 } else if strings.HasPrefix(line, "##sigma##") {
182
183 mode = SIGMA
184
185 } else if strings.HasPrefix(line, "##end##") {
186
187 mode = NONE
188
189 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200190 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200191 break
192 }
Akron8ef408b2021-08-02 22:11:04 +0200193 continue
194 }
195
Akron439f4ec2021-08-09 15:45:38 +0200196 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200197 switch mode {
198 case PROPS:
199 {
200 elem = strings.Split(line, " ")
201 /*
202 fmt.Println("arity: " + elem[0])
203 fmt.Println("arccount: " + elem[1])
204 fmt.Println("statecount: " + elem[2])
205 fmt.Println("linecount: " + elem[3])
206 fmt.Println("finalcount: " + elem[4])
207 fmt.Println("pathcount: " + elem[5])
208 fmt.Println("is_deterministic: " + elem[6])
209 fmt.Println("is_pruned: " + elem[7])
210 fmt.Println("is_minimized: " + elem[8])
211 fmt.Println("is_epsilon_free: " + elem[9])
212 fmt.Println("is_loop_free: " + elem[10])
213 fmt.Println("extras: " + elem[11])
214 fmt.Println("name: " + elem[12])
215 */
216 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200217 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200218 return nil
Akron8ef408b2021-08-02 22:11:04 +0200219 }
Akron439f4ec2021-08-09 15:45:38 +0200220
Akron8ef408b2021-08-02 22:11:04 +0200221 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200222 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200223 return nil
Akron8ef408b2021-08-02 22:11:04 +0200224 }
225
226 elemint[0], err = strconv.Atoi(elem[1])
227 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200228 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200229 return nil
Akron8ef408b2021-08-02 22:11:04 +0200230 }
Akron740f3d72021-08-03 12:12:34 +0200231 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200232
Akron8ef408b2021-08-02 22:11:04 +0200233 elemint[0], err = strconv.Atoi(elem[2])
234 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200235 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200236 return nil
Akron8ef408b2021-08-02 22:11:04 +0200237 }
Akron439f4ec2021-08-09 15:45:38 +0200238
239 // States start at 1 in Mizobuchi et al (2000),
240 // as the state 0 is associated with a fail.
241 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200242 tok.transitions = make([]map[int]*edge, elemint[0]+1)
243 continue
244 }
245 case STATES:
246 {
247 elem = strings.Split(line[0:len(line)-1], " ")
248 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200249 if DEBUG {
250 fmt.Println("Skip", elem)
251 }
Akron8ef408b2021-08-02 22:11:04 +0200252 continue
253 }
254 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200255 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200256 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200257 break
258 }
Akron8ef408b2021-08-02 22:11:04 +0200259
260 if len(elem) > 1 {
261 elemint[1], err = strconv.Atoi(elem[1])
262 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200263 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200264 break
265 }
266 if len(elem) > 2 {
267 elemint[2], err = strconv.Atoi(elem[2])
268 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200269 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200270 break
271 }
272 if len(elem) > 3 {
273 elemint[3], err = strconv.Atoi(elem[3])
274 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200275 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200276 break
277 }
278 if len(elem) > 4 {
279 elemint[4], err = strconv.Atoi(elem[4])
280 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200281 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200282 break
283 }
284 }
285 }
286 }
287 }
288
289 switch len(elem) {
290 case 5:
291 {
Akron740f3d72021-08-03 12:12:34 +0200292 state = elemint[0]
293 inSym = elemint[1]
294 outSym = elemint[2]
295 end = elemint[3]
296 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200297 }
298 case 4:
299 {
300 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200301 state = elemint[0]
302 final = elemint[3]
Akron0630be52021-08-28 09:06:16 +0200303
304 // Final state that has no outgoing edges
305 if final == 1 {
306
307 // Initialize outgoing states
308 if tok.transitions[state+1] == nil {
309 tok.transitions[state+1] = make(map[int]*edge)
310 }
311
312 // TODO:
313 // Maybe this is less relevant for tokenizers
314 tok.transitions[state+1][tok.final] = &edge{}
315 }
316 continue
Akron8ef408b2021-08-02 22:11:04 +0200317 } else {
Akron740f3d72021-08-03 12:12:34 +0200318 state = elemint[0]
319 inSym = elemint[1]
320 end = elemint[2]
321 final = elemint[3]
322 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200323 }
324 }
325 case 3:
326 {
Akron740f3d72021-08-03 12:12:34 +0200327 inSym = elemint[0]
328 outSym = elemint[1]
329 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200330 }
331 case 2:
332 {
Akron740f3d72021-08-03 12:12:34 +0200333 inSym = elemint[0]
334 end = elemint[1]
335 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200336 }
337 }
338
Akron83e75a22021-08-04 13:14:06 +0200339 nontoken := false
340 tokenend := false
341
Akron439f4ec2021-08-09 15:45:38 +0200342 // While the states in foma start with 0, the states in the
343 // Mizobuchi FSA start with one - so we increase every state by 1.
344 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200345 inSym++
346 outSym++
347
Akron439f4ec2021-08-09 15:45:38 +0200348 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200349 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200350 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200351 tokenend = true
352 } else if outSym == tok.epsilon {
353 nontoken = true
354 } else {
Akron527c10c2021-08-13 01:45:18 +0200355 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200356 "Unsupported transition: " +
357 strconv.Itoa(state) +
358 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200359 " (" +
Akron740f3d72021-08-03 12:12:34 +0200360 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200361 ":" +
Akron740f3d72021-08-03 12:12:34 +0200362 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200363 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200364 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200365 ":" +
Akron740f3d72021-08-03 12:12:34 +0200366 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200367 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200368 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200369 }
Akron92704eb2021-08-27 10:59:46 +0200370 } else if inSym == tok.tokenend {
371 // Ignore tokenend accepting arcs
372 continue
Akron83e75a22021-08-04 13:14:06 +0200373 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200374 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200375 return nil
Akron31f3c062021-08-27 10:15:13 +0200376 } else if tok.sigmaMCS[inSym] != "" {
Akron34dbe972021-08-29 17:44:34 +0200377 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
378 // Ignore MCS transitions
379 continue
Akron8ef408b2021-08-02 22:11:04 +0200380 }
381
Akron03c92fe2021-08-09 14:07:57 +0200382 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200383 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200384 inSym: inSym,
385 outSym: outSym,
386 end: end + 1,
387 tokenend: tokenend,
388 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200389 }
390
Akron740f3d72021-08-03 12:12:34 +0200391 // Initialize outgoing states
392 if tok.transitions[state+1] == nil {
393 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200394 }
395
Akron740f3d72021-08-03 12:12:34 +0200396 // Ignore transitions with invalid symbols
397 if inSym >= 0 {
398 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200399 }
Akron8ef408b2021-08-02 22:11:04 +0200400
Akron740f3d72021-08-03 12:12:34 +0200401 // Add final transition
402 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200403 // TODO:
404 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200405 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200406 }
407
Akronb4bbb472021-08-09 11:49:38 +0200408 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200409 fmt.Println("Add",
410 state+1, "->", end+1,
411 "(",
412 inSym,
413 ":",
414 outSym,
415 ") (",
416 string(tok.sigmaRev[inSym]),
417 ":",
418 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200419 ")",
420 ";",
421 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200422 "NT:", nontoken,
423 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200424 }
Akron75ebe7f2021-08-03 10:34:10 +0200425
Akron8ef408b2021-08-02 22:11:04 +0200426 continue
427 }
428 case SIGMA:
429 {
430 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
431
432 // Turn string into sigma id
433 number, err := strconv.Atoi(elem[0])
434
Akron524c5432021-08-05 14:14:27 +0200435 // ID needs to be > 1
436 number++
437
Akron8ef408b2021-08-02 22:11:04 +0200438 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200439 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200440 return nil
Akron8ef408b2021-08-02 22:11:04 +0200441 }
442
Akron740f3d72021-08-03 12:12:34 +0200443 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200444
445 var symbol rune
446
447 // Read rune
448 if utf8.RuneCountInString(elem[1]) == 1 {
449 symbol = []rune(elem[1])[0]
450
Akron8ef408b2021-08-02 22:11:04 +0200451 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200452
453 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200454 switch elem[1] {
455 case "@_EPSILON_SYMBOL_@":
456 {
Akronc17f1ca2021-08-03 19:47:27 +0200457 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200458 }
459 case "@_UNKNOWN_SYMBOL_@":
460 {
Akronc17f1ca2021-08-03 19:47:27 +0200461 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200462 }
463
464 case "@_IDENTITY_SYMBOL_@":
465 {
Akronc17f1ca2021-08-03 19:47:27 +0200466 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200467 }
Akron03c92fe2021-08-09 14:07:57 +0200468
469 case "@_TOKEN_SYMBOL_@":
470 {
471 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200472 }
Akron8ef408b2021-08-02 22:11:04 +0200473 default:
Akron740f3d72021-08-03 12:12:34 +0200474 {
Akron31f3c062021-08-27 10:15:13 +0200475 // MCS not supported
476 tok.sigmaMCS[number] = line
Akron740f3d72021-08-03 12:12:34 +0200477 }
Akron8ef408b2021-08-02 22:11:04 +0200478 }
Akron439f4ec2021-08-09 15:45:38 +0200479 continue
Akron8ef408b2021-08-02 22:11:04 +0200480
Akron740f3d72021-08-03 12:12:34 +0200481 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200482 line, err = r.ReadString('\n')
483 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200484 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200485 return nil
Akron8ef408b2021-08-02 22:11:04 +0200486 }
487 if len(line) != 1 {
Akron31f3c062021-08-27 10:15:13 +0200488 // MCS not supported
489 tok.sigmaMCS[number] = line
490 continue
Akron8ef408b2021-08-02 22:11:04 +0200491 }
Akron03c92fe2021-08-09 14:07:57 +0200492 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200493 }
494
Akron740f3d72021-08-03 12:12:34 +0200495 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200496 }
497 }
498 }
Akron31f3c062021-08-27 10:15:13 +0200499 tok.sigmaMCS = nil
Akron8ef408b2021-08-02 22:11:04 +0200500 return tok
501}
502
Akron64ffd9a2021-08-03 19:55:21 +0200503// Set alphabet A to the list of all symbols
504// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200505func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200506 for a := range tok.transitions[s] {
507 *A = append(*A, a)
508 }
509
510 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200511 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200512}
513
Akron439f4ec2021-08-09 15:45:38 +0200514// ToDoubleArray turns the intermediate tokenizer representation
515// into a double array representation.
516//
517// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200518func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
519
520 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200521 sigma: make(map[rune]int),
Akron4fa28b32021-08-27 10:55:41 +0200522 transCount: -1,
Akron03a3c612021-08-04 11:51:27 +0200523 final: tok.final,
524 unknown: tok.unknown,
525 identity: tok.identity,
526 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200527 tokenend: tok.tokenend,
Akronf2120ca2021-08-03 16:26:41 +0200528 }
529
Akronf1a16502021-08-16 15:24:38 +0200530 dat.resize(dat.final)
531
Akronf2120ca2021-08-03 16:26:41 +0200532 for num, sym := range tok.sigmaRev {
Akronea46e8a2021-08-17 00:36:31 +0200533 if int(sym) < 256 {
534 dat.sigmaASCII[int(sym)] = num
535 }
Akronf2120ca2021-08-03 16:26:41 +0200536 dat.sigma[sym] = num
537 }
Akron8ef408b2021-08-02 22:11:04 +0200538
539 mark := 0
540 size := 0
Akron6f1c16c2021-08-17 10:45:42 +0200541 var base uint32
Akronde18e902021-08-27 09:34:12 +0200542 var atrans *edge
543 var s, s1 int
544 var t, t1 uint32
Akron8ef408b2021-08-02 22:11:04 +0200545
Akron439f4ec2021-08-09 15:45:38 +0200546 // Create a mapping from s (in Ms aka Intermediate FSA)
547 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200548 table := make([]*mapping, tok.arcCount+1)
Akron7b1faa62021-09-02 16:10:21 +0200549 // tableQueue := make([]int, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200550
Akron439f4ec2021-08-09 15:45:38 +0200551 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200552 table[size] = &mapping{source: 1, target: 1}
Akron7b1faa62021-09-02 16:10:21 +0200553 // tableQueue[size] = 1
Akron8ef408b2021-08-02 22:11:04 +0200554 size++
555
Akron740f3d72021-08-03 12:12:34 +0200556 // Allocate space for the outgoing symbol range
557 A := make([]int, 0, tok.sigmaCount)
Akron7b1faa62021-09-02 16:10:21 +0200558 // tableLookup := make([]uint32, tok.arcCount+2) // make(map[int]uint32)
559 // tableLookup[1] = 1
Akron8ef408b2021-08-02 22:11:04 +0200560
Akron7b1faa62021-09-02 16:10:21 +0200561 // block_begin_pos := uint32(1)
Akronde18e902021-08-27 09:34:12 +0200562
Akron8ef408b2021-08-02 22:11:04 +0200563 for mark < size {
Akronde18e902021-08-27 09:34:12 +0200564 s = table[mark].source // This is a state in Ms
565 t = table[mark].target // This is a state in Mt
Akron7b1faa62021-09-02 16:10:21 +0200566 // s = tableQueue[mark]
567 // t = tableLookup[s]
Akron8ef408b2021-08-02 22:11:04 +0200568 mark++
Akron740f3d72021-08-03 12:12:34 +0200569
570 // Following the paper, here the state t can be remembered
571 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200572 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200573 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200574
Akron740f3d72021-08-03 12:12:34 +0200575 // Set base to the first free slot in the double array
Akron29e306f2021-09-02 18:29:56 +0200576 // base = dat.xCheck(A)
Akron679b4862021-09-02 16:59:26 +0200577 // base = dat.xCheckSkip(A)
Akron7b1faa62021-09-02 16:10:21 +0200578 // base = dat.xCheckNiu(A, &block_begin_pos)
Akron29e306f2021-09-02 18:29:56 +0200579 base = dat.xCheckSkipNiu(A)
Akron6f1c16c2021-08-17 10:45:42 +0200580 dat.array[t].setBase(base)
Akron8ef408b2021-08-02 22:11:04 +0200581
Akron773b1ef2021-08-03 17:37:20 +0200582 // TODO:
Akron068874c2021-08-04 15:19:56 +0200583 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200584 // outdegree of .end
585
Akron740f3d72021-08-03 12:12:34 +0200586 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200587 for _, a := range A {
588
Akronc17f1ca2021-08-03 19:47:27 +0200589 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200590
Akronde18e902021-08-27 09:34:12 +0200591 atrans = tok.transitions[s][a]
592
Akron740f3d72021-08-03 12:12:34 +0200593 // Aka g(s, a)
Akronde18e902021-08-27 09:34:12 +0200594 s1 = atrans.end
Akron8ef408b2021-08-02 22:11:04 +0200595
Akron740f3d72021-08-03 12:12:34 +0200596 // Store the transition
Akronde18e902021-08-27 09:34:12 +0200597 t1 = base + uint32(a)
Akronf1a16502021-08-16 15:24:38 +0200598 dat.array[t1].setCheck(t)
599
600 // Set maxSize
601 if dat.maxSize < int(t1) {
602 dat.maxSize = int(t1)
603 }
Akron8ef408b2021-08-02 22:11:04 +0200604
Akron439f4ec2021-08-09 15:45:38 +0200605 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200606 fmt.Println("Translate transition",
607 s, "->", s1, "(", a, ")", "to", t, "->", t1)
608 }
609
Akron83e75a22021-08-04 13:14:06 +0200610 // Mark the state as being the target of a nontoken transition
Akronde18e902021-08-27 09:34:12 +0200611 if atrans.nontoken {
Akronf1a16502021-08-16 15:24:38 +0200612 dat.array[t1].setNonToken(true)
Akron524c5432021-08-05 14:14:27 +0200613 if DEBUG {
614 fmt.Println("Set", t1, "to nontoken")
615 }
Akron83e75a22021-08-04 13:14:06 +0200616 }
617
Akron84d68e62021-08-04 17:06:52 +0200618 // Mark the state as being the target of a tokenend transition
Akronde18e902021-08-27 09:34:12 +0200619 if atrans.tokenend {
Akronf1a16502021-08-16 15:24:38 +0200620 dat.array[t1].setTokenEnd(true)
Akron524c5432021-08-05 14:14:27 +0200621 if DEBUG {
622 fmt.Println("Set", t1, "to tokenend")
623 }
Akron84d68e62021-08-04 17:06:52 +0200624 }
625
Akron740f3d72021-08-03 12:12:34 +0200626 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200627 r := stateAlreadyInTable(s1, table, size)
Akronde18e902021-08-27 09:34:12 +0200628 // r := tableLookup[s1]
Akron740f3d72021-08-03 12:12:34 +0200629
Akron439f4ec2021-08-09 15:45:38 +0200630 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200631 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200632 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200633 table[size] = &mapping{source: s1, target: t1}
Akron7b1faa62021-09-02 16:10:21 +0200634 // tableQueue[size] = s1
Akronde18e902021-08-27 09:34:12 +0200635 // tableLookup[s1] = t1
Akron8ef408b2021-08-02 22:11:04 +0200636 size++
637 } else {
Akron740f3d72021-08-03 12:12:34 +0200638 // Overwrite with the representative state
Akronf1a16502021-08-16 15:24:38 +0200639 dat.array[t1].setBase(r)
640 dat.array[t1].setSeparate(true)
Akron8ef408b2021-08-02 22:11:04 +0200641 }
642 } else {
Akron740f3d72021-08-03 12:12:34 +0200643 // Store a final transition
Akron6f1c16c2021-08-17 10:45:42 +0200644 dat.array[base+uint32(dat.final)].setCheck(t)
Akronf1a16502021-08-16 15:24:38 +0200645
Akronde18e902021-08-27 09:34:12 +0200646 if dat.maxSize < int(base)+dat.final {
647 dat.maxSize = int(base) + dat.final
Akronf1a16502021-08-16 15:24:38 +0200648 }
Akron8ef408b2021-08-02 22:11:04 +0200649 }
650 }
651 }
652
653 // Following Mizobuchi et al (2000) the size of the
654 // FSA should be stored in check(1).
Akron29e306f2021-09-02 18:29:56 +0200655 // We make the size a bit larger so we never have to check for boundaries.
656 dat.setSize(dat.maxSize + dat.final)
657 if len(dat.array) < dat.maxSize+dat.final {
658 dat.array = append(dat.array, make([]bc, dat.final)...)
659 }
660 dat.array = dat.array[:dat.maxSize+dat.final]
Akronf2120ca2021-08-03 16:26:41 +0200661 return dat
Akron8ef408b2021-08-02 22:11:04 +0200662}
663
Akron8ef408b2021-08-02 22:11:04 +0200664// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200665// exists and return this as a representative.
666// Currently iterates through the whole table
667// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200668func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200669 for x := 0; x < size; x++ {
670 if table[x].source == s {
671 return table[x].target
672 }
673 }
674 return 0
675}
676
Akron64ffd9a2021-08-03 19:55:21 +0200677// Resize double array when necessary
678func (dat *DaTokenizer) resize(l int) {
679 // TODO:
680 // This is a bit too aggressive atm and should be calmed down.
681 if len(dat.array) <= l {
Akronf1a16502021-08-16 15:24:38 +0200682 dat.array = append(dat.array, make([]bc, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200683 }
Akron64ffd9a2021-08-03 19:55:21 +0200684}
Akronc9d84a62021-08-03 15:56:03 +0200685
Akron64ffd9a2021-08-03 19:55:21 +0200686// Set base value in double array
Akronf1a16502021-08-16 15:24:38 +0200687func (bc *bc) setBase(v uint32) {
688 bc.base = v
Akron439f4ec2021-08-09 15:45:38 +0200689}
690
691// Get base value in double array
Akronf1a16502021-08-16 15:24:38 +0200692func (bc *bc) getBase() uint32 {
693 return bc.base & RESTBIT
Akron439f4ec2021-08-09 15:45:38 +0200694}
695
696// Set check value in double array
Akronf1a16502021-08-16 15:24:38 +0200697func (bc *bc) setCheck(v uint32) {
698 bc.check = v
Akron439f4ec2021-08-09 15:45:38 +0200699}
700
701// Get check value in double array
Akronf1a16502021-08-16 15:24:38 +0200702func (bc *bc) getCheck() uint32 {
703 return bc.check & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200704}
705
Akron3fdfec62021-08-04 11:40:10 +0200706// Returns true if a state is separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200707func (bc *bc) isSeparate() bool {
708 return bc.base&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200709}
710
711// Mark a state as separate pointing to a representative
Akronf1a16502021-08-16 15:24:38 +0200712func (bc *bc) setSeparate(sep bool) {
Akron3fdfec62021-08-04 11:40:10 +0200713 if sep {
Akronf1a16502021-08-16 15:24:38 +0200714 bc.base |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200715 } else {
Akronf1a16502021-08-16 15:24:38 +0200716 bc.base &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200717 }
718}
719
Akron83e75a22021-08-04 13:14:06 +0200720// Returns true if a state is the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200721func (bc *bc) isNonToken() bool {
722 return bc.check&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200723}
724
725// Mark a state as being the target of a nontoken transition
Akronf1a16502021-08-16 15:24:38 +0200726func (bc *bc) setNonToken(sep bool) {
Akron83e75a22021-08-04 13:14:06 +0200727 if sep {
Akronf1a16502021-08-16 15:24:38 +0200728 bc.check |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200729 } else {
Akronf1a16502021-08-16 15:24:38 +0200730 bc.check &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200731 }
732}
733
Akron84d68e62021-08-04 17:06:52 +0200734// Returns true if a state is the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200735func (bc *bc) isTokenEnd() bool {
736 return bc.check&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200737}
738
739// Mark a state as being the target of a tokenend transition
Akronf1a16502021-08-16 15:24:38 +0200740func (bc *bc) setTokenEnd(sep bool) {
Akron84d68e62021-08-04 17:06:52 +0200741 if sep {
Akronf1a16502021-08-16 15:24:38 +0200742 bc.check |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200743 } else {
Akronf1a16502021-08-16 15:24:38 +0200744 bc.check &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200745 }
746}
747
Akron64ffd9a2021-08-03 19:55:21 +0200748// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200749func (dat *DaTokenizer) setSize(v int) {
Akronf1a16502021-08-16 15:24:38 +0200750 dat.array[1].setCheck(uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200751}
752
753// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200754func (dat *DaTokenizer) GetSize() int {
Akronf1a16502021-08-16 15:24:38 +0200755 return int(dat.array[1].getCheck())
Akron8ef408b2021-08-02 22:11:04 +0200756}
757
758// Based on Mizobuchi et al (2000), p. 124
759// This iterates for every state through the complete double array
760// structure until it finds a gap that fits all outgoing transitions
761// of the state. This is extremely slow, but is only necessary in the
762// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200763func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200764
765 // Start at the first entry of the double array list
Akron6f1c16c2021-08-17 10:45:42 +0200766 base := uint32(1)
767
Akron8ef408b2021-08-02 22:11:04 +0200768OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200769 // Resize the array if necessary
Akronf1a16502021-08-16 15:24:38 +0200770 dat.resize(int(base) + dat.final)
Akron8ef408b2021-08-02 22:11:04 +0200771 for _, a := range symbols {
Akronf1a16502021-08-16 15:24:38 +0200772 if dat.array[int(base)+a].getCheck() != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200773 base++
774 goto OVERLAP
775 }
776 }
Akron8ef408b2021-08-02 22:11:04 +0200777 return base
778}
779
Akron679b4862021-09-02 16:59:26 +0200780// This is an implementation of xCheck with the skip-improvement
781// proposed by Morita et al. (2001)
782func (dat *DaTokenizer) xCheckSkip(symbols []int) uint32 {
783
784 // Start at the first entry of the double array list
785 base := uint32(math.Abs(float64(dat.maxSize-1) * .9))
786
787OVERLAP:
788 // Resize the array if necessary
789 dat.resize(int(base) + dat.final)
790 for _, a := range symbols {
791 if dat.array[int(base)+a].getCheck() != 0 {
792 base++
793 goto OVERLAP
794 }
795 }
796 return base
797}
798
Akron29e306f2021-09-02 18:29:56 +0200799// This is an implementation of xCheck with the skip-improvement
800// proposed by Morita et al. (2001) for higher outdegrees as
801// proposed by Niu et al. (2013)
802func (dat *DaTokenizer) xCheckSkipNiu(symbols []int) uint32 {
803
804 // Start at the first entry of the double array list
805 base := uint32(1)
806
807 // Or skip the first few entries
808 if len(symbols) >= 3 {
809 base = uint32(math.Abs(float64(dat.maxSize-1)*.9)) + 1
810 }
811
812OVERLAP:
813 // Resize the array if necessary
814 dat.resize(int(base) + dat.final + 1)
815 for _, a := range symbols {
816 if dat.array[int(base)+a].getCheck() != 0 {
817 base++
818 goto OVERLAP
819 }
820 }
821 return base
822}
823
Akron7b1faa62021-09-02 16:10:21 +0200824// This is an implementation of xCheck wit an improvement
825// proposed by Niu et al. (2013)
826func (dat *DaTokenizer) xCheckNiu(symbols []int, block_begin_pos *uint32) uint32 {
827
828 // Start at the first entry of the double array list
829 base := uint32(1)
830
831 if len(symbols) > 3 {
832 sort.Ints(symbols)
833 if *block_begin_pos > uint32(symbols[0]) {
834 dat.resize(int(*block_begin_pos) + dat.final)
835 *block_begin_pos += uint32(symbols[len(symbols)-1] + 1)
836 return *block_begin_pos - uint32(symbols[0])
837 }
838 }
839
840OVERLAP:
841 // Resize the array if necessary
842 dat.resize(int(base) + dat.final)
843 for _, a := range symbols {
844 if dat.array[int(base)+a].getCheck() != 0 {
845 base++
846 goto OVERLAP
847 }
848 }
849 return base
850}
851
Akron3610f102021-08-08 14:13:25 +0200852// List all outgoing transitions for a state
853// for testing purposes
854func (dat *DaTokenizer) outgoing(t uint32) []int {
855
856 valid := make([]int, 0, len(dat.sigma))
857
858 for _, a := range dat.sigma {
Akronf1a16502021-08-16 15:24:38 +0200859 t1 := dat.array[t].getBase() + uint32(a)
860 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200861 valid = append(valid, a)
862 }
863 }
864
865 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
Akronf1a16502021-08-16 15:24:38 +0200866 t1 := dat.array[t].getBase() + uint32(a)
867 if t1 <= dat.array[1].getCheck() && dat.array[t1].getCheck() == t {
Akron3610f102021-08-08 14:13:25 +0200868 valid = append(valid, -1*a)
869 }
870 }
871
872 sort.Ints(valid)
873
874 return valid
875}
876
Akron4fa28b32021-08-27 10:55:41 +0200877// TransCount as the number of transitions aka arcs in the
878// finite state automaton
879func (dat *DaTokenizer) TransCount() int {
880 // Cache the transCount
881 if dat.transCount > 0 {
882 return dat.transCount
883 }
884
885 dat.transCount = 0
886 for x := 1; x < len(dat.array); x++ {
887 if dat.array[x].getBase() != 0 {
888 dat.transCount++
889 }
890 }
891
892 return dat.transCount
893}
894
Akron03a3c612021-08-04 11:51:27 +0200895// LoadFactor as defined in Kanda et al (2018),
896// i.e. the proportion of non-empty elements to all elements.
897func (dat *DaTokenizer) LoadFactor() float64 {
Akron4fa28b32021-08-27 10:55:41 +0200898 return float64(dat.TransCount()) / float64(len(dat.array)) * 100
Akrond66a9262021-08-03 17:09:09 +0200899}
900
Akron439f4ec2021-08-09 15:45:38 +0200901// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200902func (dat *DaTokenizer) Save(file string) (n int64, err error) {
903 f, err := os.Create(file)
904 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200905 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200906 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200907 }
908 defer f.Close()
909 gz := gzip.NewWriter(f)
910 defer gz.Close()
911 n, err = dat.WriteTo(gz)
912 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200913 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200914 return n, err
915 }
916 gz.Flush()
917 return n, nil
918}
919
920// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200921func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
922
Akron3a063ef2021-08-05 19:36:35 +0200923 wb := bufio.NewWriter(w)
924 defer wb.Flush()
925
Akron6247a5d2021-08-03 19:18:28 +0200926 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200927 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200928 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200929 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200930 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200931 }
932
933 // Get sigma as a list
934 sigmalist := make([]rune, len(dat.sigma)+16)
935 max := 0
936 for sym, num := range dat.sigma {
937 sigmalist[num] = sym
938 if num > max {
939 max = num
940 }
941 }
942
943 sigmalist = sigmalist[:max+1]
944
Akron3f8571a2021-08-05 11:18:10 +0200945 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200946 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200947 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
948 bo.PutUint16(buf[4:6], uint16(dat.unknown))
949 bo.PutUint16(buf[6:8], uint16(dat.identity))
950 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200951 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akronf1a16502021-08-16 15:24:38 +0200952 bo.PutUint32(buf[12:16], uint32(len(dat.array)*2)) // Legacy support
Akron3a063ef2021-08-05 19:36:35 +0200953 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200954 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200955 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200956 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200957 }
958
959 all += more
960
Akron6247a5d2021-08-03 19:18:28 +0200961 // Write sigma
962 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200963
Akron3a063ef2021-08-05 19:36:35 +0200964 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200965 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200966 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200967 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200968 }
969 all += more
970 }
Akron439f4ec2021-08-09 15:45:38 +0200971
Akron6247a5d2021-08-03 19:18:28 +0200972 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200973 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200974 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200975 }
Akron6247a5d2021-08-03 19:18:28 +0200976
977 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200978 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200979 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200980 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200981 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200982 }
983 all += more
984
Akronf1a16502021-08-16 15:24:38 +0200985 // for x := 0; x < len(dat.array); x++ {
986 for _, bc := range dat.array {
987 bo.PutUint32(buf[0:4], bc.base)
988 more, err = wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200989 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200990 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200991 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200992 }
Akron439f4ec2021-08-09 15:45:38 +0200993 all += more
Akron3a063ef2021-08-05 19:36:35 +0200994 if more != 4 {
Akronf1a16502021-08-16 15:24:38 +0200995 log.Println("Can not write base uint32")
996 return int64(all), err
997 }
998 bo.PutUint32(buf[0:4], bc.check)
999 more, err = wb.Write(buf[0:4])
1000 if err != nil {
1001 log.Println(err)
1002 return int64(all), err
1003 }
1004 all += more
1005 if more != 4 {
1006 log.Println("Can not write check uint32")
Akron3a063ef2021-08-05 19:36:35 +02001007 return int64(all), err
1008 }
Akron6247a5d2021-08-03 19:18:28 +02001009 }
1010
1011 return int64(all), err
1012}
1013
Akron439f4ec2021-08-09 15:45:38 +02001014// LoadDatokFile reads a double array represented tokenizer
1015// from a file.
Akron3f8571a2021-08-05 11:18:10 +02001016func LoadDatokFile(file string) *DaTokenizer {
1017 f, err := os.Open(file)
1018 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001019 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +02001020 return nil
Akron3f8571a2021-08-05 11:18:10 +02001021 }
1022 defer f.Close()
1023
1024 gz, err := gzip.NewReader(f)
1025 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001026 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +02001027 return nil
Akron3f8571a2021-08-05 11:18:10 +02001028 }
1029 defer gz.Close()
1030
Akron3a063ef2021-08-05 19:36:35 +02001031 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +02001032 return ParseDatok(gz)
1033}
1034
Akron439f4ec2021-08-09 15:45:38 +02001035// LoadDatokFile reads a double array represented tokenizer
1036// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +02001037func ParseDatok(ior io.Reader) *DaTokenizer {
1038
Akron439f4ec2021-08-09 15:45:38 +02001039 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +02001040 dat := &DaTokenizer{
1041 sigma: make(map[rune]int),
1042 epsilon: 0,
1043 unknown: 0,
1044 identity: 0,
1045 final: 0,
Akron4fa28b32021-08-27 10:55:41 +02001046 transCount: 0,
Akron3f8571a2021-08-05 11:18:10 +02001047 }
1048
1049 r := bufio.NewReader(ior)
1050
Akron3f8571a2021-08-05 11:18:10 +02001051 buf := make([]byte, 1024)
1052 buf = buf[0:len(MAGIC)]
1053
Akron439f4ec2021-08-09 15:45:38 +02001054 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +02001055
1056 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001057 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +02001058 return nil
1059 }
1060
Akron3f8571a2021-08-05 11:18:10 +02001061 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +02001062 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001063 return nil
1064 }
1065
Akron439f4ec2021-08-09 15:45:38 +02001066 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +02001067 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001068 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +02001069 return nil
1070 }
1071
Akron439f4ec2021-08-09 15:45:38 +02001072 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +02001073 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +02001074 return nil
1075 }
Akron3f8571a2021-08-05 11:18:10 +02001076
Akron3a063ef2021-08-05 19:36:35 +02001077 version := bo.Uint16(buf[0:2])
1078
1079 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +02001080 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +02001081 return nil
1082 }
1083
Akron3f8571a2021-08-05 11:18:10 +02001084 dat.epsilon = int(bo.Uint16(buf[2:4]))
1085 dat.unknown = int(bo.Uint16(buf[4:6]))
1086 dat.identity = int(bo.Uint16(buf[6:8]))
1087 dat.final = int(bo.Uint16(buf[8:10]))
1088
1089 sigmaCount := int(bo.Uint16(buf[10:12]))
Akronf1a16502021-08-16 15:24:38 +02001090 arraySize := int(bo.Uint32(buf[12:16])) / 2 // Legacy support
Akron3f8571a2021-08-05 11:18:10 +02001091
Akron3a063ef2021-08-05 19:36:35 +02001092 // Shouldn't be relevant though
1093 dat.maxSize = arraySize - 1
1094
Akron3f8571a2021-08-05 11:18:10 +02001095 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +02001096 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +02001097 if err == nil && sym != 0 {
Akronea46e8a2021-08-17 00:36:31 +02001098 if int(sym) < 256 {
1099 dat.sigmaASCII[int(sym)] = x
1100 }
Akron3f8571a2021-08-05 11:18:10 +02001101 dat.sigma[sym] = x
1102 }
Akron3f8571a2021-08-05 11:18:10 +02001103 }
1104
Akron439f4ec2021-08-09 15:45:38 +02001105 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +02001106
1107 if err != nil {
Akron527c10c2021-08-13 01:45:18 +02001108 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +02001109 return nil
1110 }
1111
Akron3f8571a2021-08-05 11:18:10 +02001112 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +02001113 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +02001114 return nil
1115 }
1116
1117 // Read based on length
Akronf1a16502021-08-16 15:24:38 +02001118 dat.array = make([]bc, arraySize)
Akron3f8571a2021-08-05 11:18:10 +02001119
Akronbb4aac52021-08-13 00:52:27 +02001120 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001121
Akronbb4aac52021-08-13 00:52:27 +02001122 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001123 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001124 return nil
1125 }
1126
Akronf1a16502021-08-16 15:24:38 +02001127 if len(dataArray) < arraySize*8 {
Akron527c10c2021-08-13 01:45:18 +02001128 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001129 return nil
1130 }
1131
1132 for x := 0; x < arraySize; x++ {
Akronf1a16502021-08-16 15:24:38 +02001133 dat.array[x].base = bo.Uint32(dataArray[x*8 : (x*8)+4])
1134 dat.array[x].check = bo.Uint32(dataArray[(x*8)+4 : (x*8)+8])
Akron3f8571a2021-08-05 11:18:10 +02001135 }
1136
1137 return dat
1138}
1139
Akron439f4ec2021-08-09 15:45:38 +02001140// Show the current state of the buffer,
1141// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001142func showBuffer(buffer []rune, buffo int, buffi int) string {
1143 out := make([]rune, 0, 1024)
1144 for x := 0; x < len(buffer); x++ {
1145 if buffi == x {
1146 out = append(out, '^')
1147 }
1148 if buffo == x {
1149 out = append(out, '[', buffer[x], ']')
1150 } else {
1151 out = append(out, buffer[x])
1152 }
1153 }
1154 return string(out)
1155}
1156
Akron84d68e62021-08-04 17:06:52 +02001157// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001158// FSA. The rules are always greedy. If the automaton fails,
1159// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001160//
Akron4db3ecf2021-08-11 18:49:03 +02001161// Based on Mizobuchi et al (2000), p. 129,
1162// with additional support for IDENTITY, UNKNOWN
1163// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001164func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001165 var a int
Akronb4bbb472021-08-09 11:49:38 +02001166 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001167 t := uint32(1) // Initial state
1168 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001169
Akron3610f102021-08-08 14:13:25 +02001170 // Remember the last position of a possible tokenend,
1171 // in case the automaton fails.
1172 epsilonState := uint32(0)
1173 epsilonOffset := 0
1174
1175 // Implement a low level buffer for full control,
1176 // however - it is probably better to introduce
1177 // this on a higher level with a io.Reader interface
1178 // The buffer stores a single word and may have white
1179 // space at the end (but not at the beginning).
1180 //
1181 // This is the only backtracking requirement because of
1182 // epsilon transitions, to support tokenizations like:
1183 // "this is an example|.| And it works." vs
1184 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001185 //
1186 // TODO:
1187 // Store a translation buffer as well, so characters don't
1188 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001189 buffer := make([]rune, 1024)
1190 buffo := 0 // Buffer offset
1191 buffi := 0 // Buffer length
1192
Akron3f8571a2021-08-05 11:18:10 +02001193 reader := bufio.NewReader(r)
1194 writer := bufio.NewWriter(w)
1195 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001196
Akron3f8571a2021-08-05 11:18:10 +02001197 var char rune
1198 var err error
1199 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001200 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001201
Akronc5d8d432021-08-10 16:48:44 +02001202PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001203 for {
1204
Akronb7e1f132021-08-10 11:52:31 +02001205 if newchar {
1206 // Get from reader if buffer is empty
1207 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001208 if eof {
1209 break
1210 }
Akronb7e1f132021-08-10 11:52:31 +02001211 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001212
Akronb7e1f132021-08-10 11:52:31 +02001213 // No more runes to read
1214 if err != nil {
1215 eof = true
1216 break
1217 }
1218 buffer[buffi] = char
1219 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001220 }
Akronb7e1f132021-08-10 11:52:31 +02001221
1222 char = buffer[buffo]
1223
1224 if DEBUG {
1225 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1226 }
1227
Akron6f1c16c2021-08-17 10:45:42 +02001228 // TODO:
1229 // Better not repeatedly check for a!
1230 // Possibly keep a buffer with a.
Akronea46e8a2021-08-17 00:36:31 +02001231 if int(char) < 256 {
1232 a = dat.sigmaASCII[int(char)]
1233 } else {
1234 a, ok = dat.sigma[char]
1235 if !ok {
1236 a = 0
1237 }
1238 }
Akronb7e1f132021-08-10 11:52:31 +02001239
1240 // Use identity symbol if character is not in sigma
Akronea46e8a2021-08-17 00:36:31 +02001241 if a == 0 && dat.identity != -1 {
Akronb7e1f132021-08-10 11:52:31 +02001242 a = dat.identity
1243 }
1244
1245 t0 = t
1246
1247 // Check for epsilon transitions and remember
Akronf1a16502021-08-16 15:24:38 +02001248 if dat.array[dat.array[t0].getBase()+uint32(dat.epsilon)].getCheck() == t0 {
Akronb7e1f132021-08-10 11:52:31 +02001249 // Remember state for backtracking to last tokenend state
1250 epsilonState = t0
1251 epsilonOffset = buffo
1252 }
Akron3f8571a2021-08-05 11:18:10 +02001253 }
Akron3610f102021-08-08 14:13:25 +02001254
Akronb7e1f132021-08-10 11:52:31 +02001255 // Checks a transition based on t0, a and buffo
Akronf1a16502021-08-16 15:24:38 +02001256 t = dat.array[t0].getBase() + uint32(a)
1257 ta := dat.array[t]
Akron068874c2021-08-04 15:19:56 +02001258
Akron524c5432021-08-05 14:14:27 +02001259 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001260 // Char is only relevant if set
1261 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1262 if false {
1263 fmt.Println(dat.outgoing(t0))
1264 }
Akron524c5432021-08-05 14:14:27 +02001265 }
1266
Akronb7e1f132021-08-10 11:52:31 +02001267 // Check if the transition is invalid according to the double array
Akronf1a16502021-08-16 15:24:38 +02001268 if t > dat.array[1].getCheck() || ta.getCheck() != t0 {
Akron068874c2021-08-04 15:19:56 +02001269
1270 if DEBUG {
Akronf1a16502021-08-16 15:24:38 +02001271 fmt.Println("Match is not fine!", t, "and", ta.getCheck(), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001272 }
1273
1274 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001275
Akron068874c2021-08-04 15:19:56 +02001276 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001277 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001278 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001279 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001280 }
1281 a = dat.unknown
1282
1283 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001284
Akron068874c2021-08-04 15:19:56 +02001285 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001286 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001287 epsilonState = 0 // reset
1288 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001289 a = dat.epsilon
1290
Akron3610f102021-08-08 14:13:25 +02001291 if DEBUG {
1292 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1293 }
Akronb4bbb472021-08-09 11:49:38 +02001294
Akron068874c2021-08-04 15:19:56 +02001295 } else {
1296 break
1297 }
Akron068874c2021-08-04 15:19:56 +02001298
Akronb7e1f132021-08-10 11:52:31 +02001299 newchar = false
1300 continue
Akronb4bbb472021-08-09 11:49:38 +02001301 }
1302
Akronb7e1f132021-08-10 11:52:31 +02001303 // Transition was successful
1304 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001305
1306 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001307 if a != dat.epsilon {
1308
Akron3610f102021-08-08 14:13:25 +02001309 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001310
Akron439f4ec2021-08-09 15:45:38 +02001311 // Transition does not produce a character
Akronf1a16502021-08-16 15:24:38 +02001312 if buffo == 1 && ta.isNonToken() {
Akron3610f102021-08-08 14:13:25 +02001313 if DEBUG {
1314 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1315 }
Akron439f4ec2021-08-09 15:45:38 +02001316 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001317 }
Akron3f8571a2021-08-05 11:18:10 +02001318 }
Akron068874c2021-08-04 15:19:56 +02001319
Akronc5d8d432021-08-10 16:48:44 +02001320 // Transition marks the end of a token - so flush the buffer
Akronf1a16502021-08-16 15:24:38 +02001321 if ta.isTokenEnd() {
Akron524c5432021-08-05 14:14:27 +02001322
Akronc5d8d432021-08-10 16:48:44 +02001323 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001324 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001325 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001326 }
Akron01912fc2021-08-12 11:41:58 +02001327 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001328 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001329 }
Akron1594cb82021-08-11 11:14:56 +02001330 if DEBUG {
1331 fmt.Println("-> Newline")
1332 }
1333 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001334 }
Akron3610f102021-08-08 14:13:25 +02001335
Akronc5d8d432021-08-10 16:48:44 +02001336 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001337 if rewindBuffer {
1338
1339 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001340 for x, i := range buffer[buffo:buffi] {
1341 buffer[x] = i
1342 }
Akronb4bbb472021-08-09 11:49:38 +02001343
Akron3610f102021-08-08 14:13:25 +02001344 buffi -= buffo
1345 epsilonOffset -= buffo
1346 buffo = 0
1347 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001348 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001349 }
Akron84d68e62021-08-04 17:06:52 +02001350 }
1351
Akronb7e1f132021-08-10 11:52:31 +02001352 // Move to representative state
Akronf1a16502021-08-16 15:24:38 +02001353 if ta.isSeparate() {
1354 t = ta.getBase()
1355 ta = dat.array[t]
Akronb7e1f132021-08-10 11:52:31 +02001356
1357 if DEBUG {
1358 fmt.Println("Representative pointing to", t)
1359 }
1360 }
1361
Akronc5d8d432021-08-10 16:48:44 +02001362 newchar = true
1363
Akron068874c2021-08-04 15:19:56 +02001364 // TODO:
1365 // Prevent endless epsilon loops!
1366 }
1367
Akron439f4ec2021-08-09 15:45:38 +02001368 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001369 if !eof {
Akron068874c2021-08-04 15:19:56 +02001370 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001371 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001372 }
1373 return false
1374 }
1375
Akronb7e1f132021-08-10 11:52:31 +02001376 if DEBUG {
1377 fmt.Println("Entering final check")
1378 }
1379
Akronc5d8d432021-08-10 16:48:44 +02001380 // Automaton is in a final state, so flush the buffer and return
Akronf1a16502021-08-16 15:24:38 +02001381 x := dat.array[t].getBase() + uint32(dat.final)
1382
1383 if x < dat.array[1].getCheck() && dat.array[x].getCheck() == t {
Akronb4bbb472021-08-09 11:49:38 +02001384
1385 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001386 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001387 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001388 }
Akron01912fc2021-08-12 11:41:58 +02001389 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001390
Akronf1a16502021-08-16 15:24:38 +02001391 if dat.array[t].isTokenEnd() {
Akrondf0a3ef2021-08-09 15:53:45 +02001392 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001393 if DEBUG {
1394 fmt.Println("-> Newline")
1395 }
Akrondf0a3ef2021-08-09 15:53:45 +02001396 }
Akron84d68e62021-08-04 17:06:52 +02001397 }
1398
Akron6e70dc82021-08-11 11:33:18 +02001399 // Add an additional sentence ending, if the file is over but no explicit
1400 // sentence split was reached. This may be controversial and therefore
1401 // optional via parameter.
Akronf1a16502021-08-16 15:24:38 +02001402 if !dat.array[t0].isTokenEnd() {
Akron6e70dc82021-08-11 11:33:18 +02001403 writer.WriteRune('\n')
1404 if DEBUG {
1405 fmt.Println("-> Newline")
1406 }
1407 }
1408
Akrone61380b2021-08-16 10:10:46 +02001409 // TODO:
1410 // There may be a new line at the end, from an epsilon,
1411 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001412 return true
1413 }
1414
Akronc5d8d432021-08-10 16:48:44 +02001415 // Check epsilon transitions until a final state is reached
1416 t0 = t
Akronf1a16502021-08-16 15:24:38 +02001417 t = dat.array[t0].getBase() + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001418 a = dat.epsilon
1419 newchar = false
Akronf1a16502021-08-16 15:24:38 +02001420 if dat.array[t].getCheck() == t0 {
Akronc5d8d432021-08-10 16:48:44 +02001421 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001422 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001423
Akronc5d8d432021-08-10 16:48:44 +02001424 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001425 t0 = epsilonState
1426 epsilonState = 0 // reset
1427 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001428 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001429 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001430 }
Akronc5d8d432021-08-10 16:48:44 +02001431 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001432 }
Akronc5d8d432021-08-10 16:48:44 +02001433 return false
Akron068874c2021-08-04 15:19:56 +02001434}