blob: f1f8b0c8a753382ccfa1469d5a7cdccdc7042a09 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
Akron8e1d69b2021-08-12 17:38:49 +020017// - Write simple main function.
18// - Turn sigma into an array instead of using a map.
Akron740f3d72021-08-03 12:12:34 +020019// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020020// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020021// - Introduce methods on BC array entries instead of
22// jumping into the entries all the time!
23// - Instead of memoizing the loadFactor, better remember
24// the number of set transitions
Akron4db3ecf2021-08-11 18:49:03 +020025// - Replace table with a map
Akron8e1d69b2021-08-12 17:38:49 +020026// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020027
Akron8ef408b2021-08-02 22:11:04 +020028import (
29 "bufio"
30 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020031 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020032 "fmt"
33 "io"
34 "os"
Akronc9d84a62021-08-03 15:56:03 +020035 "sort"
Akron8ef408b2021-08-02 22:11:04 +020036 "strconv"
37 "strings"
38 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020039
40 "github.com/rs/zerolog/log"
Akron8ef408b2021-08-02 22:11:04 +020041)
42
43const (
Akron2a4b9292021-08-04 15:35:22 +020044 PROPS = 1
45 SIGMA = 2
46 STATES = 3
47 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020048 DEBUG = false
49 MAGIC = "DATOK"
50 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020051 FIRSTBIT uint32 = 1 << 31
52 SECONDBIT uint32 = 1 << 30
53 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020054)
55
Akron03c92fe2021-08-09 14:07:57 +020056// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020057var bo binary.ByteOrder = binary.LittleEndian
58
Akron8ef408b2021-08-02 22:11:04 +020059type mapping struct {
60 source int
Akron3fdfec62021-08-04 11:40:10 +020061 target uint32
Akron8ef408b2021-08-02 22:11:04 +020062}
63
64type edge struct {
Akron83e75a22021-08-04 13:14:06 +020065 inSym int
66 outSym int
67 end int
68 nontoken bool
69 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020070}
71
Akron03c92fe2021-08-09 14:07:57 +020072// Tokenizer is the intermediate representation
73// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020074type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020075 sigmaRev map[int]rune
76 arcCount int
Akron740f3d72021-08-03 12:12:34 +020077 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020078 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020079
80 // Special symbols in sigma
81 epsilon int
82 unknown int
83 identity int
84 final int
Akron03c92fe2021-08-09 14:07:57 +020085 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020086}
87
Akron03c92fe2021-08-09 14:07:57 +020088// DaTokenizer represents a tokenizer implemented as a
89// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020090type DaTokenizer struct {
Akron03a3c612021-08-04 11:51:27 +020091 sigma map[rune]int
92 maxSize int
93 loadFactor float64
94 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020095 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +020096
97 // Special symbols in sigma
98 epsilon int
99 unknown int
100 identity int
101 final int
Akron03c92fe2021-08-09 14:07:57 +0200102 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200103}
104
Akron03c92fe2021-08-09 14:07:57 +0200105// ParseFoma reads the FST from a foma file
106// and creates an internal representation,
107// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200108func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200109 f, err := os.Open(file)
110 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200111 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200112 return nil
Akron8ef408b2021-08-02 22:11:04 +0200113 }
114 defer f.Close()
115
116 gz, err := gzip.NewReader(f)
117 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200118 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200119 return nil
Akron8ef408b2021-08-02 22:11:04 +0200120 }
121 defer gz.Close()
122
Akron3fdfec62021-08-04 11:40:10 +0200123 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200124}
125
Akron03c92fe2021-08-09 14:07:57 +0200126// ParseFoma reads the FST from a foma file reader
127// and creates an internal representation,
128// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200129func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200130 r := bufio.NewReader(ior)
131
132 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200133 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200134 epsilon: -1,
135 unknown: -1,
136 identity: -1,
137 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200138 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200139 }
140
Akron740f3d72021-08-03 12:12:34 +0200141 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200142
143 mode := 0
144 var elem []string
145 var elemint [5]int
146
Akron03c92fe2021-08-09 14:07:57 +0200147 // Iterate over all lines of the file.
148 // This is mainly based on foma2js,
149 // licensed under the Apache License, version 2,
150 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200151 for {
152 line, err := r.ReadString('\n')
153 if err != nil {
154 if err == io.EOF {
155 break
156 }
Akron740f3d72021-08-03 12:12:34 +0200157 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200158 return nil
Akron8ef408b2021-08-02 22:11:04 +0200159 }
Akron8ef408b2021-08-02 22:11:04 +0200160
Akron439f4ec2021-08-09 15:45:38 +0200161 // Read parser mode for the following lines
162 if strings.HasPrefix(line, "##") {
163 if strings.HasPrefix(line, "##props##") {
164 mode = PROPS
165
166 } else if strings.HasPrefix(line, "##states##") {
167 mode = STATES
168
169 // Adds a final transition symbol to sigma
170 // written as '#' in Mizobuchi et al (2000)
171 tok.sigmaCount++
172 tok.final = tok.sigmaCount
173
174 } else if strings.HasPrefix(line, "##sigma##") {
175
176 mode = SIGMA
177
178 } else if strings.HasPrefix(line, "##end##") {
179
180 mode = NONE
181
182 } else if !strings.HasPrefix(line, "##foma-net") {
183 log.Error().Msg("Unknown input line")
184 break
185 }
Akron8ef408b2021-08-02 22:11:04 +0200186 continue
187 }
188
Akron439f4ec2021-08-09 15:45:38 +0200189 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200190 switch mode {
191 case PROPS:
192 {
193 elem = strings.Split(line, " ")
194 /*
195 fmt.Println("arity: " + elem[0])
196 fmt.Println("arccount: " + elem[1])
197 fmt.Println("statecount: " + elem[2])
198 fmt.Println("linecount: " + elem[3])
199 fmt.Println("finalcount: " + elem[4])
200 fmt.Println("pathcount: " + elem[5])
201 fmt.Println("is_deterministic: " + elem[6])
202 fmt.Println("is_pruned: " + elem[7])
203 fmt.Println("is_minimized: " + elem[8])
204 fmt.Println("is_epsilon_free: " + elem[9])
205 fmt.Println("is_loop_free: " + elem[10])
206 fmt.Println("extras: " + elem[11])
207 fmt.Println("name: " + elem[12])
208 */
209 if elem[6] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200210 log.Error().Msg("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200211 return nil
Akron8ef408b2021-08-02 22:11:04 +0200212 }
Akron439f4ec2021-08-09 15:45:38 +0200213
Akron8ef408b2021-08-02 22:11:04 +0200214 if elem[9] != "1" {
Akron740f3d72021-08-03 12:12:34 +0200215 log.Error().Msg("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200216 return nil
Akron8ef408b2021-08-02 22:11:04 +0200217 }
218
219 elemint[0], err = strconv.Atoi(elem[1])
220 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200221 log.Error().Msg("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200222 return nil
Akron8ef408b2021-08-02 22:11:04 +0200223 }
Akron740f3d72021-08-03 12:12:34 +0200224 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200225
Akron8ef408b2021-08-02 22:11:04 +0200226 elemint[0], err = strconv.Atoi(elem[2])
227 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200228 log.Error().Msg("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200229 return nil
Akron8ef408b2021-08-02 22:11:04 +0200230 }
Akron439f4ec2021-08-09 15:45:38 +0200231
232 // States start at 1 in Mizobuchi et al (2000),
233 // as the state 0 is associated with a fail.
234 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200235 tok.transitions = make([]map[int]*edge, elemint[0]+1)
236 continue
237 }
238 case STATES:
239 {
240 elem = strings.Split(line[0:len(line)-1], " ")
241 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200242 if DEBUG {
243 fmt.Println("Skip", elem)
244 }
Akron8ef408b2021-08-02 22:11:04 +0200245 continue
246 }
247 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200248 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200249 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200250 break
251 }
Akron8ef408b2021-08-02 22:11:04 +0200252
253 if len(elem) > 1 {
254 elemint[1], err = strconv.Atoi(elem[1])
255 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200256 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200257 break
258 }
259 if len(elem) > 2 {
260 elemint[2], err = strconv.Atoi(elem[2])
261 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200262 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200263 break
264 }
265 if len(elem) > 3 {
266 elemint[3], err = strconv.Atoi(elem[3])
267 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200268 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200269 break
270 }
271 if len(elem) > 4 {
272 elemint[4], err = strconv.Atoi(elem[4])
273 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200274 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200275 break
276 }
277 }
278 }
279 }
280 }
281
282 switch len(elem) {
283 case 5:
284 {
Akron740f3d72021-08-03 12:12:34 +0200285 state = elemint[0]
286 inSym = elemint[1]
287 outSym = elemint[2]
288 end = elemint[3]
289 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200290 }
291 case 4:
292 {
293 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200294 state = elemint[0]
295 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200296 } else {
Akron740f3d72021-08-03 12:12:34 +0200297 state = elemint[0]
298 inSym = elemint[1]
299 end = elemint[2]
300 final = elemint[3]
301 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200302 }
303 }
304 case 3:
305 {
Akron740f3d72021-08-03 12:12:34 +0200306 inSym = elemint[0]
307 outSym = elemint[1]
308 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200309 }
310 case 2:
311 {
Akron740f3d72021-08-03 12:12:34 +0200312 inSym = elemint[0]
313 end = elemint[1]
314 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200315 }
316 }
317
Akron83e75a22021-08-04 13:14:06 +0200318 nontoken := false
319 tokenend := false
320
Akron439f4ec2021-08-09 15:45:38 +0200321 // While the states in foma start with 0, the states in the
322 // Mizobuchi FSA start with one - so we increase every state by 1.
323 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200324 inSym++
325 outSym++
326
Akron439f4ec2021-08-09 15:45:38 +0200327 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200328 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200329 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200330 tokenend = true
331 } else if outSym == tok.epsilon {
332 nontoken = true
333 } else {
Akron740f3d72021-08-03 12:12:34 +0200334 log.Error().Msg(
335 "Unsupported transition: " +
336 strconv.Itoa(state) +
337 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200338 " (" +
Akron740f3d72021-08-03 12:12:34 +0200339 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200340 ":" +
Akron740f3d72021-08-03 12:12:34 +0200341 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200342 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200343 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200344 ":" +
Akron740f3d72021-08-03 12:12:34 +0200345 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200346 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200347 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200348 }
Akron83e75a22021-08-04 13:14:06 +0200349
Akron83e75a22021-08-04 13:14:06 +0200350 } else if inSym == tok.epsilon {
Akron439f4ec2021-08-09 15:45:38 +0200351 log.Error().Msg("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200352 return nil
Akron8ef408b2021-08-02 22:11:04 +0200353 }
354
Akron03c92fe2021-08-09 14:07:57 +0200355 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200356 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200357 inSym: inSym,
358 outSym: outSym,
359 end: end + 1,
360 tokenend: tokenend,
361 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200362 }
363
Akron740f3d72021-08-03 12:12:34 +0200364 // Initialize outgoing states
365 if tok.transitions[state+1] == nil {
366 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200367 }
368
Akron740f3d72021-08-03 12:12:34 +0200369 // Ignore transitions with invalid symbols
370 if inSym >= 0 {
371 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200372 }
Akron8ef408b2021-08-02 22:11:04 +0200373
Akron740f3d72021-08-03 12:12:34 +0200374 // Add final transition
375 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200376 // TODO:
377 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200378 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200379 }
380
Akronb4bbb472021-08-09 11:49:38 +0200381 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200382 fmt.Println("Add",
383 state+1, "->", end+1,
384 "(",
385 inSym,
386 ":",
387 outSym,
388 ") (",
389 string(tok.sigmaRev[inSym]),
390 ":",
391 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200392 ")",
393 ";",
394 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200395 "NT:", nontoken,
396 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200397 }
Akron75ebe7f2021-08-03 10:34:10 +0200398
Akron8ef408b2021-08-02 22:11:04 +0200399 continue
400 }
401 case SIGMA:
402 {
403 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
404
405 // Turn string into sigma id
406 number, err := strconv.Atoi(elem[0])
407
Akron524c5432021-08-05 14:14:27 +0200408 // ID needs to be > 1
409 number++
410
Akron8ef408b2021-08-02 22:11:04 +0200411 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200412 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200413 return nil
Akron8ef408b2021-08-02 22:11:04 +0200414 }
415
Akron740f3d72021-08-03 12:12:34 +0200416 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200417
418 var symbol rune
419
420 // Read rune
421 if utf8.RuneCountInString(elem[1]) == 1 {
422 symbol = []rune(elem[1])[0]
423
Akron8ef408b2021-08-02 22:11:04 +0200424 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200425
426 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200427 switch elem[1] {
428 case "@_EPSILON_SYMBOL_@":
429 {
Akronc17f1ca2021-08-03 19:47:27 +0200430 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200431 }
432 case "@_UNKNOWN_SYMBOL_@":
433 {
Akronc17f1ca2021-08-03 19:47:27 +0200434 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200435 }
436
437 case "@_IDENTITY_SYMBOL_@":
438 {
Akronc17f1ca2021-08-03 19:47:27 +0200439 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200440 }
Akron03c92fe2021-08-09 14:07:57 +0200441
442 case "@_TOKEN_SYMBOL_@":
443 {
444 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200445 }
Akron8ef408b2021-08-02 22:11:04 +0200446 default:
Akron740f3d72021-08-03 12:12:34 +0200447 {
448 log.Error().Msg("MCS not supported: " + line)
Akron4db3ecf2021-08-11 18:49:03 +0200449 return nil
Akron740f3d72021-08-03 12:12:34 +0200450 }
Akron8ef408b2021-08-02 22:11:04 +0200451 }
Akron439f4ec2021-08-09 15:45:38 +0200452 continue
Akron8ef408b2021-08-02 22:11:04 +0200453
Akron740f3d72021-08-03 12:12:34 +0200454 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200455 line, err = r.ReadString('\n')
456 if err != nil {
Akron740f3d72021-08-03 12:12:34 +0200457 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200458 return nil
Akron8ef408b2021-08-02 22:11:04 +0200459 }
460 if len(line) != 1 {
Akron740f3d72021-08-03 12:12:34 +0200461 log.Error().Msg("MCS not supported:" + line)
Akron4db3ecf2021-08-11 18:49:03 +0200462 return nil
Akron8ef408b2021-08-02 22:11:04 +0200463 }
Akron03c92fe2021-08-09 14:07:57 +0200464 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200465 }
466
Akron740f3d72021-08-03 12:12:34 +0200467 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200468 }
469 }
470 }
471
472 return tok
473}
474
Akron64ffd9a2021-08-03 19:55:21 +0200475// Set alphabet A to the list of all symbols
476// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200477func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200478 for a := range tok.transitions[s] {
479 *A = append(*A, a)
480 }
481
482 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200483 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200484}
485
Akron439f4ec2021-08-09 15:45:38 +0200486// ToDoubleArray turns the intermediate tokenizer representation
487// into a double array representation.
488//
489// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200490func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
491
492 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200493 sigma: make(map[rune]int),
494 loadFactor: -1,
495 final: tok.final,
496 unknown: tok.unknown,
497 identity: tok.identity,
498 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200499 tokenend: tok.tokenend,
Akron3f8571a2021-08-05 11:18:10 +0200500 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200501 }
502
503 for num, sym := range tok.sigmaRev {
504 dat.sigma[sym] = num
505 }
Akron8ef408b2021-08-02 22:11:04 +0200506
507 mark := 0
508 size := 0
509
Akron439f4ec2021-08-09 15:45:38 +0200510 // Create a mapping from s (in Ms aka Intermediate FSA)
511 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200512 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200513
Akron439f4ec2021-08-09 15:45:38 +0200514 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200515 table[size] = &mapping{source: 1, target: 1}
516 size++
517
Akron740f3d72021-08-03 12:12:34 +0200518 // Allocate space for the outgoing symbol range
519 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200520
521 for mark < size {
522 s := table[mark].source // This is a state in Ms
523 t := table[mark].target // This is a state in Mt
524 mark++
Akron740f3d72021-08-03 12:12:34 +0200525
526 // Following the paper, here the state t can be remembered
527 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200528 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200529 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200530
Akron740f3d72021-08-03 12:12:34 +0200531 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200532 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200533
Akron773b1ef2021-08-03 17:37:20 +0200534 // TODO:
Akron068874c2021-08-04 15:19:56 +0200535 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200536 // outdegree of .end
537
Akron740f3d72021-08-03 12:12:34 +0200538 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200539 for _, a := range A {
540
Akronc17f1ca2021-08-03 19:47:27 +0200541 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200542
Akron740f3d72021-08-03 12:12:34 +0200543 // Aka g(s, a)
544 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200545
Akron740f3d72021-08-03 12:12:34 +0200546 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200547 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200548 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200549
Akron439f4ec2021-08-09 15:45:38 +0200550 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200551 fmt.Println("Translate transition",
552 s, "->", s1, "(", a, ")", "to", t, "->", t1)
553 }
554
Akron83e75a22021-08-04 13:14:06 +0200555 // Mark the state as being the target of a nontoken transition
556 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200557 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200558 if DEBUG {
559 fmt.Println("Set", t1, "to nontoken")
560 }
Akron83e75a22021-08-04 13:14:06 +0200561 }
562
Akron84d68e62021-08-04 17:06:52 +0200563 // Mark the state as being the target of a tokenend transition
564 if tok.transitions[s][a].tokenend {
565 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200566 if DEBUG {
567 fmt.Println("Set", t1, "to tokenend")
568 }
Akron84d68e62021-08-04 17:06:52 +0200569 }
570
Akron740f3d72021-08-03 12:12:34 +0200571 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200572 r := stateAlreadyInTable(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200573
Akron439f4ec2021-08-09 15:45:38 +0200574 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200575 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200576 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200577 table[size] = &mapping{source: s1, target: t1}
578 size++
579 } else {
Akron740f3d72021-08-03 12:12:34 +0200580 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200581 dat.setBase(t1, r)
582 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200583 }
584 } else {
Akron740f3d72021-08-03 12:12:34 +0200585 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200586 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200587 }
588 }
589 }
590
591 // Following Mizobuchi et al (2000) the size of the
592 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200593 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200594 dat.array = dat.array[:dat.maxSize+1]
595 return dat
Akron8ef408b2021-08-02 22:11:04 +0200596}
597
Akron8ef408b2021-08-02 22:11:04 +0200598// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200599// exists and return this as a representative.
600// Currently iterates through the whole table
601// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200602func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200603 for x := 0; x < size; x++ {
604 if table[x].source == s {
605 return table[x].target
606 }
607 }
608 return 0
609}
610
Akron64ffd9a2021-08-03 19:55:21 +0200611// Resize double array when necessary
612func (dat *DaTokenizer) resize(l int) {
613 // TODO:
614 // This is a bit too aggressive atm and should be calmed down.
615 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200616 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200617 }
Akron64ffd9a2021-08-03 19:55:21 +0200618}
Akronc9d84a62021-08-03 15:56:03 +0200619
Akron64ffd9a2021-08-03 19:55:21 +0200620// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200621func (dat *DaTokenizer) setBase(p uint32, v uint32) {
622 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200623 if dat.maxSize < l {
Akron439f4ec2021-08-09 15:45:38 +0200624 dat.resize(l)
Akron64ffd9a2021-08-03 19:55:21 +0200625 dat.maxSize = l
626 }
Akron439f4ec2021-08-09 15:45:38 +0200627 dat.array[l-1] = v
628}
629
630// Get base value in double array
631func (dat *DaTokenizer) getBase(p uint32) uint32 {
632 if int(p*2) > dat.maxSize {
633 return 0
634 }
635 return dat.array[p*2] & RESTBIT
636}
637
638// Set check value in double array
639func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
640 l := int(p*2 + 1)
641 if dat.maxSize < l {
642 dat.resize(l)
643 dat.maxSize = l
644 }
645 dat.array[l] = v
646}
647
648// Get check value in double array
649func (dat *DaTokenizer) getCheck(p uint32) uint32 {
650 if int((p*2)+1) > dat.maxSize {
651 return 0
652 }
653 return dat.array[(p*2)+1] & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200654}
655
Akron3fdfec62021-08-04 11:40:10 +0200656// Returns true if a state is separate pointing to a representative
657func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200658 return dat.array[p*2]&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200659}
660
661// Mark a state as separate pointing to a representative
662func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
663 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200664 dat.array[p*2] |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200665 } else {
Akron03c92fe2021-08-09 14:07:57 +0200666 dat.array[p*2] &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200667 }
668}
669
Akron83e75a22021-08-04 13:14:06 +0200670// Returns true if a state is the target of a nontoken transition
671func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200672 return dat.array[p*2+1]&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200673}
674
675// Mark a state as being the target of a nontoken transition
676func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
677 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200678 dat.array[p*2+1] |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200679 } else {
Akron03c92fe2021-08-09 14:07:57 +0200680 dat.array[p*2+1] &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200681 }
682}
683
Akron84d68e62021-08-04 17:06:52 +0200684// Returns true if a state is the target of a tokenend transition
685func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200686 return dat.array[p*2+1]&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200687}
688
689// Mark a state as being the target of a tokenend transition
690func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
691 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200692 dat.array[p*2+1] |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200693 } else {
Akron03c92fe2021-08-09 14:07:57 +0200694 dat.array[p*2+1] &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200695 }
696}
697
Akron64ffd9a2021-08-03 19:55:21 +0200698// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200699func (dat *DaTokenizer) setSize(v int) {
700 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200701}
702
703// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200704func (dat *DaTokenizer) GetSize() int {
705 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200706}
707
708// Based on Mizobuchi et al (2000), p. 124
709// This iterates for every state through the complete double array
710// structure until it finds a gap that fits all outgoing transitions
711// of the state. This is extremely slow, but is only necessary in the
712// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200713func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200714
715 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200716 base := uint32(1) // dat.lastFilledBase
717 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200718OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200719
Akron3f8571a2021-08-05 11:18:10 +0200720 /*
721 if !skip {
722 if dat.getCheck(base) != 0 {
723 dat.lastFilledBase = base
724 } else {
725 skip = true
726 }
727 }
728 */
729
Akron740f3d72021-08-03 12:12:34 +0200730 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200731 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200732 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200733 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200734 base++
735 goto OVERLAP
736 }
737 }
Akron8ef408b2021-08-02 22:11:04 +0200738 return base
739}
740
Akron3610f102021-08-08 14:13:25 +0200741// List all outgoing transitions for a state
742// for testing purposes
743func (dat *DaTokenizer) outgoing(t uint32) []int {
744
745 valid := make([]int, 0, len(dat.sigma))
746
747 for _, a := range dat.sigma {
748 t1 := dat.getBase(t) + uint32(a)
749 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
750 valid = append(valid, a)
751 }
752 }
753
754 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
755 t1 := dat.getBase(t) + uint32(a)
756 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
757 valid = append(valid, -1*a)
758 }
759 }
760
761 sort.Ints(valid)
762
763 return valid
764}
765
Akron03a3c612021-08-04 11:51:27 +0200766// LoadFactor as defined in Kanda et al (2018),
767// i.e. the proportion of non-empty elements to all elements.
768func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200769
Akron03a3c612021-08-04 11:51:27 +0200770 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200771 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200772 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200773 }
Akrond66a9262021-08-03 17:09:09 +0200774 nonEmpty := 0
775 all := len(dat.array) / 2
776 for x := 1; x <= len(dat.array); x = x + 2 {
777 if dat.array[x] != 0 {
778 nonEmpty++
779 }
780 }
Akron03a3c612021-08-04 11:51:27 +0200781 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
782 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200783}
784
Akron439f4ec2021-08-09 15:45:38 +0200785// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200786func (dat *DaTokenizer) Save(file string) (n int64, err error) {
787 f, err := os.Create(file)
788 if err != nil {
789 log.Error().Err(err)
Akron439f4ec2021-08-09 15:45:38 +0200790 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200791 }
792 defer f.Close()
793 gz := gzip.NewWriter(f)
794 defer gz.Close()
795 n, err = dat.WriteTo(gz)
796 if err != nil {
797 log.Error().Err(err)
798 return n, err
799 }
800 gz.Flush()
801 return n, nil
802}
803
804// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200805func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
806
Akron3a063ef2021-08-05 19:36:35 +0200807 wb := bufio.NewWriter(w)
808 defer wb.Flush()
809
Akron6247a5d2021-08-03 19:18:28 +0200810 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200811 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200812 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200813 log.Error().Err(err)
814 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200815 }
816
817 // Get sigma as a list
818 sigmalist := make([]rune, len(dat.sigma)+16)
819 max := 0
820 for sym, num := range dat.sigma {
821 sigmalist[num] = sym
822 if num > max {
823 max = num
824 }
825 }
826
827 sigmalist = sigmalist[:max+1]
828
Akron3f8571a2021-08-05 11:18:10 +0200829 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200830 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200831 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
832 bo.PutUint16(buf[4:6], uint16(dat.unknown))
833 bo.PutUint16(buf[6:8], uint16(dat.identity))
834 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200835 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200836 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200837 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200838 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200839 log.Error().Err(err)
840 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200841 }
842
843 all += more
844
Akron6247a5d2021-08-03 19:18:28 +0200845 // Write sigma
846 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200847
Akron3a063ef2021-08-05 19:36:35 +0200848 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200849 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200850 log.Error().Err(err)
851 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200852 }
853 all += more
854 }
Akron439f4ec2021-08-09 15:45:38 +0200855
Akron6247a5d2021-08-03 19:18:28 +0200856 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200857 log.Error().Err(err)
858 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200859 }
Akron6247a5d2021-08-03 19:18:28 +0200860
861 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200862 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200863 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200864 log.Error().Err(err)
865 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200866 }
867 all += more
868
Akron3a063ef2021-08-05 19:36:35 +0200869 for x := 0; x < len(dat.array); x++ {
870 // for _, d := range dat.array {
871 bo.PutUint32(buf[0:4], dat.array[x])
872 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200873 if err != nil {
Akron3f8571a2021-08-05 11:18:10 +0200874 log.Error().Err(err)
875 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200876 }
Akron439f4ec2021-08-09 15:45:38 +0200877 all += more
Akron3a063ef2021-08-05 19:36:35 +0200878 if more != 4 {
879 log.Error().Msg("Can not write uint32")
880 return int64(all), err
881 }
Akron6247a5d2021-08-03 19:18:28 +0200882 }
883
884 return int64(all), err
885}
886
Akron439f4ec2021-08-09 15:45:38 +0200887// LoadDatokFile reads a double array represented tokenizer
888// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200889func LoadDatokFile(file string) *DaTokenizer {
890 f, err := os.Open(file)
891 if err != nil {
892 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200893 return nil
Akron3f8571a2021-08-05 11:18:10 +0200894 }
895 defer f.Close()
896
897 gz, err := gzip.NewReader(f)
898 if err != nil {
899 log.Error().Err(err)
Akron4db3ecf2021-08-11 18:49:03 +0200900 return nil
Akron3f8571a2021-08-05 11:18:10 +0200901 }
902 defer gz.Close()
903
Akron3a063ef2021-08-05 19:36:35 +0200904 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200905 return ParseDatok(gz)
906}
907
Akron439f4ec2021-08-09 15:45:38 +0200908// LoadDatokFile reads a double array represented tokenizer
909// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200910func ParseDatok(ior io.Reader) *DaTokenizer {
911
Akron439f4ec2021-08-09 15:45:38 +0200912 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200913 dat := &DaTokenizer{
914 sigma: make(map[rune]int),
915 epsilon: 0,
916 unknown: 0,
917 identity: 0,
918 final: 0,
919 loadFactor: 0,
920 }
921
922 r := bufio.NewReader(ior)
923
Akron3f8571a2021-08-05 11:18:10 +0200924 buf := make([]byte, 1024)
925 buf = buf[0:len(MAGIC)]
926
Akron439f4ec2021-08-09 15:45:38 +0200927 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200928
929 if err != nil {
930 log.Error().Err(err)
931 return nil
932 }
933
Akron3f8571a2021-08-05 11:18:10 +0200934 if string(MAGIC) != string(buf) {
935 log.Error().Msg("Not a datok file")
936 return nil
937 }
938
Akron439f4ec2021-08-09 15:45:38 +0200939 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200940 if err != nil {
941 log.Error().Err(err)
942 return nil
943 }
944
Akron439f4ec2021-08-09 15:45:38 +0200945 if more != 16 {
946 log.Error().Msg("Read bytes do not fit")
947 return nil
948 }
Akron3f8571a2021-08-05 11:18:10 +0200949
Akron3a063ef2021-08-05 19:36:35 +0200950 version := bo.Uint16(buf[0:2])
951
952 if version != VERSION {
953 log.Error().Msg("Version not compatible")
954 return nil
955 }
956
Akron3f8571a2021-08-05 11:18:10 +0200957 dat.epsilon = int(bo.Uint16(buf[2:4]))
958 dat.unknown = int(bo.Uint16(buf[4:6]))
959 dat.identity = int(bo.Uint16(buf[6:8]))
960 dat.final = int(bo.Uint16(buf[8:10]))
961
962 sigmaCount := int(bo.Uint16(buf[10:12]))
963 arraySize := int(bo.Uint32(buf[12:16]))
964
Akron3a063ef2021-08-05 19:36:35 +0200965 // Shouldn't be relevant though
966 dat.maxSize = arraySize - 1
967
Akron3f8571a2021-08-05 11:18:10 +0200968 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200969 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200970 if err == nil && sym != 0 {
971 dat.sigma[sym] = x
972 }
Akron3f8571a2021-08-05 11:18:10 +0200973 }
974
Akron439f4ec2021-08-09 15:45:38 +0200975 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200976
977 if err != nil {
978 log.Error().Err(err)
979 return nil
980 }
981
Akron3f8571a2021-08-05 11:18:10 +0200982 if string("T") != string(buf[0:1]) {
983 log.Error().Msg("Not a datok file")
984 return nil
985 }
986
987 // Read based on length
988 dat.array = make([]uint32, arraySize)
989
Akronbb4aac52021-08-13 00:52:27 +0200990 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +0200991
Akronbb4aac52021-08-13 00:52:27 +0200992 if err == io.EOF {
993 log.Error().Err(err)
994 return nil
995 }
996
997 if len(dataArray) < arraySize*4 {
998 log.Error().Msg("Not enough bytes read")
999 return nil
1000 }
1001
1002 for x := 0; x < arraySize; x++ {
1003 dat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
Akron3f8571a2021-08-05 11:18:10 +02001004 }
1005
1006 return dat
1007}
1008
Akron439f4ec2021-08-09 15:45:38 +02001009// Show the current state of the buffer,
1010// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001011func showBuffer(buffer []rune, buffo int, buffi int) string {
1012 out := make([]rune, 0, 1024)
1013 for x := 0; x < len(buffer); x++ {
1014 if buffi == x {
1015 out = append(out, '^')
1016 }
1017 if buffo == x {
1018 out = append(out, '[', buffer[x], ']')
1019 } else {
1020 out = append(out, buffer[x])
1021 }
1022 }
1023 return string(out)
1024}
1025
Akron84d68e62021-08-04 17:06:52 +02001026// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001027// FSA. The rules are always greedy. If the automaton fails,
1028// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001029//
Akron4db3ecf2021-08-11 18:49:03 +02001030// Based on Mizobuchi et al (2000), p. 129,
1031// with additional support for IDENTITY, UNKNOWN
1032// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001033func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001034 var a int
Akronb4bbb472021-08-09 11:49:38 +02001035 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001036 t := uint32(1) // Initial state
1037 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001038
Akron3610f102021-08-08 14:13:25 +02001039 // Remember the last position of a possible tokenend,
1040 // in case the automaton fails.
1041 epsilonState := uint32(0)
1042 epsilonOffset := 0
1043
1044 // Implement a low level buffer for full control,
1045 // however - it is probably better to introduce
1046 // this on a higher level with a io.Reader interface
1047 // The buffer stores a single word and may have white
1048 // space at the end (but not at the beginning).
1049 //
1050 // This is the only backtracking requirement because of
1051 // epsilon transitions, to support tokenizations like:
1052 // "this is an example|.| And it works." vs
1053 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001054 //
1055 // TODO:
1056 // Store a translation buffer as well, so characters don't
1057 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001058 buffer := make([]rune, 1024)
1059 buffo := 0 // Buffer offset
1060 buffi := 0 // Buffer length
1061
Akron3f8571a2021-08-05 11:18:10 +02001062 reader := bufio.NewReader(r)
1063 writer := bufio.NewWriter(w)
1064 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001065
Akron3f8571a2021-08-05 11:18:10 +02001066 var char rune
1067 var err error
1068 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001069 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001070
Akronc5d8d432021-08-10 16:48:44 +02001071PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001072 for {
1073
Akronb7e1f132021-08-10 11:52:31 +02001074 if newchar {
1075 // Get from reader if buffer is empty
1076 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001077 if eof {
1078 break
1079 }
Akronb7e1f132021-08-10 11:52:31 +02001080 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001081
Akronb7e1f132021-08-10 11:52:31 +02001082 // No more runes to read
1083 if err != nil {
1084 eof = true
1085 break
1086 }
1087 buffer[buffi] = char
1088 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001089 }
Akronb7e1f132021-08-10 11:52:31 +02001090
1091 char = buffer[buffo]
1092
1093 if DEBUG {
1094 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1095 }
1096
1097 // TODO: Better not repeatedly check for a!
1098 a, ok = dat.sigma[char]
1099
1100 // Use identity symbol if character is not in sigma
1101 if !ok && dat.identity != -1 {
1102 a = dat.identity
1103 }
1104
1105 t0 = t
1106
1107 // Check for epsilon transitions and remember
1108 if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
1109 // Remember state for backtracking to last tokenend state
1110 epsilonState = t0
1111 epsilonOffset = buffo
1112 }
Akron3f8571a2021-08-05 11:18:10 +02001113 }
Akron3610f102021-08-08 14:13:25 +02001114
Akronb7e1f132021-08-10 11:52:31 +02001115 // Checks a transition based on t0, a and buffo
Akronb4bbb472021-08-09 11:49:38 +02001116 t = dat.getBase(t0) + uint32(a)
Akron068874c2021-08-04 15:19:56 +02001117
Akron524c5432021-08-05 14:14:27 +02001118 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001119 // Char is only relevant if set
1120 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1121 if false {
1122 fmt.Println(dat.outgoing(t0))
1123 }
Akron524c5432021-08-05 14:14:27 +02001124 }
1125
Akronb7e1f132021-08-10 11:52:31 +02001126 // Check if the transition is invalid according to the double array
Akronb4bbb472021-08-09 11:49:38 +02001127 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001128
1129 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001130 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001131 }
1132
1133 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001134
Akron068874c2021-08-04 15:19:56 +02001135 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001136 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001137 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001138 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001139 }
1140 a = dat.unknown
1141
1142 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001143
Akron068874c2021-08-04 15:19:56 +02001144 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001145 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001146 epsilonState = 0 // reset
1147 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001148 a = dat.epsilon
1149
Akron3610f102021-08-08 14:13:25 +02001150 if DEBUG {
1151 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1152 }
Akronb4bbb472021-08-09 11:49:38 +02001153
Akron068874c2021-08-04 15:19:56 +02001154 } else {
1155 break
1156 }
Akron068874c2021-08-04 15:19:56 +02001157
Akronb7e1f132021-08-10 11:52:31 +02001158 newchar = false
1159 continue
Akronb4bbb472021-08-09 11:49:38 +02001160 }
1161
Akronb7e1f132021-08-10 11:52:31 +02001162 // Transition was successful
1163 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001164
1165 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001166 if a != dat.epsilon {
1167
Akron3610f102021-08-08 14:13:25 +02001168 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001169
Akron439f4ec2021-08-09 15:45:38 +02001170 // Transition does not produce a character
Akronc5d8d432021-08-10 16:48:44 +02001171 if buffo == 1 && dat.isNonToken(t) {
Akron3610f102021-08-08 14:13:25 +02001172 if DEBUG {
1173 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1174 }
Akron439f4ec2021-08-09 15:45:38 +02001175 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001176 }
Akron3f8571a2021-08-05 11:18:10 +02001177 }
Akron068874c2021-08-04 15:19:56 +02001178
Akronc5d8d432021-08-10 16:48:44 +02001179 // Transition marks the end of a token - so flush the buffer
Akronb7e1f132021-08-10 11:52:31 +02001180 if dat.isTokenEnd(t) {
Akron524c5432021-08-05 14:14:27 +02001181
Akronc5d8d432021-08-10 16:48:44 +02001182 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001183 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001184 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001185 }
Akron01912fc2021-08-12 11:41:58 +02001186 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001187 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001188 }
Akron1594cb82021-08-11 11:14:56 +02001189 if DEBUG {
1190 fmt.Println("-> Newline")
1191 }
1192 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001193 }
Akron3610f102021-08-08 14:13:25 +02001194
Akronc5d8d432021-08-10 16:48:44 +02001195 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001196 if rewindBuffer {
1197
1198 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001199 for x, i := range buffer[buffo:buffi] {
1200 buffer[x] = i
1201 }
Akronb4bbb472021-08-09 11:49:38 +02001202
Akron3610f102021-08-08 14:13:25 +02001203 buffi -= buffo
1204 epsilonOffset -= buffo
1205 buffo = 0
1206 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001207 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001208 }
Akron84d68e62021-08-04 17:06:52 +02001209 }
1210
Akronb7e1f132021-08-10 11:52:31 +02001211 // Move to representative state
1212 if dat.isSeparate(t) {
1213 t = dat.getBase(t)
1214
1215 if DEBUG {
1216 fmt.Println("Representative pointing to", t)
1217 }
1218 }
1219
Akronc5d8d432021-08-10 16:48:44 +02001220 newchar = true
1221
Akron068874c2021-08-04 15:19:56 +02001222 // TODO:
1223 // Prevent endless epsilon loops!
1224 }
1225
Akron439f4ec2021-08-09 15:45:38 +02001226 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001227 if !eof {
Akron068874c2021-08-04 15:19:56 +02001228 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001229 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001230 }
1231 return false
1232 }
1233
Akronb7e1f132021-08-10 11:52:31 +02001234 if DEBUG {
1235 fmt.Println("Entering final check")
1236 }
1237
Akronc5d8d432021-08-10 16:48:44 +02001238 // Automaton is in a final state, so flush the buffer and return
Akron068874c2021-08-04 15:19:56 +02001239 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akronb4bbb472021-08-09 11:49:38 +02001240
1241 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001242 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001243 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001244 }
Akron01912fc2021-08-12 11:41:58 +02001245 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001246
Akrondf0a3ef2021-08-09 15:53:45 +02001247 if dat.isTokenEnd(t) {
1248 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001249 if DEBUG {
1250 fmt.Println("-> Newline")
1251 }
Akrondf0a3ef2021-08-09 15:53:45 +02001252 }
Akron84d68e62021-08-04 17:06:52 +02001253 }
1254
Akron6e70dc82021-08-11 11:33:18 +02001255 // Add an additional sentence ending, if the file is over but no explicit
1256 // sentence split was reached. This may be controversial and therefore
1257 // optional via parameter.
1258 if !dat.isTokenEnd(t0) {
1259 writer.WriteRune('\n')
1260 if DEBUG {
1261 fmt.Println("-> Newline")
1262 }
1263 }
1264
Akron84d68e62021-08-04 17:06:52 +02001265 // There may be a new line at the end, from an epsilon, so we go on!
Akron068874c2021-08-04 15:19:56 +02001266 return true
1267 }
1268
Akronc5d8d432021-08-10 16:48:44 +02001269 // Check epsilon transitions until a final state is reached
1270 t0 = t
1271 t = dat.getBase(t0) + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001272 a = dat.epsilon
1273 newchar = false
Akronc5d8d432021-08-10 16:48:44 +02001274 if dat.getCheck(t) == t0 {
1275 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001276 goto PARSECHAR
1277 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001278 t0 = epsilonState
1279 epsilonState = 0 // reset
1280 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001281 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001282 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001283 }
Akronc5d8d432021-08-10 16:48:44 +02001284 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001285 }
Akronc5d8d432021-08-10 16:48:44 +02001286 return false
Akron068874c2021-08-04 15:19:56 +02001287}