blob: 4e870d9bde165a1cb5933a264e911ece1f3b7d93 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
Akronb4bbb472021-08-09 11:49:38 +02009// The maximum number of states is 1.073.741.823 (30bit),
Akron83e75a22021-08-04 13:14:06 +020010// with a loadfactor of ~70, this means roughly 70 million
11// states in the FSA, which is sufficient for the current
12// job.
Akron03c92fe2021-08-09 14:07:57 +020013//
14// Serialization is little endian.
Akron83e75a22021-08-04 13:14:06 +020015
Akron740f3d72021-08-03 12:12:34 +020016// TODO:
Akrone61380b2021-08-16 10:10:46 +020017// - Turn sigma into an array instead of using a map
18// and improve the mapping beforehand so that ASCII
19// is mapped directly and only non-ASCII needs to be
20// looked up in a map or similar.
Akron740f3d72021-08-03 12:12:34 +020021// - replace maxSize with the check value
Akron3a063ef2021-08-05 19:36:35 +020022// - Add checksum to serialization.
Akron03c92fe2021-08-09 14:07:57 +020023// - Introduce methods on BC array entries instead of
24// jumping into the entries all the time!
25// - Instead of memoizing the loadFactor, better remember
26// the number of set transitions
Akrone61380b2021-08-16 10:10:46 +020027// - Replace/Enhance table with a map
28// - Provide a bufio.Scanner compatible interface.
Akron8e1d69b2021-08-12 17:38:49 +020029// - Mark epsilon transitions in bytes
Akron740f3d72021-08-03 12:12:34 +020030
Akron8ef408b2021-08-02 22:11:04 +020031import (
32 "bufio"
33 "compress/gzip"
Akron6247a5d2021-08-03 19:18:28 +020034 "encoding/binary"
Akron8ef408b2021-08-02 22:11:04 +020035 "fmt"
36 "io"
37 "os"
Akronc9d84a62021-08-03 15:56:03 +020038 "sort"
Akron8ef408b2021-08-02 22:11:04 +020039 "strconv"
40 "strings"
41 "unicode/utf8"
Akron740f3d72021-08-03 12:12:34 +020042
Akron527c10c2021-08-13 01:45:18 +020043 "log"
Akron8ef408b2021-08-02 22:11:04 +020044)
45
46const (
Akron2a4b9292021-08-04 15:35:22 +020047 PROPS = 1
48 SIGMA = 2
49 STATES = 3
50 NONE = 4
Akron2a4b9292021-08-04 15:35:22 +020051 DEBUG = false
52 MAGIC = "DATOK"
53 VERSION = uint16(1)
Akron03c92fe2021-08-09 14:07:57 +020054 FIRSTBIT uint32 = 1 << 31
55 SECONDBIT uint32 = 1 << 30
56 RESTBIT uint32 = ^uint32(0) &^ (FIRSTBIT | SECONDBIT)
Akron8ef408b2021-08-02 22:11:04 +020057)
58
Akron03c92fe2021-08-09 14:07:57 +020059// Serialization is always little endian
Akron6247a5d2021-08-03 19:18:28 +020060var bo binary.ByteOrder = binary.LittleEndian
61
Akron8ef408b2021-08-02 22:11:04 +020062type mapping struct {
63 source int
Akron3fdfec62021-08-04 11:40:10 +020064 target uint32
Akron8ef408b2021-08-02 22:11:04 +020065}
66
67type edge struct {
Akron83e75a22021-08-04 13:14:06 +020068 inSym int
69 outSym int
70 end int
71 nontoken bool
72 tokenend bool
Akron8ef408b2021-08-02 22:11:04 +020073}
74
Akron03c92fe2021-08-09 14:07:57 +020075// Tokenizer is the intermediate representation
76// of the tokenizer.
Akron8ef408b2021-08-02 22:11:04 +020077type Tokenizer struct {
Akron740f3d72021-08-03 12:12:34 +020078 sigmaRev map[int]rune
79 arcCount int
Akron740f3d72021-08-03 12:12:34 +020080 sigmaCount int
Akron8ef408b2021-08-02 22:11:04 +020081 transitions []map[int]*edge
Akronc17f1ca2021-08-03 19:47:27 +020082
83 // Special symbols in sigma
84 epsilon int
85 unknown int
86 identity int
87 final int
Akron03c92fe2021-08-09 14:07:57 +020088 tokenend int
Akron8ef408b2021-08-02 22:11:04 +020089}
90
Akron03c92fe2021-08-09 14:07:57 +020091// DaTokenizer represents a tokenizer implemented as a
92// Double Array FSA.
Akronf2120ca2021-08-03 16:26:41 +020093type DaTokenizer struct {
Akrone61380b2021-08-16 10:10:46 +020094 sigma map[rune]int
95 // sigmaList []rune
Akron03a3c612021-08-04 11:51:27 +020096 maxSize int
97 loadFactor float64
98 array []uint32
Akron3f8571a2021-08-05 11:18:10 +020099 // lastFilledBase uint32
Akronc17f1ca2021-08-03 19:47:27 +0200100
101 // Special symbols in sigma
102 epsilon int
103 unknown int
104 identity int
105 final int
Akron03c92fe2021-08-09 14:07:57 +0200106 tokenend int
Akronf2120ca2021-08-03 16:26:41 +0200107}
108
Akron03c92fe2021-08-09 14:07:57 +0200109// ParseFoma reads the FST from a foma file
110// and creates an internal representation,
111// in case it follows the tokenizer's convention.
Akron64ffd9a2021-08-03 19:55:21 +0200112func LoadFomaFile(file string) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200113 f, err := os.Open(file)
114 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200115 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200116 return nil
Akron8ef408b2021-08-02 22:11:04 +0200117 }
118 defer f.Close()
119
120 gz, err := gzip.NewReader(f)
121 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200122 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200123 return nil
Akron8ef408b2021-08-02 22:11:04 +0200124 }
125 defer gz.Close()
126
Akron3fdfec62021-08-04 11:40:10 +0200127 return ParseFoma(gz)
Akron8ef408b2021-08-02 22:11:04 +0200128}
129
Akron03c92fe2021-08-09 14:07:57 +0200130// ParseFoma reads the FST from a foma file reader
131// and creates an internal representation,
132// in case it follows the tokenizer's convention.
Akron3fdfec62021-08-04 11:40:10 +0200133func ParseFoma(ior io.Reader) *Tokenizer {
Akron8ef408b2021-08-02 22:11:04 +0200134 r := bufio.NewReader(ior)
135
136 tok := &Tokenizer{
Akron740f3d72021-08-03 12:12:34 +0200137 sigmaRev: make(map[int]rune),
Akronc17f1ca2021-08-03 19:47:27 +0200138 epsilon: -1,
139 unknown: -1,
140 identity: -1,
141 final: -1,
Akron03c92fe2021-08-09 14:07:57 +0200142 tokenend: -1,
Akron8ef408b2021-08-02 22:11:04 +0200143 }
144
Akron740f3d72021-08-03 12:12:34 +0200145 var state, inSym, outSym, end, final int
Akron8ef408b2021-08-02 22:11:04 +0200146
147 mode := 0
148 var elem []string
149 var elemint [5]int
150
Akron03c92fe2021-08-09 14:07:57 +0200151 // Iterate over all lines of the file.
152 // This is mainly based on foma2js,
153 // licensed under the Apache License, version 2,
154 // and written by Mans Hulden.
Akron8ef408b2021-08-02 22:11:04 +0200155 for {
156 line, err := r.ReadString('\n')
157 if err != nil {
158 if err == io.EOF {
159 break
160 }
Akron527c10c2021-08-13 01:45:18 +0200161 log.Print(err)
Akron4db3ecf2021-08-11 18:49:03 +0200162 return nil
Akron8ef408b2021-08-02 22:11:04 +0200163 }
Akron8ef408b2021-08-02 22:11:04 +0200164
Akron439f4ec2021-08-09 15:45:38 +0200165 // Read parser mode for the following lines
166 if strings.HasPrefix(line, "##") {
167 if strings.HasPrefix(line, "##props##") {
168 mode = PROPS
169
170 } else if strings.HasPrefix(line, "##states##") {
171 mode = STATES
172
173 // Adds a final transition symbol to sigma
174 // written as '#' in Mizobuchi et al (2000)
175 tok.sigmaCount++
176 tok.final = tok.sigmaCount
177
178 } else if strings.HasPrefix(line, "##sigma##") {
179
180 mode = SIGMA
181
182 } else if strings.HasPrefix(line, "##end##") {
183
184 mode = NONE
185
186 } else if !strings.HasPrefix(line, "##foma-net") {
Akron527c10c2021-08-13 01:45:18 +0200187 log.Print("Unknown input line")
Akron439f4ec2021-08-09 15:45:38 +0200188 break
189 }
Akron8ef408b2021-08-02 22:11:04 +0200190 continue
191 }
192
Akron439f4ec2021-08-09 15:45:38 +0200193 // Based on the current parser mode, interpret the lines
Akron8ef408b2021-08-02 22:11:04 +0200194 switch mode {
195 case PROPS:
196 {
197 elem = strings.Split(line, " ")
198 /*
199 fmt.Println("arity: " + elem[0])
200 fmt.Println("arccount: " + elem[1])
201 fmt.Println("statecount: " + elem[2])
202 fmt.Println("linecount: " + elem[3])
203 fmt.Println("finalcount: " + elem[4])
204 fmt.Println("pathcount: " + elem[5])
205 fmt.Println("is_deterministic: " + elem[6])
206 fmt.Println("is_pruned: " + elem[7])
207 fmt.Println("is_minimized: " + elem[8])
208 fmt.Println("is_epsilon_free: " + elem[9])
209 fmt.Println("is_loop_free: " + elem[10])
210 fmt.Println("extras: " + elem[11])
211 fmt.Println("name: " + elem[12])
212 */
213 if elem[6] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200214 log.Print("The FST needs to be deterministic")
Akron4db3ecf2021-08-11 18:49:03 +0200215 return nil
Akron8ef408b2021-08-02 22:11:04 +0200216 }
Akron439f4ec2021-08-09 15:45:38 +0200217
Akron8ef408b2021-08-02 22:11:04 +0200218 if elem[9] != "1" {
Akron527c10c2021-08-13 01:45:18 +0200219 log.Print("The FST needs to be epsilon free")
Akron4db3ecf2021-08-11 18:49:03 +0200220 return nil
Akron8ef408b2021-08-02 22:11:04 +0200221 }
222
223 elemint[0], err = strconv.Atoi(elem[1])
224 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200225 log.Print("Can't read arccount")
Akron4db3ecf2021-08-11 18:49:03 +0200226 return nil
Akron8ef408b2021-08-02 22:11:04 +0200227 }
Akron740f3d72021-08-03 12:12:34 +0200228 tok.arcCount = elemint[0]
Akron8ef408b2021-08-02 22:11:04 +0200229
Akron8ef408b2021-08-02 22:11:04 +0200230 elemint[0], err = strconv.Atoi(elem[2])
231 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200232 log.Print("Can't read statecount")
Akron4db3ecf2021-08-11 18:49:03 +0200233 return nil
Akron8ef408b2021-08-02 22:11:04 +0200234 }
Akron439f4ec2021-08-09 15:45:38 +0200235
236 // States start at 1 in Mizobuchi et al (2000),
237 // as the state 0 is associated with a fail.
238 // Initialize states and transitions
Akron8ef408b2021-08-02 22:11:04 +0200239 tok.transitions = make([]map[int]*edge, elemint[0]+1)
240 continue
241 }
242 case STATES:
243 {
244 elem = strings.Split(line[0:len(line)-1], " ")
245 if elem[0] == "-1" {
Akron3610f102021-08-08 14:13:25 +0200246 if DEBUG {
247 fmt.Println("Skip", elem)
248 }
Akron8ef408b2021-08-02 22:11:04 +0200249 continue
250 }
251 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200252 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200253 fmt.Println("Unable to translate", elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200254 break
255 }
Akron8ef408b2021-08-02 22:11:04 +0200256
257 if len(elem) > 1 {
258 elemint[1], err = strconv.Atoi(elem[1])
259 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200260 fmt.Println("Unable to translate", elem[1])
Akron8ef408b2021-08-02 22:11:04 +0200261 break
262 }
263 if len(elem) > 2 {
264 elemint[2], err = strconv.Atoi(elem[2])
265 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200266 fmt.Println("Unable to translate", elem[2])
Akron8ef408b2021-08-02 22:11:04 +0200267 break
268 }
269 if len(elem) > 3 {
270 elemint[3], err = strconv.Atoi(elem[3])
271 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200272 fmt.Println("Unable to translate", elem[3])
Akron8ef408b2021-08-02 22:11:04 +0200273 break
274 }
275 if len(elem) > 4 {
276 elemint[4], err = strconv.Atoi(elem[4])
277 if err != nil {
Akron3610f102021-08-08 14:13:25 +0200278 fmt.Println("Unable to translate", elem[4])
Akron8ef408b2021-08-02 22:11:04 +0200279 break
280 }
281 }
282 }
283 }
284 }
285
286 switch len(elem) {
287 case 5:
288 {
Akron740f3d72021-08-03 12:12:34 +0200289 state = elemint[0]
290 inSym = elemint[1]
291 outSym = elemint[2]
292 end = elemint[3]
293 final = elemint[4]
Akron8ef408b2021-08-02 22:11:04 +0200294 }
295 case 4:
296 {
297 if elemint[1] == -1 {
Akron740f3d72021-08-03 12:12:34 +0200298 state = elemint[0]
299 final = elemint[3]
Akron8ef408b2021-08-02 22:11:04 +0200300 } else {
Akron740f3d72021-08-03 12:12:34 +0200301 state = elemint[0]
302 inSym = elemint[1]
303 end = elemint[2]
304 final = elemint[3]
305 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200306 }
307 }
308 case 3:
309 {
Akron740f3d72021-08-03 12:12:34 +0200310 inSym = elemint[0]
311 outSym = elemint[1]
312 end = elemint[2]
Akron8ef408b2021-08-02 22:11:04 +0200313 }
314 case 2:
315 {
Akron740f3d72021-08-03 12:12:34 +0200316 inSym = elemint[0]
317 end = elemint[1]
318 outSym = inSym
Akron8ef408b2021-08-02 22:11:04 +0200319 }
320 }
321
Akron83e75a22021-08-04 13:14:06 +0200322 nontoken := false
323 tokenend := false
324
Akron439f4ec2021-08-09 15:45:38 +0200325 // While the states in foma start with 0, the states in the
326 // Mizobuchi FSA start with one - so we increase every state by 1.
327 // We also increase sigma by 1, so there are no 0 transitions.
Akron524c5432021-08-05 14:14:27 +0200328 inSym++
329 outSym++
330
Akron439f4ec2021-08-09 15:45:38 +0200331 // Only a limited list of transitions are allowed
Akron740f3d72021-08-03 12:12:34 +0200332 if inSym != outSym {
Akron01912fc2021-08-12 11:41:58 +0200333 if outSym == tok.tokenend && inSym == tok.epsilon {
Akron83e75a22021-08-04 13:14:06 +0200334 tokenend = true
335 } else if outSym == tok.epsilon {
336 nontoken = true
337 } else {
Akron527c10c2021-08-13 01:45:18 +0200338 log.Println(
Akron740f3d72021-08-03 12:12:34 +0200339 "Unsupported transition: " +
340 strconv.Itoa(state) +
341 " -> " + strconv.Itoa(end) +
Akron75ebe7f2021-08-03 10:34:10 +0200342 " (" +
Akron740f3d72021-08-03 12:12:34 +0200343 strconv.Itoa(inSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200344 ":" +
Akron740f3d72021-08-03 12:12:34 +0200345 strconv.Itoa(outSym) +
Akron75ebe7f2021-08-03 10:34:10 +0200346 ") (" +
Akron740f3d72021-08-03 12:12:34 +0200347 string(tok.sigmaRev[inSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200348 ":" +
Akron740f3d72021-08-03 12:12:34 +0200349 string(tok.sigmaRev[outSym]) +
Akron75ebe7f2021-08-03 10:34:10 +0200350 ")")
Akron4db3ecf2021-08-11 18:49:03 +0200351 return nil
Akron75ebe7f2021-08-03 10:34:10 +0200352 }
Akron83e75a22021-08-04 13:14:06 +0200353
Akron83e75a22021-08-04 13:14:06 +0200354 } else if inSym == tok.epsilon {
Akron527c10c2021-08-13 01:45:18 +0200355 log.Println("General epsilon transitions are not supported")
Akron4db3ecf2021-08-11 18:49:03 +0200356 return nil
Akron8ef408b2021-08-02 22:11:04 +0200357 }
358
Akron03c92fe2021-08-09 14:07:57 +0200359 // Create an edge based on the collected information
Akron8ef408b2021-08-02 22:11:04 +0200360 targetObj := &edge{
Akron83e75a22021-08-04 13:14:06 +0200361 inSym: inSym,
362 outSym: outSym,
363 end: end + 1,
364 tokenend: tokenend,
365 nontoken: nontoken,
Akron8ef408b2021-08-02 22:11:04 +0200366 }
367
Akron740f3d72021-08-03 12:12:34 +0200368 // Initialize outgoing states
369 if tok.transitions[state+1] == nil {
370 tok.transitions[state+1] = make(map[int]*edge)
Akron8ef408b2021-08-02 22:11:04 +0200371 }
372
Akron740f3d72021-08-03 12:12:34 +0200373 // Ignore transitions with invalid symbols
374 if inSym >= 0 {
375 tok.transitions[state+1][inSym] = targetObj
Akron75ebe7f2021-08-03 10:34:10 +0200376 }
Akron8ef408b2021-08-02 22:11:04 +0200377
Akron740f3d72021-08-03 12:12:34 +0200378 // Add final transition
379 if final == 1 {
Akron03c92fe2021-08-09 14:07:57 +0200380 // TODO:
381 // Maybe this is less relevant for tokenizers
Akronc17f1ca2021-08-03 19:47:27 +0200382 tok.transitions[state+1][tok.final] = &edge{}
Akron8ef408b2021-08-02 22:11:04 +0200383 }
384
Akronb4bbb472021-08-09 11:49:38 +0200385 if DEBUG {
Akron740f3d72021-08-03 12:12:34 +0200386 fmt.Println("Add",
387 state+1, "->", end+1,
388 "(",
389 inSym,
390 ":",
391 outSym,
392 ") (",
393 string(tok.sigmaRev[inSym]),
394 ":",
395 string(tok.sigmaRev[outSym]),
Akron524c5432021-08-05 14:14:27 +0200396 ")",
397 ";",
398 "TE:", tokenend,
Akron3610f102021-08-08 14:13:25 +0200399 "NT:", nontoken,
400 "FIN:", final)
Akron740f3d72021-08-03 12:12:34 +0200401 }
Akron75ebe7f2021-08-03 10:34:10 +0200402
Akron8ef408b2021-08-02 22:11:04 +0200403 continue
404 }
405 case SIGMA:
406 {
407 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
408
409 // Turn string into sigma id
410 number, err := strconv.Atoi(elem[0])
411
Akron524c5432021-08-05 14:14:27 +0200412 // ID needs to be > 1
413 number++
414
Akron8ef408b2021-08-02 22:11:04 +0200415 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200416 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200417 return nil
Akron8ef408b2021-08-02 22:11:04 +0200418 }
419
Akron740f3d72021-08-03 12:12:34 +0200420 tok.sigmaCount = number
Akron8ef408b2021-08-02 22:11:04 +0200421
422 var symbol rune
423
424 // Read rune
425 if utf8.RuneCountInString(elem[1]) == 1 {
426 symbol = []rune(elem[1])[0]
427
Akron8ef408b2021-08-02 22:11:04 +0200428 } else if utf8.RuneCountInString(elem[1]) > 1 {
Akron439f4ec2021-08-09 15:45:38 +0200429
430 // Probably a MCS
Akron8ef408b2021-08-02 22:11:04 +0200431 switch elem[1] {
432 case "@_EPSILON_SYMBOL_@":
433 {
Akronc17f1ca2021-08-03 19:47:27 +0200434 tok.epsilon = number
Akron8ef408b2021-08-02 22:11:04 +0200435 }
436 case "@_UNKNOWN_SYMBOL_@":
437 {
Akronc17f1ca2021-08-03 19:47:27 +0200438 tok.unknown = number
Akron8ef408b2021-08-02 22:11:04 +0200439 }
440
441 case "@_IDENTITY_SYMBOL_@":
442 {
Akronc17f1ca2021-08-03 19:47:27 +0200443 tok.identity = number
Akron8ef408b2021-08-02 22:11:04 +0200444 }
Akron03c92fe2021-08-09 14:07:57 +0200445
446 case "@_TOKEN_SYMBOL_@":
447 {
448 tok.tokenend = number
Akron03c92fe2021-08-09 14:07:57 +0200449 }
Akron8ef408b2021-08-02 22:11:04 +0200450 default:
Akron740f3d72021-08-03 12:12:34 +0200451 {
Akron527c10c2021-08-13 01:45:18 +0200452 log.Println("MCS not supported: " + line)
Akron4db3ecf2021-08-11 18:49:03 +0200453 return nil
Akron740f3d72021-08-03 12:12:34 +0200454 }
Akron8ef408b2021-08-02 22:11:04 +0200455 }
Akron439f4ec2021-08-09 15:45:38 +0200456 continue
Akron8ef408b2021-08-02 22:11:04 +0200457
Akron740f3d72021-08-03 12:12:34 +0200458 } else { // Probably a new line symbol
Akron8ef408b2021-08-02 22:11:04 +0200459 line, err = r.ReadString('\n')
460 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200461 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200462 return nil
Akron8ef408b2021-08-02 22:11:04 +0200463 }
464 if len(line) != 1 {
Akron527c10c2021-08-13 01:45:18 +0200465 log.Println("MCS not supported:" + line)
Akron4db3ecf2021-08-11 18:49:03 +0200466 return nil
Akron8ef408b2021-08-02 22:11:04 +0200467 }
Akron03c92fe2021-08-09 14:07:57 +0200468 symbol = rune('\n')
Akron8ef408b2021-08-02 22:11:04 +0200469 }
470
Akron740f3d72021-08-03 12:12:34 +0200471 tok.sigmaRev[number] = symbol
Akron8ef408b2021-08-02 22:11:04 +0200472 }
473 }
474 }
475
476 return tok
477}
478
Akron64ffd9a2021-08-03 19:55:21 +0200479// Set alphabet A to the list of all symbols
480// outgoing from s
Akron439f4ec2021-08-09 15:45:38 +0200481func (tok *Tokenizer) getSet(s int, A *[]int) {
Akron64ffd9a2021-08-03 19:55:21 +0200482 for a := range tok.transitions[s] {
483 *A = append(*A, a)
484 }
485
486 // Not required, but simplifies bug hunting
Akron439f4ec2021-08-09 15:45:38 +0200487 // sort.Ints(*A)
Akron64ffd9a2021-08-03 19:55:21 +0200488}
489
Akron439f4ec2021-08-09 15:45:38 +0200490// ToDoubleArray turns the intermediate tokenizer representation
491// into a double array representation.
492//
493// This is based on Mizobuchi et al (2000), p.128
Akronf2120ca2021-08-03 16:26:41 +0200494func (tok *Tokenizer) ToDoubleArray() *DaTokenizer {
495
496 dat := &DaTokenizer{
Akron03a3c612021-08-04 11:51:27 +0200497 sigma: make(map[rune]int),
498 loadFactor: -1,
499 final: tok.final,
500 unknown: tok.unknown,
501 identity: tok.identity,
502 epsilon: tok.epsilon,
Akron03c92fe2021-08-09 14:07:57 +0200503 tokenend: tok.tokenend,
Akron3f8571a2021-08-05 11:18:10 +0200504 // lastFilledBase: 1,
Akronf2120ca2021-08-03 16:26:41 +0200505 }
506
Akrone61380b2021-08-16 10:10:46 +0200507 // dat.sigmaList = make([]rune, tok.sigmaCount)
508
Akronf2120ca2021-08-03 16:26:41 +0200509 for num, sym := range tok.sigmaRev {
510 dat.sigma[sym] = num
Akrone61380b2021-08-16 10:10:46 +0200511 // dat.sigmaList[num] = sym
Akronf2120ca2021-08-03 16:26:41 +0200512 }
Akron8ef408b2021-08-02 22:11:04 +0200513
514 mark := 0
515 size := 0
516
Akron439f4ec2021-08-09 15:45:38 +0200517 // Create a mapping from s (in Ms aka Intermediate FSA)
518 // to t (in Mt aka Double Array FSA)
Akron740f3d72021-08-03 12:12:34 +0200519 table := make([]*mapping, tok.arcCount+1)
Akron8ef408b2021-08-02 22:11:04 +0200520
Akron439f4ec2021-08-09 15:45:38 +0200521 // Initialize with the start state
Akron8ef408b2021-08-02 22:11:04 +0200522 table[size] = &mapping{source: 1, target: 1}
523 size++
524
Akron740f3d72021-08-03 12:12:34 +0200525 // Allocate space for the outgoing symbol range
526 A := make([]int, 0, tok.sigmaCount)
Akron8ef408b2021-08-02 22:11:04 +0200527
528 for mark < size {
529 s := table[mark].source // This is a state in Ms
530 t := table[mark].target // This is a state in Mt
531 mark++
Akron740f3d72021-08-03 12:12:34 +0200532
533 // Following the paper, here the state t can be remembered
534 // in the set of states St
Akron8ef408b2021-08-02 22:11:04 +0200535 A = A[:0]
Akron439f4ec2021-08-09 15:45:38 +0200536 tok.getSet(s, &A)
Akron8ef408b2021-08-02 22:11:04 +0200537
Akron740f3d72021-08-03 12:12:34 +0200538 // Set base to the first free slot in the double array
Akronf2120ca2021-08-03 16:26:41 +0200539 dat.setBase(t, dat.xCheck(A))
Akron8ef408b2021-08-02 22:11:04 +0200540
Akron773b1ef2021-08-03 17:37:20 +0200541 // TODO:
Akron068874c2021-08-04 15:19:56 +0200542 // Sort the outgoing transitions based on the
Akron773b1ef2021-08-03 17:37:20 +0200543 // outdegree of .end
544
Akron740f3d72021-08-03 12:12:34 +0200545 // Iterate over all outgoing symbols
Akron8ef408b2021-08-02 22:11:04 +0200546 for _, a := range A {
547
Akronc17f1ca2021-08-03 19:47:27 +0200548 if a != tok.final {
Akron8ef408b2021-08-02 22:11:04 +0200549
Akron740f3d72021-08-03 12:12:34 +0200550 // Aka g(s, a)
551 s1 := tok.transitions[s][a].end
Akron8ef408b2021-08-02 22:11:04 +0200552
Akron740f3d72021-08-03 12:12:34 +0200553 // Store the transition
Akron3fdfec62021-08-04 11:40:10 +0200554 t1 := dat.getBase(t) + uint32(a)
Akronf2120ca2021-08-03 16:26:41 +0200555 dat.setCheck(t1, t)
Akron8ef408b2021-08-02 22:11:04 +0200556
Akron439f4ec2021-08-09 15:45:38 +0200557 if DEBUG {
Akron524c5432021-08-05 14:14:27 +0200558 fmt.Println("Translate transition",
559 s, "->", s1, "(", a, ")", "to", t, "->", t1)
560 }
561
Akron83e75a22021-08-04 13:14:06 +0200562 // Mark the state as being the target of a nontoken transition
563 if tok.transitions[s][a].nontoken {
Akron068874c2021-08-04 15:19:56 +0200564 dat.setNonToken(t1, true)
Akron524c5432021-08-05 14:14:27 +0200565 if DEBUG {
566 fmt.Println("Set", t1, "to nontoken")
567 }
Akron83e75a22021-08-04 13:14:06 +0200568 }
569
Akron84d68e62021-08-04 17:06:52 +0200570 // Mark the state as being the target of a tokenend transition
571 if tok.transitions[s][a].tokenend {
572 dat.setTokenEnd(t1, true)
Akron524c5432021-08-05 14:14:27 +0200573 if DEBUG {
574 fmt.Println("Set", t1, "to tokenend")
575 }
Akron84d68e62021-08-04 17:06:52 +0200576 }
577
Akron740f3d72021-08-03 12:12:34 +0200578 // Check for representative states
Akron439f4ec2021-08-09 15:45:38 +0200579 r := stateAlreadyInTable(s1, table, size)
Akron740f3d72021-08-03 12:12:34 +0200580
Akron439f4ec2021-08-09 15:45:38 +0200581 // No representative found
Akron8ef408b2021-08-02 22:11:04 +0200582 if r == 0 {
Akron740f3d72021-08-03 12:12:34 +0200583 // Remember the mapping
Akron8ef408b2021-08-02 22:11:04 +0200584 table[size] = &mapping{source: s1, target: t1}
585 size++
586 } else {
Akron740f3d72021-08-03 12:12:34 +0200587 // Overwrite with the representative state
Akron3fdfec62021-08-04 11:40:10 +0200588 dat.setBase(t1, r)
589 dat.setSeparate(t1, true)
Akron8ef408b2021-08-02 22:11:04 +0200590 }
591 } else {
Akron740f3d72021-08-03 12:12:34 +0200592 // Store a final transition
Akron3fdfec62021-08-04 11:40:10 +0200593 dat.setCheck(dat.getBase(t)+uint32(dat.final), t)
Akron8ef408b2021-08-02 22:11:04 +0200594 }
595 }
596 }
597
598 // Following Mizobuchi et al (2000) the size of the
599 // FSA should be stored in check(1).
Akron3fdfec62021-08-04 11:40:10 +0200600 dat.setSize(dat.maxSize + 1)
Akronf2120ca2021-08-03 16:26:41 +0200601 dat.array = dat.array[:dat.maxSize+1]
602 return dat
Akron8ef408b2021-08-02 22:11:04 +0200603}
604
Akron8ef408b2021-08-02 22:11:04 +0200605// Check the table if a mapping of s
Akron740f3d72021-08-03 12:12:34 +0200606// exists and return this as a representative.
607// Currently iterates through the whole table
608// in a bruteforce manner.
Akron439f4ec2021-08-09 15:45:38 +0200609func stateAlreadyInTable(s int, table []*mapping, size int) uint32 {
Akron8ef408b2021-08-02 22:11:04 +0200610 for x := 0; x < size; x++ {
611 if table[x].source == s {
612 return table[x].target
613 }
614 }
615 return 0
616}
617
Akron64ffd9a2021-08-03 19:55:21 +0200618// Resize double array when necessary
619func (dat *DaTokenizer) resize(l int) {
620 // TODO:
621 // This is a bit too aggressive atm and should be calmed down.
622 if len(dat.array) <= l {
Akron3fdfec62021-08-04 11:40:10 +0200623 dat.array = append(dat.array, make([]uint32, l)...)
Akron8ef408b2021-08-02 22:11:04 +0200624 }
Akron64ffd9a2021-08-03 19:55:21 +0200625}
Akronc9d84a62021-08-03 15:56:03 +0200626
Akron64ffd9a2021-08-03 19:55:21 +0200627// Set base value in double array
Akron3fdfec62021-08-04 11:40:10 +0200628func (dat *DaTokenizer) setBase(p uint32, v uint32) {
629 l := int(p*2 + 1)
Akron64ffd9a2021-08-03 19:55:21 +0200630 if dat.maxSize < l {
Akron439f4ec2021-08-09 15:45:38 +0200631 dat.resize(l)
Akron64ffd9a2021-08-03 19:55:21 +0200632 dat.maxSize = l
633 }
Akron439f4ec2021-08-09 15:45:38 +0200634 dat.array[l-1] = v
635}
636
637// Get base value in double array
638func (dat *DaTokenizer) getBase(p uint32) uint32 {
639 if int(p*2) > dat.maxSize {
640 return 0
641 }
642 return dat.array[p*2] & RESTBIT
643}
644
645// Set check value in double array
646func (dat *DaTokenizer) setCheck(p uint32, v uint32) {
647 l := int(p*2 + 1)
648 if dat.maxSize < l {
649 dat.resize(l)
650 dat.maxSize = l
651 }
652 dat.array[l] = v
653}
654
655// Get check value in double array
656func (dat *DaTokenizer) getCheck(p uint32) uint32 {
657 if int((p*2)+1) > dat.maxSize {
658 return 0
659 }
660 return dat.array[(p*2)+1] & RESTBIT
Akron64ffd9a2021-08-03 19:55:21 +0200661}
662
Akron3fdfec62021-08-04 11:40:10 +0200663// Returns true if a state is separate pointing to a representative
664func (dat *DaTokenizer) isSeparate(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200665 return dat.array[p*2]&FIRSTBIT != 0
Akron3fdfec62021-08-04 11:40:10 +0200666}
667
668// Mark a state as separate pointing to a representative
669func (dat *DaTokenizer) setSeparate(p uint32, sep bool) {
670 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200671 dat.array[p*2] |= FIRSTBIT
Akron3fdfec62021-08-04 11:40:10 +0200672 } else {
Akron03c92fe2021-08-09 14:07:57 +0200673 dat.array[p*2] &= (RESTBIT | SECONDBIT)
Akron3fdfec62021-08-04 11:40:10 +0200674 }
675}
676
Akron83e75a22021-08-04 13:14:06 +0200677// Returns true if a state is the target of a nontoken transition
678func (dat *DaTokenizer) isNonToken(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200679 return dat.array[p*2+1]&FIRSTBIT != 0
Akron83e75a22021-08-04 13:14:06 +0200680}
681
682// Mark a state as being the target of a nontoken transition
683func (dat *DaTokenizer) setNonToken(p uint32, sep bool) {
684 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200685 dat.array[p*2+1] |= FIRSTBIT
Akron83e75a22021-08-04 13:14:06 +0200686 } else {
Akron03c92fe2021-08-09 14:07:57 +0200687 dat.array[p*2+1] &= (RESTBIT | SECONDBIT)
Akron83e75a22021-08-04 13:14:06 +0200688 }
689}
690
Akron84d68e62021-08-04 17:06:52 +0200691// Returns true if a state is the target of a tokenend transition
692func (dat *DaTokenizer) isTokenEnd(p uint32) bool {
Akron03c92fe2021-08-09 14:07:57 +0200693 return dat.array[p*2+1]&SECONDBIT != 0
Akron84d68e62021-08-04 17:06:52 +0200694}
695
696// Mark a state as being the target of a tokenend transition
697func (dat *DaTokenizer) setTokenEnd(p uint32, sep bool) {
698 if sep {
Akron03c92fe2021-08-09 14:07:57 +0200699 dat.array[p*2+1] |= SECONDBIT
Akron84d68e62021-08-04 17:06:52 +0200700 } else {
Akron03c92fe2021-08-09 14:07:57 +0200701 dat.array[p*2+1] &= (RESTBIT | FIRSTBIT)
Akron84d68e62021-08-04 17:06:52 +0200702 }
703}
704
Akron64ffd9a2021-08-03 19:55:21 +0200705// Set size of double array
Akron3fdfec62021-08-04 11:40:10 +0200706func (dat *DaTokenizer) setSize(v int) {
707 dat.setCheck(1, uint32(v))
Akron64ffd9a2021-08-03 19:55:21 +0200708}
709
710// Get size of double array
Akron3fdfec62021-08-04 11:40:10 +0200711func (dat *DaTokenizer) GetSize() int {
712 return int(dat.getCheck(1))
Akron8ef408b2021-08-02 22:11:04 +0200713}
714
715// Based on Mizobuchi et al (2000), p. 124
716// This iterates for every state through the complete double array
717// structure until it finds a gap that fits all outgoing transitions
718// of the state. This is extremely slow, but is only necessary in the
719// construction phase of the tokenizer.
Akron3fdfec62021-08-04 11:40:10 +0200720func (dat *DaTokenizer) xCheck(symbols []int) uint32 {
Akron740f3d72021-08-03 12:12:34 +0200721
722 // Start at the first entry of the double array list
Akron3f8571a2021-08-05 11:18:10 +0200723 base := uint32(1) // dat.lastFilledBase
724 // skip := false
Akron8ef408b2021-08-02 22:11:04 +0200725OVERLAP:
Akron740f3d72021-08-03 12:12:34 +0200726
Akron3f8571a2021-08-05 11:18:10 +0200727 /*
728 if !skip {
729 if dat.getCheck(base) != 0 {
730 dat.lastFilledBase = base
731 } else {
732 skip = true
733 }
734 }
735 */
736
Akron740f3d72021-08-03 12:12:34 +0200737 // Resize the array if necessary
Akron3fdfec62021-08-04 11:40:10 +0200738 dat.resize((int(base) + dat.final) * 2)
Akron8ef408b2021-08-02 22:11:04 +0200739 for _, a := range symbols {
Akron3fdfec62021-08-04 11:40:10 +0200740 if dat.getCheck(base+uint32(a)) != 0 {
Akron8ef408b2021-08-02 22:11:04 +0200741 base++
742 goto OVERLAP
743 }
744 }
Akron8ef408b2021-08-02 22:11:04 +0200745 return base
746}
747
Akron3610f102021-08-08 14:13:25 +0200748// List all outgoing transitions for a state
749// for testing purposes
750func (dat *DaTokenizer) outgoing(t uint32) []int {
751
752 valid := make([]int, 0, len(dat.sigma))
753
754 for _, a := range dat.sigma {
755 t1 := dat.getBase(t) + uint32(a)
756 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
757 valid = append(valid, a)
758 }
759 }
760
761 for _, a := range []int{dat.epsilon, dat.unknown, dat.identity, dat.final} {
762 t1 := dat.getBase(t) + uint32(a)
763 if t1 <= dat.getCheck(1) && dat.getCheck(t1) == t {
764 valid = append(valid, -1*a)
765 }
766 }
767
768 sort.Ints(valid)
769
770 return valid
771}
772
Akron03a3c612021-08-04 11:51:27 +0200773// LoadFactor as defined in Kanda et al (2018),
774// i.e. the proportion of non-empty elements to all elements.
775func (dat *DaTokenizer) LoadFactor() float64 {
Akrond66a9262021-08-03 17:09:09 +0200776
Akron03a3c612021-08-04 11:51:27 +0200777 // Cache the loadfactor
Akron3f8571a2021-08-05 11:18:10 +0200778 if dat.loadFactor > 0 {
Akron03a3c612021-08-04 11:51:27 +0200779 return dat.loadFactor
Akron773b1ef2021-08-03 17:37:20 +0200780 }
Akrond66a9262021-08-03 17:09:09 +0200781 nonEmpty := 0
782 all := len(dat.array) / 2
783 for x := 1; x <= len(dat.array); x = x + 2 {
784 if dat.array[x] != 0 {
785 nonEmpty++
786 }
787 }
Akron03a3c612021-08-04 11:51:27 +0200788 dat.loadFactor = float64(nonEmpty) / float64(all) * 100
789 return dat.loadFactor
Akrond66a9262021-08-03 17:09:09 +0200790}
791
Akron439f4ec2021-08-09 15:45:38 +0200792// Save stores the double array data in a file
Akron3a063ef2021-08-05 19:36:35 +0200793func (dat *DaTokenizer) Save(file string) (n int64, err error) {
794 f, err := os.Create(file)
795 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200796 log.Println(err)
Akron439f4ec2021-08-09 15:45:38 +0200797 return 0, err
Akron3a063ef2021-08-05 19:36:35 +0200798 }
799 defer f.Close()
800 gz := gzip.NewWriter(f)
801 defer gz.Close()
802 n, err = dat.WriteTo(gz)
803 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200804 log.Println(err)
Akron3a063ef2021-08-05 19:36:35 +0200805 return n, err
806 }
807 gz.Flush()
808 return n, nil
809}
810
811// WriteTo stores the double array data in an io.Writer.
Akron6247a5d2021-08-03 19:18:28 +0200812func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
813
Akron3a063ef2021-08-05 19:36:35 +0200814 wb := bufio.NewWriter(w)
815 defer wb.Flush()
816
Akron6247a5d2021-08-03 19:18:28 +0200817 // Store magical header
Akron3a063ef2021-08-05 19:36:35 +0200818 all, err := wb.Write([]byte(MAGIC))
Akron6247a5d2021-08-03 19:18:28 +0200819 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200820 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200821 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200822 }
823
824 // Get sigma as a list
825 sigmalist := make([]rune, len(dat.sigma)+16)
826 max := 0
827 for sym, num := range dat.sigma {
828 sigmalist[num] = sym
829 if num > max {
830 max = num
831 }
832 }
833
834 sigmalist = sigmalist[:max+1]
835
Akron3f8571a2021-08-05 11:18:10 +0200836 buf := make([]byte, 0, 16)
Akron6247a5d2021-08-03 19:18:28 +0200837 bo.PutUint16(buf[0:2], VERSION)
Akronc17f1ca2021-08-03 19:47:27 +0200838 bo.PutUint16(buf[2:4], uint16(dat.epsilon))
839 bo.PutUint16(buf[4:6], uint16(dat.unknown))
840 bo.PutUint16(buf[6:8], uint16(dat.identity))
841 bo.PutUint16(buf[8:10], uint16(dat.final))
Akron6247a5d2021-08-03 19:18:28 +0200842 bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
Akron3f8571a2021-08-05 11:18:10 +0200843 bo.PutUint32(buf[12:16], uint32(len(dat.array)))
Akron3a063ef2021-08-05 19:36:35 +0200844 more, err := wb.Write(buf[0:16])
Akron6247a5d2021-08-03 19:18:28 +0200845 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200846 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200847 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200848 }
849
850 all += more
851
Akron6247a5d2021-08-03 19:18:28 +0200852 // Write sigma
853 for _, sym := range sigmalist {
Akron3f8571a2021-08-05 11:18:10 +0200854
Akron3a063ef2021-08-05 19:36:35 +0200855 more, err = wb.WriteRune(sym)
Akron6247a5d2021-08-03 19:18:28 +0200856 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200857 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200858 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200859 }
860 all += more
861 }
Akron439f4ec2021-08-09 15:45:38 +0200862
Akron6247a5d2021-08-03 19:18:28 +0200863 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200864 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200865 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200866 }
Akron6247a5d2021-08-03 19:18:28 +0200867
868 // Test marker - could be checksum
Akron3a063ef2021-08-05 19:36:35 +0200869 more, err = wb.Write([]byte("T"))
Akron6247a5d2021-08-03 19:18:28 +0200870 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200871 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200872 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200873 }
874 all += more
875
Akron3a063ef2021-08-05 19:36:35 +0200876 for x := 0; x < len(dat.array); x++ {
877 // for _, d := range dat.array {
878 bo.PutUint32(buf[0:4], dat.array[x])
879 more, err := wb.Write(buf[0:4])
Akron6247a5d2021-08-03 19:18:28 +0200880 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200881 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200882 return int64(all), err
Akron6247a5d2021-08-03 19:18:28 +0200883 }
Akron439f4ec2021-08-09 15:45:38 +0200884 all += more
Akron3a063ef2021-08-05 19:36:35 +0200885 if more != 4 {
Akron527c10c2021-08-13 01:45:18 +0200886 log.Println("Can not write uint32")
Akron3a063ef2021-08-05 19:36:35 +0200887 return int64(all), err
888 }
Akron6247a5d2021-08-03 19:18:28 +0200889 }
890
891 return int64(all), err
892}
893
Akron439f4ec2021-08-09 15:45:38 +0200894// LoadDatokFile reads a double array represented tokenizer
895// from a file.
Akron3f8571a2021-08-05 11:18:10 +0200896func LoadDatokFile(file string) *DaTokenizer {
897 f, err := os.Open(file)
898 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200899 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200900 return nil
Akron3f8571a2021-08-05 11:18:10 +0200901 }
902 defer f.Close()
903
904 gz, err := gzip.NewReader(f)
905 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200906 log.Println(err)
Akron4db3ecf2021-08-11 18:49:03 +0200907 return nil
Akron3f8571a2021-08-05 11:18:10 +0200908 }
909 defer gz.Close()
910
Akron3a063ef2021-08-05 19:36:35 +0200911 // Todo: Read the whole file!
Akron3f8571a2021-08-05 11:18:10 +0200912 return ParseDatok(gz)
913}
914
Akron439f4ec2021-08-09 15:45:38 +0200915// LoadDatokFile reads a double array represented tokenizer
916// from an io.Reader
Akron3f8571a2021-08-05 11:18:10 +0200917func ParseDatok(ior io.Reader) *DaTokenizer {
918
Akron439f4ec2021-08-09 15:45:38 +0200919 // Initialize tokenizer with default values
Akron3f8571a2021-08-05 11:18:10 +0200920 dat := &DaTokenizer{
921 sigma: make(map[rune]int),
922 epsilon: 0,
923 unknown: 0,
924 identity: 0,
925 final: 0,
926 loadFactor: 0,
927 }
928
929 r := bufio.NewReader(ior)
930
Akron3f8571a2021-08-05 11:18:10 +0200931 buf := make([]byte, 1024)
932 buf = buf[0:len(MAGIC)]
933
Akron439f4ec2021-08-09 15:45:38 +0200934 _, err := r.Read(buf)
Akron3f8571a2021-08-05 11:18:10 +0200935
936 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200937 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200938 return nil
939 }
940
Akron3f8571a2021-08-05 11:18:10 +0200941 if string(MAGIC) != string(buf) {
Akron527c10c2021-08-13 01:45:18 +0200942 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200943 return nil
944 }
945
Akron439f4ec2021-08-09 15:45:38 +0200946 more, err := io.ReadFull(r, buf[0:16])
Akron3f8571a2021-08-05 11:18:10 +0200947 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200948 log.Println(err)
Akron3f8571a2021-08-05 11:18:10 +0200949 return nil
950 }
951
Akron439f4ec2021-08-09 15:45:38 +0200952 if more != 16 {
Akron527c10c2021-08-13 01:45:18 +0200953 log.Println("Read bytes do not fit")
Akron439f4ec2021-08-09 15:45:38 +0200954 return nil
955 }
Akron3f8571a2021-08-05 11:18:10 +0200956
Akron3a063ef2021-08-05 19:36:35 +0200957 version := bo.Uint16(buf[0:2])
958
959 if version != VERSION {
Akron527c10c2021-08-13 01:45:18 +0200960 log.Println("Version not compatible")
Akron3a063ef2021-08-05 19:36:35 +0200961 return nil
962 }
963
Akron3f8571a2021-08-05 11:18:10 +0200964 dat.epsilon = int(bo.Uint16(buf[2:4]))
965 dat.unknown = int(bo.Uint16(buf[4:6]))
966 dat.identity = int(bo.Uint16(buf[6:8]))
967 dat.final = int(bo.Uint16(buf[8:10]))
968
969 sigmaCount := int(bo.Uint16(buf[10:12]))
970 arraySize := int(bo.Uint32(buf[12:16]))
971
Akron3a063ef2021-08-05 19:36:35 +0200972 // Shouldn't be relevant though
973 dat.maxSize = arraySize - 1
974
Akrone61380b2021-08-16 10:10:46 +0200975 // dat.sigmaList = make([]rune, sigmaCount)
976
Akron3f8571a2021-08-05 11:18:10 +0200977 for x := 0; x < sigmaCount; x++ {
Akron439f4ec2021-08-09 15:45:38 +0200978 sym, _, err := r.ReadRune()
Akron3f8571a2021-08-05 11:18:10 +0200979 if err == nil && sym != 0 {
980 dat.sigma[sym] = x
981 }
Akrone61380b2021-08-16 10:10:46 +0200982 /*
983 if err == nil {
984 dat.sigmaList[x] = sym
985 }
986 */
Akron3f8571a2021-08-05 11:18:10 +0200987 }
988
Akron439f4ec2021-08-09 15:45:38 +0200989 _, err = io.ReadFull(r, buf[0:1])
Akron3f8571a2021-08-05 11:18:10 +0200990
991 if err != nil {
Akron527c10c2021-08-13 01:45:18 +0200992 log.Print(err)
Akron3f8571a2021-08-05 11:18:10 +0200993 return nil
994 }
995
Akron3f8571a2021-08-05 11:18:10 +0200996 if string("T") != string(buf[0:1]) {
Akron527c10c2021-08-13 01:45:18 +0200997 log.Println("Not a datok file")
Akron3f8571a2021-08-05 11:18:10 +0200998 return nil
999 }
1000
1001 // Read based on length
1002 dat.array = make([]uint32, arraySize)
1003
Akronbb4aac52021-08-13 00:52:27 +02001004 dataArray, err := io.ReadAll(r)
Akron439f4ec2021-08-09 15:45:38 +02001005
Akronbb4aac52021-08-13 00:52:27 +02001006 if err == io.EOF {
Akron527c10c2021-08-13 01:45:18 +02001007 log.Println(err)
Akronbb4aac52021-08-13 00:52:27 +02001008 return nil
1009 }
1010
1011 if len(dataArray) < arraySize*4 {
Akron527c10c2021-08-13 01:45:18 +02001012 log.Println("Not enough bytes read")
Akronbb4aac52021-08-13 00:52:27 +02001013 return nil
1014 }
1015
1016 for x := 0; x < arraySize; x++ {
1017 dat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
Akron3f8571a2021-08-05 11:18:10 +02001018 }
1019
1020 return dat
1021}
1022
Akron439f4ec2021-08-09 15:45:38 +02001023// Show the current state of the buffer,
1024// for testing puroses
Akron3610f102021-08-08 14:13:25 +02001025func showBuffer(buffer []rune, buffo int, buffi int) string {
1026 out := make([]rune, 0, 1024)
1027 for x := 0; x < len(buffer); x++ {
1028 if buffi == x {
1029 out = append(out, '^')
1030 }
1031 if buffo == x {
1032 out = append(out, '[', buffer[x], ']')
1033 } else {
1034 out = append(out, buffer[x])
1035 }
1036 }
1037 return string(out)
1038}
1039
Akrone61380b2021-08-16 10:10:46 +02001040/*
1041func (dat *DaTokenizer) LookupSigma(r rune) (int, bool) {
1042 for i, l := range dat.sigmaList {
1043 if l == r {
1044 return i, true
1045 } else if l > r {
1046 return 0, false
1047 }
1048 }
1049 return 0, false
1050}
1051*/
1052
Akron84d68e62021-08-04 17:06:52 +02001053// Transduce an input string against the double array
Akron3610f102021-08-08 14:13:25 +02001054// FSA. The rules are always greedy. If the automaton fails,
1055// it takes the last possible token ending branch.
Akron068874c2021-08-04 15:19:56 +02001056//
Akron4db3ecf2021-08-11 18:49:03 +02001057// Based on Mizobuchi et al (2000), p. 129,
1058// with additional support for IDENTITY, UNKNOWN
1059// and EPSILON transitions and NONTOKEN and TOKENEND handling.
Akron3f8571a2021-08-05 11:18:10 +02001060func (dat *DaTokenizer) Transduce(r io.Reader, w io.Writer) bool {
Akron068874c2021-08-04 15:19:56 +02001061 var a int
Akronb4bbb472021-08-09 11:49:38 +02001062 var t0 uint32
Akronb7e1f132021-08-10 11:52:31 +02001063 t := uint32(1) // Initial state
1064 var ok, rewindBuffer bool
Akron068874c2021-08-04 15:19:56 +02001065
Akron3610f102021-08-08 14:13:25 +02001066 // Remember the last position of a possible tokenend,
1067 // in case the automaton fails.
1068 epsilonState := uint32(0)
1069 epsilonOffset := 0
1070
1071 // Implement a low level buffer for full control,
1072 // however - it is probably better to introduce
1073 // this on a higher level with a io.Reader interface
1074 // The buffer stores a single word and may have white
1075 // space at the end (but not at the beginning).
1076 //
1077 // This is the only backtracking requirement because of
1078 // epsilon transitions, to support tokenizations like:
1079 // "this is an example|.| And it works." vs
1080 // "this is an example.com| application."
Akronb7e1f132021-08-10 11:52:31 +02001081 //
1082 // TODO:
1083 // Store a translation buffer as well, so characters don't
1084 // have to be translated multiple times!
Akron3610f102021-08-08 14:13:25 +02001085 buffer := make([]rune, 1024)
1086 buffo := 0 // Buffer offset
1087 buffi := 0 // Buffer length
1088
Akron3f8571a2021-08-05 11:18:10 +02001089 reader := bufio.NewReader(r)
1090 writer := bufio.NewWriter(w)
1091 defer writer.Flush()
Akron068874c2021-08-04 15:19:56 +02001092
Akron3f8571a2021-08-05 11:18:10 +02001093 var char rune
1094 var err error
1095 eof := false
Akronb7e1f132021-08-10 11:52:31 +02001096 newchar := true
Akron3f8571a2021-08-05 11:18:10 +02001097
Akronc5d8d432021-08-10 16:48:44 +02001098PARSECHAR:
Akron3f8571a2021-08-05 11:18:10 +02001099 for {
1100
Akronb7e1f132021-08-10 11:52:31 +02001101 if newchar {
1102 // Get from reader if buffer is empty
1103 if buffo >= buffi {
Akron1594cb82021-08-11 11:14:56 +02001104 if eof {
1105 break
1106 }
Akronb7e1f132021-08-10 11:52:31 +02001107 char, _, err = reader.ReadRune()
Akron439f4ec2021-08-09 15:45:38 +02001108
Akronb7e1f132021-08-10 11:52:31 +02001109 // No more runes to read
1110 if err != nil {
1111 eof = true
1112 break
1113 }
1114 buffer[buffi] = char
1115 buffi++
Akron3f8571a2021-08-05 11:18:10 +02001116 }
Akronb7e1f132021-08-10 11:52:31 +02001117
1118 char = buffer[buffo]
1119
1120 if DEBUG {
1121 fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
1122 }
1123
1124 // TODO: Better not repeatedly check for a!
1125 a, ok = dat.sigma[char]
Akrone61380b2021-08-16 10:10:46 +02001126 // a, ok = dat.LookupSigma(char)
Akronb7e1f132021-08-10 11:52:31 +02001127
1128 // Use identity symbol if character is not in sigma
1129 if !ok && dat.identity != -1 {
1130 a = dat.identity
1131 }
1132
1133 t0 = t
1134
1135 // Check for epsilon transitions and remember
1136 if dat.getCheck(dat.getBase(t0)+uint32(dat.epsilon)) == t0 {
1137 // Remember state for backtracking to last tokenend state
1138 epsilonState = t0
1139 epsilonOffset = buffo
1140 }
Akron3f8571a2021-08-05 11:18:10 +02001141 }
Akron3610f102021-08-08 14:13:25 +02001142
Akronb7e1f132021-08-10 11:52:31 +02001143 // Checks a transition based on t0, a and buffo
Akronb4bbb472021-08-09 11:49:38 +02001144 t = dat.getBase(t0) + uint32(a)
Akron068874c2021-08-04 15:19:56 +02001145
Akron524c5432021-08-05 14:14:27 +02001146 if DEBUG {
Akronb7e1f132021-08-10 11:52:31 +02001147 // Char is only relevant if set
1148 fmt.Println("Check", t0, "-", a, "(", string(char), ")", "->", t)
1149 if false {
1150 fmt.Println(dat.outgoing(t0))
1151 }
Akron524c5432021-08-05 14:14:27 +02001152 }
1153
Akronb7e1f132021-08-10 11:52:31 +02001154 // Check if the transition is invalid according to the double array
Akronb4bbb472021-08-09 11:49:38 +02001155 if t > dat.getCheck(1) || dat.getCheck(t) != t0 {
Akron068874c2021-08-04 15:19:56 +02001156
1157 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001158 fmt.Println("Match is not fine!", t, "and", dat.getCheck(t), "vs", t0)
Akron068874c2021-08-04 15:19:56 +02001159 }
1160
1161 if !ok && a == dat.identity {
Akronb4bbb472021-08-09 11:49:38 +02001162
Akron068874c2021-08-04 15:19:56 +02001163 // Try again with unknown symbol, in case identity failed
Akronb7e1f132021-08-10 11:52:31 +02001164 // Char is only relevant when set
Akron068874c2021-08-04 15:19:56 +02001165 if DEBUG {
Akron3f8571a2021-08-05 11:18:10 +02001166 fmt.Println("UNKNOWN symbol", string(char), "->", dat.unknown)
Akron068874c2021-08-04 15:19:56 +02001167 }
1168 a = dat.unknown
1169
1170 } else if a != dat.epsilon {
Akronb4bbb472021-08-09 11:49:38 +02001171
Akron068874c2021-08-04 15:19:56 +02001172 // Try again with epsilon symbol, in case everything else failed
Akronb4bbb472021-08-09 11:49:38 +02001173 t0 = epsilonState
Akron3610f102021-08-08 14:13:25 +02001174 epsilonState = 0 // reset
1175 buffo = epsilonOffset
Akron439f4ec2021-08-09 15:45:38 +02001176 a = dat.epsilon
1177
Akron3610f102021-08-08 14:13:25 +02001178 if DEBUG {
1179 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
1180 }
Akronb4bbb472021-08-09 11:49:38 +02001181
Akron068874c2021-08-04 15:19:56 +02001182 } else {
1183 break
1184 }
Akron068874c2021-08-04 15:19:56 +02001185
Akronb7e1f132021-08-10 11:52:31 +02001186 newchar = false
1187 continue
Akronb4bbb472021-08-09 11:49:38 +02001188 }
1189
Akronb7e1f132021-08-10 11:52:31 +02001190 // Transition was successful
1191 rewindBuffer = false
Akron439f4ec2021-08-09 15:45:38 +02001192
1193 // Transition consumes a character
Akronb4bbb472021-08-09 11:49:38 +02001194 if a != dat.epsilon {
1195
Akron3610f102021-08-08 14:13:25 +02001196 buffo++
Akronb4bbb472021-08-09 11:49:38 +02001197
Akron439f4ec2021-08-09 15:45:38 +02001198 // Transition does not produce a character
Akronc5d8d432021-08-10 16:48:44 +02001199 if buffo == 1 && dat.isNonToken(t) {
Akron3610f102021-08-08 14:13:25 +02001200 if DEBUG {
1201 fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
1202 }
Akron439f4ec2021-08-09 15:45:38 +02001203 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001204 }
Akron3f8571a2021-08-05 11:18:10 +02001205 }
Akron068874c2021-08-04 15:19:56 +02001206
Akronc5d8d432021-08-10 16:48:44 +02001207 // Transition marks the end of a token - so flush the buffer
Akronb7e1f132021-08-10 11:52:31 +02001208 if dat.isTokenEnd(t) {
Akron524c5432021-08-05 14:14:27 +02001209
Akronc5d8d432021-08-10 16:48:44 +02001210 if buffi > 0 {
Akronc5d8d432021-08-10 16:48:44 +02001211 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001212 fmt.Println("-> Flush buffer: [", string(buffer[:buffo]), "]", showBuffer(buffer, buffo, buffi))
Akronc5d8d432021-08-10 16:48:44 +02001213 }
Akron01912fc2021-08-12 11:41:58 +02001214 writer.WriteString(string(buffer[:buffo]))
Akronc5d8d432021-08-10 16:48:44 +02001215 rewindBuffer = true
Akron3610f102021-08-08 14:13:25 +02001216 }
Akron1594cb82021-08-11 11:14:56 +02001217 if DEBUG {
1218 fmt.Println("-> Newline")
1219 }
1220 writer.WriteRune('\n')
Akron439f4ec2021-08-09 15:45:38 +02001221 }
Akron3610f102021-08-08 14:13:25 +02001222
Akronc5d8d432021-08-10 16:48:44 +02001223 // Rewind the buffer if necessary
Akron439f4ec2021-08-09 15:45:38 +02001224 if rewindBuffer {
1225
1226 // TODO: Better as a ring buffer
Akron3610f102021-08-08 14:13:25 +02001227 for x, i := range buffer[buffo:buffi] {
1228 buffer[x] = i
1229 }
Akronb4bbb472021-08-09 11:49:38 +02001230
Akron3610f102021-08-08 14:13:25 +02001231 buffi -= buffo
1232 epsilonOffset -= buffo
1233 buffo = 0
1234 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001235 fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
Akron3610f102021-08-08 14:13:25 +02001236 }
Akron84d68e62021-08-04 17:06:52 +02001237 }
1238
Akronb7e1f132021-08-10 11:52:31 +02001239 // Move to representative state
1240 if dat.isSeparate(t) {
1241 t = dat.getBase(t)
1242
1243 if DEBUG {
1244 fmt.Println("Representative pointing to", t)
1245 }
1246 }
1247
Akronc5d8d432021-08-10 16:48:44 +02001248 newchar = true
1249
Akron068874c2021-08-04 15:19:56 +02001250 // TODO:
1251 // Prevent endless epsilon loops!
1252 }
1253
Akron439f4ec2021-08-09 15:45:38 +02001254 // Input reader is not yet finished
Akron3f8571a2021-08-05 11:18:10 +02001255 if !eof {
Akron068874c2021-08-04 15:19:56 +02001256 if DEBUG {
Akronb4bbb472021-08-09 11:49:38 +02001257 fmt.Println("Not at the end - problem", t0, ":", dat.outgoing(t0))
Akron068874c2021-08-04 15:19:56 +02001258 }
1259 return false
1260 }
1261
Akronb7e1f132021-08-10 11:52:31 +02001262 if DEBUG {
1263 fmt.Println("Entering final check")
1264 }
1265
Akronc5d8d432021-08-10 16:48:44 +02001266 // Automaton is in a final state, so flush the buffer and return
Akron068874c2021-08-04 15:19:56 +02001267 if dat.getCheck(dat.getBase(t)+uint32(dat.final)) == t {
Akronb4bbb472021-08-09 11:49:38 +02001268
1269 if buffi > 0 {
Akronb4bbb472021-08-09 11:49:38 +02001270 if DEBUG {
Akron01912fc2021-08-12 11:41:58 +02001271 fmt.Println("-> Flush buffer: [", string(buffer[:buffi]), "]")
Akron3f8571a2021-08-05 11:18:10 +02001272 }
Akron01912fc2021-08-12 11:41:58 +02001273 writer.WriteString(string(buffer[:buffi]))
Akron6e70dc82021-08-11 11:33:18 +02001274
Akrondf0a3ef2021-08-09 15:53:45 +02001275 if dat.isTokenEnd(t) {
1276 writer.WriteRune('\n')
Akronc5d8d432021-08-10 16:48:44 +02001277 if DEBUG {
1278 fmt.Println("-> Newline")
1279 }
Akrondf0a3ef2021-08-09 15:53:45 +02001280 }
Akron84d68e62021-08-04 17:06:52 +02001281 }
1282
Akron6e70dc82021-08-11 11:33:18 +02001283 // Add an additional sentence ending, if the file is over but no explicit
1284 // sentence split was reached. This may be controversial and therefore
1285 // optional via parameter.
1286 if !dat.isTokenEnd(t0) {
1287 writer.WriteRune('\n')
1288 if DEBUG {
1289 fmt.Println("-> Newline")
1290 }
1291 }
1292
Akrone61380b2021-08-16 10:10:46 +02001293 // TODO:
1294 // There may be a new line at the end, from an epsilon,
1295 // so we may need to go on!
Akron068874c2021-08-04 15:19:56 +02001296 return true
1297 }
1298
Akronc5d8d432021-08-10 16:48:44 +02001299 // Check epsilon transitions until a final state is reached
1300 t0 = t
1301 t = dat.getBase(t0) + uint32(dat.epsilon)
Akron01912fc2021-08-12 11:41:58 +02001302 a = dat.epsilon
1303 newchar = false
Akronc5d8d432021-08-10 16:48:44 +02001304 if dat.getCheck(t) == t0 {
1305 // Remember state for backtracking to last tokenend state
Akronc5d8d432021-08-10 16:48:44 +02001306 goto PARSECHAR
Akrone61380b2021-08-16 10:10:46 +02001307
Akronc5d8d432021-08-10 16:48:44 +02001308 } else if epsilonState != 0 {
Akronb7e1f132021-08-10 11:52:31 +02001309 t0 = epsilonState
1310 epsilonState = 0 // reset
1311 buffo = epsilonOffset
Akron068874c2021-08-04 15:19:56 +02001312 if DEBUG {
Akronc5d8d432021-08-10 16:48:44 +02001313 fmt.Println("Get from epsilon stack and set buffo!", showBuffer(buffer, buffo, buffi))
Akron068874c2021-08-04 15:19:56 +02001314 }
Akronc5d8d432021-08-10 16:48:44 +02001315 goto PARSECHAR
Akron068874c2021-08-04 15:19:56 +02001316 }
Akronc5d8d432021-08-10 16:48:44 +02001317 return false
Akron068874c2021-08-04 15:19:56 +02001318}