blob: 1d46500087c87d3439e10103dba69fd3e24ee092 [file] [log] [blame]
Akron0d0daa22021-09-21 16:32:23 +02001package datok
2
3import (
4 "bufio"
5 "compress/gzip"
6 "fmt"
7 "io"
8 "log"
9 "os"
10 "strconv"
11 "strings"
12 "unicode/utf8"
13)
14
15const (
16 PROPS = 1
17 SIGMA = 2
18 STATES = 3
19 NONE = 4
20)
21
22type edge struct {
23 inSym int
24 outSym int
25 end int
26 nontoken bool
27 tokenend bool
28}
29
Akron1c34ce62021-09-23 23:27:39 +020030type Tokenizer interface {
31 Transduce(r io.Reader, w io.Writer) bool
32}
33
34// Automaton is the intermediate representation
Akron0d0daa22021-09-21 16:32:23 +020035// of the tokenizer.
Akron1c34ce62021-09-23 23:27:39 +020036type Automaton struct {
Akron0d0daa22021-09-21 16:32:23 +020037 sigmaRev map[int]rune
38 sigmaMCS map[int]string
39 arcCount int
40 sigmaCount int
Akron1c34ce62021-09-23 23:27:39 +020041 stateCount int
Akron0d0daa22021-09-21 16:32:23 +020042 transitions []map[int]*edge
43
44 // Special symbols in sigma
45 epsilon int
46 unknown int
47 identity int
48 final int
49 tokenend int
50}
51
52// ParseFoma reads the FST from a foma file
53// and creates an internal representation,
54// in case it follows the tokenizer's convention.
Akron1c34ce62021-09-23 23:27:39 +020055func LoadFomaFile(file string) *Automaton {
Akron0d0daa22021-09-21 16:32:23 +020056 f, err := os.Open(file)
57 if err != nil {
58 log.Print(err)
59 return nil
60 }
61 defer f.Close()
62
63 gz, err := gzip.NewReader(f)
64 if err != nil {
65 log.Print(err)
66 return nil
67 }
68 defer gz.Close()
69
70 return ParseFoma(gz)
71}
72
73// ParseFoma reads the FST from a foma file reader
74// and creates an internal representation,
75// in case it follows the tokenizer's convention.
Akron1c34ce62021-09-23 23:27:39 +020076func ParseFoma(ior io.Reader) *Automaton {
Akron0d0daa22021-09-21 16:32:23 +020077 r := bufio.NewReader(ior)
78
Akron1c34ce62021-09-23 23:27:39 +020079 auto := &Automaton{
Akron0d0daa22021-09-21 16:32:23 +020080 sigmaRev: make(map[int]rune),
81 sigmaMCS: make(map[int]string),
82 epsilon: -1,
83 unknown: -1,
84 identity: -1,
85 final: -1,
86 tokenend: -1,
87 }
88
89 var state, inSym, outSym, end, final int
90
91 mode := 0
92 var elem []string
93 var elemint [5]int
94
95 // Iterate over all lines of the file.
96 // This is mainly based on foma2js,
97 // licensed under the Apache License, version 2,
98 // and written by Mans Hulden.
99 for {
100 line, err := r.ReadString('\n')
101 if err != nil {
102 if err == io.EOF {
103 break
104 }
105 log.Print(err)
106 return nil
107 }
108
109 // Read parser mode for the following lines
110 if strings.HasPrefix(line, "##") {
111 if strings.HasPrefix(line, "##props##") {
112 mode = PROPS
113
114 } else if strings.HasPrefix(line, "##states##") {
115 mode = STATES
116
117 // Adds a final transition symbol to sigma
118 // written as '#' in Mizobuchi et al (2000)
Akron1c34ce62021-09-23 23:27:39 +0200119 auto.sigmaCount++
120 auto.final = auto.sigmaCount
Akron0d0daa22021-09-21 16:32:23 +0200121
122 } else if strings.HasPrefix(line, "##sigma##") {
123
124 mode = SIGMA
125
126 } else if strings.HasPrefix(line, "##end##") {
127
128 mode = NONE
129
130 } else if !strings.HasPrefix(line, "##foma-net") {
131 log.Print("Unknown input line")
132 break
133 }
134 continue
135 }
136
137 // Based on the current parser mode, interpret the lines
138 switch mode {
139 case PROPS:
140 {
141 elem = strings.Split(line, " ")
142 /*
143 fmt.Println("arity: " + elem[0])
144 fmt.Println("arccount: " + elem[1])
145 fmt.Println("statecount: " + elem[2])
146 fmt.Println("linecount: " + elem[3])
147 fmt.Println("finalcount: " + elem[4])
148 fmt.Println("pathcount: " + elem[5])
149 fmt.Println("is_deterministic: " + elem[6])
150 fmt.Println("is_pruned: " + elem[7])
151 fmt.Println("is_minimized: " + elem[8])
152 fmt.Println("is_epsilon_free: " + elem[9])
153 fmt.Println("is_loop_free: " + elem[10])
154 fmt.Println("extras: " + elem[11])
155 fmt.Println("name: " + elem[12])
156 */
157 if elem[6] != "1" {
158 log.Print("The FST needs to be deterministic")
159 return nil
160 }
161
162 if elem[9] != "1" {
163 log.Print("The FST needs to be epsilon free")
164 return nil
165 }
166
167 elemint[0], err = strconv.Atoi(elem[1])
168 if err != nil {
169 log.Print("Can't read arccount")
170 return nil
171 }
Akron1c34ce62021-09-23 23:27:39 +0200172 auto.arcCount = elemint[0]
Akron0d0daa22021-09-21 16:32:23 +0200173
174 elemint[0], err = strconv.Atoi(elem[2])
175 if err != nil {
176 log.Print("Can't read statecount")
177 return nil
178 }
179
180 // States start at 1 in Mizobuchi et al (2000),
181 // as the state 0 is associated with a fail.
182 // Initialize states and transitions
Akron1c34ce62021-09-23 23:27:39 +0200183 auto.stateCount = elemint[0]
184 auto.transitions = make([]map[int]*edge, elemint[0]+1)
Akron0d0daa22021-09-21 16:32:23 +0200185 continue
186 }
187 case STATES:
188 {
189 elem = strings.Split(line[0:len(line)-1], " ")
190 if elem[0] == "-1" {
191 if DEBUG {
192 fmt.Println("Skip", elem)
193 }
194 continue
195 }
196 elemint[0], err = strconv.Atoi(elem[0])
197 if err != nil {
198 fmt.Println("Unable to translate", elem[0])
199 break
200 }
201
202 if len(elem) > 1 {
203 elemint[1], err = strconv.Atoi(elem[1])
204 if err != nil {
205 fmt.Println("Unable to translate", elem[1])
206 break
207 }
208 if len(elem) > 2 {
209 elemint[2], err = strconv.Atoi(elem[2])
210 if err != nil {
211 fmt.Println("Unable to translate", elem[2])
212 break
213 }
214 if len(elem) > 3 {
215 elemint[3], err = strconv.Atoi(elem[3])
216 if err != nil {
217 fmt.Println("Unable to translate", elem[3])
218 break
219 }
220 if len(elem) > 4 {
221 elemint[4], err = strconv.Atoi(elem[4])
222 if err != nil {
223 fmt.Println("Unable to translate", elem[4])
224 break
225 }
226 }
227 }
228 }
229 }
230
231 switch len(elem) {
232 case 5:
233 {
234 state = elemint[0]
235 inSym = elemint[1]
236 outSym = elemint[2]
237 end = elemint[3]
238 final = elemint[4]
239 }
240 case 4:
241 {
242 if elemint[1] == -1 {
243 state = elemint[0]
244 final = elemint[3]
245
246 // Final state that has no outgoing edges
247 if final == 1 {
248
249 // Initialize outgoing states
Akron1c34ce62021-09-23 23:27:39 +0200250 if auto.transitions[state+1] == nil {
251 auto.transitions[state+1] = make(map[int]*edge)
Akron0d0daa22021-09-21 16:32:23 +0200252 }
253
254 // TODO:
255 // Maybe this is less relevant for tokenizers
Akron1c34ce62021-09-23 23:27:39 +0200256 auto.transitions[state+1][auto.final] = &edge{}
Akron0d0daa22021-09-21 16:32:23 +0200257 }
258 continue
259 } else {
260 state = elemint[0]
261 inSym = elemint[1]
262 end = elemint[2]
263 final = elemint[3]
264 outSym = inSym
265 }
266 }
267 case 3:
268 {
269 inSym = elemint[0]
270 outSym = elemint[1]
271 end = elemint[2]
272 }
273 case 2:
274 {
275 inSym = elemint[0]
276 end = elemint[1]
277 outSym = inSym
278 }
279 }
280
281 nontoken := false
282 tokenend := false
283
284 // While the states in foma start with 0, the states in the
285 // Mizobuchi FSA start with one - so we increase every state by 1.
286 // We also increase sigma by 1, so there are no 0 transitions.
287 inSym++
288 outSym++
289
290 // Only a limited list of transitions are allowed
291 if inSym != outSym {
Akron1c34ce62021-09-23 23:27:39 +0200292 if outSym == auto.tokenend && inSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200293 tokenend = true
Akron1c34ce62021-09-23 23:27:39 +0200294 } else if outSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200295 nontoken = true
296 } else {
297 log.Println(
298 "Unsupported transition: " +
299 strconv.Itoa(state) +
300 " -> " + strconv.Itoa(end) +
301 " (" +
302 strconv.Itoa(inSym) +
303 ":" +
304 strconv.Itoa(outSym) +
305 ") (" +
Akron1c34ce62021-09-23 23:27:39 +0200306 string(auto.sigmaRev[inSym]) +
Akron0d0daa22021-09-21 16:32:23 +0200307 ":" +
Akron1c34ce62021-09-23 23:27:39 +0200308 string(auto.sigmaRev[outSym]) +
Akron0d0daa22021-09-21 16:32:23 +0200309 ")")
310 return nil
311 }
Akron1c34ce62021-09-23 23:27:39 +0200312 } else if inSym == auto.tokenend {
Akron0d0daa22021-09-21 16:32:23 +0200313 // Ignore tokenend accepting arcs
314 continue
Akron1c34ce62021-09-23 23:27:39 +0200315 } else if inSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200316 log.Println("General epsilon transitions are not supported")
317 return nil
Akron1c34ce62021-09-23 23:27:39 +0200318 } else if auto.sigmaMCS[inSym] != "" {
Akron0d0daa22021-09-21 16:32:23 +0200319 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
320 // Ignore MCS transitions
321 continue
322 }
323
324 // Create an edge based on the collected information
325 targetObj := &edge{
326 inSym: inSym,
327 outSym: outSym,
328 end: end + 1,
329 tokenend: tokenend,
330 nontoken: nontoken,
331 }
332
333 // Initialize outgoing states
Akron1c34ce62021-09-23 23:27:39 +0200334 if auto.transitions[state+1] == nil {
335 auto.transitions[state+1] = make(map[int]*edge)
Akron0d0daa22021-09-21 16:32:23 +0200336 }
337
338 // Ignore transitions with invalid symbols
339 if inSym >= 0 {
Akron1c34ce62021-09-23 23:27:39 +0200340 auto.transitions[state+1][inSym] = targetObj
Akron0d0daa22021-09-21 16:32:23 +0200341 }
342
343 // Add final transition
344 if final == 1 {
345 // TODO:
346 // Maybe this is less relevant for tokenizers
Akron1c34ce62021-09-23 23:27:39 +0200347 auto.transitions[state+1][auto.final] = &edge{}
Akron0d0daa22021-09-21 16:32:23 +0200348 }
349
350 if DEBUG {
351 fmt.Println("Add",
352 state+1, "->", end+1,
353 "(",
354 inSym,
355 ":",
356 outSym,
357 ") (",
Akron1c34ce62021-09-23 23:27:39 +0200358 string(auto.sigmaRev[inSym]),
Akron0d0daa22021-09-21 16:32:23 +0200359 ":",
Akron1c34ce62021-09-23 23:27:39 +0200360 string(auto.sigmaRev[outSym]),
Akron0d0daa22021-09-21 16:32:23 +0200361 ")",
362 ";",
363 "TE:", tokenend,
364 "NT:", nontoken,
365 "FIN:", final)
366 }
367
368 continue
369 }
370 case SIGMA:
371 {
372 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
373
374 // Turn string into sigma id
375 number, err := strconv.Atoi(elem[0])
376
377 // ID needs to be > 1
378 number++
379
380 if err != nil {
381 log.Println(err)
382 return nil
383 }
384
Akron1c34ce62021-09-23 23:27:39 +0200385 auto.sigmaCount = number
Akron0d0daa22021-09-21 16:32:23 +0200386
387 var symbol rune
388
389 // Read rune
390 if utf8.RuneCountInString(elem[1]) == 1 {
391 symbol = []rune(elem[1])[0]
392
393 } else if utf8.RuneCountInString(elem[1]) > 1 {
394
395 // Probably a MCS
396 switch elem[1] {
397 case "@_EPSILON_SYMBOL_@":
398 {
Akron1c34ce62021-09-23 23:27:39 +0200399 auto.epsilon = number
Akron0d0daa22021-09-21 16:32:23 +0200400 }
401 case "@_UNKNOWN_SYMBOL_@":
402 {
Akron1c34ce62021-09-23 23:27:39 +0200403 auto.unknown = number
Akron0d0daa22021-09-21 16:32:23 +0200404 }
405
406 case "@_IDENTITY_SYMBOL_@":
407 {
Akron1c34ce62021-09-23 23:27:39 +0200408 auto.identity = number
Akron0d0daa22021-09-21 16:32:23 +0200409 }
410
411 case "@_TOKEN_SYMBOL_@":
412 {
Akron1c34ce62021-09-23 23:27:39 +0200413 auto.tokenend = number
Akron0d0daa22021-09-21 16:32:23 +0200414 }
415 default:
416 {
417 // MCS not supported
Akron1c34ce62021-09-23 23:27:39 +0200418 auto.sigmaMCS[number] = line
Akron0d0daa22021-09-21 16:32:23 +0200419 }
420 }
421 continue
422
423 } else { // Probably a new line symbol
424 line, err = r.ReadString('\n')
425 if err != nil {
426 log.Println(err)
427 return nil
428 }
429 if len(line) != 1 {
430 // MCS not supported
Akron1c34ce62021-09-23 23:27:39 +0200431 auto.sigmaMCS[number] = line
Akron0d0daa22021-09-21 16:32:23 +0200432 continue
433 }
434 symbol = rune('\n')
435 }
436
Akron1c34ce62021-09-23 23:27:39 +0200437 auto.sigmaRev[number] = symbol
Akron0d0daa22021-09-21 16:32:23 +0200438 }
439 }
440 }
Akron1c34ce62021-09-23 23:27:39 +0200441 auto.sigmaMCS = nil
442 return auto
Akron0d0daa22021-09-21 16:32:23 +0200443}
444
445// Set alphabet A to the list of all symbols
446// outgoing from s
Akron1c34ce62021-09-23 23:27:39 +0200447func (auto *Automaton) getSet(s int, A *[]int) {
448 for a := range auto.transitions[s] {
Akron0d0daa22021-09-21 16:32:23 +0200449 *A = append(*A, a)
450 }
451
452 // Not required, but simplifies bug hunting
453 // sort.Ints(*A)
454}