blob: 46f9d710c15717ac0813f242361b9fc038e585f8 [file] [log] [blame]
Akron0d0daa22021-09-21 16:32:23 +02001package datok
2
3import (
4 "bufio"
5 "compress/gzip"
6 "fmt"
7 "io"
8 "log"
9 "os"
10 "strconv"
11 "strings"
12 "unicode/utf8"
13)
14
15const (
16 PROPS = 1
17 SIGMA = 2
18 STATES = 3
19 NONE = 4
20)
21
22type edge struct {
23 inSym int
24 outSym int
25 end int
26 nontoken bool
27 tokenend bool
28}
29
30// Tokenizer is the intermediate representation
31// of the tokenizer.
32type Tokenizer struct {
33 sigmaRev map[int]rune
34 sigmaMCS map[int]string
35 arcCount int
36 sigmaCount int
37 transitions []map[int]*edge
38
39 // Special symbols in sigma
40 epsilon int
41 unknown int
42 identity int
43 final int
44 tokenend int
45}
46
47// ParseFoma reads the FST from a foma file
48// and creates an internal representation,
49// in case it follows the tokenizer's convention.
50func LoadFomaFile(file string) *Tokenizer {
51 f, err := os.Open(file)
52 if err != nil {
53 log.Print(err)
54 return nil
55 }
56 defer f.Close()
57
58 gz, err := gzip.NewReader(f)
59 if err != nil {
60 log.Print(err)
61 return nil
62 }
63 defer gz.Close()
64
65 return ParseFoma(gz)
66}
67
68// ParseFoma reads the FST from a foma file reader
69// and creates an internal representation,
70// in case it follows the tokenizer's convention.
71func ParseFoma(ior io.Reader) *Tokenizer {
72 r := bufio.NewReader(ior)
73
74 tok := &Tokenizer{
75 sigmaRev: make(map[int]rune),
76 sigmaMCS: make(map[int]string),
77 epsilon: -1,
78 unknown: -1,
79 identity: -1,
80 final: -1,
81 tokenend: -1,
82 }
83
84 var state, inSym, outSym, end, final int
85
86 mode := 0
87 var elem []string
88 var elemint [5]int
89
90 // Iterate over all lines of the file.
91 // This is mainly based on foma2js,
92 // licensed under the Apache License, version 2,
93 // and written by Mans Hulden.
94 for {
95 line, err := r.ReadString('\n')
96 if err != nil {
97 if err == io.EOF {
98 break
99 }
100 log.Print(err)
101 return nil
102 }
103
104 // Read parser mode for the following lines
105 if strings.HasPrefix(line, "##") {
106 if strings.HasPrefix(line, "##props##") {
107 mode = PROPS
108
109 } else if strings.HasPrefix(line, "##states##") {
110 mode = STATES
111
112 // Adds a final transition symbol to sigma
113 // written as '#' in Mizobuchi et al (2000)
114 tok.sigmaCount++
115 tok.final = tok.sigmaCount
116
117 } else if strings.HasPrefix(line, "##sigma##") {
118
119 mode = SIGMA
120
121 } else if strings.HasPrefix(line, "##end##") {
122
123 mode = NONE
124
125 } else if !strings.HasPrefix(line, "##foma-net") {
126 log.Print("Unknown input line")
127 break
128 }
129 continue
130 }
131
132 // Based on the current parser mode, interpret the lines
133 switch mode {
134 case PROPS:
135 {
136 elem = strings.Split(line, " ")
137 /*
138 fmt.Println("arity: " + elem[0])
139 fmt.Println("arccount: " + elem[1])
140 fmt.Println("statecount: " + elem[2])
141 fmt.Println("linecount: " + elem[3])
142 fmt.Println("finalcount: " + elem[4])
143 fmt.Println("pathcount: " + elem[5])
144 fmt.Println("is_deterministic: " + elem[6])
145 fmt.Println("is_pruned: " + elem[7])
146 fmt.Println("is_minimized: " + elem[8])
147 fmt.Println("is_epsilon_free: " + elem[9])
148 fmt.Println("is_loop_free: " + elem[10])
149 fmt.Println("extras: " + elem[11])
150 fmt.Println("name: " + elem[12])
151 */
152 if elem[6] != "1" {
153 log.Print("The FST needs to be deterministic")
154 return nil
155 }
156
157 if elem[9] != "1" {
158 log.Print("The FST needs to be epsilon free")
159 return nil
160 }
161
162 elemint[0], err = strconv.Atoi(elem[1])
163 if err != nil {
164 log.Print("Can't read arccount")
165 return nil
166 }
167 tok.arcCount = elemint[0]
168
169 elemint[0], err = strconv.Atoi(elem[2])
170 if err != nil {
171 log.Print("Can't read statecount")
172 return nil
173 }
174
175 // States start at 1 in Mizobuchi et al (2000),
176 // as the state 0 is associated with a fail.
177 // Initialize states and transitions
178 tok.transitions = make([]map[int]*edge, elemint[0]+1)
179 continue
180 }
181 case STATES:
182 {
183 elem = strings.Split(line[0:len(line)-1], " ")
184 if elem[0] == "-1" {
185 if DEBUG {
186 fmt.Println("Skip", elem)
187 }
188 continue
189 }
190 elemint[0], err = strconv.Atoi(elem[0])
191 if err != nil {
192 fmt.Println("Unable to translate", elem[0])
193 break
194 }
195
196 if len(elem) > 1 {
197 elemint[1], err = strconv.Atoi(elem[1])
198 if err != nil {
199 fmt.Println("Unable to translate", elem[1])
200 break
201 }
202 if len(elem) > 2 {
203 elemint[2], err = strconv.Atoi(elem[2])
204 if err != nil {
205 fmt.Println("Unable to translate", elem[2])
206 break
207 }
208 if len(elem) > 3 {
209 elemint[3], err = strconv.Atoi(elem[3])
210 if err != nil {
211 fmt.Println("Unable to translate", elem[3])
212 break
213 }
214 if len(elem) > 4 {
215 elemint[4], err = strconv.Atoi(elem[4])
216 if err != nil {
217 fmt.Println("Unable to translate", elem[4])
218 break
219 }
220 }
221 }
222 }
223 }
224
225 switch len(elem) {
226 case 5:
227 {
228 state = elemint[0]
229 inSym = elemint[1]
230 outSym = elemint[2]
231 end = elemint[3]
232 final = elemint[4]
233 }
234 case 4:
235 {
236 if elemint[1] == -1 {
237 state = elemint[0]
238 final = elemint[3]
239
240 // Final state that has no outgoing edges
241 if final == 1 {
242
243 // Initialize outgoing states
244 if tok.transitions[state+1] == nil {
245 tok.transitions[state+1] = make(map[int]*edge)
246 }
247
248 // TODO:
249 // Maybe this is less relevant for tokenizers
250 tok.transitions[state+1][tok.final] = &edge{}
251 }
252 continue
253 } else {
254 state = elemint[0]
255 inSym = elemint[1]
256 end = elemint[2]
257 final = elemint[3]
258 outSym = inSym
259 }
260 }
261 case 3:
262 {
263 inSym = elemint[0]
264 outSym = elemint[1]
265 end = elemint[2]
266 }
267 case 2:
268 {
269 inSym = elemint[0]
270 end = elemint[1]
271 outSym = inSym
272 }
273 }
274
275 nontoken := false
276 tokenend := false
277
278 // While the states in foma start with 0, the states in the
279 // Mizobuchi FSA start with one - so we increase every state by 1.
280 // We also increase sigma by 1, so there are no 0 transitions.
281 inSym++
282 outSym++
283
284 // Only a limited list of transitions are allowed
285 if inSym != outSym {
286 if outSym == tok.tokenend && inSym == tok.epsilon {
287 tokenend = true
288 } else if outSym == tok.epsilon {
289 nontoken = true
290 } else {
291 log.Println(
292 "Unsupported transition: " +
293 strconv.Itoa(state) +
294 " -> " + strconv.Itoa(end) +
295 " (" +
296 strconv.Itoa(inSym) +
297 ":" +
298 strconv.Itoa(outSym) +
299 ") (" +
300 string(tok.sigmaRev[inSym]) +
301 ":" +
302 string(tok.sigmaRev[outSym]) +
303 ")")
304 return nil
305 }
306 } else if inSym == tok.tokenend {
307 // Ignore tokenend accepting arcs
308 continue
309 } else if inSym == tok.epsilon {
310 log.Println("General epsilon transitions are not supported")
311 return nil
312 } else if tok.sigmaMCS[inSym] != "" {
313 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
314 // Ignore MCS transitions
315 continue
316 }
317
318 // Create an edge based on the collected information
319 targetObj := &edge{
320 inSym: inSym,
321 outSym: outSym,
322 end: end + 1,
323 tokenend: tokenend,
324 nontoken: nontoken,
325 }
326
327 // Initialize outgoing states
328 if tok.transitions[state+1] == nil {
329 tok.transitions[state+1] = make(map[int]*edge)
330 }
331
332 // Ignore transitions with invalid symbols
333 if inSym >= 0 {
334 tok.transitions[state+1][inSym] = targetObj
335 }
336
337 // Add final transition
338 if final == 1 {
339 // TODO:
340 // Maybe this is less relevant for tokenizers
341 tok.transitions[state+1][tok.final] = &edge{}
342 }
343
344 if DEBUG {
345 fmt.Println("Add",
346 state+1, "->", end+1,
347 "(",
348 inSym,
349 ":",
350 outSym,
351 ") (",
352 string(tok.sigmaRev[inSym]),
353 ":",
354 string(tok.sigmaRev[outSym]),
355 ")",
356 ";",
357 "TE:", tokenend,
358 "NT:", nontoken,
359 "FIN:", final)
360 }
361
362 continue
363 }
364 case SIGMA:
365 {
366 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
367
368 // Turn string into sigma id
369 number, err := strconv.Atoi(elem[0])
370
371 // ID needs to be > 1
372 number++
373
374 if err != nil {
375 log.Println(err)
376 return nil
377 }
378
379 tok.sigmaCount = number
380
381 var symbol rune
382
383 // Read rune
384 if utf8.RuneCountInString(elem[1]) == 1 {
385 symbol = []rune(elem[1])[0]
386
387 } else if utf8.RuneCountInString(elem[1]) > 1 {
388
389 // Probably a MCS
390 switch elem[1] {
391 case "@_EPSILON_SYMBOL_@":
392 {
393 tok.epsilon = number
394 }
395 case "@_UNKNOWN_SYMBOL_@":
396 {
397 tok.unknown = number
398 }
399
400 case "@_IDENTITY_SYMBOL_@":
401 {
402 tok.identity = number
403 }
404
405 case "@_TOKEN_SYMBOL_@":
406 {
407 tok.tokenend = number
408 }
409 default:
410 {
411 // MCS not supported
412 tok.sigmaMCS[number] = line
413 }
414 }
415 continue
416
417 } else { // Probably a new line symbol
418 line, err = r.ReadString('\n')
419 if err != nil {
420 log.Println(err)
421 return nil
422 }
423 if len(line) != 1 {
424 // MCS not supported
425 tok.sigmaMCS[number] = line
426 continue
427 }
428 symbol = rune('\n')
429 }
430
431 tok.sigmaRev[number] = symbol
432 }
433 }
434 }
435 tok.sigmaMCS = nil
436 return tok
437}
438
439// Set alphabet A to the list of all symbols
440// outgoing from s
441func (tok *Tokenizer) getSet(s int, A *[]int) {
442 for a := range tok.transitions[s] {
443 *A = append(*A, a)
444 }
445
446 // Not required, but simplifies bug hunting
447 // sort.Ints(*A)
448}