blob: 33128a2118bdfcd1d2aec10be949b1e95e3e5612 [file] [log] [blame]
Akron0d0daa22021-09-21 16:32:23 +02001package datok
2
3import (
4 "bufio"
5 "compress/gzip"
6 "fmt"
7 "io"
8 "log"
9 "os"
10 "strconv"
11 "strings"
12 "unicode/utf8"
13)
14
15const (
16 PROPS = 1
17 SIGMA = 2
18 STATES = 3
19 NONE = 4
20)
21
22type edge struct {
23 inSym int
24 outSym int
25 end int
26 nontoken bool
27 tokenend bool
28}
29
Akron1c34ce62021-09-23 23:27:39 +020030type Tokenizer interface {
31 Transduce(r io.Reader, w io.Writer) bool
Akron4f6b28c2021-10-25 00:52:03 +020032 TransduceTokenWriter(r io.Reader, w *TokenWriter) bool
Akron941f2152021-09-26 15:14:25 +020033 Type() string
Akron1c34ce62021-09-23 23:27:39 +020034}
35
36// Automaton is the intermediate representation
Akron0d0daa22021-09-21 16:32:23 +020037// of the tokenizer.
Akron1c34ce62021-09-23 23:27:39 +020038type Automaton struct {
Akron0d0daa22021-09-21 16:32:23 +020039 sigmaRev map[int]rune
40 sigmaMCS map[int]string
41 arcCount int
42 sigmaCount int
Akron1c34ce62021-09-23 23:27:39 +020043 stateCount int
Akron0d0daa22021-09-21 16:32:23 +020044 transitions []map[int]*edge
45
46 // Special symbols in sigma
47 epsilon int
48 unknown int
49 identity int
50 final int
51 tokenend int
52}
53
54// ParseFoma reads the FST from a foma file
55// and creates an internal representation,
56// in case it follows the tokenizer's convention.
Akron1c34ce62021-09-23 23:27:39 +020057func LoadFomaFile(file string) *Automaton {
Akron0d0daa22021-09-21 16:32:23 +020058 f, err := os.Open(file)
59 if err != nil {
60 log.Print(err)
61 return nil
62 }
63 defer f.Close()
64
65 gz, err := gzip.NewReader(f)
66 if err != nil {
67 log.Print(err)
68 return nil
69 }
70 defer gz.Close()
71
72 return ParseFoma(gz)
73}
74
75// ParseFoma reads the FST from a foma file reader
76// and creates an internal representation,
77// in case it follows the tokenizer's convention.
Akron1c34ce62021-09-23 23:27:39 +020078func ParseFoma(ior io.Reader) *Automaton {
Akron0d0daa22021-09-21 16:32:23 +020079 r := bufio.NewReader(ior)
80
Akron1c34ce62021-09-23 23:27:39 +020081 auto := &Automaton{
Akron0d0daa22021-09-21 16:32:23 +020082 sigmaRev: make(map[int]rune),
83 sigmaMCS: make(map[int]string),
84 epsilon: -1,
85 unknown: -1,
86 identity: -1,
87 final: -1,
88 tokenend: -1,
89 }
90
91 var state, inSym, outSym, end, final int
92
93 mode := 0
94 var elem []string
95 var elemint [5]int
96
97 // Iterate over all lines of the file.
98 // This is mainly based on foma2js,
99 // licensed under the Apache License, version 2,
100 // and written by Mans Hulden.
101 for {
102 line, err := r.ReadString('\n')
103 if err != nil {
104 if err == io.EOF {
105 break
106 }
107 log.Print(err)
108 return nil
109 }
110
111 // Read parser mode for the following lines
112 if strings.HasPrefix(line, "##") {
113 if strings.HasPrefix(line, "##props##") {
114 mode = PROPS
115
116 } else if strings.HasPrefix(line, "##states##") {
117 mode = STATES
118
119 // Adds a final transition symbol to sigma
120 // written as '#' in Mizobuchi et al (2000)
Akron1c34ce62021-09-23 23:27:39 +0200121 auto.sigmaCount++
122 auto.final = auto.sigmaCount
Akron0d0daa22021-09-21 16:32:23 +0200123
124 } else if strings.HasPrefix(line, "##sigma##") {
125
126 mode = SIGMA
127
128 } else if strings.HasPrefix(line, "##end##") {
129
130 mode = NONE
131
132 } else if !strings.HasPrefix(line, "##foma-net") {
133 log.Print("Unknown input line")
134 break
135 }
136 continue
137 }
138
139 // Based on the current parser mode, interpret the lines
140 switch mode {
141 case PROPS:
142 {
143 elem = strings.Split(line, " ")
144 /*
145 fmt.Println("arity: " + elem[0])
146 fmt.Println("arccount: " + elem[1])
147 fmt.Println("statecount: " + elem[2])
148 fmt.Println("linecount: " + elem[3])
149 fmt.Println("finalcount: " + elem[4])
150 fmt.Println("pathcount: " + elem[5])
151 fmt.Println("is_deterministic: " + elem[6])
152 fmt.Println("is_pruned: " + elem[7])
153 fmt.Println("is_minimized: " + elem[8])
154 fmt.Println("is_epsilon_free: " + elem[9])
155 fmt.Println("is_loop_free: " + elem[10])
156 fmt.Println("extras: " + elem[11])
157 fmt.Println("name: " + elem[12])
158 */
159 if elem[6] != "1" {
160 log.Print("The FST needs to be deterministic")
161 return nil
162 }
163
164 if elem[9] != "1" {
165 log.Print("The FST needs to be epsilon free")
166 return nil
167 }
168
169 elemint[0], err = strconv.Atoi(elem[1])
170 if err != nil {
171 log.Print("Can't read arccount")
172 return nil
173 }
Akron1c34ce62021-09-23 23:27:39 +0200174 auto.arcCount = elemint[0]
Akron0d0daa22021-09-21 16:32:23 +0200175
176 elemint[0], err = strconv.Atoi(elem[2])
177 if err != nil {
178 log.Print("Can't read statecount")
179 return nil
180 }
181
182 // States start at 1 in Mizobuchi et al (2000),
183 // as the state 0 is associated with a fail.
184 // Initialize states and transitions
Akron1c34ce62021-09-23 23:27:39 +0200185 auto.stateCount = elemint[0]
186 auto.transitions = make([]map[int]*edge, elemint[0]+1)
Akron0d0daa22021-09-21 16:32:23 +0200187 continue
188 }
189 case STATES:
190 {
191 elem = strings.Split(line[0:len(line)-1], " ")
192 if elem[0] == "-1" {
193 if DEBUG {
194 fmt.Println("Skip", elem)
195 }
196 continue
197 }
198 elemint[0], err = strconv.Atoi(elem[0])
199 if err != nil {
200 fmt.Println("Unable to translate", elem[0])
201 break
202 }
203
204 if len(elem) > 1 {
205 elemint[1], err = strconv.Atoi(elem[1])
206 if err != nil {
207 fmt.Println("Unable to translate", elem[1])
208 break
209 }
210 if len(elem) > 2 {
211 elemint[2], err = strconv.Atoi(elem[2])
212 if err != nil {
213 fmt.Println("Unable to translate", elem[2])
214 break
215 }
216 if len(elem) > 3 {
217 elemint[3], err = strconv.Atoi(elem[3])
218 if err != nil {
219 fmt.Println("Unable to translate", elem[3])
220 break
221 }
222 if len(elem) > 4 {
223 elemint[4], err = strconv.Atoi(elem[4])
224 if err != nil {
225 fmt.Println("Unable to translate", elem[4])
226 break
227 }
228 }
229 }
230 }
231 }
232
233 switch len(elem) {
234 case 5:
235 {
236 state = elemint[0]
237 inSym = elemint[1]
238 outSym = elemint[2]
239 end = elemint[3]
240 final = elemint[4]
241 }
242 case 4:
243 {
244 if elemint[1] == -1 {
245 state = elemint[0]
246 final = elemint[3]
247
248 // Final state that has no outgoing edges
249 if final == 1 {
250
251 // Initialize outgoing states
Akron1c34ce62021-09-23 23:27:39 +0200252 if auto.transitions[state+1] == nil {
253 auto.transitions[state+1] = make(map[int]*edge)
Akron0d0daa22021-09-21 16:32:23 +0200254 }
255
256 // TODO:
257 // Maybe this is less relevant for tokenizers
Akron1c34ce62021-09-23 23:27:39 +0200258 auto.transitions[state+1][auto.final] = &edge{}
Akron0d0daa22021-09-21 16:32:23 +0200259 }
260 continue
261 } else {
262 state = elemint[0]
263 inSym = elemint[1]
264 end = elemint[2]
265 final = elemint[3]
266 outSym = inSym
267 }
268 }
269 case 3:
270 {
271 inSym = elemint[0]
272 outSym = elemint[1]
273 end = elemint[2]
274 }
275 case 2:
276 {
277 inSym = elemint[0]
278 end = elemint[1]
279 outSym = inSym
280 }
281 }
282
283 nontoken := false
284 tokenend := false
285
286 // While the states in foma start with 0, the states in the
287 // Mizobuchi FSA start with one - so we increase every state by 1.
288 // We also increase sigma by 1, so there are no 0 transitions.
289 inSym++
290 outSym++
291
292 // Only a limited list of transitions are allowed
293 if inSym != outSym {
Akron1c34ce62021-09-23 23:27:39 +0200294 if outSym == auto.tokenend && inSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200295 tokenend = true
Akron1c34ce62021-09-23 23:27:39 +0200296 } else if outSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200297 nontoken = true
298 } else {
299 log.Println(
300 "Unsupported transition: " +
301 strconv.Itoa(state) +
302 " -> " + strconv.Itoa(end) +
303 " (" +
304 strconv.Itoa(inSym) +
305 ":" +
306 strconv.Itoa(outSym) +
307 ") (" +
Akron1c34ce62021-09-23 23:27:39 +0200308 string(auto.sigmaRev[inSym]) +
Akron0d0daa22021-09-21 16:32:23 +0200309 ":" +
Akron1c34ce62021-09-23 23:27:39 +0200310 string(auto.sigmaRev[outSym]) +
Akron0d0daa22021-09-21 16:32:23 +0200311 ")")
312 return nil
313 }
Akron1c34ce62021-09-23 23:27:39 +0200314 } else if inSym == auto.tokenend {
Akron0d0daa22021-09-21 16:32:23 +0200315 // Ignore tokenend accepting arcs
316 continue
Akron1c34ce62021-09-23 23:27:39 +0200317 } else if inSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200318 log.Println("General epsilon transitions are not supported")
319 return nil
Akron1c34ce62021-09-23 23:27:39 +0200320 } else if auto.sigmaMCS[inSym] != "" {
Akron0d0daa22021-09-21 16:32:23 +0200321 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
322 // Ignore MCS transitions
323 continue
324 }
325
326 // Create an edge based on the collected information
327 targetObj := &edge{
328 inSym: inSym,
329 outSym: outSym,
330 end: end + 1,
331 tokenend: tokenend,
332 nontoken: nontoken,
333 }
334
335 // Initialize outgoing states
Akron1c34ce62021-09-23 23:27:39 +0200336 if auto.transitions[state+1] == nil {
337 auto.transitions[state+1] = make(map[int]*edge)
Akron0d0daa22021-09-21 16:32:23 +0200338 }
339
340 // Ignore transitions with invalid symbols
341 if inSym >= 0 {
Akron1c34ce62021-09-23 23:27:39 +0200342 auto.transitions[state+1][inSym] = targetObj
Akron0d0daa22021-09-21 16:32:23 +0200343 }
344
345 // Add final transition
346 if final == 1 {
347 // TODO:
348 // Maybe this is less relevant for tokenizers
Akron1c34ce62021-09-23 23:27:39 +0200349 auto.transitions[state+1][auto.final] = &edge{}
Akron0d0daa22021-09-21 16:32:23 +0200350 }
351
352 if DEBUG {
353 fmt.Println("Add",
354 state+1, "->", end+1,
355 "(",
356 inSym,
357 ":",
358 outSym,
359 ") (",
Akron1c34ce62021-09-23 23:27:39 +0200360 string(auto.sigmaRev[inSym]),
Akron0d0daa22021-09-21 16:32:23 +0200361 ":",
Akron1c34ce62021-09-23 23:27:39 +0200362 string(auto.sigmaRev[outSym]),
Akron0d0daa22021-09-21 16:32:23 +0200363 ")",
364 ";",
365 "TE:", tokenend,
366 "NT:", nontoken,
367 "FIN:", final)
368 }
369
370 continue
371 }
372 case SIGMA:
373 {
374 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
375
376 // Turn string into sigma id
377 number, err := strconv.Atoi(elem[0])
378
379 // ID needs to be > 1
380 number++
381
382 if err != nil {
383 log.Println(err)
384 return nil
385 }
386
Akron1c34ce62021-09-23 23:27:39 +0200387 auto.sigmaCount = number
Akron0d0daa22021-09-21 16:32:23 +0200388
389 var symbol rune
390
391 // Read rune
392 if utf8.RuneCountInString(elem[1]) == 1 {
393 symbol = []rune(elem[1])[0]
394
395 } else if utf8.RuneCountInString(elem[1]) > 1 {
396
397 // Probably a MCS
398 switch elem[1] {
399 case "@_EPSILON_SYMBOL_@":
400 {
Akron1c34ce62021-09-23 23:27:39 +0200401 auto.epsilon = number
Akron0d0daa22021-09-21 16:32:23 +0200402 }
403 case "@_UNKNOWN_SYMBOL_@":
404 {
Akron1c34ce62021-09-23 23:27:39 +0200405 auto.unknown = number
Akron0d0daa22021-09-21 16:32:23 +0200406 }
407
408 case "@_IDENTITY_SYMBOL_@":
409 {
Akron1c34ce62021-09-23 23:27:39 +0200410 auto.identity = number
Akron0d0daa22021-09-21 16:32:23 +0200411 }
412
413 case "@_TOKEN_SYMBOL_@":
414 {
Akron1c34ce62021-09-23 23:27:39 +0200415 auto.tokenend = number
Akron0d0daa22021-09-21 16:32:23 +0200416 }
417 default:
418 {
419 // MCS not supported
Akron1c34ce62021-09-23 23:27:39 +0200420 auto.sigmaMCS[number] = line
Akron0d0daa22021-09-21 16:32:23 +0200421 }
422 }
423 continue
424
425 } else { // Probably a new line symbol
426 line, err = r.ReadString('\n')
427 if err != nil {
428 log.Println(err)
429 return nil
430 }
431 if len(line) != 1 {
432 // MCS not supported
Akron1c34ce62021-09-23 23:27:39 +0200433 auto.sigmaMCS[number] = line
Akron0d0daa22021-09-21 16:32:23 +0200434 continue
435 }
436 symbol = rune('\n')
437 }
438
Akron1c34ce62021-09-23 23:27:39 +0200439 auto.sigmaRev[number] = symbol
Akron0d0daa22021-09-21 16:32:23 +0200440 }
441 }
442 }
Akron1c34ce62021-09-23 23:27:39 +0200443 auto.sigmaMCS = nil
444 return auto
Akron0d0daa22021-09-21 16:32:23 +0200445}
446
Akron941f2152021-09-26 15:14:25 +0200447func LoadTokenizerFile(file string) Tokenizer {
448 f, err := os.Open(file)
449 if err != nil {
450 log.Println(err)
451 return nil
452 }
453 defer f.Close()
454
455 gz, err := gzip.NewReader(f)
456 if err != nil {
457 log.Println(err)
458 return nil
459 }
460 defer gz.Close()
461
462 r := bufio.NewReader(gz)
463
464 mstr, err := r.Peek(len(DAMAGIC))
465
466 if err != nil {
467 log.Println(err)
468 return nil
469 }
470
471 if string(mstr) == MAMAGIC {
472 return ParseMatrix(r)
473 } else if string(mstr) == DAMAGIC {
474 return ParseDatok(r)
475 }
476
477 log.Println("Neither a matrix nor a datok file")
478 return nil
479}
480
Akron0d0daa22021-09-21 16:32:23 +0200481// Set alphabet A to the list of all symbols
482// outgoing from s
Akron1c34ce62021-09-23 23:27:39 +0200483func (auto *Automaton) getSet(s int, A *[]int) {
484 for a := range auto.transitions[s] {
Akron0d0daa22021-09-21 16:32:23 +0200485 *A = append(*A, a)
486 }
487
488 // Not required, but simplifies bug hunting
489 // sort.Ints(*A)
490}