blob: 9d7106c7f1fbf79aac4fd05a9e5037ccac220b33 [file] [log] [blame]
Akron0d0daa22021-09-21 16:32:23 +02001package datok
2
3import (
4 "bufio"
5 "compress/gzip"
Akron0d0daa22021-09-21 16:32:23 +02006 "io"
7 "log"
8 "os"
9 "strconv"
10 "strings"
11 "unicode/utf8"
12)
13
14const (
15 PROPS = 1
16 SIGMA = 2
17 STATES = 3
18 NONE = 4
19)
20
21type edge struct {
22 inSym int
23 outSym int
24 end int
25 nontoken bool
26 tokenend bool
27}
28
Akron1c34ce62021-09-23 23:27:39 +020029type Tokenizer interface {
30 Transduce(r io.Reader, w io.Writer) bool
Akron4f6b28c2021-10-25 00:52:03 +020031 TransduceTokenWriter(r io.Reader, w *TokenWriter) bool
Akron941f2152021-09-26 15:14:25 +020032 Type() string
Akron1c34ce62021-09-23 23:27:39 +020033}
34
35// Automaton is the intermediate representation
Akron0d0daa22021-09-21 16:32:23 +020036// of the tokenizer.
Akron1c34ce62021-09-23 23:27:39 +020037type Automaton struct {
Akron0d0daa22021-09-21 16:32:23 +020038 sigmaRev map[int]rune
39 sigmaMCS map[int]string
40 arcCount int
41 sigmaCount int
Akron1c34ce62021-09-23 23:27:39 +020042 stateCount int
Akron0d0daa22021-09-21 16:32:23 +020043 transitions []map[int]*edge
44
45 // Special symbols in sigma
46 epsilon int
47 unknown int
48 identity int
49 final int
50 tokenend int
51}
52
53// ParseFoma reads the FST from a foma file
54// and creates an internal representation,
55// in case it follows the tokenizer's convention.
Akron1c34ce62021-09-23 23:27:39 +020056func LoadFomaFile(file string) *Automaton {
Akron0d0daa22021-09-21 16:32:23 +020057 f, err := os.Open(file)
58 if err != nil {
59 log.Print(err)
60 return nil
61 }
62 defer f.Close()
63
64 gz, err := gzip.NewReader(f)
65 if err != nil {
66 log.Print(err)
67 return nil
68 }
69 defer gz.Close()
70
71 return ParseFoma(gz)
72}
73
74// ParseFoma reads the FST from a foma file reader
75// and creates an internal representation,
76// in case it follows the tokenizer's convention.
Akron1c34ce62021-09-23 23:27:39 +020077func ParseFoma(ior io.Reader) *Automaton {
Akron0d0daa22021-09-21 16:32:23 +020078 r := bufio.NewReader(ior)
79
Akron1c34ce62021-09-23 23:27:39 +020080 auto := &Automaton{
Akron0d0daa22021-09-21 16:32:23 +020081 sigmaRev: make(map[int]rune),
82 sigmaMCS: make(map[int]string),
83 epsilon: -1,
84 unknown: -1,
85 identity: -1,
86 final: -1,
87 tokenend: -1,
88 }
89
90 var state, inSym, outSym, end, final int
91
92 mode := 0
93 var elem []string
94 var elemint [5]int
95
96 // Iterate over all lines of the file.
97 // This is mainly based on foma2js,
98 // licensed under the Apache License, version 2,
99 // and written by Mans Hulden.
100 for {
101 line, err := r.ReadString('\n')
102 if err != nil {
103 if err == io.EOF {
104 break
105 }
106 log.Print(err)
107 return nil
108 }
109
110 // Read parser mode for the following lines
111 if strings.HasPrefix(line, "##") {
112 if strings.HasPrefix(line, "##props##") {
113 mode = PROPS
114
115 } else if strings.HasPrefix(line, "##states##") {
116 mode = STATES
117
118 // Adds a final transition symbol to sigma
119 // written as '#' in Mizobuchi et al (2000)
Akron1c34ce62021-09-23 23:27:39 +0200120 auto.sigmaCount++
121 auto.final = auto.sigmaCount
Akron0d0daa22021-09-21 16:32:23 +0200122
123 } else if strings.HasPrefix(line, "##sigma##") {
124
125 mode = SIGMA
126
127 } else if strings.HasPrefix(line, "##end##") {
128
129 mode = NONE
130
131 } else if !strings.HasPrefix(line, "##foma-net") {
132 log.Print("Unknown input line")
133 break
134 }
135 continue
136 }
137
138 // Based on the current parser mode, interpret the lines
139 switch mode {
140 case PROPS:
141 {
142 elem = strings.Split(line, " ")
143 /*
Akron9c3bf7f2021-11-03 19:52:12 +0100144 log.Println("arity: " + elem[0])
145 log.Println("arccount: " + elem[1])
146 log.Println("statecount: " + elem[2])
147 log.Println("linecount: " + elem[3])
148 log.Println("finalcount: " + elem[4])
149 log.Println("pathcount: " + elem[5])
150 log.Println("is_deterministic: " + elem[6])
151 log.Println("is_pruned: " + elem[7])
152 log.Println("is_minimized: " + elem[8])
153 log.Println("is_epsilon_free: " + elem[9])
154 log.Println("is_loop_free: " + elem[10])
155 log.Println("extras: " + elem[11])
156 log.Println("name: " + elem[12])
Akron0d0daa22021-09-21 16:32:23 +0200157 */
158 if elem[6] != "1" {
159 log.Print("The FST needs to be deterministic")
160 return nil
161 }
162
163 if elem[9] != "1" {
164 log.Print("The FST needs to be epsilon free")
165 return nil
166 }
167
168 elemint[0], err = strconv.Atoi(elem[1])
169 if err != nil {
170 log.Print("Can't read arccount")
171 return nil
172 }
Akron1c34ce62021-09-23 23:27:39 +0200173 auto.arcCount = elemint[0]
Akron0d0daa22021-09-21 16:32:23 +0200174
175 elemint[0], err = strconv.Atoi(elem[2])
176 if err != nil {
177 log.Print("Can't read statecount")
178 return nil
179 }
180
181 // States start at 1 in Mizobuchi et al (2000),
182 // as the state 0 is associated with a fail.
183 // Initialize states and transitions
Akron1c34ce62021-09-23 23:27:39 +0200184 auto.stateCount = elemint[0]
185 auto.transitions = make([]map[int]*edge, elemint[0]+1)
Akron0d0daa22021-09-21 16:32:23 +0200186 continue
187 }
188 case STATES:
189 {
190 elem = strings.Split(line[0:len(line)-1], " ")
191 if elem[0] == "-1" {
192 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100193 log.Println("Skip", elem)
Akron0d0daa22021-09-21 16:32:23 +0200194 }
195 continue
196 }
197 elemint[0], err = strconv.Atoi(elem[0])
198 if err != nil {
Akron9c3bf7f2021-11-03 19:52:12 +0100199 log.Println("Unable to translate", elem[0])
Akron0d0daa22021-09-21 16:32:23 +0200200 break
201 }
202
203 if len(elem) > 1 {
204 elemint[1], err = strconv.Atoi(elem[1])
205 if err != nil {
Akron9c3bf7f2021-11-03 19:52:12 +0100206 log.Println("Unable to translate", elem[1])
Akron0d0daa22021-09-21 16:32:23 +0200207 break
208 }
209 if len(elem) > 2 {
210 elemint[2], err = strconv.Atoi(elem[2])
211 if err != nil {
Akron9c3bf7f2021-11-03 19:52:12 +0100212 log.Println("Unable to translate", elem[2])
Akron0d0daa22021-09-21 16:32:23 +0200213 break
214 }
215 if len(elem) > 3 {
216 elemint[3], err = strconv.Atoi(elem[3])
217 if err != nil {
Akron9c3bf7f2021-11-03 19:52:12 +0100218 log.Println("Unable to translate", elem[3])
Akron0d0daa22021-09-21 16:32:23 +0200219 break
220 }
221 if len(elem) > 4 {
222 elemint[4], err = strconv.Atoi(elem[4])
223 if err != nil {
Akron9c3bf7f2021-11-03 19:52:12 +0100224 log.Println("Unable to translate", elem[4])
Akron0d0daa22021-09-21 16:32:23 +0200225 break
226 }
227 }
228 }
229 }
230 }
231
232 switch len(elem) {
233 case 5:
234 {
235 state = elemint[0]
236 inSym = elemint[1]
237 outSym = elemint[2]
238 end = elemint[3]
239 final = elemint[4]
240 }
241 case 4:
242 {
243 if elemint[1] == -1 {
244 state = elemint[0]
245 final = elemint[3]
246
247 // Final state that has no outgoing edges
248 if final == 1 {
249
250 // Initialize outgoing states
Akron1c34ce62021-09-23 23:27:39 +0200251 if auto.transitions[state+1] == nil {
252 auto.transitions[state+1] = make(map[int]*edge)
Akron0d0daa22021-09-21 16:32:23 +0200253 }
254
255 // TODO:
256 // Maybe this is less relevant for tokenizers
Akron1c34ce62021-09-23 23:27:39 +0200257 auto.transitions[state+1][auto.final] = &edge{}
Akron0d0daa22021-09-21 16:32:23 +0200258 }
259 continue
260 } else {
261 state = elemint[0]
262 inSym = elemint[1]
263 end = elemint[2]
264 final = elemint[3]
265 outSym = inSym
266 }
267 }
268 case 3:
269 {
270 inSym = elemint[0]
271 outSym = elemint[1]
272 end = elemint[2]
273 }
274 case 2:
275 {
276 inSym = elemint[0]
277 end = elemint[1]
278 outSym = inSym
279 }
280 }
281
282 nontoken := false
283 tokenend := false
284
285 // While the states in foma start with 0, the states in the
286 // Mizobuchi FSA start with one - so we increase every state by 1.
287 // We also increase sigma by 1, so there are no 0 transitions.
288 inSym++
289 outSym++
290
291 // Only a limited list of transitions are allowed
292 if inSym != outSym {
Akron1c34ce62021-09-23 23:27:39 +0200293 if outSym == auto.tokenend && inSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200294 tokenend = true
Akron1c34ce62021-09-23 23:27:39 +0200295 } else if outSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200296 nontoken = true
297 } else {
298 log.Println(
299 "Unsupported transition: " +
300 strconv.Itoa(state) +
301 " -> " + strconv.Itoa(end) +
302 " (" +
303 strconv.Itoa(inSym) +
304 ":" +
305 strconv.Itoa(outSym) +
306 ") (" +
Akron1c34ce62021-09-23 23:27:39 +0200307 string(auto.sigmaRev[inSym]) +
Akron0d0daa22021-09-21 16:32:23 +0200308 ":" +
Akron1c34ce62021-09-23 23:27:39 +0200309 string(auto.sigmaRev[outSym]) +
Akron0d0daa22021-09-21 16:32:23 +0200310 ")")
311 return nil
312 }
Akron1c34ce62021-09-23 23:27:39 +0200313 } else if inSym == auto.tokenend {
Akron0d0daa22021-09-21 16:32:23 +0200314 // Ignore tokenend accepting arcs
315 continue
Akron1c34ce62021-09-23 23:27:39 +0200316 } else if inSym == auto.epsilon {
Akron0d0daa22021-09-21 16:32:23 +0200317 log.Println("General epsilon transitions are not supported")
318 return nil
Akron1c34ce62021-09-23 23:27:39 +0200319 } else if auto.sigmaMCS[inSym] != "" {
Akron0d0daa22021-09-21 16:32:23 +0200320 // log.Fatalln("Non supported character", tok.sigmaMCS[inSym])
321 // Ignore MCS transitions
322 continue
323 }
324
325 // Create an edge based on the collected information
326 targetObj := &edge{
327 inSym: inSym,
328 outSym: outSym,
329 end: end + 1,
330 tokenend: tokenend,
331 nontoken: nontoken,
332 }
333
334 // Initialize outgoing states
Akron1c34ce62021-09-23 23:27:39 +0200335 if auto.transitions[state+1] == nil {
336 auto.transitions[state+1] = make(map[int]*edge)
Akron0d0daa22021-09-21 16:32:23 +0200337 }
338
339 // Ignore transitions with invalid symbols
340 if inSym >= 0 {
Akron1c34ce62021-09-23 23:27:39 +0200341 auto.transitions[state+1][inSym] = targetObj
Akron0d0daa22021-09-21 16:32:23 +0200342 }
343
344 // Add final transition
345 if final == 1 {
346 // TODO:
347 // Maybe this is less relevant for tokenizers
Akron1c34ce62021-09-23 23:27:39 +0200348 auto.transitions[state+1][auto.final] = &edge{}
Akron0d0daa22021-09-21 16:32:23 +0200349 }
350
351 if DEBUG {
Akron9c3bf7f2021-11-03 19:52:12 +0100352 log.Println("Add",
Akron0d0daa22021-09-21 16:32:23 +0200353 state+1, "->", end+1,
354 "(",
355 inSym,
356 ":",
357 outSym,
358 ") (",
Akron1c34ce62021-09-23 23:27:39 +0200359 string(auto.sigmaRev[inSym]),
Akron0d0daa22021-09-21 16:32:23 +0200360 ":",
Akron1c34ce62021-09-23 23:27:39 +0200361 string(auto.sigmaRev[outSym]),
Akron0d0daa22021-09-21 16:32:23 +0200362 ")",
363 ";",
364 "TE:", tokenend,
365 "NT:", nontoken,
366 "FIN:", final)
367 }
368
369 continue
370 }
371 case SIGMA:
372 {
373 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
374
375 // Turn string into sigma id
376 number, err := strconv.Atoi(elem[0])
377
378 // ID needs to be > 1
379 number++
380
381 if err != nil {
382 log.Println(err)
383 return nil
384 }
385
Akron1c34ce62021-09-23 23:27:39 +0200386 auto.sigmaCount = number
Akron0d0daa22021-09-21 16:32:23 +0200387
388 var symbol rune
389
390 // Read rune
391 if utf8.RuneCountInString(elem[1]) == 1 {
392 symbol = []rune(elem[1])[0]
393
394 } else if utf8.RuneCountInString(elem[1]) > 1 {
395
396 // Probably a MCS
397 switch elem[1] {
398 case "@_EPSILON_SYMBOL_@":
399 {
Akron1c34ce62021-09-23 23:27:39 +0200400 auto.epsilon = number
Akron0d0daa22021-09-21 16:32:23 +0200401 }
402 case "@_UNKNOWN_SYMBOL_@":
403 {
Akron1c34ce62021-09-23 23:27:39 +0200404 auto.unknown = number
Akron0d0daa22021-09-21 16:32:23 +0200405 }
406
407 case "@_IDENTITY_SYMBOL_@":
408 {
Akron1c34ce62021-09-23 23:27:39 +0200409 auto.identity = number
Akron0d0daa22021-09-21 16:32:23 +0200410 }
411
Akronb15acb92022-04-16 11:01:46 +0200412 // Deprecated
Akron0d0daa22021-09-21 16:32:23 +0200413 case "@_TOKEN_SYMBOL_@":
414 {
Akron1c34ce62021-09-23 23:27:39 +0200415 auto.tokenend = number
Akron0d0daa22021-09-21 16:32:23 +0200416 }
Akronb15acb92022-04-16 11:01:46 +0200417
418 case "@_TOKEN_BOUND_@":
419 {
420 auto.tokenend = number
421 }
Akron0d0daa22021-09-21 16:32:23 +0200422 default:
423 {
424 // MCS not supported
Akron1c34ce62021-09-23 23:27:39 +0200425 auto.sigmaMCS[number] = line
Akron0d0daa22021-09-21 16:32:23 +0200426 }
427 }
428 continue
429
430 } else { // Probably a new line symbol
431 line, err = r.ReadString('\n')
432 if err != nil {
433 log.Println(err)
434 return nil
435 }
436 if len(line) != 1 {
437 // MCS not supported
Akron1c34ce62021-09-23 23:27:39 +0200438 auto.sigmaMCS[number] = line
Akron0d0daa22021-09-21 16:32:23 +0200439 continue
440 }
441 symbol = rune('\n')
442 }
443
Akron1c34ce62021-09-23 23:27:39 +0200444 auto.sigmaRev[number] = symbol
Akron0d0daa22021-09-21 16:32:23 +0200445 }
446 }
447 }
Akron1c34ce62021-09-23 23:27:39 +0200448 auto.sigmaMCS = nil
449 return auto
Akron0d0daa22021-09-21 16:32:23 +0200450}
451
Akron941f2152021-09-26 15:14:25 +0200452func LoadTokenizerFile(file string) Tokenizer {
453 f, err := os.Open(file)
454 if err != nil {
455 log.Println(err)
456 return nil
457 }
458 defer f.Close()
459
460 gz, err := gzip.NewReader(f)
461 if err != nil {
462 log.Println(err)
463 return nil
464 }
465 defer gz.Close()
466
467 r := bufio.NewReader(gz)
468
469 mstr, err := r.Peek(len(DAMAGIC))
470
471 if err != nil {
472 log.Println(err)
473 return nil
474 }
475
476 if string(mstr) == MAMAGIC {
477 return ParseMatrix(r)
478 } else if string(mstr) == DAMAGIC {
479 return ParseDatok(r)
480 }
481
482 log.Println("Neither a matrix nor a datok file")
483 return nil
484}
485
Akron0d0daa22021-09-21 16:32:23 +0200486// Set alphabet A to the list of all symbols
487// outgoing from s
Akron1c34ce62021-09-23 23:27:39 +0200488func (auto *Automaton) getSet(s int, A *[]int) {
489 for a := range auto.transitions[s] {
Akron0d0daa22021-09-21 16:32:23 +0200490 *A = append(*A, a)
491 }
492
493 // Not required, but simplifies bug hunting
494 // sort.Ints(*A)
495}