blob: 8a687538f4235ed1233bd0eab2b886cf7f92d560 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
9import (
10 "bufio"
11 "compress/gzip"
12 "fmt"
13 "io"
14 "os"
15 "strconv"
16 "strings"
17 "unicode/utf8"
18)
19
20const (
21 PROPS = 1
22 SIGMA = 2
23 STATES = 3
24 NONE = 4
25)
26
27// Special symbols in sigma
28var EPSILON, UNKNOWN, IDENTITY, FINAL int
29
30type mapping struct {
31 source int
32 target int
33}
34
35type edge struct {
36 in int
37 out int
38 target int
39}
40
41type Tokenizer struct {
42 sigma map[rune]int
43 sigma_rev map[int]rune
44 arccount int
45 statecount int
46 sigmacount int
47 maxsize int
48 array []int
49 transitions []map[int]*edge
50}
51
52func parse_file(file string) *Tokenizer {
53 f, err := os.Open(file)
54 if err != nil {
55 panic(err)
56 }
57 defer f.Close()
58
59 gz, err := gzip.NewReader(f)
60 if err != nil {
61 panic(err)
62 }
63 defer gz.Close()
64
65 return parse(gz)
66}
67
68func parse(ior io.Reader) *Tokenizer {
69 r := bufio.NewReader(ior)
70
71 tok := &Tokenizer{
72 sigma: make(map[rune]int),
73 sigma_rev: make(map[int]rune),
74 }
75
76 final := false
77
78 var arrstate, arrin, arrout, arrtarget, arrfinal int
79
80 mode := 0
81 var elem []string
82 var elemint [5]int
83
84 for {
85 line, err := r.ReadString('\n')
86 if err != nil {
87 if err == io.EOF {
88 break
89 }
90 panic(err)
91 }
92 if strings.HasPrefix(line, "##foma-net") {
93 continue
94 }
95 if strings.HasPrefix(line, "##props##") {
96 mode = PROPS
97 continue
98 }
99 if strings.HasPrefix(line, "##states##") {
100 mode = STATES
101
102 // Adds a final transition symbol to sigma
103 // written as '#' in Mizobuchi et al (2000)
104 tok.sigmacount++
105 FINAL = tok.sigmacount
106 continue
107 }
108 if strings.HasPrefix(line, "##sigma##") {
109 mode = SIGMA
110 continue
111 }
112 if strings.HasPrefix(line, "##end##") {
113 mode = NONE
114 continue
115 }
116
117 switch mode {
118 case PROPS:
119 {
120 elem = strings.Split(line, " ")
121 /*
122 fmt.Println("arity: " + elem[0])
123 fmt.Println("arccount: " + elem[1])
124 fmt.Println("statecount: " + elem[2])
125 fmt.Println("linecount: " + elem[3])
126 fmt.Println("finalcount: " + elem[4])
127 fmt.Println("pathcount: " + elem[5])
128 fmt.Println("is_deterministic: " + elem[6])
129 fmt.Println("is_pruned: " + elem[7])
130 fmt.Println("is_minimized: " + elem[8])
131 fmt.Println("is_epsilon_free: " + elem[9])
132 fmt.Println("is_loop_free: " + elem[10])
133 fmt.Println("extras: " + elem[11])
134 fmt.Println("name: " + elem[12])
135 */
136 if elem[6] != "1" {
137 panic("The FST needs to be deterministic")
138 }
139 if elem[9] != "1" {
140 panic("The FST needs to be epsilon free")
141 }
142
143 elemint[0], err = strconv.Atoi(elem[1])
144 if err != nil {
145 panic("Can't read arccount")
146 }
147 tok.arccount = elemint[0]
148
149 // States start at 1 in Mizobuchi et al (2000),
150 // as the state 0 is associated with a fail.
151 // Initialize states and transitions
152 elemint[0], err = strconv.Atoi(elem[2])
153 if err != nil {
154 panic("Can't read statecount")
155 }
156 tok.statecount = elemint[0]
157 tok.transitions = make([]map[int]*edge, elemint[0]+1)
158 continue
159 }
160 case STATES:
161 {
162 elem = strings.Split(line[0:len(line)-1], " ")
163 if elem[0] == "-1" {
164 continue
165 }
166 elemint[0], err = strconv.Atoi(elem[0])
167
168 if len(elem) > 1 {
169 elemint[1], err = strconv.Atoi(elem[1])
170 if err != nil {
171 break
172 }
173 if len(elem) > 2 {
174 elemint[2], err = strconv.Atoi(elem[2])
175 if err != nil {
176 break
177 }
178 if len(elem) > 3 {
179 elemint[3], err = strconv.Atoi(elem[3])
180 if err != nil {
181 break
182 }
183 if len(elem) > 4 {
184 elemint[4], err = strconv.Atoi(elem[4])
185 if err != nil {
186 break
187 }
188 }
189 }
190 }
191 }
192
193 switch len(elem) {
194 case 5:
195 {
196 arrstate = elemint[0]
197 arrin = elemint[1]
198 arrout = elemint[2]
199 arrtarget = elemint[3]
200 arrfinal = elemint[4]
201 }
202 case 4:
203 {
204 if elemint[1] == -1 {
205 arrstate = elemint[0]
206 arrfinal = elemint[3]
207 } else {
208 arrstate = elemint[0]
209 arrin = elemint[1]
210 arrtarget = elemint[2]
211 arrfinal = elemint[3]
212 arrout = arrin
213 }
214 }
215 case 3:
216 {
217 arrin = elemint[0]
218 arrout = elemint[1]
219 arrtarget = elemint[2]
220 }
221 case 2:
222 {
223 arrin = elemint[0]
224 arrtarget = elemint[2]
225 arrout = arrin
226 }
227 }
228
229 // This collects all edges until arrstate changes
230 if arrfinal == 1 {
231 final = true
232 } else {
233 final = false
234 }
235
236 // While the states in foma start with 0, the states in the
237 // Mizobuchi FSA start with one - so we increase every state by 1.
238
239 if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
240 panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
241 }
242
243 // TODO:
244 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
245 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
246 // This requires to remove the transition alltogether and marks the state instead.
247
248 // TODO:
249 // if arrout == EPSILON, mark the transition as NOTOKEN
250
251 targetObj := &edge{
252 in: arrin,
253 out: arrout,
254 target: arrtarget + 1,
255 }
256
257 // Initialize outgoing state
258 if tok.transitions[arrstate+1] == nil {
259 tok.transitions[arrstate+1] = make(map[int]*edge)
260 }
261
262 tok.transitions[arrstate+1][arrin] = targetObj
263
264 if final {
265 tok.transitions[arrstate+1][FINAL] = &edge{}
266 }
267
268 continue
269 }
270 case SIGMA:
271 {
272 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
273
274 // Turn string into sigma id
275 number, err := strconv.Atoi(elem[0])
276
277 if err != nil {
278 panic(err)
279 }
280
281 tok.sigmacount = number
282
283 var symbol rune
284
285 // Read rune
286 if utf8.RuneCountInString(elem[1]) == 1 {
287 symbol = []rune(elem[1])[0]
288
289 // Probably a MCS
290 } else if utf8.RuneCountInString(elem[1]) > 1 {
291 switch elem[1] {
292 case "@_EPSILON_SYMBOL_@":
293 {
294 EPSILON = number
295 continue
296 }
297 case "@_UNKNOWN_SYMBOL_@":
298 {
299 UNKNOWN = number
300 continue
301 }
302
303 case "@_IDENTITY_SYMBOL_@":
304 {
305 IDENTITY = number
306 continue
307 }
308 default:
309 panic("MCS not supported: " + line)
310 }
311
312 // Probably a new line symbol
313 } else {
314 line, err = r.ReadString('\n')
315 if err != nil {
316 panic(err)
317 }
318 if len(line) != 1 {
319 panic("MCS not supported:" + line)
320 }
321 symbol = rune('\n')
322 }
323
324 tok.sigma[symbol] = number
325 tok.sigma_rev[number] = symbol
326 }
327 }
328 }
329
330 return tok
331}
332
333// Implementation of Mizobuchi et al (2000), p.128
334func (tok *Tokenizer) buildDA() *Tokenizer {
335
336 mark := 0
337 size := 0
338
339 // Create a mapping from s to t
340 table := make([]*mapping, tok.arccount+1)
341
342 table[size] = &mapping{source: 1, target: 1}
343 size++
344
345 A := make([]int, 0, 256)
346
347 for mark < size {
348 s := table[mark].source // This is a state in Ms
349 t := table[mark].target // This is a state in Mt
350 mark++
351 // fmt.Println("Increase mark", mark)
352 // St := append(St, t)
353 A = A[:0]
354 tok.get_set(s, &A)
355
356 // fmt.Println("Outgoing arcs from t", t, A)
357
358 // tok.array[t].base = tok.x_check(A)
359 tok.set_base(t, tok.x_check(A))
360
361 for _, a := range A {
362
363 if a != FINAL {
364 s1 := tok.transitions[s][a].target // g(s, a)
365
366 // fmt.Println("Found", s, "to", s1, "via", a)
367
368 t1 := tok.get_base(t) + a
369 tok.set_check(t1, t)
370
371 r := in_table(s1, table, size)
372 if r == 0 {
373 // fmt.Println("Increase size", t1)
374 table[size] = &mapping{source: s1, target: t1}
375 size++
376 } else {
377 //fmt.Println("Rep is there", t1, r)
378 tok.set_base(t1, -1*r)
379 // tok.array[t1].base = -1 * r
380 }
381 } else {
382 fmt.Println("I set a final")
383 // t1 := tok.array[t].base + FINAL
384 t1 := tok.get_base(t) + FINAL
385 // tok.array[t1].check = t
386 tok.set_check(t1, t)
387 }
388 }
389 }
390
391 // Following Mizobuchi et al (2000) the size of the
392 // FSA should be stored in check(1).
393 tok.set_check(1, tok.maxsize+1)
394 tok.array = tok.array[:tok.maxsize+1]
395 return tok
396}
397
398func (tok *Tokenizer) resize(l int) {
399 if len(tok.array) < l {
400 tok.array = append(tok.array, make([]int, l)...)
401 }
402}
403
404func (tok *Tokenizer) set_base(p int, v int) {
405 l := p*2 + 1
406 tok.resize(l)
407 if tok.maxsize < l {
408 tok.maxsize = l
409 }
410 tok.array[p*2] = v
411}
412
413func (tok *Tokenizer) get_base(p int) int {
414 if p*2 >= len(tok.array) {
415 return 0
416 }
417 return tok.array[p*2]
418}
419
420func (tok *Tokenizer) set_check(p int, v int) {
421 l := p*2 + 1
422 tok.resize(l)
423 if tok.maxsize < l {
424 tok.maxsize = l
425 }
426 tok.array[(p*2)+1] = v
427}
428
429func (tok *Tokenizer) get_check(p int) int {
430 if (p*2)+1 >= len(tok.array) {
431 return 0
432 }
433 return tok.array[(p*2)+1]
434}
435
436// Check the table if a mapping of s
437// exists and return this as a representative
438func in_table(s int, table []*mapping, size int) int {
439 for x := 0; x < size; x++ {
440 if table[x].source == s {
441 return table[x].target
442 }
443 }
444 return 0
445}
446
447// Set alphabet A to the list of all symbols
448// outgoing from s
449func (tok *Tokenizer) get_set(s int, A *[]int) {
450 for a, _ := range tok.transitions[s] {
451 *A = append(*A, a)
452 }
453}
454
455// Based on Mizobuchi et al (2000), p. 124
456// This iterates for every state through the complete double array
457// structure until it finds a gap that fits all outgoing transitions
458// of the state. This is extremely slow, but is only necessary in the
459// construction phase of the tokenizer.
460func (tok *Tokenizer) x_check(symbols []int) int {
461 // see https://github.com/bramstein/datrie/blob/master/lib/trie.js
462 base := 1
463
464 // fmt.Println("Resize", len(tok.linarray), "<", ((base + FINAL + 1) * 2))
465
466OVERLAP:
467 tok.resize((base + FINAL) * 2)
468 for _, a := range symbols {
469 // if tok.array[base+a].check != 0 {
470 if tok.get_check(base+a) != 0 {
471 base++
472 goto OVERLAP
473 }
474 }
475 fmt.Println("Found a nice place at", base, "for", len(symbols))
476 return base
477}
478
479// Based on Mizobuchi et al (2000), p. 129
480func (tok *Tokenizer) match(input string) bool {
481 t := 1 // Start position
482 chars := []rune(input)
483 i := 0
484 for ; i < len(chars); i++ {
485 a := tok.sigma[chars[i]]
486 tu := t
487 t = tok.get_base(tu) + a
488 fmt.Println("Check", a, t, tok.get_check(1))
489 if t > tok.get_check(1) {
490 break
491 } else if tok.get_check(t) != tu {
492 break
493 } else if tok.get_base(t) < 0 {
494 t = -1 * tok.get_base(t)
495 // fmt.Println("Match is representative!")
496 } else {
497 // fmt.Println("Match is fine!")
498 }
499 }
500
501 if i == len(chars) {
502 fmt.Println("At the end")
503 } else {
504 return false
505 }
506
507 // fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
508
509 if tok.get_check(tok.get_base(t)+FINAL) == t {
510 fmt.Println("FINE")
511 return true
512 }
513 return false
514}
515
516// In the final realization, the states can only have 30 bits:
517// base[1] -> is final
518// base[2] -> is_separate
519// check[1] -> translates to epsilon
520// check[2] -> appends newine (Maybe)
521// If check[1] && check[2] is set, this translates to a sentence split (Maybe)