blob: cf3f9c44230262af4ddecea17627c02001c9c226 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
9import (
10 "bufio"
11 "compress/gzip"
12 "fmt"
13 "io"
14 "os"
15 "strconv"
16 "strings"
17 "unicode/utf8"
18)
19
20const (
Akron75ebe7f2021-08-03 10:34:10 +020021 PROPS = 1
22 SIGMA = 2
23 STATES = 3
24 NONE = 4
25 NEWLINE = '\u000a'
Akron8ef408b2021-08-02 22:11:04 +020026)
27
28// Special symbols in sigma
29var EPSILON, UNKNOWN, IDENTITY, FINAL int
30
31type mapping struct {
32 source int
33 target int
34}
35
36type edge struct {
37 in int
38 out int
39 target int
40}
41
42type Tokenizer struct {
43 sigma map[rune]int
44 sigma_rev map[int]rune
45 arccount int
46 statecount int
47 sigmacount int
48 maxsize int
49 array []int
50 transitions []map[int]*edge
51}
52
53func parse_file(file string) *Tokenizer {
54 f, err := os.Open(file)
55 if err != nil {
56 panic(err)
57 }
58 defer f.Close()
59
60 gz, err := gzip.NewReader(f)
61 if err != nil {
62 panic(err)
63 }
64 defer gz.Close()
65
66 return parse(gz)
67}
68
69func parse(ior io.Reader) *Tokenizer {
70 r := bufio.NewReader(ior)
71
72 tok := &Tokenizer{
73 sigma: make(map[rune]int),
74 sigma_rev: make(map[int]rune),
75 }
76
77 final := false
78
79 var arrstate, arrin, arrout, arrtarget, arrfinal int
80
81 mode := 0
82 var elem []string
83 var elemint [5]int
84
85 for {
86 line, err := r.ReadString('\n')
87 if err != nil {
88 if err == io.EOF {
89 break
90 }
91 panic(err)
92 }
93 if strings.HasPrefix(line, "##foma-net") {
94 continue
95 }
96 if strings.HasPrefix(line, "##props##") {
97 mode = PROPS
98 continue
99 }
100 if strings.HasPrefix(line, "##states##") {
101 mode = STATES
102
103 // Adds a final transition symbol to sigma
104 // written as '#' in Mizobuchi et al (2000)
105 tok.sigmacount++
106 FINAL = tok.sigmacount
107 continue
108 }
109 if strings.HasPrefix(line, "##sigma##") {
110 mode = SIGMA
111 continue
112 }
113 if strings.HasPrefix(line, "##end##") {
114 mode = NONE
115 continue
116 }
117
118 switch mode {
119 case PROPS:
120 {
121 elem = strings.Split(line, " ")
122 /*
123 fmt.Println("arity: " + elem[0])
124 fmt.Println("arccount: " + elem[1])
125 fmt.Println("statecount: " + elem[2])
126 fmt.Println("linecount: " + elem[3])
127 fmt.Println("finalcount: " + elem[4])
128 fmt.Println("pathcount: " + elem[5])
129 fmt.Println("is_deterministic: " + elem[6])
130 fmt.Println("is_pruned: " + elem[7])
131 fmt.Println("is_minimized: " + elem[8])
132 fmt.Println("is_epsilon_free: " + elem[9])
133 fmt.Println("is_loop_free: " + elem[10])
134 fmt.Println("extras: " + elem[11])
135 fmt.Println("name: " + elem[12])
136 */
137 if elem[6] != "1" {
138 panic("The FST needs to be deterministic")
139 }
140 if elem[9] != "1" {
141 panic("The FST needs to be epsilon free")
142 }
143
144 elemint[0], err = strconv.Atoi(elem[1])
145 if err != nil {
146 panic("Can't read arccount")
147 }
148 tok.arccount = elemint[0]
149
150 // States start at 1 in Mizobuchi et al (2000),
151 // as the state 0 is associated with a fail.
152 // Initialize states and transitions
153 elemint[0], err = strconv.Atoi(elem[2])
154 if err != nil {
155 panic("Can't read statecount")
156 }
157 tok.statecount = elemint[0]
158 tok.transitions = make([]map[int]*edge, elemint[0]+1)
159 continue
160 }
161 case STATES:
162 {
163 elem = strings.Split(line[0:len(line)-1], " ")
164 if elem[0] == "-1" {
165 continue
166 }
167 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200168 if err != nil {
169 break
170 }
Akron8ef408b2021-08-02 22:11:04 +0200171
172 if len(elem) > 1 {
173 elemint[1], err = strconv.Atoi(elem[1])
174 if err != nil {
175 break
176 }
177 if len(elem) > 2 {
178 elemint[2], err = strconv.Atoi(elem[2])
179 if err != nil {
180 break
181 }
182 if len(elem) > 3 {
183 elemint[3], err = strconv.Atoi(elem[3])
184 if err != nil {
185 break
186 }
187 if len(elem) > 4 {
188 elemint[4], err = strconv.Atoi(elem[4])
189 if err != nil {
190 break
191 }
192 }
193 }
194 }
195 }
196
197 switch len(elem) {
198 case 5:
199 {
200 arrstate = elemint[0]
201 arrin = elemint[1]
202 arrout = elemint[2]
203 arrtarget = elemint[3]
204 arrfinal = elemint[4]
205 }
206 case 4:
207 {
208 if elemint[1] == -1 {
209 arrstate = elemint[0]
210 arrfinal = elemint[3]
211 } else {
212 arrstate = elemint[0]
213 arrin = elemint[1]
214 arrtarget = elemint[2]
215 arrfinal = elemint[3]
216 arrout = arrin
217 }
218 }
219 case 3:
220 {
221 arrin = elemint[0]
222 arrout = elemint[1]
223 arrtarget = elemint[2]
224 }
225 case 2:
226 {
227 arrin = elemint[0]
Akron75ebe7f2021-08-03 10:34:10 +0200228 arrtarget = elemint[1]
Akron8ef408b2021-08-02 22:11:04 +0200229 arrout = arrin
230 }
231 }
232
233 // This collects all edges until arrstate changes
234 if arrfinal == 1 {
235 final = true
236 } else {
237 final = false
238 }
239
240 // While the states in foma start with 0, the states in the
241 // Mizobuchi FSA start with one - so we increase every state by 1.
242
Akron75ebe7f2021-08-03 10:34:10 +0200243 /*
244 if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
245 panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
246 }
247 */
248 if arrin != arrout {
249 if arrin == EPSILON && tok.sigma_rev[arrout] == NEWLINE {
250 } else if arrin != EPSILON && arrout == EPSILON {
251 } else {
252 panic(
253 "Problem: " +
254 strconv.Itoa(arrstate) +
255 " -> " + strconv.Itoa(arrtarget) +
256 " (" +
257 strconv.Itoa(arrin) +
258 ":" +
259 strconv.Itoa(arrout) +
260 ") (" +
261 string(tok.sigma_rev[arrin]) +
262 ":" +
263 string(tok.sigma_rev[arrout]) +
264 ")")
265 }
Akron8ef408b2021-08-02 22:11:04 +0200266 }
267
268 // TODO:
269 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
270 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
271 // This requires to remove the transition alltogether and marks the state instead.
272
273 // TODO:
274 // if arrout == EPSILON, mark the transition as NOTOKEN
275
276 targetObj := &edge{
277 in: arrin,
278 out: arrout,
279 target: arrtarget + 1,
280 }
281
282 // Initialize outgoing state
283 if tok.transitions[arrstate+1] == nil {
284 tok.transitions[arrstate+1] = make(map[int]*edge)
285 }
286
Akron75ebe7f2021-08-03 10:34:10 +0200287 if arrin >= 0 {
288 tok.transitions[arrstate+1][arrin] = targetObj
289 }
Akron8ef408b2021-08-02 22:11:04 +0200290
291 if final {
292 tok.transitions[arrstate+1][FINAL] = &edge{}
293 }
294
Akron75ebe7f2021-08-03 10:34:10 +0200295 fmt.Println("Add", arrstate+1, "->", arrtarget+1, "(", string(tok.sigma_rev[arrin]), ":", string(tok.sigma_rev[arrout]), ")")
296
Akron8ef408b2021-08-02 22:11:04 +0200297 continue
298 }
299 case SIGMA:
300 {
301 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
302
303 // Turn string into sigma id
304 number, err := strconv.Atoi(elem[0])
305
306 if err != nil {
307 panic(err)
308 }
309
310 tok.sigmacount = number
311
312 var symbol rune
313
314 // Read rune
315 if utf8.RuneCountInString(elem[1]) == 1 {
316 symbol = []rune(elem[1])[0]
317
318 // Probably a MCS
319 } else if utf8.RuneCountInString(elem[1]) > 1 {
320 switch elem[1] {
321 case "@_EPSILON_SYMBOL_@":
322 {
323 EPSILON = number
324 continue
325 }
326 case "@_UNKNOWN_SYMBOL_@":
327 {
328 UNKNOWN = number
329 continue
330 }
331
332 case "@_IDENTITY_SYMBOL_@":
333 {
334 IDENTITY = number
335 continue
336 }
337 default:
338 panic("MCS not supported: " + line)
339 }
340
341 // Probably a new line symbol
342 } else {
343 line, err = r.ReadString('\n')
344 if err != nil {
345 panic(err)
346 }
347 if len(line) != 1 {
348 panic("MCS not supported:" + line)
349 }
350 symbol = rune('\n')
351 }
352
353 tok.sigma[symbol] = number
354 tok.sigma_rev[number] = symbol
355 }
356 }
357 }
358
359 return tok
360}
361
362// Implementation of Mizobuchi et al (2000), p.128
363func (tok *Tokenizer) buildDA() *Tokenizer {
364
365 mark := 0
366 size := 0
367
368 // Create a mapping from s to t
369 table := make([]*mapping, tok.arccount+1)
370
371 table[size] = &mapping{source: 1, target: 1}
372 size++
373
374 A := make([]int, 0, 256)
375
376 for mark < size {
377 s := table[mark].source // This is a state in Ms
378 t := table[mark].target // This is a state in Mt
379 mark++
380 // fmt.Println("Increase mark", mark)
381 // St := append(St, t)
382 A = A[:0]
383 tok.get_set(s, &A)
384
385 // fmt.Println("Outgoing arcs from t", t, A)
386
387 // tok.array[t].base = tok.x_check(A)
388 tok.set_base(t, tok.x_check(A))
389
390 for _, a := range A {
391
392 if a != FINAL {
393 s1 := tok.transitions[s][a].target // g(s, a)
394
395 // fmt.Println("Found", s, "to", s1, "via", a)
396
397 t1 := tok.get_base(t) + a
398 tok.set_check(t1, t)
399
400 r := in_table(s1, table, size)
401 if r == 0 {
402 // fmt.Println("Increase size", t1)
403 table[size] = &mapping{source: s1, target: t1}
404 size++
405 } else {
406 //fmt.Println("Rep is there", t1, r)
407 tok.set_base(t1, -1*r)
408 // tok.array[t1].base = -1 * r
409 }
410 } else {
411 fmt.Println("I set a final")
412 // t1 := tok.array[t].base + FINAL
413 t1 := tok.get_base(t) + FINAL
414 // tok.array[t1].check = t
415 tok.set_check(t1, t)
416 }
417 }
418 }
419
420 // Following Mizobuchi et al (2000) the size of the
421 // FSA should be stored in check(1).
422 tok.set_check(1, tok.maxsize+1)
423 tok.array = tok.array[:tok.maxsize+1]
424 return tok
425}
426
427func (tok *Tokenizer) resize(l int) {
Akron75ebe7f2021-08-03 10:34:10 +0200428 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200429 tok.array = append(tok.array, make([]int, l)...)
430 }
431}
432
433func (tok *Tokenizer) set_base(p int, v int) {
434 l := p*2 + 1
435 tok.resize(l)
436 if tok.maxsize < l {
437 tok.maxsize = l
438 }
439 tok.array[p*2] = v
440}
441
442func (tok *Tokenizer) get_base(p int) int {
443 if p*2 >= len(tok.array) {
444 return 0
445 }
446 return tok.array[p*2]
447}
448
449func (tok *Tokenizer) set_check(p int, v int) {
450 l := p*2 + 1
451 tok.resize(l)
452 if tok.maxsize < l {
453 tok.maxsize = l
454 }
455 tok.array[(p*2)+1] = v
456}
457
458func (tok *Tokenizer) get_check(p int) int {
459 if (p*2)+1 >= len(tok.array) {
460 return 0
461 }
462 return tok.array[(p*2)+1]
463}
464
465// Check the table if a mapping of s
466// exists and return this as a representative
467func in_table(s int, table []*mapping, size int) int {
468 for x := 0; x < size; x++ {
469 if table[x].source == s {
470 return table[x].target
471 }
472 }
473 return 0
474}
475
476// Set alphabet A to the list of all symbols
477// outgoing from s
478func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200479 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200480 *A = append(*A, a)
481 }
482}
483
484// Based on Mizobuchi et al (2000), p. 124
485// This iterates for every state through the complete double array
486// structure until it finds a gap that fits all outgoing transitions
487// of the state. This is extremely slow, but is only necessary in the
488// construction phase of the tokenizer.
489func (tok *Tokenizer) x_check(symbols []int) int {
490 // see https://github.com/bramstein/datrie/blob/master/lib/trie.js
491 base := 1
492
493 // fmt.Println("Resize", len(tok.linarray), "<", ((base + FINAL + 1) * 2))
494
495OVERLAP:
496 tok.resize((base + FINAL) * 2)
497 for _, a := range symbols {
498 // if tok.array[base+a].check != 0 {
499 if tok.get_check(base+a) != 0 {
500 base++
501 goto OVERLAP
502 }
503 }
Akron75ebe7f2021-08-03 10:34:10 +0200504 // fmt.Println("Found a nice place at", base, "for", len(symbols))
Akron8ef408b2021-08-02 22:11:04 +0200505 return base
506}
507
508// Based on Mizobuchi et al (2000), p. 129
509func (tok *Tokenizer) match(input string) bool {
510 t := 1 // Start position
511 chars := []rune(input)
512 i := 0
Akron75ebe7f2021-08-03 10:34:10 +0200513 fmt.Println("Length of string is", len(chars))
Akron8ef408b2021-08-02 22:11:04 +0200514 for ; i < len(chars); i++ {
515 a := tok.sigma[chars[i]]
516 tu := t
517 t = tok.get_base(tu) + a
Akron75ebe7f2021-08-03 10:34:10 +0200518 fmt.Println("Check", string(tok.sigma_rev[a]), ":", t)
Akron8ef408b2021-08-02 22:11:04 +0200519 if t > tok.get_check(1) {
Akron75ebe7f2021-08-03 10:34:10 +0200520 fmt.Println("Out of array")
Akron8ef408b2021-08-02 22:11:04 +0200521 break
522 } else if tok.get_check(t) != tu {
Akron75ebe7f2021-08-03 10:34:10 +0200523 fmt.Println("Match is not fine!", t, "and", tok.get_check(t), "vs", tu)
Akron8ef408b2021-08-02 22:11:04 +0200524 break
525 } else if tok.get_base(t) < 0 {
526 t = -1 * tok.get_base(t)
Akron75ebe7f2021-08-03 10:34:10 +0200527 // } else {
Akron8ef408b2021-08-02 22:11:04 +0200528 }
529 }
530
531 if i == len(chars) {
532 fmt.Println("At the end")
533 } else {
Akron75ebe7f2021-08-03 10:34:10 +0200534 fmt.Println("Not at the end")
Akron8ef408b2021-08-02 22:11:04 +0200535 return false
536 }
537
538 // fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
539
540 if tok.get_check(tok.get_base(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200541 return true
542 }
543 return false
544}
545
546// In the final realization, the states can only have 30 bits:
547// base[1] -> is final
548// base[2] -> is_separate
549// check[1] -> translates to epsilon
550// check[2] -> appends newine (Maybe)
551// If check[1] && check[2] is set, this translates to a sentence split (Maybe)