blob: fe57dde7686644798f448422a28b54fc78fb8226 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3/**
4 * The file reader is basically a port of foma2js,
5 * licensed under the Apache License, version 2,
6 * and written by Mans Hulden.
7 */
8
9import (
10 "bufio"
11 "compress/gzip"
12 "fmt"
13 "io"
14 "os"
15 "strconv"
16 "strings"
17 "unicode/utf8"
18)
19
20const (
Akron75ebe7f2021-08-03 10:34:10 +020021 PROPS = 1
22 SIGMA = 2
23 STATES = 3
24 NONE = 4
25 NEWLINE = '\u000a'
Akron8ef408b2021-08-02 22:11:04 +020026)
27
28// Special symbols in sigma
Akron730a79c2021-08-03 11:05:29 +020029var EPSILON = -1
30var UNKNOWN = -1
31var IDENTITY = -1
32var FINAL = -1
Akron8ef408b2021-08-02 22:11:04 +020033
34type mapping struct {
35 source int
36 target int
37}
38
39type edge struct {
40 in int
41 out int
42 target int
43}
44
45type Tokenizer struct {
46 sigma map[rune]int
47 sigma_rev map[int]rune
48 arccount int
49 statecount int
50 sigmacount int
51 maxsize int
52 array []int
53 transitions []map[int]*edge
54}
55
56func parse_file(file string) *Tokenizer {
57 f, err := os.Open(file)
58 if err != nil {
59 panic(err)
60 }
61 defer f.Close()
62
63 gz, err := gzip.NewReader(f)
64 if err != nil {
65 panic(err)
66 }
67 defer gz.Close()
68
69 return parse(gz)
70}
71
72func parse(ior io.Reader) *Tokenizer {
73 r := bufio.NewReader(ior)
74
75 tok := &Tokenizer{
76 sigma: make(map[rune]int),
77 sigma_rev: make(map[int]rune),
78 }
79
80 final := false
81
82 var arrstate, arrin, arrout, arrtarget, arrfinal int
83
84 mode := 0
85 var elem []string
86 var elemint [5]int
87
88 for {
89 line, err := r.ReadString('\n')
90 if err != nil {
91 if err == io.EOF {
92 break
93 }
94 panic(err)
95 }
96 if strings.HasPrefix(line, "##foma-net") {
97 continue
98 }
99 if strings.HasPrefix(line, "##props##") {
100 mode = PROPS
101 continue
102 }
103 if strings.HasPrefix(line, "##states##") {
104 mode = STATES
105
106 // Adds a final transition symbol to sigma
107 // written as '#' in Mizobuchi et al (2000)
108 tok.sigmacount++
109 FINAL = tok.sigmacount
110 continue
111 }
112 if strings.HasPrefix(line, "##sigma##") {
113 mode = SIGMA
114 continue
115 }
116 if strings.HasPrefix(line, "##end##") {
117 mode = NONE
118 continue
119 }
120
121 switch mode {
122 case PROPS:
123 {
124 elem = strings.Split(line, " ")
125 /*
126 fmt.Println("arity: " + elem[0])
127 fmt.Println("arccount: " + elem[1])
128 fmt.Println("statecount: " + elem[2])
129 fmt.Println("linecount: " + elem[3])
130 fmt.Println("finalcount: " + elem[4])
131 fmt.Println("pathcount: " + elem[5])
132 fmt.Println("is_deterministic: " + elem[6])
133 fmt.Println("is_pruned: " + elem[7])
134 fmt.Println("is_minimized: " + elem[8])
135 fmt.Println("is_epsilon_free: " + elem[9])
136 fmt.Println("is_loop_free: " + elem[10])
137 fmt.Println("extras: " + elem[11])
138 fmt.Println("name: " + elem[12])
139 */
140 if elem[6] != "1" {
141 panic("The FST needs to be deterministic")
142 }
143 if elem[9] != "1" {
144 panic("The FST needs to be epsilon free")
145 }
146
147 elemint[0], err = strconv.Atoi(elem[1])
148 if err != nil {
149 panic("Can't read arccount")
150 }
151 tok.arccount = elemint[0]
152
153 // States start at 1 in Mizobuchi et al (2000),
154 // as the state 0 is associated with a fail.
155 // Initialize states and transitions
156 elemint[0], err = strconv.Atoi(elem[2])
157 if err != nil {
158 panic("Can't read statecount")
159 }
160 tok.statecount = elemint[0]
161 tok.transitions = make([]map[int]*edge, elemint[0]+1)
162 continue
163 }
164 case STATES:
165 {
166 elem = strings.Split(line[0:len(line)-1], " ")
167 if elem[0] == "-1" {
168 continue
169 }
170 elemint[0], err = strconv.Atoi(elem[0])
Akron75ebe7f2021-08-03 10:34:10 +0200171 if err != nil {
172 break
173 }
Akron8ef408b2021-08-02 22:11:04 +0200174
175 if len(elem) > 1 {
176 elemint[1], err = strconv.Atoi(elem[1])
177 if err != nil {
178 break
179 }
180 if len(elem) > 2 {
181 elemint[2], err = strconv.Atoi(elem[2])
182 if err != nil {
183 break
184 }
185 if len(elem) > 3 {
186 elemint[3], err = strconv.Atoi(elem[3])
187 if err != nil {
188 break
189 }
190 if len(elem) > 4 {
191 elemint[4], err = strconv.Atoi(elem[4])
192 if err != nil {
193 break
194 }
195 }
196 }
197 }
198 }
199
200 switch len(elem) {
201 case 5:
202 {
203 arrstate = elemint[0]
204 arrin = elemint[1]
205 arrout = elemint[2]
206 arrtarget = elemint[3]
207 arrfinal = elemint[4]
208 }
209 case 4:
210 {
211 if elemint[1] == -1 {
212 arrstate = elemint[0]
213 arrfinal = elemint[3]
214 } else {
215 arrstate = elemint[0]
216 arrin = elemint[1]
217 arrtarget = elemint[2]
218 arrfinal = elemint[3]
219 arrout = arrin
220 }
221 }
222 case 3:
223 {
224 arrin = elemint[0]
225 arrout = elemint[1]
226 arrtarget = elemint[2]
227 }
228 case 2:
229 {
230 arrin = elemint[0]
Akron75ebe7f2021-08-03 10:34:10 +0200231 arrtarget = elemint[1]
Akron8ef408b2021-08-02 22:11:04 +0200232 arrout = arrin
233 }
234 }
235
236 // This collects all edges until arrstate changes
237 if arrfinal == 1 {
238 final = true
239 } else {
240 final = false
241 }
242
243 // While the states in foma start with 0, the states in the
244 // Mizobuchi FSA start with one - so we increase every state by 1.
245
Akron75ebe7f2021-08-03 10:34:10 +0200246 /*
247 if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
248 panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
249 }
250 */
251 if arrin != arrout {
252 if arrin == EPSILON && tok.sigma_rev[arrout] == NEWLINE {
253 } else if arrin != EPSILON && arrout == EPSILON {
254 } else {
255 panic(
256 "Problem: " +
257 strconv.Itoa(arrstate) +
258 " -> " + strconv.Itoa(arrtarget) +
259 " (" +
260 strconv.Itoa(arrin) +
261 ":" +
262 strconv.Itoa(arrout) +
263 ") (" +
264 string(tok.sigma_rev[arrin]) +
265 ":" +
266 string(tok.sigma_rev[arrout]) +
267 ")")
268 }
Akron8ef408b2021-08-02 22:11:04 +0200269 }
270
271 // TODO:
272 // if arrin == EPSILON && arrout == TOKENEND, mark state as newline
273 // if the next transition is the same, remove TOKENEND and add SENTENCEEND
274 // This requires to remove the transition alltogether and marks the state instead.
275
276 // TODO:
277 // if arrout == EPSILON, mark the transition as NOTOKEN
278
279 targetObj := &edge{
280 in: arrin,
281 out: arrout,
282 target: arrtarget + 1,
283 }
284
285 // Initialize outgoing state
286 if tok.transitions[arrstate+1] == nil {
287 tok.transitions[arrstate+1] = make(map[int]*edge)
288 }
289
Akron75ebe7f2021-08-03 10:34:10 +0200290 if arrin >= 0 {
291 tok.transitions[arrstate+1][arrin] = targetObj
292 }
Akron8ef408b2021-08-02 22:11:04 +0200293
294 if final {
295 tok.transitions[arrstate+1][FINAL] = &edge{}
296 }
297
Akron730a79c2021-08-03 11:05:29 +0200298 fmt.Println("Add",
299 arrstate+1, "->", arrtarget+1,
300 "(",
301 arrin,
302 ":",
303 arrout,
304 ") (",
305 string(tok.sigma_rev[arrin]),
306 ":",
307 string(tok.sigma_rev[arrout]),
308 ")")
Akron75ebe7f2021-08-03 10:34:10 +0200309
Akron8ef408b2021-08-02 22:11:04 +0200310 continue
311 }
312 case SIGMA:
313 {
314 elem = strings.SplitN(line[0:len(line)-1], " ", 2)
315
316 // Turn string into sigma id
317 number, err := strconv.Atoi(elem[0])
318
319 if err != nil {
320 panic(err)
321 }
322
323 tok.sigmacount = number
324
325 var symbol rune
326
327 // Read rune
328 if utf8.RuneCountInString(elem[1]) == 1 {
329 symbol = []rune(elem[1])[0]
330
331 // Probably a MCS
332 } else if utf8.RuneCountInString(elem[1]) > 1 {
333 switch elem[1] {
334 case "@_EPSILON_SYMBOL_@":
335 {
336 EPSILON = number
337 continue
338 }
339 case "@_UNKNOWN_SYMBOL_@":
340 {
341 UNKNOWN = number
342 continue
343 }
344
345 case "@_IDENTITY_SYMBOL_@":
346 {
347 IDENTITY = number
348 continue
349 }
350 default:
351 panic("MCS not supported: " + line)
352 }
353
354 // Probably a new line symbol
355 } else {
356 line, err = r.ReadString('\n')
357 if err != nil {
358 panic(err)
359 }
360 if len(line) != 1 {
361 panic("MCS not supported:" + line)
362 }
363 symbol = rune('\n')
364 }
365
366 tok.sigma[symbol] = number
367 tok.sigma_rev[number] = symbol
368 }
369 }
370 }
371
372 return tok
373}
374
375// Implementation of Mizobuchi et al (2000), p.128
376func (tok *Tokenizer) buildDA() *Tokenizer {
377
378 mark := 0
379 size := 0
380
381 // Create a mapping from s to t
382 table := make([]*mapping, tok.arccount+1)
383
384 table[size] = &mapping{source: 1, target: 1}
385 size++
386
387 A := make([]int, 0, 256)
388
389 for mark < size {
390 s := table[mark].source // This is a state in Ms
391 t := table[mark].target // This is a state in Mt
392 mark++
393 // fmt.Println("Increase mark", mark)
394 // St := append(St, t)
395 A = A[:0]
396 tok.get_set(s, &A)
397
398 // fmt.Println("Outgoing arcs from t", t, A)
399
400 // tok.array[t].base = tok.x_check(A)
401 tok.set_base(t, tok.x_check(A))
402
403 for _, a := range A {
404
405 if a != FINAL {
406 s1 := tok.transitions[s][a].target // g(s, a)
407
408 // fmt.Println("Found", s, "to", s1, "via", a)
409
410 t1 := tok.get_base(t) + a
411 tok.set_check(t1, t)
412
413 r := in_table(s1, table, size)
414 if r == 0 {
415 // fmt.Println("Increase size", t1)
416 table[size] = &mapping{source: s1, target: t1}
417 size++
418 } else {
419 //fmt.Println("Rep is there", t1, r)
420 tok.set_base(t1, -1*r)
421 // tok.array[t1].base = -1 * r
422 }
423 } else {
424 fmt.Println("I set a final")
425 // t1 := tok.array[t].base + FINAL
426 t1 := tok.get_base(t) + FINAL
427 // tok.array[t1].check = t
428 tok.set_check(t1, t)
429 }
430 }
431 }
432
433 // Following Mizobuchi et al (2000) the size of the
434 // FSA should be stored in check(1).
435 tok.set_check(1, tok.maxsize+1)
436 tok.array = tok.array[:tok.maxsize+1]
437 return tok
438}
439
440func (tok *Tokenizer) resize(l int) {
Akron75ebe7f2021-08-03 10:34:10 +0200441 if len(tok.array) <= l {
Akron8ef408b2021-08-02 22:11:04 +0200442 tok.array = append(tok.array, make([]int, l)...)
443 }
444}
445
446func (tok *Tokenizer) set_base(p int, v int) {
447 l := p*2 + 1
448 tok.resize(l)
449 if tok.maxsize < l {
450 tok.maxsize = l
451 }
452 tok.array[p*2] = v
453}
454
455func (tok *Tokenizer) get_base(p int) int {
456 if p*2 >= len(tok.array) {
457 return 0
458 }
459 return tok.array[p*2]
460}
461
462func (tok *Tokenizer) set_check(p int, v int) {
463 l := p*2 + 1
464 tok.resize(l)
465 if tok.maxsize < l {
466 tok.maxsize = l
467 }
468 tok.array[(p*2)+1] = v
469}
470
471func (tok *Tokenizer) get_check(p int) int {
472 if (p*2)+1 >= len(tok.array) {
473 return 0
474 }
475 return tok.array[(p*2)+1]
476}
477
478// Check the table if a mapping of s
479// exists and return this as a representative
480func in_table(s int, table []*mapping, size int) int {
481 for x := 0; x < size; x++ {
482 if table[x].source == s {
483 return table[x].target
484 }
485 }
486 return 0
487}
488
489// Set alphabet A to the list of all symbols
490// outgoing from s
491func (tok *Tokenizer) get_set(s int, A *[]int) {
Akron75ebe7f2021-08-03 10:34:10 +0200492 for a := range tok.transitions[s] {
Akron8ef408b2021-08-02 22:11:04 +0200493 *A = append(*A, a)
494 }
495}
496
497// Based on Mizobuchi et al (2000), p. 124
498// This iterates for every state through the complete double array
499// structure until it finds a gap that fits all outgoing transitions
500// of the state. This is extremely slow, but is only necessary in the
501// construction phase of the tokenizer.
502func (tok *Tokenizer) x_check(symbols []int) int {
503 // see https://github.com/bramstein/datrie/blob/master/lib/trie.js
504 base := 1
505
506 // fmt.Println("Resize", len(tok.linarray), "<", ((base + FINAL + 1) * 2))
507
508OVERLAP:
509 tok.resize((base + FINAL) * 2)
510 for _, a := range symbols {
511 // if tok.array[base+a].check != 0 {
512 if tok.get_check(base+a) != 0 {
513 base++
514 goto OVERLAP
515 }
516 }
Akron75ebe7f2021-08-03 10:34:10 +0200517 // fmt.Println("Found a nice place at", base, "for", len(symbols))
Akron8ef408b2021-08-02 22:11:04 +0200518 return base
519}
520
521// Based on Mizobuchi et al (2000), p. 129
522func (tok *Tokenizer) match(input string) bool {
523 t := 1 // Start position
524 chars := []rune(input)
525 i := 0
Akron465a0992021-08-03 11:28:48 +0200526 var a int
527 var tu int
528 var ok bool
529
Akron730a79c2021-08-03 11:05:29 +0200530 // fmt.Println("Length of string is", len(chars))
Akron8ef408b2021-08-02 22:11:04 +0200531 for ; i < len(chars); i++ {
Akron465a0992021-08-03 11:28:48 +0200532 a, ok = tok.sigma[chars[i]]
Akron730a79c2021-08-03 11:05:29 +0200533
534 // Support identity symbol if char not in sigma
535 if !ok && IDENTITY != -1 {
536 fmt.Println("IDENTITY symbol", chars[i], "->", IDENTITY)
537 a = IDENTITY
538 }
Akron465a0992021-08-03 11:28:48 +0200539 tu = t
Akron730a79c2021-08-03 11:05:29 +0200540 CHECK:
Akron8ef408b2021-08-02 22:11:04 +0200541 t = tok.get_base(tu) + a
Akron730a79c2021-08-03 11:05:29 +0200542 if t > tok.get_check(1) || tok.get_check(t) != tu {
Akron75ebe7f2021-08-03 10:34:10 +0200543 fmt.Println("Match is not fine!", t, "and", tok.get_check(t), "vs", tu)
Akron730a79c2021-08-03 11:05:29 +0200544
545 // Try again with unknown symbol, in case identity failed
Akron465a0992021-08-03 11:28:48 +0200546 if ok {
547 if a == IDENTITY {
548 fmt.Println("UNKNOWN symbol", chars[i], "->", UNKNOWN)
549 a = UNKNOWN
550 goto CHECK
551 } else if a == UNKNOWN {
552 fmt.Println("EPSILON symbol", chars[i], "->", EPSILON)
553 a = EPSILON
554 goto CHECK
555 }
Akron730a79c2021-08-03 11:05:29 +0200556 }
Akron8ef408b2021-08-02 22:11:04 +0200557 break
558 } else if tok.get_base(t) < 0 {
Akron730a79c2021-08-03 11:05:29 +0200559 // Move to representative state
Akron8ef408b2021-08-02 22:11:04 +0200560 t = -1 * tok.get_base(t)
Akron8ef408b2021-08-02 22:11:04 +0200561 }
562 }
563
564 if i == len(chars) {
565 fmt.Println("At the end")
566 } else {
Akron75ebe7f2021-08-03 10:34:10 +0200567 fmt.Println("Not at the end")
Akron8ef408b2021-08-02 22:11:04 +0200568 return false
569 }
570
Akron465a0992021-08-03 11:28:48 +0200571 // fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
Akron8ef408b2021-08-02 22:11:04 +0200572
Akron465a0992021-08-03 11:28:48 +0200573FINALCHECK:
Akron8ef408b2021-08-02 22:11:04 +0200574 if tok.get_check(tok.get_base(t)+FINAL) == t {
Akron8ef408b2021-08-02 22:11:04 +0200575 return true
576 }
Akron465a0992021-08-03 11:28:48 +0200577
578 // if a != EPSILON {
579 // EPSILONCHECK:
580 tu = t
581 a = EPSILON
582
583 t = tok.get_base(tu) + a
584 if t > tok.get_check(1) || tok.get_check(t) != tu {
585 fmt.Println("xMatch is not fine!", t, "and", tok.get_check(t), "vs", tu)
586 return false
587 } else if tok.get_base(t) < 0 {
588 // Move to representative state
589 t = -1 * tok.get_base(t)
590 goto FINALCHECK
591 }
592 goto FINALCHECK
593 // return true
594 /*
595 }
596
597 return false
598 */
Akron8ef408b2021-08-02 22:11:04 +0200599}
600
601// In the final realization, the states can only have 30 bits:
602// base[1] -> is final
603// base[2] -> is_separate
604// check[1] -> translates to epsilon
605// check[2] -> appends newine (Maybe)
606// If check[1] && check[2] is set, this translates to a sentence split (Maybe)