Fix foma format parser
diff --git a/datokenizer.go b/datokenizer.go
index 8a68753..cf3f9c4 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -18,10 +18,11 @@
)
const (
- PROPS = 1
- SIGMA = 2
- STATES = 3
- NONE = 4
+ PROPS = 1
+ SIGMA = 2
+ STATES = 3
+ NONE = 4
+ NEWLINE = '\u000a'
)
// Special symbols in sigma
@@ -164,6 +165,9 @@
continue
}
elemint[0], err = strconv.Atoi(elem[0])
+ if err != nil {
+ break
+ }
if len(elem) > 1 {
elemint[1], err = strconv.Atoi(elem[1])
@@ -221,7 +225,7 @@
case 2:
{
arrin = elemint[0]
- arrtarget = elemint[2]
+ arrtarget = elemint[1]
arrout = arrin
}
}
@@ -236,8 +240,29 @@
// While the states in foma start with 0, the states in the
// Mizobuchi FSA start with one - so we increase every state by 1.
- if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
- panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
+ /*
+ if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
+ panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
+ }
+ */
+ if arrin != arrout {
+ if arrin == EPSILON && tok.sigma_rev[arrout] == NEWLINE {
+ } else if arrin != EPSILON && arrout == EPSILON {
+ } else {
+ panic(
+ "Problem: " +
+ strconv.Itoa(arrstate) +
+ " -> " + strconv.Itoa(arrtarget) +
+ " (" +
+ strconv.Itoa(arrin) +
+ ":" +
+ strconv.Itoa(arrout) +
+ ") (" +
+ string(tok.sigma_rev[arrin]) +
+ ":" +
+ string(tok.sigma_rev[arrout]) +
+ ")")
+ }
}
// TODO:
@@ -259,12 +284,16 @@
tok.transitions[arrstate+1] = make(map[int]*edge)
}
- tok.transitions[arrstate+1][arrin] = targetObj
+ if arrin >= 0 {
+ tok.transitions[arrstate+1][arrin] = targetObj
+ }
if final {
tok.transitions[arrstate+1][FINAL] = &edge{}
}
+ fmt.Println("Add", arrstate+1, "->", arrtarget+1, "(", string(tok.sigma_rev[arrin]), ":", string(tok.sigma_rev[arrout]), ")")
+
continue
}
case SIGMA:
@@ -396,7 +425,7 @@
}
func (tok *Tokenizer) resize(l int) {
- if len(tok.array) < l {
+ if len(tok.array) <= l {
tok.array = append(tok.array, make([]int, l)...)
}
}
@@ -447,7 +476,7 @@
// Set alphabet A to the list of all symbols
// outgoing from s
func (tok *Tokenizer) get_set(s int, A *[]int) {
- for a, _ := range tok.transitions[s] {
+ for a := range tok.transitions[s] {
*A = append(*A, a)
}
}
@@ -472,7 +501,7 @@
goto OVERLAP
}
}
- fmt.Println("Found a nice place at", base, "for", len(symbols))
+ // fmt.Println("Found a nice place at", base, "for", len(symbols))
return base
}
@@ -481,33 +510,34 @@
t := 1 // Start position
chars := []rune(input)
i := 0
+ fmt.Println("Length of string is", len(chars))
for ; i < len(chars); i++ {
a := tok.sigma[chars[i]]
tu := t
t = tok.get_base(tu) + a
- fmt.Println("Check", a, t, tok.get_check(1))
+ fmt.Println("Check", string(tok.sigma_rev[a]), ":", t)
if t > tok.get_check(1) {
+ fmt.Println("Out of array")
break
} else if tok.get_check(t) != tu {
+ fmt.Println("Match is not fine!", t, "and", tok.get_check(t), "vs", tu)
break
} else if tok.get_base(t) < 0 {
t = -1 * tok.get_base(t)
- // fmt.Println("Match is representative!")
- } else {
- // fmt.Println("Match is fine!")
+ // } else {
}
}
if i == len(chars) {
fmt.Println("At the end")
} else {
+ fmt.Println("Not at the end")
return false
}
// fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
if tok.get_check(tok.get_base(t)+FINAL) == t {
- fmt.Println("FINE")
return true
}
return false
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 8945f91..f714e10 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -1,7 +1,6 @@
package datokenizer
import (
- "strings"
"testing"
"github.com/stretchr/testify/assert"
@@ -11,30 +10,23 @@
assert := assert.New(t)
// bau | bauamt
- r := strings.NewReader(`##foma-net 1.0##
-##props##
-1 6 7 8 2 2 1 1 1 1 1 2 5B57D486
-##sigma##
-0 @_EPSILON_SYMBOL_@
-3 a
-4 b
-5 m
-6 t
-7 u
-##states##
-0 4 1 0
-1 3 2 0
-2 7 3 0
-3 3 4 1
-4 5 5 0
-5 6 6 0
-6 -1 -1 1
--1 -1 -1 -1 -1
-##end##`)
-
- tok := parse(r) // ("tokenizer.fst")
+ tok := parse_file("testdata/bauamt.fst")
tok.buildDA()
assert.True(tok.match("bau"))
assert.True(tok.match("bauamt"))
assert.False(tok.match("baum"))
}
+
+func TestSimpleBranches(t *testing.T) {
+ assert := assert.New(t)
+
+ // (bau | wahl) (amt | en)
+ tok := parse_file("testdata/wahlamt.fst")
+ tok.buildDA()
+ assert.False(tok.match("bau"))
+ assert.True(tok.match("bauamt"))
+ assert.True(tok.match("wahlamt"))
+ assert.True(tok.match("bauen"))
+ assert.True(tok.match("wahlen"))
+ assert.False(tok.match("baum"))
+}
diff --git a/testdata/bauamt.fst b/testdata/bauamt.fst
new file mode 100644
index 0000000..0e572c3
--- /dev/null
+++ b/testdata/bauamt.fst
Binary files differ
diff --git a/testdata/wahlamt.fst b/testdata/wahlamt.fst
new file mode 100644
index 0000000..b89fe54
--- /dev/null
+++ b/testdata/wahlamt.fst
Binary files differ