Fix foma format parser

commit: 75ebe7f3def6f0e34a7708393efacfd4eed226d5 [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Aug 03 10:34:10 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 03 10:34:10 2021 +0200
tree: 5d16596331b56630b7389d113c212de5373f71b6
parent: 8ef408b0a3e1d8eddd1beeedab0c0f8b78f39c2c [diff]
diff --git a/datokenizer.go b/datokenizer.go
index 8a68753..cf3f9c4 100644
--- a/datokenizer.go
+++ b/datokenizer.go

@@ -18,10 +18,11 @@
 )
 
 const (
-	PROPS  = 1
-	SIGMA  = 2
-	STATES = 3
-	NONE   = 4
+	PROPS   = 1
+	SIGMA   = 2
+	STATES  = 3
+	NONE    = 4
+	NEWLINE = '\u000a'
 )
 
 // Special symbols in sigma
@@ -164,6 +165,9 @@
 					continue
 				}
 				elemint[0], err = strconv.Atoi(elem[0])
+				if err != nil {
+					break
+				}
 
 				if len(elem) > 1 {
 					elemint[1], err = strconv.Atoi(elem[1])
@@ -221,7 +225,7 @@
 				case 2:
 					{
 						arrin = elemint[0]
-						arrtarget = elemint[2]
+						arrtarget = elemint[1]
 						arrout = arrin
 					}
 				}
@@ -236,8 +240,29 @@
 				// While the states in foma start with 0, the states in the
 				// Mizobuchi FSA start with one - so we increase every state by 1.
 
-				if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
-					panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
+				/*
+					if arrin != arrout && arrin != EPSILON && tok.sigma_rev[arrin] != '\n' {
+						panic("Problem: " + strconv.Itoa(arrstate) + " -> " + strconv.Itoa(arrtarget) + " (" + strconv.Itoa(arrin) + ":" + strconv.Itoa(arrout) + ") ")
+					}
+				*/
+				if arrin != arrout {
+					if arrin == EPSILON && tok.sigma_rev[arrout] == NEWLINE {
+					} else if arrin != EPSILON && arrout == EPSILON {
+					} else {
+						panic(
+							"Problem: " +
+								strconv.Itoa(arrstate) +
+								" -> " + strconv.Itoa(arrtarget) +
+								" (" +
+								strconv.Itoa(arrin) +
+								":" +
+								strconv.Itoa(arrout) +
+								") (" +
+								string(tok.sigma_rev[arrin]) +
+								":" +
+								string(tok.sigma_rev[arrout]) +
+								")")
+					}
 				}
 
 				// TODO:
@@ -259,12 +284,16 @@
 					tok.transitions[arrstate+1] = make(map[int]*edge)
 				}
 
-				tok.transitions[arrstate+1][arrin] = targetObj
+				if arrin >= 0 {
+					tok.transitions[arrstate+1][arrin] = targetObj
+				}
 
 				if final {
 					tok.transitions[arrstate+1][FINAL] = &edge{}
 				}
 
+				fmt.Println("Add", arrstate+1, "->", arrtarget+1, "(", string(tok.sigma_rev[arrin]), ":", string(tok.sigma_rev[arrout]), ")")
+
 				continue
 			}
 		case SIGMA:
@@ -396,7 +425,7 @@
 }
 
 func (tok *Tokenizer) resize(l int) {
-	if len(tok.array) < l {
+	if len(tok.array) <= l {
 		tok.array = append(tok.array, make([]int, l)...)
 	}
 }
@@ -447,7 +476,7 @@
 // Set alphabet A to the list of all symbols
 // outgoing from s
 func (tok *Tokenizer) get_set(s int, A *[]int) {
-	for a, _ := range tok.transitions[s] {
+	for a := range tok.transitions[s] {
 		*A = append(*A, a)
 	}
 }
@@ -472,7 +501,7 @@
 			goto OVERLAP
 		}
 	}
-	fmt.Println("Found a nice place at", base, "for", len(symbols))
+	//	fmt.Println("Found a nice place at", base, "for", len(symbols))
 	return base
 }
 
@@ -481,33 +510,34 @@
 	t := 1 // Start position
 	chars := []rune(input)
 	i := 0
+	fmt.Println("Length of string is", len(chars))
 	for ; i < len(chars); i++ {
 		a := tok.sigma[chars[i]]
 		tu := t
 		t = tok.get_base(tu) + a
-		fmt.Println("Check", a, t, tok.get_check(1))
+		fmt.Println("Check", string(tok.sigma_rev[a]), ":", t)
 		if t > tok.get_check(1) {
+			fmt.Println("Out of array")
 			break
 		} else if tok.get_check(t) != tu {
+			fmt.Println("Match is not fine!", t, "and", tok.get_check(t), "vs", tu)
 			break
 		} else if tok.get_base(t) < 0 {
 			t = -1 * tok.get_base(t)
-			// fmt.Println("Match is representative!")
-		} else {
-			// fmt.Println("Match is fine!")
+			// } else {
 		}
 	}
 
 	if i == len(chars) {
 		fmt.Println("At the end")
 	} else {
+		fmt.Println("Not at the end")
 		return false
 	}
 
 	// fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
 
 	if tok.get_check(tok.get_base(t)+FINAL) == t {
-		fmt.Println("FINE")
 		return true
 	}
 	return false

diff --git a/datokenizer_test.go b/datokenizer_test.go
index 8945f91..f714e10 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go

@@ -1,7 +1,6 @@
 package datokenizer
 
 import (
-	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -11,30 +10,23 @@
 	assert := assert.New(t)
 
 	// bau | bauamt
-	r := strings.NewReader(`##foma-net 1.0##
-##props##
-1 6 7 8 2 2 1 1 1 1 1 2 5B57D486
-##sigma##
-0 @_EPSILON_SYMBOL_@
-3 a
-4 b
-5 m
-6 t
-7 u
-##states##
-0 4 1 0
-1 3 2 0
-2 7 3 0
-3 3 4 1
-4 5 5 0
-5 6 6 0
-6 -1 -1 1
--1 -1 -1 -1 -1
-##end##`)
-
-	tok := parse(r) // ("tokenizer.fst")
+	tok := parse_file("testdata/bauamt.fst")
 	tok.buildDA()
 	assert.True(tok.match("bau"))
 	assert.True(tok.match("bauamt"))
 	assert.False(tok.match("baum"))
 }
+
+func TestSimpleBranches(t *testing.T) {
+	assert := assert.New(t)
+
+	// (bau | wahl) (amt | en)
+	tok := parse_file("testdata/wahlamt.fst")
+	tok.buildDA()
+	assert.False(tok.match("bau"))
+	assert.True(tok.match("bauamt"))
+	assert.True(tok.match("wahlamt"))
+	assert.True(tok.match("bauen"))
+	assert.True(tok.match("wahlen"))
+	assert.False(tok.match("baum"))
+}

diff --git a/testdata/bauamt.fst b/testdata/bauamt.fst
new file mode 100644
index 0000000..0e572c3
--- /dev/null
+++ b/testdata/bauamt.fst
Binary files differ

diff --git a/testdata/wahlamt.fst b/testdata/wahlamt.fst
new file mode 100644
index 0000000..b89fe54
--- /dev/null
+++ b/testdata/wahlamt.fst
Binary files differ
commit	75ebe7f3def6f0e34a7708393efacfd4eed226d5	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Aug 03 10:34:10 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 03 10:34:10 2021 +0200
tree	5d16596331b56630b7389d113c212de5373f71b6
parent	8ef408b0a3e1d8eddd1beeedab0c0f8b78f39c2c [diff]