Support unknown and identity symbols
diff --git a/datokenizer.go b/datokenizer.go
index cf3f9c4..5868a36 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -26,7 +26,10 @@
)
// Special symbols in sigma
-var EPSILON, UNKNOWN, IDENTITY, FINAL int
+var EPSILON = -1
+var UNKNOWN = -1
+var IDENTITY = -1
+var FINAL = -1
type mapping struct {
source int
@@ -292,7 +295,17 @@
tok.transitions[arrstate+1][FINAL] = &edge{}
}
- fmt.Println("Add", arrstate+1, "->", arrtarget+1, "(", string(tok.sigma_rev[arrin]), ":", string(tok.sigma_rev[arrout]), ")")
+ fmt.Println("Add",
+ arrstate+1, "->", arrtarget+1,
+ "(",
+ arrin,
+ ":",
+ arrout,
+ ") (",
+ string(tok.sigma_rev[arrin]),
+ ":",
+ string(tok.sigma_rev[arrout]),
+ ")")
continue
}
@@ -510,21 +523,30 @@
t := 1 // Start position
chars := []rune(input)
i := 0
- fmt.Println("Length of string is", len(chars))
+ // fmt.Println("Length of string is", len(chars))
for ; i < len(chars); i++ {
- a := tok.sigma[chars[i]]
+ a, ok := tok.sigma[chars[i]]
+
+ // Support identity symbol if char not in sigma
+ if !ok && IDENTITY != -1 {
+ fmt.Println("IDENTITY symbol", chars[i], "->", IDENTITY)
+ a = IDENTITY
+ }
tu := t
+ CHECK:
t = tok.get_base(tu) + a
- fmt.Println("Check", string(tok.sigma_rev[a]), ":", t)
- if t > tok.get_check(1) {
- fmt.Println("Out of array")
- break
- } else if tok.get_check(t) != tu {
+ if t > tok.get_check(1) || tok.get_check(t) != tu {
fmt.Println("Match is not fine!", t, "and", tok.get_check(t), "vs", tu)
+
+ // Try again with unknown symbol, in case identity failed
+ if !ok && a == IDENTITY {
+ a = UNKNOWN
+ goto CHECK
+ }
break
} else if tok.get_base(t) < 0 {
+ // Move to representative state
t = -1 * tok.get_base(t)
- // } else {
}
}
@@ -535,7 +557,7 @@
return false
}
- // fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
+ fmt.Println("Hmm...", tok.get_check(tok.get_base(t)+FINAL), "-", t)
if tok.get_check(tok.get_base(t)+FINAL) == t {
return true
diff --git a/datokenizer_test.go b/datokenizer_test.go
index f714e10..64631c4 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -1,6 +1,7 @@
package datokenizer
import (
+ "fmt"
"testing"
"github.com/stretchr/testify/assert"
@@ -30,3 +31,17 @@
assert.True(tok.match("wahlen"))
assert.False(tok.match("baum"))
}
+
+func TestSimpleTokenizer(t *testing.T) {
+ assert := assert.New(t)
+
+ fmt.Println("-------------------")
+
+ tok := parse_file("testdata/simpletok.fst")
+ tok.buildDA()
+ assert.True(tok.match("bau"))
+ /*
+ assert.True(tok.match("bad"))
+ assert.True(tok.match("wald gehen"))
+ */
+}
diff --git a/testdata/simpletok.fst b/testdata/simpletok.fst
new file mode 100644
index 0000000..1cb6d68
--- /dev/null
+++ b/testdata/simpletok.fst
Binary files differ