Turn special sigma values into properties
diff --git a/datokenizer.go b/datokenizer.go
index 1ebcd91..826ba87 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -9,9 +9,6 @@
// TODO:
// - replace maxSize with the check value
// - Strip first state and make everything start with 0!
-// - Serialize!
-// - Split Tokenizer and DATokenizer
-// - Make epsilon etc. properties
import (
"bufio"
@@ -40,12 +37,6 @@
VERSION = uint16(1)
)
-// Special symbols in sigma
-var EPSILON = -1
-var UNKNOWN = -1
-var IDENTITY = -1
-var FINAL = -1
-
var bo binary.ByteOrder = binary.LittleEndian
type mapping struct {
@@ -66,6 +57,12 @@
stateCount int
sigmaCount int
transitions []map[int]*edge
+
+ // Special symbols in sigma
+ epsilon int
+ unknown int
+ identity int
+ final int
}
type DaTokenizer struct {
@@ -74,9 +71,15 @@
maxSize int
loadLevel float64
array []int
+
+ // Special symbols in sigma
+ epsilon int
+ unknown int
+ identity int
+ final int
}
-func ParseFile(file string) *Tokenizer {
+func ParseFoma(file string) *Tokenizer {
f, err := os.Open(file)
if err != nil {
log.Error().Err(err)
@@ -98,8 +101,11 @@
r := bufio.NewReader(ior)
tok := &Tokenizer{
- // sigma: make(map[rune]int),
sigmaRev: make(map[int]rune),
+ epsilon: -1,
+ unknown: -1,
+ identity: -1,
+ final: -1,
}
var state, inSym, outSym, end, final int
@@ -130,7 +136,7 @@
// Adds a final transition symbol to sigma
// written as '#' in Mizobuchi et al (2000)
tok.sigmaCount++
- FINAL = tok.sigmaCount
+ tok.final = tok.sigmaCount
continue
}
if strings.HasPrefix(line, "##sigma##") {
@@ -267,10 +273,10 @@
if inSym != outSym {
// Allow any epsilon to become a newline
- if !(inSym == EPSILON && tok.sigmaRev[outSym] == NEWLINE) &&
+ if !(inSym == tok.epsilon && tok.sigmaRev[outSym] == NEWLINE) &&
// Allow any whitespace to be ignored
- !(inSym != EPSILON && outSym == EPSILON) &&
+ !(inSym != tok.epsilon && outSym == tok.epsilon) &&
// Allow any whitespace to become a new line
!(tok.sigmaRev[outSym] == NEWLINE) {
@@ -320,7 +326,7 @@
// Add final transition
if final == 1 {
- tok.transitions[state+1][FINAL] = &edge{}
+ tok.transitions[state+1][tok.final] = &edge{}
}
if DEBUG {
@@ -364,18 +370,18 @@
switch elem[1] {
case "@_EPSILON_SYMBOL_@":
{
- EPSILON = number
+ tok.epsilon = number
continue
}
case "@_UNKNOWN_SYMBOL_@":
{
- UNKNOWN = number
+ tok.unknown = number
continue
}
case "@_IDENTITY_SYMBOL_@":
{
- IDENTITY = number
+ tok.identity = number
continue
}
default:
@@ -412,6 +418,10 @@
dat := &DaTokenizer{
sigma: make(map[rune]int),
loadLevel: -1,
+ final: tok.final,
+ unknown: tok.unknown,
+ identity: tok.identity,
+ epsilon: tok.epsilon,
}
for num, sym := range tok.sigmaRev {
@@ -450,7 +460,7 @@
// Iterate over all outgoing symbols
for _, a := range A {
- if a != FINAL {
+ if a != tok.final {
// Aka g(s, a)
s1 := tok.transitions[s][a].end
@@ -472,7 +482,7 @@
}
} else {
// Store a final transition
- dat.setCheck(dat.getBase(t)+FINAL, t)
+ dat.setCheck(dat.getBase(t)+dat.final, t)
}
}
}
@@ -576,7 +586,7 @@
OVERLAP:
// Resize the array if necessary
- dat.resize((base + FINAL) * 2)
+ dat.resize((base + dat.final) * 2)
for _, a := range symbols {
if dat.getCheck(base+a) != 0 {
base++
@@ -625,10 +635,10 @@
buf := make([]byte, 0, 12)
bo.PutUint16(buf[0:2], VERSION)
- bo.PutUint16(buf[2:4], uint16(EPSILON))
- bo.PutUint16(buf[4:6], uint16(UNKNOWN))
- bo.PutUint16(buf[6:8], uint16(IDENTITY))
- bo.PutUint16(buf[8:10], uint16(FINAL))
+ bo.PutUint16(buf[2:4], uint16(dat.epsilon))
+ bo.PutUint16(buf[4:6], uint16(dat.unknown))
+ bo.PutUint16(buf[6:8], uint16(dat.identity))
+ bo.PutUint16(buf[8:10], uint16(dat.final))
bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
more, err := w.Write(buf[0:12])
if err != nil {
@@ -695,11 +705,11 @@
a, ok = tok.sigma[chars[i]]
// Support identity symbol if character is not in sigma
- if !ok && IDENTITY != -1 {
+ if !ok && tok.identity != -1 {
if DEBUG {
- fmt.Println("IDENTITY symbol", string(chars[i]), "->", IDENTITY)
+ fmt.Println("IDENTITY symbol", string(chars[i]), "->", tok.identity)
}
- a = IDENTITY
+ a = tok.identity
} else if DEBUG {
fmt.Println("Sigma transition is okay for [", string(chars[i]), "]")
}
@@ -714,19 +724,19 @@
fmt.Println("Match is not fine!", t, "and", tok.getCheck(t), "vs", tu)
}
- if !ok && a == IDENTITY {
+ if !ok && a == tok.identity {
// Try again with unknown symbol, in case identity failed
if DEBUG {
- fmt.Println("UNKNOWN symbol", string(chars[i]), "->", UNKNOWN)
+ fmt.Println("UNKNOWN symbol", string(chars[i]), "->", tok.unknown)
}
- a = UNKNOWN
+ a = tok.unknown
- } else if a != EPSILON {
+ } else if a != tok.epsilon {
// Try again with epsilon symbol, in case everything else failed
if DEBUG {
- fmt.Println("EPSILON symbol", string(chars[i]), "->", EPSILON)
+ fmt.Println("EPSILON symbol", string(chars[i]), "->", tok.epsilon)
}
- a = EPSILON
+ a = tok.epsilon
} else {
break
}
@@ -737,7 +747,7 @@
}
// Transition is fine
- if a != EPSILON {
+ if a != tok.epsilon {
// Character consumed
i++
}
@@ -755,13 +765,13 @@
FINALCHECK:
// Automaton is in a final state
- if tok.getCheck(tok.getBase(t)+FINAL) == t {
+ if tok.getCheck(tok.getBase(t)+tok.final) == t {
return true
}
// Check epsilon transitions until a final state is reached
tu = t
- t = tok.getBase(tu) + EPSILON
+ t = tok.getBase(tu) + tok.epsilon
// Epsilon transition failed
if t > tok.getCheck(1) || tok.getCheck(t) != tu {
diff --git a/datokenizer_test.go b/datokenizer_test.go
index 4cc5619..7f1d1d5 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -11,7 +11,7 @@
assert := assert.New(t)
// bau | bauamt
- tok := ParseFile("testdata/bauamt.fst")
+ tok := ParseFoma("testdata/bauamt.fst")
dat := tok.ToDoubleArray()
assert.True(dat.Match("bau"))
assert.True(dat.Match("bauamt"))
@@ -22,7 +22,7 @@
assert := assert.New(t)
// (bau | wahl) (amt | en)
- tok := ParseFile("testdata/wahlamt.fst")
+ tok := ParseFoma("testdata/wahlamt.fst")
dat := tok.ToDoubleArray()
assert.False(dat.Match("bau"))
assert.True(dat.Match("bauamt"))
@@ -34,7 +34,7 @@
func TestSimpleTokenizer(t *testing.T) {
assert := assert.New(t)
- tok := ParseFile("testdata/simpletok.fst")
+ tok := ParseFoma("testdata/simpletok.fst")
dat := tok.ToDoubleArray()
assert.True(dat.Match("bau"))
assert.True(dat.Match("bad"))
@@ -43,7 +43,7 @@
func TestWriteTokenizer(t *testing.T) {
assert := assert.New(t)
- tok := ParseFile("testdata/simpletok.fst")
+ tok := ParseFoma("testdata/simpletok.fst")
dat := tok.ToDoubleArray()
assert.True(dat.Match("bau"))
assert.True(dat.Match("bad"))