Serialize and deserialize matrix representation
diff --git a/datok.go b/datok.go
index 4f7ac6f..7812981 100644
--- a/datok.go
+++ b/datok.go
@@ -39,7 +39,7 @@
const (
DEBUG = false
- MAGIC = "DATOK"
+ DAMAGIC = "DATOK"
VERSION = uint16(1)
FIRSTBIT uint32 = 1 << 31
SECONDBIT uint32 = 1 << 30
@@ -489,7 +489,7 @@
defer wb.Flush()
// Store magical header
- all, err := wb.Write([]byte(MAGIC))
+ all, err := wb.Write([]byte(DAMAGIC))
if err != nil {
log.Println(err)
return int64(all), err
@@ -614,7 +614,7 @@
r := bufio.NewReader(ior)
buf := make([]byte, 1024)
- buf = buf[0:len(MAGIC)]
+ buf = buf[0:len(DAMAGIC)]
_, err := r.Read(buf)
@@ -623,7 +623,7 @@
return nil
}
- if string(MAGIC) != string(buf) {
+ if string(DAMAGIC) != string(buf) {
log.Println("Not a datok file")
return nil
}
@@ -907,7 +907,8 @@
}
buffi -= buffo
- epsilonOffset -= buffo
+ // epsilonOffset -= buffo
+ epsilonOffset = buffo
buffo = 0
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
diff --git a/matrix.go b/matrix.go
index 356efa9..4ce82c9 100644
--- a/matrix.go
+++ b/matrix.go
@@ -2,22 +2,29 @@
import (
"bufio"
+ "compress/gzip"
"fmt"
"io"
+ "log"
+ "os"
+)
+
+const (
+ MAMAGIC = "MATOK"
)
type MatrixTokenizer struct {
sigma map[rune]int
sigmaASCII [256]int
- array []int
+ array []uint32
stateCount int
// Special symbols in sigma
epsilon int
unknown int
identity int
- final int
- tokenend int
+ // final int
+ // tokenend int
}
// ToMatrix turns the intermediate tokenizer into a
@@ -25,16 +32,16 @@
func (auto *Automaton) ToMatrix() *MatrixTokenizer {
mat := &MatrixTokenizer{
- sigma: make(map[rune]int),
- final: auto.final,
- unknown: auto.unknown,
- identity: auto.identity,
- epsilon: auto.epsilon,
- tokenend: auto.tokenend,
+ sigma: make(map[rune]int),
+ // final: auto.final,
+ unknown: auto.unknown,
+ identity: auto.identity,
+ epsilon: auto.epsilon,
+ // tokenend: auto.tokenend,
stateCount: auto.stateCount,
}
- mat.array = make([]int, (auto.stateCount+1)*(auto.sigmaCount+1))
+ mat.array = make([]uint32, (auto.stateCount+1)*(auto.sigmaCount))
for num, sym := range auto.sigmaRev {
if int(sym) < 256 {
@@ -48,9 +55,9 @@
remember := make([]bool, auto.stateCount+2)
// Store all transitions in matrix
- var toMatrix func([]int, int)
+ var toMatrix func([]uint32, int)
- toMatrix = func(matrix []int, start int) {
+ toMatrix = func(matrix []uint32, start int) {
if start > auto.stateCount {
panic("stateCount is smaller")
}
@@ -59,11 +66,11 @@
}
remember[start] = true
for alpha, t := range auto.transitions[start] {
- matrix[(alpha-1)*auto.stateCount+start] = t.end
+ matrix[(alpha-1)*auto.stateCount+start] = uint32(t.end)
// Mark nontoken transitions
if t.nontoken {
- matrix[(alpha-1)*auto.stateCount+start] *= -1
+ matrix[(alpha-1)*auto.stateCount+start] |= FIRSTBIT
}
toMatrix(matrix, t.end)
@@ -75,15 +82,258 @@
return mat
}
+// Save stores the matrix data in a file
+func (mat *MatrixTokenizer) Save(file string) (n int64, err error) {
+ f, err := os.Create(file)
+ if err != nil {
+ log.Println(err)
+ return 0, err
+ }
+ defer f.Close()
+ gz := gzip.NewWriter(f)
+ defer gz.Close()
+ n, err = mat.WriteTo(gz)
+ if err != nil {
+ log.Println(err)
+ return n, err
+ }
+ gz.Flush()
+ return n, nil
+}
+
+// WriteTo stores the matrix data in an io.Writer.
+func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
+
+ wb := bufio.NewWriter(w)
+ defer wb.Flush()
+
+ // Store magical header
+ all, err := wb.Write([]byte(MAMAGIC))
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+
+ // Get sigma as a list
+ sigmalist := make([]rune, len(mat.sigma)+12)
+ max := 0
+ for sym, num := range mat.sigma {
+ sigmalist[num] = sym
+ if num > max {
+ max = num
+ }
+ }
+
+ sigmalist = sigmalist[:max+1]
+
+ buf := make([]byte, 0, 12)
+ bo.PutUint16(buf[0:2], VERSION)
+ bo.PutUint16(buf[2:4], uint16(mat.epsilon))
+ bo.PutUint16(buf[4:6], uint16(mat.unknown))
+ bo.PutUint16(buf[6:8], uint16(mat.identity))
+ bo.PutUint16(buf[8:10], uint16(mat.stateCount))
+ bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
+ // bo.PutUint32(buf[12:16], uint32(len(mat.array)*2)) // Legacy support
+ more, err := wb.Write(buf[0:12])
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+
+ all += more
+
+ // Write sigma
+ for _, sym := range sigmalist {
+
+ more, err = wb.WriteRune(sym)
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+ all += more
+ }
+
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+
+ // Test marker - could be checksum
+ more, err = wb.Write([]byte("M"))
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+ all += more
+
+ // for x := 0; x < len(dat.array); x++ {
+ for _, x := range mat.array {
+ bo.PutUint32(buf[0:4], uint32(x))
+ more, err = wb.Write(buf[0:4])
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+ all += more
+ if more != 4 {
+ log.Println("Can not write base uint32")
+ return int64(all), err
+ }
+ /*
+ bo.PutUint32(buf[0:4], bc.check)
+ more, err = wb.Write(buf[0:4])
+ if err != nil {
+ log.Println(err)
+ return int64(all), err
+ }
+ all += more
+ if more != 4 {
+ log.Println("Can not write check uint32")
+ return int64(all), err
+ }
+ */
+ }
+
+ return int64(all), err
+}
+
+// LoadDatokFile reads a double array represented tokenizer
+// from a file.
+func LoadMatrixFile(file string) *MatrixTokenizer {
+ f, err := os.Open(file)
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+ defer f.Close()
+
+ gz, err := gzip.NewReader(f)
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+ defer gz.Close()
+
+ // Todo: Read the whole file!
+ return ParseMatrix(gz)
+}
+
+// LoadMatrixFile reads a matrix represented tokenizer
+// from an io.Reader
+func ParseMatrix(ior io.Reader) *MatrixTokenizer {
+
+ // Initialize tokenizer with default values
+ mat := &MatrixTokenizer{
+ sigma: make(map[rune]int),
+ epsilon: 0,
+ unknown: 0,
+ identity: 0,
+ // final: 0,
+ stateCount: 0,
+ // transCount: 0,
+ }
+
+ r := bufio.NewReader(ior)
+
+ buf := make([]byte, 1024)
+ buf = buf[0:len(MAMAGIC)]
+
+ _, err := r.Read(buf)
+
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+
+ if string(MAMAGIC) != string(buf) {
+ log.Println("Not a matok file")
+ return nil
+ }
+
+ more, err := io.ReadFull(r, buf[0:12])
+ if err != nil {
+ log.Println(err)
+ return nil
+ }
+
+ if more != 12 {
+ log.Println("Read bytes do not fit")
+ return nil
+ }
+
+ version := bo.Uint16(buf[0:2])
+
+ if version != VERSION {
+ log.Println("Version not compatible")
+ return nil
+ }
+
+ mat.epsilon = int(bo.Uint16(buf[2:4]))
+ mat.unknown = int(bo.Uint16(buf[4:6]))
+ mat.identity = int(bo.Uint16(buf[6:8]))
+ mat.stateCount = int(bo.Uint16(buf[8:10]))
+
+ sigmaCount := int(bo.Uint16(buf[10:12]))
+ arraySize := (mat.stateCount + 1) * (sigmaCount + 1)
+ // int(bo.Uint32(buf[12:16]))
+
+ // Shouldn't be relevant though
+ // mat.maxSize = arraySize - 1
+
+ for x := 0; x < sigmaCount; x++ {
+ sym, _, err := r.ReadRune()
+ if err == nil && sym != 0 {
+ if int(sym) < 256 {
+ mat.sigmaASCII[int(sym)] = x
+ }
+ mat.sigma[sym] = x
+ }
+ }
+
+ _, err = io.ReadFull(r, buf[0:1])
+
+ if err != nil {
+ log.Print(err)
+ return nil
+ }
+
+ if string("M") != string(buf[0:1]) {
+ log.Println("Not a matok file")
+ return nil
+ }
+
+ // Read based on length
+ mat.array = make([]uint32, arraySize)
+
+ dataArray, err := io.ReadAll(r)
+
+ if err == io.EOF {
+ log.Println(err)
+ return nil
+ }
+
+ if len(dataArray) < arraySize*4 {
+ log.Println("Not enough bytes read", len(dataArray), arraySize)
+ return nil
+ }
+
+ for x := 0; x < arraySize; x++ {
+ // mat.array[x] = bo.Uint32(dataArray[x*8 : (x*8)+4])
+ mat.array[x] = bo.Uint32(dataArray[x*4 : (x*4)+4])
+ }
+
+ return mat
+}
+
func (mat *MatrixTokenizer) Transduce(r io.Reader, w io.Writer) bool {
var a int
- var t0 int
- t := int(1) // Initial state
+ var t0 uint32
+ t := uint32(1) // Initial state
var ok, rewindBuffer bool
// Remember the last position of a possible tokenend,
// in case the automaton fails.
- epsilonState := int(0)
+ epsilonState := uint32(0)
epsilonOffset := 0
// Remember if the last transition was epsilon
@@ -150,10 +400,24 @@
// Check for epsilon transitions and remember
- if mat.array[(mat.epsilon-1)*mat.stateCount+t0] != 0 {
+ // TODO: Can t0 be negative here?
+ if mat.array[(mat.epsilon-1)*mat.stateCount+int(t0)] != 0 {
// Remember state for backtracking to last tokenend state
+
+ // Maybe not necessary - and should be simpler!
+ // Just Remove
+ t0 &= ^FIRSTBIT
+ /*
+ if (t0 & FIRSTBIT) != 0 {
+ t0 ^= FIRSTBIT
+ }
+ */
epsilonState = t0
epsilonOffset = buffo
+
+ if DEBUG {
+ fmt.Println("epsilonOffset is set to", buffo)
+ }
}
}
@@ -219,7 +483,7 @@
// Transition does not produce a character
// if buffo == 1 && ta.isNonToken() {
- if buffo == 1 && t < 0 {
+ if buffo == 1 && (t&FIRSTBIT) != 0 {
if DEBUG {
fmt.Println("Nontoken forward", showBuffer(buffer, buffo, buffi))
}
@@ -248,13 +512,20 @@
// Rewind the buffer if necessary
if rewindBuffer {
+ if DEBUG {
+ fmt.Println("-> Rewind buffer", buffo, buffi, epsilonOffset)
+ }
+
// TODO: Better as a ring buffer
for x, i := range buffer[buffo:buffi] {
buffer[x] = i
}
buffi -= buffo
- epsilonOffset -= buffo
+ // epsilonOffset -= buffo
+ epsilonOffset = 0
+ epsilonState = 0
+
buffo = 0
if DEBUG {
fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
@@ -274,9 +545,12 @@
*/
// Ignore nontoken mark
- if t < 0 {
- t *= -1
- }
+ /*
+ if t < 0 {
+ t *= -1
+ }
+ */
+ t &= ^FIRSTBIT
newchar = true
@@ -340,7 +614,7 @@
newchar = false
// if dat.array[t].getCheck() == t0 {
// t can't be < 0
- if t > 0 {
+ if t != 0 {
// Remember state for backtracking to last tokenend state
goto PARSECHARM
diff --git a/matrix_test.go b/matrix_test.go
index 49a1523..37a61b2 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -55,6 +55,36 @@
assert.Equal(6, len(tokens))
}
+func TestReadWriteMatrixTokenizer(t *testing.T) {
+ assert := assert.New(t)
+ foma := LoadFomaFile("testdata/simpletok.fst")
+ assert.NotNil(foma)
+
+ mat := foma.ToMatrix()
+ assert.NotNil(foma)
+
+ assert.True(tmatch(mat, "bau"))
+ assert.True(tmatch(mat, "bad"))
+ assert.True(tmatch(mat, "wald gehen"))
+ b := make([]byte, 0, 1024)
+ buf := bytes.NewBuffer(b)
+ n, err := mat.WriteTo(buf)
+ assert.Nil(err)
+ assert.Equal(int64(248), n)
+ mat2 := ParseMatrix(buf)
+ assert.NotNil(mat2)
+ assert.Equal(mat.sigma, mat2.sigma)
+ assert.Equal(mat.epsilon, mat2.epsilon)
+ assert.Equal(mat.unknown, mat2.unknown)
+ assert.Equal(mat.identity, mat2.identity)
+ assert.Equal(mat.stateCount, mat2.stateCount)
+ assert.Equal(len(mat.array), len(mat2.array))
+ assert.Equal(mat.array, mat2.array)
+ assert.True(tmatch(mat2, "bau"))
+ assert.True(tmatch(mat2, "bad"))
+ assert.True(tmatch(mat2, "wald gehen"))
+}
+
func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
assert := assert.New(t)
foma := LoadFomaFile("testdata/tokenizer.fst")