Fix loading routine
diff --git a/datokenizer.go b/datokenizer.go
index b4a3f36..2cdac27 100644
--- a/datokenizer.go
+++ b/datokenizer.go
@@ -14,10 +14,10 @@
// TODO:
// - replace maxSize with the check value
// - Strip first state and make everything start with 0!
+// - Add checksum to serialization.
import (
"bufio"
- "bytes"
"compress/gzip"
"encoding/binary"
"fmt"
@@ -717,10 +717,32 @@
}
// WriteTo stores the double array data in an io.Writer.
+func (dat *DaTokenizer) Save(file string) (n int64, err error) {
+ f, err := os.Create(file)
+ if err != nil {
+ log.Error().Err(err)
+ return 0, nil
+ }
+ defer f.Close()
+ gz := gzip.NewWriter(f)
+ defer gz.Close()
+ n, err = dat.WriteTo(gz)
+ if err != nil {
+ log.Error().Err(err)
+ return n, err
+ }
+ gz.Flush()
+ return n, nil
+}
+
+// WriteTo stores the double array data in an io.Writer.
func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
+ wb := bufio.NewWriter(w)
+ defer wb.Flush()
+
// Store magical header
- all, err := w.Write([]byte(MAGIC))
+ all, err := wb.Write([]byte(MAGIC))
if err != nil {
log.Error().Err(err)
return int64(all), err
@@ -746,7 +768,7 @@
bo.PutUint16(buf[8:10], uint16(dat.final))
bo.PutUint16(buf[10:12], uint16(len(sigmalist)))
bo.PutUint32(buf[12:16], uint32(len(dat.array)))
- more, err := w.Write(buf[0:16])
+ more, err := wb.Write(buf[0:16])
if err != nil {
log.Error().Err(err)
return int64(all), err
@@ -754,47 +776,54 @@
all += more
- wbuf := bytes.NewBuffer(nil)
- wbufWrap := bufio.NewWriter(wbuf)
+ // wbuf := bytes.NewBuffer(nil)
+ // wbufWrap := bufio.NewWriter(wbuf)
// Write sigma
for _, sym := range sigmalist {
- more, err = wbufWrap.WriteRune(sym)
+ more, err = wb.WriteRune(sym)
if err != nil {
log.Error().Err(err)
return int64(all), err
}
all += more
}
- wbufWrap.Flush()
- more, err = w.Write(wbuf.Bytes())
+ // wbufWrap.Flush()
+ // more, err = w.Write(wbuf.Bytes())
if err != nil {
log.Error().Err(err)
return int64(all), err
}
- all += more
+ // all += more
// Test marker - could be checksum
- more, err = w.Write([]byte("T"))
+ more, err = wb.Write([]byte("T"))
if err != nil {
log.Error().Err(err)
return int64(all), err
}
all += more
- wbuf.Reset()
+ // wbuf.Reset()
- for _, d := range dat.array {
- bo.PutUint32(buf[0:4], d)
- more, err := w.Write(buf[0:4])
+ for x := 0; x < len(dat.array); x++ {
+ // for _, d := range dat.array {
+ bo.PutUint32(buf[0:4], dat.array[x])
+ more, err := wb.Write(buf[0:4])
if err != nil {
log.Error().Err(err)
return int64(all), err
}
+ if more != 4 {
+ log.Error().Msg("Can not write uint32")
+ return int64(all), err
+ }
all += more
}
+ // wbufWrap.Flush()
+
return int64(all), err
}
@@ -813,6 +842,7 @@
}
defer gz.Close()
+ // Todo: Read the whole file!
return ParseDatok(gz)
}
@@ -848,7 +878,7 @@
return nil
}
- more, err = r.Read(buf[0:16])
+ more, err = io.ReadFull(r, buf[0:16])
if err != nil {
log.Error().Err(err)
return nil
@@ -856,7 +886,13 @@
all += more
- // version := bo.Uint16(buf[0:2])
+ version := bo.Uint16(buf[0:2])
+
+ if version != VERSION {
+ log.Error().Msg("Version not compatible")
+ return nil
+ }
+
dat.epsilon = int(bo.Uint16(buf[2:4]))
dat.unknown = int(bo.Uint16(buf[4:6]))
dat.identity = int(bo.Uint16(buf[6:8]))
@@ -865,6 +901,9 @@
sigmaCount := int(bo.Uint16(buf[10:12]))
arraySize := int(bo.Uint32(buf[12:16]))
+ // Shouldn't be relevant though
+ dat.maxSize = arraySize - 1
+
for x := 0; x < sigmaCount; x++ {
sym, more, err := r.ReadRune()
if err == nil && sym != 0 {
@@ -873,7 +912,7 @@
all += more
}
- more, err = r.Read(buf[0:1])
+ more, err = io.ReadFull(r, buf[0:1])
if err != nil {
log.Error().Err(err)
@@ -891,8 +930,12 @@
dat.array = make([]uint32, arraySize)
for x := 0; x < arraySize; x++ {
- more, err = r.Read(buf[0:4])
+ more, err = io.ReadFull(r, buf[0:4])
if err != nil {
+ if err == io.EOF {
+ fmt.Println(arraySize, x)
+ break
+ }
log.Error().Err(err)
return nil
}
diff --git a/datokenizer_test.go b/datokenizer_test.go
index efd1e7c..692112e 100644
--- a/datokenizer_test.go
+++ b/datokenizer_test.go
@@ -100,7 +100,7 @@
buf := bytes.NewBuffer(b)
n, err := dat.WriteTo(buf)
assert.Nil(err)
- assert.Equal(int64(218), n)
+ assert.Equal(int64(208), n)
dat2 := ParseDatok(buf)
assert.NotNil(dat2)
@@ -117,30 +117,33 @@
}
func TestFullTokenizer(t *testing.T) {
+ assert := assert.New(t)
/*
- assert := assert.New(t)
tok := LoadFomaFile("testdata/tokenizer.fst")
dat := tok.ToDoubleArray()
-
- f, _ := os.Create("testdata/tokenizer.datok")
- gz := gzip.NewWriter(f)
- defer f.Close()
- dat.WriteTo(gz)
- assert.NotNil(gz)
-
- assert.True(dat.LoadFactor() >= 70)
- assert.True(dat.Match("bau"))
- assert.True(dat.Match("bad"))
- assert.True(dat.Match("wald gehen"))
+ dat.Save("testdata/tokenizer.datok")
*/
+ dat := LoadDatokFile("testdata/tokenizer.datok")
+ assert.NotNil(dat)
+ assert.True(dat.LoadFactor() >= 70)
+ assert.Equal(dat.epsilon, 1)
+ assert.Equal(dat.unknown, 2)
+ assert.Equal(dat.identity, 3)
+ assert.Equal(dat.final, 135)
+ assert.Equal(len(dat.sigma), 131)
+ assert.Equal(len(dat.array), 3771904)
+ assert.Equal(dat.maxSize, 3771903)
+
+ assert.True(dat.Match("bau"))
+ assert.True(dat.Match("bad"))
+ assert.True(dat.Match("wald gehen"))
}
func TestFullTokenizerTransduce(t *testing.T) {
/*
assert := assert.New(t)
- // tok := LoadFomaFile("testdata/tokenizer.fst")
- tok := LoadFomaFile("testdata/simpletok.fst")
- dat := tok.ToDoubleArray()
+ dat := LoadDatokFile("testdata/tokenizer.datok")
+ assert.NotNil(dat)
dat := LoadDatokFile("testdata/tokenizer.datok")
r := strings.NewReader("wald gehen! Da kann\t man was \"erleben\"!")
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
new file mode 100644
index 0000000..e0efd4b
--- /dev/null
+++ b/testdata/tokenizer.datok
Binary files differ