blob: e8a32d96fb005d37fbc5a2a22ebbcae602c44318 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
53 dat.Transduce(r, w)
54
55 tokens := strings.Split(string(w.Bytes()), "\n")
56 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
64 /*
65 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68
69 tokens = strings.Split(string(w.Bytes()), "\n")
70 assert.Equal("In", tokens[0])
71 assert.Equal("den", tokens[1])
72 assert.Equal("Wald", tokens[2])
73 assert.Equal("gehen", tokens[3])
74 assert.Equal("?", tokens[4])
75 assert.Equal("--", tokens[5])
76 */
Akron068874c2021-08-04 15:19:56 +020077}
78
Akron3f8571a2021-08-05 11:18:10 +020079func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020080 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020081 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020082 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020083 assert.True(dat.Match("bau"))
84 assert.True(dat.Match("bad"))
85 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020086
Akron03a3c612021-08-04 11:51:27 +020087 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020088
Akron3f8571a2021-08-05 11:18:10 +020089 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020090 buf := bytes.NewBuffer(b)
91 n, err := dat.WriteTo(buf)
92 assert.Nil(err)
Akron3f8571a2021-08-05 11:18:10 +020093 assert.Equal(int64(208), n)
94
95 dat2 := ParseDatok(buf)
96 assert.NotNil(dat2)
97 assert.Equal(dat.array, dat2.array)
98 assert.Equal(dat.sigma, dat2.sigma)
99 assert.Equal(dat.epsilon, dat2.epsilon)
100 assert.Equal(dat.unknown, dat2.unknown)
101 assert.Equal(dat.identity, dat2.identity)
102 assert.Equal(dat.final, dat2.final)
103 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
104 assert.True(dat2.Match("bau"))
105 assert.True(dat2.Match("bad"))
106 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200107}
108
109func TestFullTokenizer(t *testing.T) {
Akron2a4b9292021-08-04 15:35:22 +0200110 /*
111 assert := assert.New(t)
112 tok := LoadFomaFile("testdata/tokenizer.fst")
113 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +0200114
115 f, _ := os.Create("testdata/tokenizer.datok")
116 gz := gzip.NewWriter(f)
117 defer f.Close()
118 dat.WriteTo(gz)
119 assert.NotNil(gz)
120
Akron2a4b9292021-08-04 15:35:22 +0200121 assert.True(dat.LoadFactor() >= 70)
122 assert.True(dat.Match("bau"))
123 assert.True(dat.Match("bad"))
124 assert.True(dat.Match("wald gehen"))
125 */
Akron740f3d72021-08-03 12:12:34 +0200126}
Akron3f8571a2021-08-05 11:18:10 +0200127
128func TestFullTokenizerTransduce(t *testing.T) {
129 /*
130 assert := assert.New(t)
131 // tok := LoadFomaFile("testdata/tokenizer.fst")
132 tok := LoadFomaFile("testdata/simpletok.fst")
133 dat := tok.ToDoubleArray()
134
135 dat := LoadDatokFile("testdata/tokenizer.datok")
136 r := strings.NewReader("wald gehen! Da kann\t man was \"erleben\"!")
137 assert.True(dat.Transduce(r, os.Stdout))
138
139 assert.Fail("!")
140 */
141}