blob: a977b8d4c4e09a3f1de22f12a5b1db5a54b2eae2 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron524c5432021-08-05 14:14:27 +020054 dat.Transduce(r, w)
55 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020056 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
Akron524c5432021-08-05 14:14:27 +020064 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
65 w.Reset()
66 dat.Transduce(r, w)
67 tokens = strings.Split(w.String(), "\n")
68 assert.Equal("In", tokens[0])
69 assert.Equal("den", tokens[1])
70 assert.Equal("Wald", tokens[2])
71 assert.Equal("gehen", tokens[3])
72 assert.Equal("?", tokens[4])
73 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020074
Akron524c5432021-08-05 14:14:27 +020075 r = strings.NewReader(" g? -- D")
76 w.Reset()
77 dat.Transduce(r, w)
78 tokens = strings.Split(w.String(), "\n")
79 assert.Equal("g", tokens[0])
80 assert.Equal("?", tokens[1])
81 assert.Equal("--", tokens[2])
82 assert.Equal("D", tokens[3])
83 assert.Equal("", tokens[4])
84 assert.Equal("", tokens[5])
85 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020086}
87
Akron3f8571a2021-08-05 11:18:10 +020088func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020089 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020090 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020091 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020092 assert.True(dat.Match("bau"))
93 assert.True(dat.Match("bad"))
94 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020095
Akron03a3c612021-08-04 11:51:27 +020096 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020097
Akron3f8571a2021-08-05 11:18:10 +020098 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020099 buf := bytes.NewBuffer(b)
100 n, err := dat.WriteTo(buf)
101 assert.Nil(err)
Akron3a063ef2021-08-05 19:36:35 +0200102 assert.Equal(int64(208), n)
Akron3f8571a2021-08-05 11:18:10 +0200103
104 dat2 := ParseDatok(buf)
105 assert.NotNil(dat2)
106 assert.Equal(dat.array, dat2.array)
107 assert.Equal(dat.sigma, dat2.sigma)
108 assert.Equal(dat.epsilon, dat2.epsilon)
109 assert.Equal(dat.unknown, dat2.unknown)
110 assert.Equal(dat.identity, dat2.identity)
111 assert.Equal(dat.final, dat2.final)
112 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
113 assert.True(dat2.Match("bau"))
114 assert.True(dat2.Match("bad"))
115 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116}
117
118func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200119 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200120 /*
Akron2a4b9292021-08-04 15:35:22 +0200121 tok := LoadFomaFile("testdata/tokenizer.fst")
122 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200123 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200124 */
Akron3a063ef2021-08-05 19:36:35 +0200125 dat := LoadDatokFile("testdata/tokenizer.datok")
126 assert.NotNil(dat)
127 assert.True(dat.LoadFactor() >= 70)
128 assert.Equal(dat.epsilon, 1)
129 assert.Equal(dat.unknown, 2)
130 assert.Equal(dat.identity, 3)
131 assert.Equal(dat.final, 135)
132 assert.Equal(len(dat.sigma), 131)
Akronb4bbb472021-08-09 11:49:38 +0200133 assert.Equal(len(dat.array), 3771624)
134 assert.Equal(dat.maxSize, 3771623)
Akron3a063ef2021-08-05 19:36:35 +0200135
136 assert.True(dat.Match("bau"))
137 assert.True(dat.Match("bad"))
138 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200139}
Akron3f8571a2021-08-05 11:18:10 +0200140
141func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200142 assert := assert.New(t)
143
Akron3f8571a2021-08-05 11:18:10 +0200144 /*
Akron3610f102021-08-08 14:13:25 +0200145 tok := LoadFomaFile("testdata/tokenizer.fst")
146 dat := tok.ToDoubleArray()
Akronb4bbb472021-08-09 11:49:38 +0200147 dat.Save("testdata/tokenizer.datok")
Akron3f8571a2021-08-05 11:18:10 +0200148 */
Akron3610f102021-08-08 14:13:25 +0200149 dat := LoadDatokFile("testdata/tokenizer.datok")
150
151 assert.NotNil(dat)
152
153 r := strings.NewReader("tra. und Du?")
154
155 b := make([]byte, 0, 2048)
156 w := bytes.NewBuffer(b)
157 var tokens []string
158
159 assert.True(dat.Transduce(r, w))
160
161 tokens = strings.Split(w.String(), "\n")
162 assert.Equal("tra", tokens[0])
163 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200164 assert.Equal("", tokens[2])
165 assert.Equal("und", tokens[3])
166 assert.Equal("Du", tokens[4])
167 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200168 assert.Equal("", tokens[6])
Akronb4bbb472021-08-09 11:49:38 +0200169 assert.Equal("", tokens[7])
170 assert.Equal("", tokens[8])
171 assert.Equal(9, len(tokens))
Akron3f8571a2021-08-05 11:18:10 +0200172}