blob: cee2eb95446236de8421e8bc436d3bb6b4182bde [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron524c5432021-08-05 14:14:27 +020054 dat.Transduce(r, w)
55 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020056 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
Akron524c5432021-08-05 14:14:27 +020064 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
65 w.Reset()
66 dat.Transduce(r, w)
67 tokens = strings.Split(w.String(), "\n")
68 assert.Equal("In", tokens[0])
69 assert.Equal("den", tokens[1])
70 assert.Equal("Wald", tokens[2])
71 assert.Equal("gehen", tokens[3])
72 assert.Equal("?", tokens[4])
73 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020074
Akron524c5432021-08-05 14:14:27 +020075 r = strings.NewReader(" g? -- D")
76 w.Reset()
77 dat.Transduce(r, w)
78 tokens = strings.Split(w.String(), "\n")
79 assert.Equal("g", tokens[0])
80 assert.Equal("?", tokens[1])
81 assert.Equal("--", tokens[2])
82 assert.Equal("D", tokens[3])
83 assert.Equal("", tokens[4])
84 assert.Equal("", tokens[5])
85 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020086}
87
Akron3f8571a2021-08-05 11:18:10 +020088func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020089 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020090 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020091 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020092 assert.True(dat.Match("bau"))
93 assert.True(dat.Match("bad"))
94 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020095
Akron03a3c612021-08-04 11:51:27 +020096 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020097
Akron3f8571a2021-08-05 11:18:10 +020098 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020099 buf := bytes.NewBuffer(b)
100 n, err := dat.WriteTo(buf)
101 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200102 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200103
104 dat2 := ParseDatok(buf)
105 assert.NotNil(dat2)
106 assert.Equal(dat.array, dat2.array)
107 assert.Equal(dat.sigma, dat2.sigma)
108 assert.Equal(dat.epsilon, dat2.epsilon)
109 assert.Equal(dat.unknown, dat2.unknown)
110 assert.Equal(dat.identity, dat2.identity)
111 assert.Equal(dat.final, dat2.final)
112 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
113 assert.True(dat2.Match("bau"))
114 assert.True(dat2.Match("bad"))
115 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200116}
117
118func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200119 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200120 /*
Akron2a4b9292021-08-04 15:35:22 +0200121 tok := LoadFomaFile("testdata/tokenizer.fst")
122 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200123 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200124 */
Akron3a063ef2021-08-05 19:36:35 +0200125 dat := LoadDatokFile("testdata/tokenizer.datok")
126 assert.NotNil(dat)
127 assert.True(dat.LoadFactor() >= 70)
128 assert.Equal(dat.epsilon, 1)
129 assert.Equal(dat.unknown, 2)
130 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200131 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200132 assert.Equal(len(dat.sigma), 131)
Akron03c92fe2021-08-09 14:07:57 +0200133 assert.Equal(len(dat.array), 3806280)
134 assert.Equal(dat.maxSize, 3806279)
Akron3a063ef2021-08-05 19:36:35 +0200135
136 assert.True(dat.Match("bau"))
137 assert.True(dat.Match("bad"))
138 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200139}
Akron3f8571a2021-08-05 11:18:10 +0200140
141func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200142 assert := assert.New(t)
143
Akron03c92fe2021-08-09 14:07:57 +0200144 var dat *DaTokenizer
Akron3610f102021-08-08 14:13:25 +0200145
Akron03c92fe2021-08-09 14:07:57 +0200146 if false {
147 tok := LoadFomaFile("testdata/tokenizer.fst")
148 dat = tok.ToDoubleArray()
149 dat.Save("testdata/tokenizer.datok")
150 } else {
151 dat = LoadDatokFile("testdata/tokenizer.datok")
152 }
Akron3610f102021-08-08 14:13:25 +0200153 assert.NotNil(dat)
154
155 r := strings.NewReader("tra. und Du?")
156
157 b := make([]byte, 0, 2048)
158 w := bytes.NewBuffer(b)
159 var tokens []string
160
161 assert.True(dat.Transduce(r, w))
162
163 tokens = strings.Split(w.String(), "\n")
164 assert.Equal("tra", tokens[0])
165 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200166 assert.Equal("", tokens[2])
167 assert.Equal("und", tokens[3])
168 assert.Equal("Du", tokens[4])
169 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200170 assert.Equal("", tokens[6])
Akronb4bbb472021-08-09 11:49:38 +0200171 assert.Equal("", tokens[7])
172 assert.Equal("", tokens[8])
173 assert.Equal(9, len(tokens))
Akron3f8571a2021-08-05 11:18:10 +0200174}