blob: e861b0875a5522facf69d702ee1064f4f003c36a [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron524c5432021-08-05 14:14:27 +020054 dat.Transduce(r, w)
55 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020056 assert.Equal("wald", tokens[0])
57 assert.Equal("gehen", tokens[1])
58 assert.Equal("Da", tokens[2])
59 assert.Equal("kann", tokens[3])
60 assert.Equal("man", tokens[4])
61 assert.Equal("was", tokens[5])
62 assert.Equal("\"erleben\"", tokens[6])
63
Akron524c5432021-08-05 14:14:27 +020064 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
65 w.Reset()
66 dat.Transduce(r, w)
67 tokens = strings.Split(w.String(), "\n")
68 assert.Equal("In", tokens[0])
69 assert.Equal("den", tokens[1])
70 assert.Equal("Wald", tokens[2])
71 assert.Equal("gehen", tokens[3])
72 assert.Equal("?", tokens[4])
73 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020074
Akron524c5432021-08-05 14:14:27 +020075 r = strings.NewReader(" g? -- D")
76 w.Reset()
77 dat.Transduce(r, w)
78 tokens = strings.Split(w.String(), "\n")
79 assert.Equal("g", tokens[0])
80 assert.Equal("?", tokens[1])
81 assert.Equal("--", tokens[2])
82 assert.Equal("D", tokens[3])
83 assert.Equal("", tokens[4])
Akrondf0a3ef2021-08-09 15:53:45 +020084 assert.Equal(5, len(tokens))
Akron068874c2021-08-04 15:19:56 +020085}
86
Akron3f8571a2021-08-05 11:18:10 +020087func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020088 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020089 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020090 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020091 assert.True(dat.Match("bau"))
92 assert.True(dat.Match("bad"))
93 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020094
Akron03a3c612021-08-04 11:51:27 +020095 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020096
Akron3f8571a2021-08-05 11:18:10 +020097 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +020098 buf := bytes.NewBuffer(b)
99 n, err := dat.WriteTo(buf)
100 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200101 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200102
103 dat2 := ParseDatok(buf)
104 assert.NotNil(dat2)
105 assert.Equal(dat.array, dat2.array)
106 assert.Equal(dat.sigma, dat2.sigma)
107 assert.Equal(dat.epsilon, dat2.epsilon)
108 assert.Equal(dat.unknown, dat2.unknown)
109 assert.Equal(dat.identity, dat2.identity)
110 assert.Equal(dat.final, dat2.final)
111 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
112 assert.True(dat2.Match("bau"))
113 assert.True(dat2.Match("bad"))
114 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200115}
116
117func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200118 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200119 /*
Akron2a4b9292021-08-04 15:35:22 +0200120 tok := LoadFomaFile("testdata/tokenizer.fst")
121 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200122 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200123 */
Akron3a063ef2021-08-05 19:36:35 +0200124 dat := LoadDatokFile("testdata/tokenizer.datok")
125 assert.NotNil(dat)
126 assert.True(dat.LoadFactor() >= 70)
127 assert.Equal(dat.epsilon, 1)
128 assert.Equal(dat.unknown, 2)
129 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200130 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200131 assert.Equal(len(dat.sigma), 131)
Akron03c92fe2021-08-09 14:07:57 +0200132 assert.Equal(len(dat.array), 3806280)
133 assert.Equal(dat.maxSize, 3806279)
Akron3a063ef2021-08-05 19:36:35 +0200134
135 assert.True(dat.Match("bau"))
136 assert.True(dat.Match("bad"))
137 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200138}
Akron3f8571a2021-08-05 11:18:10 +0200139
140func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200141 assert := assert.New(t)
142
Akron03c92fe2021-08-09 14:07:57 +0200143 var dat *DaTokenizer
Akron3610f102021-08-08 14:13:25 +0200144
Akron03c92fe2021-08-09 14:07:57 +0200145 if false {
146 tok := LoadFomaFile("testdata/tokenizer.fst")
147 dat = tok.ToDoubleArray()
Akron439f4ec2021-08-09 15:45:38 +0200148 // dat.Save("testdata/tokenizer.datok")
Akron03c92fe2021-08-09 14:07:57 +0200149 } else {
150 dat = LoadDatokFile("testdata/tokenizer.datok")
151 }
Akron3610f102021-08-08 14:13:25 +0200152 assert.NotNil(dat)
153
154 r := strings.NewReader("tra. und Du?")
155
156 b := make([]byte, 0, 2048)
157 w := bytes.NewBuffer(b)
158 var tokens []string
159
160 assert.True(dat.Transduce(r, w))
161
162 tokens = strings.Split(w.String(), "\n")
163 assert.Equal("tra", tokens[0])
164 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200165 assert.Equal("", tokens[2])
166 assert.Equal("und", tokens[3])
167 assert.Equal("Du", tokens[4])
168 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200169 assert.Equal("", tokens[6])
Akronb4bbb472021-08-09 11:49:38 +0200170 assert.Equal("", tokens[7])
Akrondf0a3ef2021-08-09 15:53:45 +0200171 assert.Equal(8, len(tokens))
Akron3f8571a2021-08-05 11:18:10 +0200172}
Akronb7e1f132021-08-10 11:52:31 +0200173
174func TestFullTokenizerSentenceSplitter(t *testing.T) {
175 assert := assert.New(t)
176 dat := LoadDatokFile("testdata/tokenizer.datok")
177 assert.NotNil(dat)
178
179 b := make([]byte, 0, 2048)
180 w := bytes.NewBuffer(b)
181 var sentences []string
182
183 // testSentSplitterSimple
184 r := strings.NewReader("Mann.")
185 assert.True(dat.Transduce(r, w))
186 sentences = strings.Split(w.String(), "\n\n")
187 assert.Equal(len(sentences), 1)
188}