blob: 692112e895d3fa3dc0b2b0895fe067398f704d14 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron3f8571a2021-08-05 11:18:10 +020054
Akron524c5432021-08-05 14:14:27 +020055 dat.Transduce(r, w)
56 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020057 assert.Equal("wald", tokens[0])
58 assert.Equal("gehen", tokens[1])
59 assert.Equal("Da", tokens[2])
60 assert.Equal("kann", tokens[3])
61 assert.Equal("man", tokens[4])
62 assert.Equal("was", tokens[5])
63 assert.Equal("\"erleben\"", tokens[6])
64
Akron524c5432021-08-05 14:14:27 +020065 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68 tokens = strings.Split(w.String(), "\n")
69 assert.Equal("In", tokens[0])
70 assert.Equal("den", tokens[1])
71 assert.Equal("Wald", tokens[2])
72 assert.Equal("gehen", tokens[3])
73 assert.Equal("?", tokens[4])
74 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020075
Akron524c5432021-08-05 14:14:27 +020076 r = strings.NewReader(" g? -- D")
77 w.Reset()
78 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
80 assert.Equal("g", tokens[0])
81 assert.Equal("?", tokens[1])
82 assert.Equal("--", tokens[2])
83 assert.Equal("D", tokens[3])
84 assert.Equal("", tokens[4])
85 assert.Equal("", tokens[5])
86 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020087}
88
Akron3f8571a2021-08-05 11:18:10 +020089func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020090 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020091 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020092 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020093 assert.True(dat.Match("bau"))
94 assert.True(dat.Match("bad"))
95 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020096
Akron03a3c612021-08-04 11:51:27 +020097 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020098
Akron3f8571a2021-08-05 11:18:10 +020099 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200100 buf := bytes.NewBuffer(b)
101 n, err := dat.WriteTo(buf)
102 assert.Nil(err)
Akron3a063ef2021-08-05 19:36:35 +0200103 assert.Equal(int64(208), n)
Akron3f8571a2021-08-05 11:18:10 +0200104
105 dat2 := ParseDatok(buf)
106 assert.NotNil(dat2)
107 assert.Equal(dat.array, dat2.array)
108 assert.Equal(dat.sigma, dat2.sigma)
109 assert.Equal(dat.epsilon, dat2.epsilon)
110 assert.Equal(dat.unknown, dat2.unknown)
111 assert.Equal(dat.identity, dat2.identity)
112 assert.Equal(dat.final, dat2.final)
113 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
114 assert.True(dat2.Match("bau"))
115 assert.True(dat2.Match("bad"))
116 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117}
118
119func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200120 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200121 /*
Akron2a4b9292021-08-04 15:35:22 +0200122 tok := LoadFomaFile("testdata/tokenizer.fst")
123 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200124 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200125 */
Akron3a063ef2021-08-05 19:36:35 +0200126 dat := LoadDatokFile("testdata/tokenizer.datok")
127 assert.NotNil(dat)
128 assert.True(dat.LoadFactor() >= 70)
129 assert.Equal(dat.epsilon, 1)
130 assert.Equal(dat.unknown, 2)
131 assert.Equal(dat.identity, 3)
132 assert.Equal(dat.final, 135)
133 assert.Equal(len(dat.sigma), 131)
134 assert.Equal(len(dat.array), 3771904)
135 assert.Equal(dat.maxSize, 3771903)
136
137 assert.True(dat.Match("bau"))
138 assert.True(dat.Match("bad"))
139 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200140}
Akron3f8571a2021-08-05 11:18:10 +0200141
142func TestFullTokenizerTransduce(t *testing.T) {
143 /*
144 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200145 dat := LoadDatokFile("testdata/tokenizer.datok")
146 assert.NotNil(dat)
Akron3f8571a2021-08-05 11:18:10 +0200147
148 dat := LoadDatokFile("testdata/tokenizer.datok")
149 r := strings.NewReader("wald gehen! Da kann\t man was \"erleben\"!")
150 assert.True(dat.Transduce(r, os.Stdout))
151
152 assert.Fail("!")
153 */
154}