blob: efd1e7cf9c15af14390133eb733b36682f04b6b0 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron3f8571a2021-08-05 11:18:10 +02005 "strings"
Akron8ef408b2021-08-02 22:11:04 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestSimpleString(t *testing.T) {
12 assert := assert.New(t)
13
14 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020015 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020016 dat := tok.ToDoubleArray()
17 assert.True(dat.Match("bau"))
18 assert.True(dat.Match("bauamt"))
19 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020020}
Akron75ebe7f2021-08-03 10:34:10 +020021
22func TestSimpleBranches(t *testing.T) {
23 assert := assert.New(t)
24
25 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020026 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020027 dat := tok.ToDoubleArray()
28 assert.False(dat.Match("bau"))
29 assert.True(dat.Match("bauamt"))
30 assert.True(dat.Match("wahlamt"))
31 assert.True(dat.Match("bauen"))
32 assert.True(dat.Match("wahlen"))
33 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020034}
Akron730a79c2021-08-03 11:05:29 +020035
36func TestSimpleTokenizer(t *testing.T) {
37 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020038 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020039 dat := tok.ToDoubleArray()
40 assert.True(dat.Match("bau"))
41 assert.True(dat.Match("bad"))
42 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020043}
Akron740f3d72021-08-03 12:12:34 +020044
Akron068874c2021-08-04 15:19:56 +020045func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020046 assert := assert.New(t)
47 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020048 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020049
50 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
51 b := make([]byte, 0, 2048)
52 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020053 var tokens []string
Akron3f8571a2021-08-05 11:18:10 +020054
Akron524c5432021-08-05 14:14:27 +020055 dat.Transduce(r, w)
56 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020057 assert.Equal("wald", tokens[0])
58 assert.Equal("gehen", tokens[1])
59 assert.Equal("Da", tokens[2])
60 assert.Equal("kann", tokens[3])
61 assert.Equal("man", tokens[4])
62 assert.Equal("was", tokens[5])
63 assert.Equal("\"erleben\"", tokens[6])
64
Akron524c5432021-08-05 14:14:27 +020065 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68 tokens = strings.Split(w.String(), "\n")
69 assert.Equal("In", tokens[0])
70 assert.Equal("den", tokens[1])
71 assert.Equal("Wald", tokens[2])
72 assert.Equal("gehen", tokens[3])
73 assert.Equal("?", tokens[4])
74 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020075
Akron524c5432021-08-05 14:14:27 +020076 r = strings.NewReader(" g? -- D")
77 w.Reset()
78 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
80 assert.Equal("g", tokens[0])
81 assert.Equal("?", tokens[1])
82 assert.Equal("--", tokens[2])
83 assert.Equal("D", tokens[3])
84 assert.Equal("", tokens[4])
85 assert.Equal("", tokens[5])
86 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020087}
88
Akron3f8571a2021-08-05 11:18:10 +020089func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020090 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020091 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020092 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020093 assert.True(dat.Match("bau"))
94 assert.True(dat.Match("bad"))
95 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020096
Akron03a3c612021-08-04 11:51:27 +020097 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020098
Akron3f8571a2021-08-05 11:18:10 +020099 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200100 buf := bytes.NewBuffer(b)
101 n, err := dat.WriteTo(buf)
102 assert.Nil(err)
Akron524c5432021-08-05 14:14:27 +0200103 assert.Equal(int64(218), n)
Akron3f8571a2021-08-05 11:18:10 +0200104
105 dat2 := ParseDatok(buf)
106 assert.NotNil(dat2)
107 assert.Equal(dat.array, dat2.array)
108 assert.Equal(dat.sigma, dat2.sigma)
109 assert.Equal(dat.epsilon, dat2.epsilon)
110 assert.Equal(dat.unknown, dat2.unknown)
111 assert.Equal(dat.identity, dat2.identity)
112 assert.Equal(dat.final, dat2.final)
113 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
114 assert.True(dat2.Match("bau"))
115 assert.True(dat2.Match("bad"))
116 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117}
118
119func TestFullTokenizer(t *testing.T) {
Akron2a4b9292021-08-04 15:35:22 +0200120 /*
121 assert := assert.New(t)
122 tok := LoadFomaFile("testdata/tokenizer.fst")
123 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +0200124
125 f, _ := os.Create("testdata/tokenizer.datok")
126 gz := gzip.NewWriter(f)
127 defer f.Close()
128 dat.WriteTo(gz)
129 assert.NotNil(gz)
130
Akron2a4b9292021-08-04 15:35:22 +0200131 assert.True(dat.LoadFactor() >= 70)
132 assert.True(dat.Match("bau"))
133 assert.True(dat.Match("bad"))
134 assert.True(dat.Match("wald gehen"))
135 */
Akron740f3d72021-08-03 12:12:34 +0200136}
Akron3f8571a2021-08-05 11:18:10 +0200137
138func TestFullTokenizerTransduce(t *testing.T) {
139 /*
140 assert := assert.New(t)
141 // tok := LoadFomaFile("testdata/tokenizer.fst")
142 tok := LoadFomaFile("testdata/simpletok.fst")
143 dat := tok.ToDoubleArray()
144
145 dat := LoadDatokFile("testdata/tokenizer.datok")
146 r := strings.NewReader("wald gehen! Da kann\t man was \"erleben\"!")
147 assert.True(dat.Transduce(r, os.Stdout))
148
149 assert.Fail("!")
150 */
151}