Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 1 | package datokenizer |
| 2 | |
| 3 | import ( |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 4 | "bytes" |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 5 | "strings" |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 6 | "testing" |
| 7 | |
| 8 | "github.com/stretchr/testify/assert" |
| 9 | ) |
| 10 | |
| 11 | func TestSimpleString(t *testing.T) { |
| 12 | assert := assert.New(t) |
| 13 | |
| 14 | // bau | bauamt |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 15 | tok := LoadFomaFile("testdata/bauamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 16 | dat := tok.ToDoubleArray() |
| 17 | assert.True(dat.Match("bau")) |
| 18 | assert.True(dat.Match("bauamt")) |
| 19 | assert.False(dat.Match("baum")) |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 20 | } |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 21 | |
| 22 | func TestSimpleBranches(t *testing.T) { |
| 23 | assert := assert.New(t) |
| 24 | |
| 25 | // (bau | wahl) (amt | en) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 26 | tok := LoadFomaFile("testdata/wahlamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 27 | dat := tok.ToDoubleArray() |
| 28 | assert.False(dat.Match("bau")) |
| 29 | assert.True(dat.Match("bauamt")) |
| 30 | assert.True(dat.Match("wahlamt")) |
| 31 | assert.True(dat.Match("bauen")) |
| 32 | assert.True(dat.Match("wahlen")) |
| 33 | assert.False(dat.Match("baum")) |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 34 | } |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 35 | |
| 36 | func TestSimpleTokenizer(t *testing.T) { |
| 37 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 38 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 39 | dat := tok.ToDoubleArray() |
| 40 | assert.True(dat.Match("bau")) |
| 41 | assert.True(dat.Match("bad")) |
| 42 | assert.True(dat.Match("wald gehen")) |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 43 | } |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 44 | |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 45 | func TestSimpleTokenizerTransduce(t *testing.T) { |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 46 | assert := assert.New(t) |
| 47 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 48 | dat := tok.ToDoubleArray() |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 49 | |
| 50 | r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!") |
| 51 | b := make([]byte, 0, 2048) |
| 52 | w := bytes.NewBuffer(b) |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 53 | var tokens []string |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 54 | dat.Transduce(r, w) |
| 55 | tokens = strings.Split(w.String(), "\n") |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 56 | assert.Equal("wald", tokens[0]) |
| 57 | assert.Equal("gehen", tokens[1]) |
| 58 | assert.Equal("Da", tokens[2]) |
| 59 | assert.Equal("kann", tokens[3]) |
| 60 | assert.Equal("man", tokens[4]) |
| 61 | assert.Equal("was", tokens[5]) |
| 62 | assert.Equal("\"erleben\"", tokens[6]) |
| 63 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 64 | r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!") |
| 65 | w.Reset() |
| 66 | dat.Transduce(r, w) |
| 67 | tokens = strings.Split(w.String(), "\n") |
| 68 | assert.Equal("In", tokens[0]) |
| 69 | assert.Equal("den", tokens[1]) |
| 70 | assert.Equal("Wald", tokens[2]) |
| 71 | assert.Equal("gehen", tokens[3]) |
| 72 | assert.Equal("?", tokens[4]) |
| 73 | assert.Equal("--", tokens[5]) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 74 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 75 | r = strings.NewReader(" g? -- D") |
| 76 | w.Reset() |
| 77 | dat.Transduce(r, w) |
| 78 | tokens = strings.Split(w.String(), "\n") |
| 79 | assert.Equal("g", tokens[0]) |
| 80 | assert.Equal("?", tokens[1]) |
| 81 | assert.Equal("--", tokens[2]) |
| 82 | assert.Equal("D", tokens[3]) |
| 83 | assert.Equal("", tokens[4]) |
Akron | df0a3ef | 2021-08-09 15:53:45 +0200 | [diff] [blame] | 84 | assert.Equal(5, len(tokens)) |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 85 | } |
| 86 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 87 | func TestReadWriteTokenizer(t *testing.T) { |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 88 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 89 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 90 | dat := tok.ToDoubleArray() |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 91 | assert.True(dat.Match("bau")) |
| 92 | assert.True(dat.Match("bad")) |
| 93 | assert.True(dat.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 94 | |
Akron | 03a3c61 | 2021-08-04 11:51:27 +0200 | [diff] [blame] | 95 | assert.True(dat.LoadFactor() >= 70) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 96 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 97 | b := make([]byte, 0, 1024) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 98 | buf := bytes.NewBuffer(b) |
| 99 | n, err := dat.WriteTo(buf) |
| 100 | assert.Nil(err) |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 101 | assert.Equal(int64(224), n) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 102 | |
| 103 | dat2 := ParseDatok(buf) |
| 104 | assert.NotNil(dat2) |
| 105 | assert.Equal(dat.array, dat2.array) |
| 106 | assert.Equal(dat.sigma, dat2.sigma) |
| 107 | assert.Equal(dat.epsilon, dat2.epsilon) |
| 108 | assert.Equal(dat.unknown, dat2.unknown) |
| 109 | assert.Equal(dat.identity, dat2.identity) |
| 110 | assert.Equal(dat.final, dat2.final) |
| 111 | assert.Equal(dat.LoadFactor(), dat2.LoadFactor()) |
| 112 | assert.True(dat2.Match("bau")) |
| 113 | assert.True(dat2.Match("bad")) |
| 114 | assert.True(dat2.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 115 | } |
| 116 | |
| 117 | func TestFullTokenizer(t *testing.T) { |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 118 | assert := assert.New(t) |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 119 | /* |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 120 | tok := LoadFomaFile("testdata/tokenizer.fst") |
| 121 | dat := tok.ToDoubleArray() |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 122 | dat.Save("testdata/tokenizer.datok") |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 123 | */ |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 124 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 125 | assert.NotNil(dat) |
| 126 | assert.True(dat.LoadFactor() >= 70) |
| 127 | assert.Equal(dat.epsilon, 1) |
| 128 | assert.Equal(dat.unknown, 2) |
| 129 | assert.Equal(dat.identity, 3) |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 130 | assert.Equal(dat.final, 136) |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 131 | assert.Equal(len(dat.sigma), 131) |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 132 | assert.Equal(len(dat.array), 3806280) |
| 133 | assert.Equal(dat.maxSize, 3806279) |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 134 | |
| 135 | assert.True(dat.Match("bau")) |
| 136 | assert.True(dat.Match("bad")) |
| 137 | assert.True(dat.Match("wald gehen")) |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 138 | } |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 139 | |
| 140 | func TestFullTokenizerTransduce(t *testing.T) { |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 141 | assert := assert.New(t) |
| 142 | |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 143 | var dat *DaTokenizer |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 144 | |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 145 | if false { |
| 146 | tok := LoadFomaFile("testdata/tokenizer.fst") |
| 147 | dat = tok.ToDoubleArray() |
Akron | 439f4ec | 2021-08-09 15:45:38 +0200 | [diff] [blame] | 148 | // dat.Save("testdata/tokenizer.datok") |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 149 | } else { |
| 150 | dat = LoadDatokFile("testdata/tokenizer.datok") |
| 151 | } |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 152 | assert.NotNil(dat) |
| 153 | |
Akron | c5d8d43 | 2021-08-10 16:48:44 +0200 | [diff] [blame^] | 154 | r := strings.NewReader("tra. u Du?") |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 155 | |
| 156 | b := make([]byte, 0, 2048) |
| 157 | w := bytes.NewBuffer(b) |
| 158 | var tokens []string |
| 159 | |
| 160 | assert.True(dat.Transduce(r, w)) |
| 161 | |
| 162 | tokens = strings.Split(w.String(), "\n") |
| 163 | assert.Equal("tra", tokens[0]) |
| 164 | assert.Equal(".", tokens[1]) |
Akron | b4bbb47 | 2021-08-09 11:49:38 +0200 | [diff] [blame] | 165 | assert.Equal("", tokens[2]) |
Akron | c5d8d43 | 2021-08-10 16:48:44 +0200 | [diff] [blame^] | 166 | assert.Equal("u", tokens[3]) |
Akron | b4bbb47 | 2021-08-09 11:49:38 +0200 | [diff] [blame] | 167 | assert.Equal("Du", tokens[4]) |
| 168 | assert.Equal("?", tokens[5]) |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 169 | assert.Equal("", tokens[6]) |
Akron | c5d8d43 | 2021-08-10 16:48:44 +0200 | [diff] [blame^] | 170 | // assert.Equal("", tokens[7]) |
| 171 | assert.Equal(7, len(tokens)) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 172 | } |
Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 173 | |
| 174 | func TestFullTokenizerSentenceSplitter(t *testing.T) { |
| 175 | assert := assert.New(t) |
| 176 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 177 | assert.NotNil(dat) |
| 178 | |
| 179 | b := make([]byte, 0, 2048) |
| 180 | w := bytes.NewBuffer(b) |
| 181 | var sentences []string |
| 182 | |
| 183 | // testSentSplitterSimple |
| 184 | r := strings.NewReader("Mann.") |
| 185 | assert.True(dat.Transduce(r, w)) |
| 186 | sentences = strings.Split(w.String(), "\n\n") |
| 187 | assert.Equal(len(sentences), 1) |
| 188 | } |