Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 1 | package datokenizer |
| 2 | |
| 3 | import ( |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 4 | "bytes" |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 5 | "strings" |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 6 | "testing" |
| 7 | |
| 8 | "github.com/stretchr/testify/assert" |
| 9 | ) |
| 10 | |
| 11 | func TestSimpleString(t *testing.T) { |
| 12 | assert := assert.New(t) |
| 13 | |
| 14 | // bau | bauamt |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 15 | tok := LoadFomaFile("testdata/bauamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 16 | dat := tok.ToDoubleArray() |
| 17 | assert.True(dat.Match("bau")) |
| 18 | assert.True(dat.Match("bauamt")) |
| 19 | assert.False(dat.Match("baum")) |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 20 | } |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 21 | |
| 22 | func TestSimpleBranches(t *testing.T) { |
| 23 | assert := assert.New(t) |
| 24 | |
| 25 | // (bau | wahl) (amt | en) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 26 | tok := LoadFomaFile("testdata/wahlamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 27 | dat := tok.ToDoubleArray() |
| 28 | assert.False(dat.Match("bau")) |
| 29 | assert.True(dat.Match("bauamt")) |
| 30 | assert.True(dat.Match("wahlamt")) |
| 31 | assert.True(dat.Match("bauen")) |
| 32 | assert.True(dat.Match("wahlen")) |
| 33 | assert.False(dat.Match("baum")) |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 34 | } |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 35 | |
| 36 | func TestSimpleTokenizer(t *testing.T) { |
| 37 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 38 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 39 | dat := tok.ToDoubleArray() |
| 40 | assert.True(dat.Match("bau")) |
| 41 | assert.True(dat.Match("bad")) |
| 42 | assert.True(dat.Match("wald gehen")) |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 43 | } |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 44 | |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 45 | func TestSimpleTokenizerTransduce(t *testing.T) { |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 46 | assert := assert.New(t) |
| 47 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 48 | dat := tok.ToDoubleArray() |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 49 | |
| 50 | r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!") |
| 51 | b := make([]byte, 0, 2048) |
| 52 | w := bytes.NewBuffer(b) |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 53 | var tokens []string |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 54 | dat.Transduce(r, w) |
| 55 | tokens = strings.Split(w.String(), "\n") |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 56 | assert.Equal("wald", tokens[0]) |
| 57 | assert.Equal("gehen", tokens[1]) |
| 58 | assert.Equal("Da", tokens[2]) |
| 59 | assert.Equal("kann", tokens[3]) |
| 60 | assert.Equal("man", tokens[4]) |
| 61 | assert.Equal("was", tokens[5]) |
| 62 | assert.Equal("\"erleben\"", tokens[6]) |
| 63 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 64 | r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!") |
| 65 | w.Reset() |
| 66 | dat.Transduce(r, w) |
| 67 | tokens = strings.Split(w.String(), "\n") |
| 68 | assert.Equal("In", tokens[0]) |
| 69 | assert.Equal("den", tokens[1]) |
| 70 | assert.Equal("Wald", tokens[2]) |
| 71 | assert.Equal("gehen", tokens[3]) |
| 72 | assert.Equal("?", tokens[4]) |
| 73 | assert.Equal("--", tokens[5]) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 74 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 75 | r = strings.NewReader(" g? -- D") |
| 76 | w.Reset() |
| 77 | dat.Transduce(r, w) |
| 78 | tokens = strings.Split(w.String(), "\n") |
| 79 | assert.Equal("g", tokens[0]) |
| 80 | assert.Equal("?", tokens[1]) |
| 81 | assert.Equal("--", tokens[2]) |
| 82 | assert.Equal("D", tokens[3]) |
| 83 | assert.Equal("", tokens[4]) |
Akron | df0a3ef | 2021-08-09 15:53:45 +0200 | [diff] [blame] | 84 | assert.Equal(5, len(tokens)) |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 85 | } |
| 86 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 87 | func TestReadWriteTokenizer(t *testing.T) { |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 88 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 89 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 90 | dat := tok.ToDoubleArray() |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 91 | assert.True(dat.Match("bau")) |
| 92 | assert.True(dat.Match("bad")) |
| 93 | assert.True(dat.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 94 | |
Akron | 03a3c61 | 2021-08-04 11:51:27 +0200 | [diff] [blame] | 95 | assert.True(dat.LoadFactor() >= 70) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 96 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 97 | b := make([]byte, 0, 1024) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 98 | buf := bytes.NewBuffer(b) |
| 99 | n, err := dat.WriteTo(buf) |
| 100 | assert.Nil(err) |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 101 | assert.Equal(int64(224), n) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 102 | |
| 103 | dat2 := ParseDatok(buf) |
| 104 | assert.NotNil(dat2) |
| 105 | assert.Equal(dat.array, dat2.array) |
| 106 | assert.Equal(dat.sigma, dat2.sigma) |
| 107 | assert.Equal(dat.epsilon, dat2.epsilon) |
| 108 | assert.Equal(dat.unknown, dat2.unknown) |
| 109 | assert.Equal(dat.identity, dat2.identity) |
| 110 | assert.Equal(dat.final, dat2.final) |
| 111 | assert.Equal(dat.LoadFactor(), dat2.LoadFactor()) |
| 112 | assert.True(dat2.Match("bau")) |
| 113 | assert.True(dat2.Match("bad")) |
| 114 | assert.True(dat2.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 115 | } |
| 116 | |
| 117 | func TestFullTokenizer(t *testing.T) { |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 118 | assert := assert.New(t) |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 119 | /* |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 120 | tok := LoadFomaFile("testdata/tokenizer.fst") |
| 121 | dat := tok.ToDoubleArray() |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 122 | dat.Save("testdata/tokenizer.datok") |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 123 | */ |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 124 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 125 | assert.NotNil(dat) |
| 126 | assert.True(dat.LoadFactor() >= 70) |
| 127 | assert.Equal(dat.epsilon, 1) |
| 128 | assert.Equal(dat.unknown, 2) |
| 129 | assert.Equal(dat.identity, 3) |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 130 | assert.Equal(dat.final, 136) |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 131 | assert.Equal(len(dat.sigma), 131) |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 132 | assert.Equal(len(dat.array), 3806280) |
| 133 | assert.Equal(dat.maxSize, 3806279) |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame] | 134 | |
| 135 | assert.True(dat.Match("bau")) |
| 136 | assert.True(dat.Match("bad")) |
| 137 | assert.True(dat.Match("wald gehen")) |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 138 | } |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 139 | |
| 140 | func TestFullTokenizerTransduce(t *testing.T) { |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 141 | assert := assert.New(t) |
| 142 | |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 143 | var dat *DaTokenizer |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 144 | |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 145 | if false { |
| 146 | tok := LoadFomaFile("testdata/tokenizer.fst") |
| 147 | dat = tok.ToDoubleArray() |
Akron | 439f4ec | 2021-08-09 15:45:38 +0200 | [diff] [blame] | 148 | // dat.Save("testdata/tokenizer.datok") |
Akron | 03c92fe | 2021-08-09 14:07:57 +0200 | [diff] [blame] | 149 | } else { |
| 150 | dat = LoadDatokFile("testdata/tokenizer.datok") |
| 151 | } |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 152 | assert.NotNil(dat) |
| 153 | |
Akron | c5d8d43 | 2021-08-10 16:48:44 +0200 | [diff] [blame] | 154 | r := strings.NewReader("tra. u Du?") |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 155 | |
| 156 | b := make([]byte, 0, 2048) |
| 157 | w := bytes.NewBuffer(b) |
| 158 | var tokens []string |
| 159 | |
| 160 | assert.True(dat.Transduce(r, w)) |
| 161 | |
| 162 | tokens = strings.Split(w.String(), "\n") |
Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame^] | 163 | assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String()) |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 164 | assert.Equal("tra", tokens[0]) |
| 165 | assert.Equal(".", tokens[1]) |
Akron | b4bbb47 | 2021-08-09 11:49:38 +0200 | [diff] [blame] | 166 | assert.Equal("", tokens[2]) |
Akron | c5d8d43 | 2021-08-10 16:48:44 +0200 | [diff] [blame] | 167 | assert.Equal("u", tokens[3]) |
Akron | b4bbb47 | 2021-08-09 11:49:38 +0200 | [diff] [blame] | 168 | assert.Equal("Du", tokens[4]) |
| 169 | assert.Equal("?", tokens[5]) |
Akron | 3610f10 | 2021-08-08 14:13:25 +0200 | [diff] [blame] | 170 | assert.Equal("", tokens[6]) |
Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame^] | 171 | assert.Equal("", tokens[7]) |
| 172 | assert.Equal(8, len(tokens)) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 173 | } |
Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 174 | |
| 175 | func TestFullTokenizerSentenceSplitter(t *testing.T) { |
| 176 | assert := assert.New(t) |
| 177 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 178 | assert.NotNil(dat) |
| 179 | |
| 180 | b := make([]byte, 0, 2048) |
| 181 | w := bytes.NewBuffer(b) |
| 182 | var sentences []string |
| 183 | |
| 184 | // testSentSplitterSimple |
Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame^] | 185 | assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w)) |
| 186 | sentences = strings.Split(w.String(), "\n\n") |
| 187 | |
| 188 | assert.Equal("Der\nalte\nMann\n.\n\n", w.String()) |
| 189 | assert.Equal("Der\nalte\nMann\n.", sentences[0]) |
| 190 | assert.Equal("", sentences[1]) |
| 191 | assert.Equal(len(sentences), 2) |
| 192 | |
| 193 | w.Reset() |
| 194 | assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w)) |
| 195 | sentences = strings.Split(w.String(), "\n\n") |
| 196 | assert.Equal(len(sentences), 2) |
| 197 | assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0]) |
| 198 | assert.Equal("", sentences[1]) |
| 199 | |
| 200 | w.Reset() |
| 201 | assert.True(dat.Transduce(strings.NewReader(""), w)) |
Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 202 | sentences = strings.Split(w.String(), "\n\n") |
| 203 | assert.Equal(len(sentences), 1) |
Akron | 1594cb8 | 2021-08-11 11:14:56 +0200 | [diff] [blame^] | 204 | assert.Equal("", sentences[0]) |
| 205 | |
| 206 | w.Reset() |
| 207 | assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w)) |
| 208 | sentences = strings.Split(w.String(), "\n\n") |
| 209 | assert.Equal(len(sentences), 2) |
| 210 | |
| 211 | w.Reset() |
| 212 | assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w)) |
| 213 | sentences = strings.Split(w.String(), "\n\n") |
| 214 | assert.Equal(len(sentences), 2) |
| 215 | |
| 216 | /* |
| 217 | w.Reset() |
| 218 | assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w)) |
| 219 | sentences = strings.Split(w.String(), "\n\n") |
| 220 | assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum\n", sentences[0]) |
| 221 | assert.Equal(len(sentences), 1) |
| 222 | |
| 223 | w.Reset() |
| 224 | assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w)) |
| 225 | sentences = strings.Split(w.String(), "\n\n") |
| 226 | assert.Equal(len(sentences), 1) |
| 227 | |
| 228 | w.Reset() |
| 229 | assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w)) |
| 230 | sentences = strings.Split(w.String(), "\n\n") |
| 231 | assert.Equal(len(sentences), 1) |
| 232 | |
| 233 | w.Reset() |
| 234 | assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w)) |
| 235 | sentences = strings.Split(w.String(), "\n\n") |
| 236 | assert.Equal(len(sentences), 1) |
| 237 | |
| 238 | w.Reset() |
| 239 | assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w)) |
| 240 | sentences = strings.Split(w.String(), "\n\n") |
| 241 | assert.Equal(len(sentences), 1) |
| 242 | assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen\n", sentences[0]) |
| 243 | |
| 244 | w.Reset() |
| 245 | assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w)) |
| 246 | sentences = strings.Split(w.String(), "\n\n") |
| 247 | assert.Equal(len(sentences), 2) |
| 248 | assert.Equal("Ausschalten\n!!!", sentences[0]) |
| 249 | assert.Equal("Hast\nDu\nnicht\ngehört\n???\n", sentences[1]) |
| 250 | */ |
| 251 | |
| 252 | /* |
| 253 | w.Reset() |
| 254 | assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w)) |
| 255 | sentences = strings.Split(w.String(), "\n\n") |
| 256 | assert.Equal(len(sentences), 1) |
| 257 | */ |
| 258 | |
| 259 | /* |
| 260 | Test: |
| 261 | "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w)) |
| 262 | */ |
Akron | b7e1f13 | 2021-08-10 11:52:31 +0200 | [diff] [blame] | 263 | } |