Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 1 | package datokenizer |
| 2 | |
| 3 | import ( |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 4 | "bytes" |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 5 | "strings" |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 6 | "testing" |
| 7 | |
| 8 | "github.com/stretchr/testify/assert" |
| 9 | ) |
| 10 | |
| 11 | func TestSimpleString(t *testing.T) { |
| 12 | assert := assert.New(t) |
| 13 | |
| 14 | // bau | bauamt |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 15 | tok := LoadFomaFile("testdata/bauamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 16 | dat := tok.ToDoubleArray() |
| 17 | assert.True(dat.Match("bau")) |
| 18 | assert.True(dat.Match("bauamt")) |
| 19 | assert.False(dat.Match("baum")) |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 20 | } |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 21 | |
| 22 | func TestSimpleBranches(t *testing.T) { |
| 23 | assert := assert.New(t) |
| 24 | |
| 25 | // (bau | wahl) (amt | en) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 26 | tok := LoadFomaFile("testdata/wahlamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 27 | dat := tok.ToDoubleArray() |
| 28 | assert.False(dat.Match("bau")) |
| 29 | assert.True(dat.Match("bauamt")) |
| 30 | assert.True(dat.Match("wahlamt")) |
| 31 | assert.True(dat.Match("bauen")) |
| 32 | assert.True(dat.Match("wahlen")) |
| 33 | assert.False(dat.Match("baum")) |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 34 | } |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 35 | |
| 36 | func TestSimpleTokenizer(t *testing.T) { |
| 37 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 38 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 39 | dat := tok.ToDoubleArray() |
| 40 | assert.True(dat.Match("bau")) |
| 41 | assert.True(dat.Match("bad")) |
| 42 | assert.True(dat.Match("wald gehen")) |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 43 | } |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 44 | |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 45 | func TestSimpleTokenizerTransduce(t *testing.T) { |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 46 | assert := assert.New(t) |
| 47 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 48 | dat := tok.ToDoubleArray() |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 49 | |
| 50 | r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!") |
| 51 | b := make([]byte, 0, 2048) |
| 52 | w := bytes.NewBuffer(b) |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 53 | var tokens []string |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 54 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 55 | dat.Transduce(r, w) |
| 56 | tokens = strings.Split(w.String(), "\n") |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 57 | assert.Equal("wald", tokens[0]) |
| 58 | assert.Equal("gehen", tokens[1]) |
| 59 | assert.Equal("Da", tokens[2]) |
| 60 | assert.Equal("kann", tokens[3]) |
| 61 | assert.Equal("man", tokens[4]) |
| 62 | assert.Equal("was", tokens[5]) |
| 63 | assert.Equal("\"erleben\"", tokens[6]) |
| 64 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 65 | r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!") |
| 66 | w.Reset() |
| 67 | dat.Transduce(r, w) |
| 68 | tokens = strings.Split(w.String(), "\n") |
| 69 | assert.Equal("In", tokens[0]) |
| 70 | assert.Equal("den", tokens[1]) |
| 71 | assert.Equal("Wald", tokens[2]) |
| 72 | assert.Equal("gehen", tokens[3]) |
| 73 | assert.Equal("?", tokens[4]) |
| 74 | assert.Equal("--", tokens[5]) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 75 | |
Akron | 524c543 | 2021-08-05 14:14:27 +0200 | [diff] [blame] | 76 | r = strings.NewReader(" g? -- D") |
| 77 | w.Reset() |
| 78 | dat.Transduce(r, w) |
| 79 | tokens = strings.Split(w.String(), "\n") |
| 80 | assert.Equal("g", tokens[0]) |
| 81 | assert.Equal("?", tokens[1]) |
| 82 | assert.Equal("--", tokens[2]) |
| 83 | assert.Equal("D", tokens[3]) |
| 84 | assert.Equal("", tokens[4]) |
| 85 | assert.Equal("", tokens[5]) |
| 86 | assert.Equal(6, len(tokens)) |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 87 | } |
| 88 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 89 | func TestReadWriteTokenizer(t *testing.T) { |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 90 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 91 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 92 | dat := tok.ToDoubleArray() |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 93 | assert.True(dat.Match("bau")) |
| 94 | assert.True(dat.Match("bad")) |
| 95 | assert.True(dat.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 96 | |
Akron | 03a3c61 | 2021-08-04 11:51:27 +0200 | [diff] [blame] | 97 | assert.True(dat.LoadFactor() >= 70) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 98 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 99 | b := make([]byte, 0, 1024) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 100 | buf := bytes.NewBuffer(b) |
| 101 | n, err := dat.WriteTo(buf) |
| 102 | assert.Nil(err) |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame^] | 103 | assert.Equal(int64(208), n) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 104 | |
| 105 | dat2 := ParseDatok(buf) |
| 106 | assert.NotNil(dat2) |
| 107 | assert.Equal(dat.array, dat2.array) |
| 108 | assert.Equal(dat.sigma, dat2.sigma) |
| 109 | assert.Equal(dat.epsilon, dat2.epsilon) |
| 110 | assert.Equal(dat.unknown, dat2.unknown) |
| 111 | assert.Equal(dat.identity, dat2.identity) |
| 112 | assert.Equal(dat.final, dat2.final) |
| 113 | assert.Equal(dat.LoadFactor(), dat2.LoadFactor()) |
| 114 | assert.True(dat2.Match("bau")) |
| 115 | assert.True(dat2.Match("bad")) |
| 116 | assert.True(dat2.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 117 | } |
| 118 | |
| 119 | func TestFullTokenizer(t *testing.T) { |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame^] | 120 | assert := assert.New(t) |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 121 | /* |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 122 | tok := LoadFomaFile("testdata/tokenizer.fst") |
| 123 | dat := tok.ToDoubleArray() |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame^] | 124 | dat.Save("testdata/tokenizer.datok") |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 125 | */ |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame^] | 126 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 127 | assert.NotNil(dat) |
| 128 | assert.True(dat.LoadFactor() >= 70) |
| 129 | assert.Equal(dat.epsilon, 1) |
| 130 | assert.Equal(dat.unknown, 2) |
| 131 | assert.Equal(dat.identity, 3) |
| 132 | assert.Equal(dat.final, 135) |
| 133 | assert.Equal(len(dat.sigma), 131) |
| 134 | assert.Equal(len(dat.array), 3771904) |
| 135 | assert.Equal(dat.maxSize, 3771903) |
| 136 | |
| 137 | assert.True(dat.Match("bau")) |
| 138 | assert.True(dat.Match("bad")) |
| 139 | assert.True(dat.Match("wald gehen")) |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 140 | } |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 141 | |
| 142 | func TestFullTokenizerTransduce(t *testing.T) { |
| 143 | /* |
| 144 | assert := assert.New(t) |
Akron | 3a063ef | 2021-08-05 19:36:35 +0200 | [diff] [blame^] | 145 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 146 | assert.NotNil(dat) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame] | 147 | |
| 148 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 149 | r := strings.NewReader("wald gehen! Da kann\t man was \"erleben\"!") |
| 150 | assert.True(dat.Transduce(r, os.Stdout)) |
| 151 | |
| 152 | assert.Fail("!") |
| 153 | */ |
| 154 | } |