Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 1 | package datokenizer |
| 2 | |
| 3 | import ( |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 4 | "bytes" |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 5 | "strings" |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 6 | "testing" |
| 7 | |
| 8 | "github.com/stretchr/testify/assert" |
| 9 | ) |
| 10 | |
| 11 | func TestSimpleString(t *testing.T) { |
| 12 | assert := assert.New(t) |
| 13 | |
| 14 | // bau | bauamt |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 15 | tok := LoadFomaFile("testdata/bauamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 16 | dat := tok.ToDoubleArray() |
| 17 | assert.True(dat.Match("bau")) |
| 18 | assert.True(dat.Match("bauamt")) |
| 19 | assert.False(dat.Match("baum")) |
Akron | 8ef408b | 2021-08-02 22:11:04 +0200 | [diff] [blame] | 20 | } |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 21 | |
| 22 | func TestSimpleBranches(t *testing.T) { |
| 23 | assert := assert.New(t) |
| 24 | |
| 25 | // (bau | wahl) (amt | en) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 26 | tok := LoadFomaFile("testdata/wahlamt.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 27 | dat := tok.ToDoubleArray() |
| 28 | assert.False(dat.Match("bau")) |
| 29 | assert.True(dat.Match("bauamt")) |
| 30 | assert.True(dat.Match("wahlamt")) |
| 31 | assert.True(dat.Match("bauen")) |
| 32 | assert.True(dat.Match("wahlen")) |
| 33 | assert.False(dat.Match("baum")) |
Akron | 75ebe7f | 2021-08-03 10:34:10 +0200 | [diff] [blame] | 34 | } |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 35 | |
| 36 | func TestSimpleTokenizer(t *testing.T) { |
| 37 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 38 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 39 | dat := tok.ToDoubleArray() |
| 40 | assert.True(dat.Match("bau")) |
| 41 | assert.True(dat.Match("bad")) |
| 42 | assert.True(dat.Match("wald gehen")) |
Akron | 730a79c | 2021-08-03 11:05:29 +0200 | [diff] [blame] | 43 | } |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 44 | |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 45 | func TestSimpleTokenizerTransduce(t *testing.T) { |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 46 | assert := assert.New(t) |
| 47 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | 84d68e6 | 2021-08-04 17:06:52 +0200 | [diff] [blame] | 48 | dat := tok.ToDoubleArray() |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 49 | |
| 50 | r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!") |
| 51 | b := make([]byte, 0, 2048) |
| 52 | w := bytes.NewBuffer(b) |
| 53 | dat.Transduce(r, w) |
| 54 | |
| 55 | tokens := strings.Split(string(w.Bytes()), "\n") |
| 56 | assert.Equal("wald", tokens[0]) |
| 57 | assert.Equal("gehen", tokens[1]) |
| 58 | assert.Equal("Da", tokens[2]) |
| 59 | assert.Equal("kann", tokens[3]) |
| 60 | assert.Equal("man", tokens[4]) |
| 61 | assert.Equal("was", tokens[5]) |
| 62 | assert.Equal("\"erleben\"", tokens[6]) |
| 63 | |
| 64 | /* |
| 65 | r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!") |
| 66 | w.Reset() |
| 67 | dat.Transduce(r, w) |
| 68 | |
| 69 | tokens = strings.Split(string(w.Bytes()), "\n") |
| 70 | assert.Equal("In", tokens[0]) |
| 71 | assert.Equal("den", tokens[1]) |
| 72 | assert.Equal("Wald", tokens[2]) |
| 73 | assert.Equal("gehen", tokens[3]) |
| 74 | assert.Equal("?", tokens[4]) |
| 75 | assert.Equal("--", tokens[5]) |
| 76 | */ |
Akron | 068874c | 2021-08-04 15:19:56 +0200 | [diff] [blame] | 77 | } |
| 78 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 79 | func TestReadWriteTokenizer(t *testing.T) { |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 80 | assert := assert.New(t) |
Akron | 64ffd9a | 2021-08-03 19:55:21 +0200 | [diff] [blame] | 81 | tok := LoadFomaFile("testdata/simpletok.fst") |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 82 | dat := tok.ToDoubleArray() |
Akron | f2120ca | 2021-08-03 16:26:41 +0200 | [diff] [blame] | 83 | assert.True(dat.Match("bau")) |
| 84 | assert.True(dat.Match("bad")) |
| 85 | assert.True(dat.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 86 | |
Akron | 03a3c61 | 2021-08-04 11:51:27 +0200 | [diff] [blame] | 87 | assert.True(dat.LoadFactor() >= 70) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 88 | |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 89 | b := make([]byte, 0, 1024) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 90 | buf := bytes.NewBuffer(b) |
| 91 | n, err := dat.WriteTo(buf) |
| 92 | assert.Nil(err) |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 93 | assert.Equal(int64(208), n) |
| 94 | |
| 95 | dat2 := ParseDatok(buf) |
| 96 | assert.NotNil(dat2) |
| 97 | assert.Equal(dat.array, dat2.array) |
| 98 | assert.Equal(dat.sigma, dat2.sigma) |
| 99 | assert.Equal(dat.epsilon, dat2.epsilon) |
| 100 | assert.Equal(dat.unknown, dat2.unknown) |
| 101 | assert.Equal(dat.identity, dat2.identity) |
| 102 | assert.Equal(dat.final, dat2.final) |
| 103 | assert.Equal(dat.LoadFactor(), dat2.LoadFactor()) |
| 104 | assert.True(dat2.Match("bau")) |
| 105 | assert.True(dat2.Match("bad")) |
| 106 | assert.True(dat2.Match("wald gehen")) |
Akron | 6247a5d | 2021-08-03 19:18:28 +0200 | [diff] [blame] | 107 | } |
| 108 | |
| 109 | func TestFullTokenizer(t *testing.T) { |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 110 | /* |
| 111 | assert := assert.New(t) |
| 112 | tok := LoadFomaFile("testdata/tokenizer.fst") |
| 113 | dat := tok.ToDoubleArray() |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 114 | |
| 115 | f, _ := os.Create("testdata/tokenizer.datok") |
| 116 | gz := gzip.NewWriter(f) |
| 117 | defer f.Close() |
| 118 | dat.WriteTo(gz) |
| 119 | assert.NotNil(gz) |
| 120 | |
Akron | 2a4b929 | 2021-08-04 15:35:22 +0200 | [diff] [blame] | 121 | assert.True(dat.LoadFactor() >= 70) |
| 122 | assert.True(dat.Match("bau")) |
| 123 | assert.True(dat.Match("bad")) |
| 124 | assert.True(dat.Match("wald gehen")) |
| 125 | */ |
Akron | 740f3d7 | 2021-08-03 12:12:34 +0200 | [diff] [blame] | 126 | } |
Akron | 3f8571a | 2021-08-05 11:18:10 +0200 | [diff] [blame^] | 127 | |
| 128 | func TestFullTokenizerTransduce(t *testing.T) { |
| 129 | /* |
| 130 | assert := assert.New(t) |
| 131 | // tok := LoadFomaFile("testdata/tokenizer.fst") |
| 132 | tok := LoadFomaFile("testdata/simpletok.fst") |
| 133 | dat := tok.ToDoubleArray() |
| 134 | |
| 135 | dat := LoadDatokFile("testdata/tokenizer.datok") |
| 136 | r := strings.NewReader("wald gehen! Da kann\t man was \"erleben\"!") |
| 137 | assert.True(dat.Transduce(r, os.Stdout)) |
| 138 | |
| 139 | assert.Fail("!") |
| 140 | */ |
| 141 | } |