Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bytes" |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 5 | "strings" |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 6 | "testing" |
| 7 | |
| 8 | "github.com/stretchr/testify/assert" |
| 9 | ) |
| 10 | |
| 11 | func TestTokenWriterSimple(t *testing.T) { |
| 12 | assert := assert.New(t) |
| 13 | |
| 14 | b := make([]byte, 0, 2048) |
| 15 | w := bytes.NewBuffer(b) |
| 16 | |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 17 | tws := NewTokenWriter(w, SIMPLE) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 18 | |
| 19 | assert.NotNil(tws) |
| 20 | |
| 21 | tws.Token(0, []rune{'a', 'b', 'c'}) |
| 22 | |
Akron | 32416ce | 2021-10-23 17:09:41 +0200 | [diff] [blame] | 23 | tws.Token(1, []rune{'d', 'e', 'f'}) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 24 | |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 25 | tws.SentenceEnd(0) |
| 26 | |
| 27 | tws.TextEnd(0) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 28 | |
| 29 | tws.Flush() |
| 30 | |
Akron | 32416ce | 2021-10-23 17:09:41 +0200 | [diff] [blame] | 31 | assert.Equal("abc\nef\n\n\n", w.String()) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 32 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 33 | |
| 34 | func TestTokenWriterFromOptions(t *testing.T) { |
| 35 | assert := assert.New(t) |
| 36 | |
Akron | 0139bc5 | 2023-08-31 16:35:58 +0200 | [diff] [blame^] | 37 | mat := LoadMatrixFile("testdata/tokenizer_de.matok") |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 38 | assert.NotNil(mat) |
| 39 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 40 | b := make([]byte, 0, 2048) |
| 41 | w := bytes.NewBuffer(b) |
| 42 | |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 43 | tws := NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS) |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 44 | |
| 45 | assert.True(mat.TransduceTokenWriter( |
| 46 | strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws), |
| 47 | ) |
| 48 | |
| 49 | matStr := w.String() |
| 50 | assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr) |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 51 | |
| 52 | w.Reset() |
| 53 | mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws) |
| 54 | |
| 55 | matStr = w.String() |
| 56 | assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n1 4 4 5\n", matStr) |
| 57 | |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 58 | // |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 59 | // Accept newline after EOT |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 60 | tws = NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT) |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 61 | |
| 62 | w.Reset() |
| 63 | mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws) |
| 64 | |
| 65 | matStr = w.String() |
| 66 | assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n0 3 3 4\n", matStr) |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 67 | |
| 68 | // |
| 69 | // Write no tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 70 | tws = NewTokenWriter(w, SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT) |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 71 | |
| 72 | w.Reset() |
| 73 | mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws) |
| 74 | |
| 75 | matStr = w.String() |
| 76 | assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 77 | |
| 78 | // |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 79 | // Write sentence offsets |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 80 | tws = NewTokenWriter(w, TOKEN_POS|SENTENCE_POS|NEWLINE_AFTER_EOT) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 81 | |
| 82 | w.Reset() |
| 83 | mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws) |
| 84 | |
| 85 | matStr = w.String() |
| 86 | assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 87 | |
| 88 | // |
| 89 | // Write sentence offsets without token offsets |
| 90 | tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT) |
| 91 | |
| 92 | w.Reset() |
| 93 | mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws) |
| 94 | |
| 95 | matStr = w.String() |
| 96 | assert.Equal("1 6\n0 4\n", matStr) |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 97 | } |