blob: 601da0eb03bdbfb80b2a24600b56eada57ca9fc2 [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bytes"
Akron4f6b28c2021-10-25 00:52:03 +02005 "strings"
Akrone396a932021-10-19 01:06:13 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestTokenWriterSimple(t *testing.T) {
12 assert := assert.New(t)
13
14 b := make([]byte, 0, 2048)
15 w := bytes.NewBuffer(b)
16
Akron4f6b28c2021-10-25 00:52:03 +020017 tws := NewTokenWriter(w)
Akrone396a932021-10-19 01:06:13 +020018
19 assert.NotNil(tws)
20
21 tws.Token(0, []rune{'a', 'b', 'c'})
22
Akron32416ce2021-10-23 17:09:41 +020023 tws.Token(1, []rune{'d', 'e', 'f'})
Akrone396a932021-10-19 01:06:13 +020024
Akrona854faa2021-10-22 19:31:08 +020025 tws.SentenceEnd(0)
26
27 tws.TextEnd(0)
Akrone396a932021-10-19 01:06:13 +020028
29 tws.Flush()
30
Akron32416ce2021-10-23 17:09:41 +020031 assert.Equal("abc\nef\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +020032}
Akron4f6b28c2021-10-25 00:52:03 +020033
34func TestTokenWriterFromOptions(t *testing.T) {
35 assert := assert.New(t)
36
37 b := make([]byte, 0, 2048)
38 w := bytes.NewBuffer(b)
39
Akronfceddb62021-10-27 19:27:54 +020040 tws := NewTokenWriterFromOptions(w, true, true, true, false, false)
Akron4f6b28c2021-10-25 00:52:03 +020041
42 mat := LoadMatrixFile("testdata/tokenizer.matok")
43
44 assert.NotNil(mat)
45
46 assert.True(mat.TransduceTokenWriter(
47 strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
48 )
49
50 matStr := w.String()
51 assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr)
Akron8cc2dd92021-10-25 19:49:41 +020052
53 w.Reset()
54 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
55
56 matStr = w.String()
57 assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n1 4 4 5\n", matStr)
58
Akrona9e0c422021-10-27 19:01:17 +020059 //
Akrone9431ec2021-10-25 21:35:33 +020060 // Accept newline after EOT
Akronfceddb62021-10-27 19:27:54 +020061 tws = NewTokenWriterFromOptions(w, true, true, true, false, true)
Akrone9431ec2021-10-25 21:35:33 +020062
63 w.Reset()
64 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
65
66 matStr = w.String()
67 assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n0 3 3 4\n", matStr)
Akrona9e0c422021-10-27 19:01:17 +020068
69 //
70 // Write no tokens
Akronfceddb62021-10-27 19:27:54 +020071 tws = NewTokenWriterFromOptions(w, true, false, true, false, true)
Akrona9e0c422021-10-27 19:01:17 +020072
73 w.Reset()
74 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
75
76 matStr = w.String()
77 assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr)
Akronfceddb62021-10-27 19:27:54 +020078
79 //
80 // Write sentences
81 tws = NewTokenWriterFromOptions(w, true, false, false, true, true)
82
83 w.Reset()
84 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
85
86 matStr = w.String()
87 assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
Akron4f6b28c2021-10-25 00:52:03 +020088}