blob: a27ae1c8767339d59a6de31985440df54fb4bd86 [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bytes"
Akron4f6b28c2021-10-25 00:52:03 +02005 "strings"
Akrone396a932021-10-19 01:06:13 +02006 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestTokenWriterSimple(t *testing.T) {
12 assert := assert.New(t)
13
14 b := make([]byte, 0, 2048)
15 w := bytes.NewBuffer(b)
16
Akron96fdc9b2021-10-27 21:11:17 +020017 tws := NewTokenWriter(w, SIMPLE)
Akrone396a932021-10-19 01:06:13 +020018
19 assert.NotNil(tws)
20
21 tws.Token(0, []rune{'a', 'b', 'c'})
22
Akron32416ce2021-10-23 17:09:41 +020023 tws.Token(1, []rune{'d', 'e', 'f'})
Akrone396a932021-10-19 01:06:13 +020024
Akrona854faa2021-10-22 19:31:08 +020025 tws.SentenceEnd(0)
26
27 tws.TextEnd(0)
Akrone396a932021-10-19 01:06:13 +020028
29 tws.Flush()
30
Akron32416ce2021-10-23 17:09:41 +020031 assert.Equal("abc\nef\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +020032}
Akron4f6b28c2021-10-25 00:52:03 +020033
34func TestTokenWriterFromOptions(t *testing.T) {
35 assert := assert.New(t)
36
Akron0f087ea2021-10-27 19:40:15 +020037 mat := LoadMatrixFile("testdata/tokenizer.matok")
38 assert.NotNil(mat)
39
Akron4f6b28c2021-10-25 00:52:03 +020040 b := make([]byte, 0, 2048)
41 w := bytes.NewBuffer(b)
42
Akron96fdc9b2021-10-27 21:11:17 +020043 tws := NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS)
Akron4f6b28c2021-10-25 00:52:03 +020044
45 assert.True(mat.TransduceTokenWriter(
46 strings.NewReader("This.\x0a\x04And.\n\x04\n"), tws),
47 )
48
49 matStr := w.String()
50 assert.Equal("This\n.\n\n0 4 4 5\nAnd\n.\n\n0 3 3 4\n", matStr)
Akron8cc2dd92021-10-25 19:49:41 +020051
52 w.Reset()
53 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
54
55 matStr = w.String()
56 assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n1 4 4 5\n", matStr)
57
Akrona9e0c422021-10-27 19:01:17 +020058 //
Akrone9431ec2021-10-25 21:35:33 +020059 // Accept newline after EOT
Akron96fdc9b2021-10-27 21:11:17 +020060 tws = NewTokenWriter(w, TOKENS|SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
Akrone9431ec2021-10-25 21:35:33 +020061
62 w.Reset()
63 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
64
65 matStr = w.String()
66 assert.Equal("This\n.\n\n1 5 5 6\nAnd\n.\n\n0 3 3 4\n", matStr)
Akrona9e0c422021-10-27 19:01:17 +020067
68 //
69 // Write no tokens
Akron96fdc9b2021-10-27 21:11:17 +020070 tws = NewTokenWriter(w, SENTENCES|TOKEN_POS|NEWLINE_AFTER_EOT)
Akrona9e0c422021-10-27 19:01:17 +020071
72 w.Reset()
73 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
74
75 matStr = w.String()
76 assert.Equal("\n1 5 5 6\n\n0 3 3 4\n", matStr)
Akronfceddb62021-10-27 19:27:54 +020077
78 //
Akron0f087ea2021-10-27 19:40:15 +020079 // Write sentence offsets
Akron96fdc9b2021-10-27 21:11:17 +020080 tws = NewTokenWriter(w, TOKEN_POS|SENTENCE_POS|NEWLINE_AFTER_EOT)
Akronfceddb62021-10-27 19:27:54 +020081
82 w.Reset()
83 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
84
85 matStr = w.String()
86 assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
Akron7035d2e2021-10-28 00:54:01 +020087
88 //
89 // Write sentence offsets without token offsets
90 tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)
91
92 w.Reset()
93 mat.TransduceTokenWriter(strings.NewReader("\nThis.\x0a\x04\nAnd.\n\x04\n"), tws)
94
95 matStr = w.String()
96 assert.Equal("1 6\n0 4\n", matStr)
Akron4f6b28c2021-10-25 00:52:03 +020097}