blob: 9c095067f4908cb6886b339eb575ae375bbe8570 [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron0f087ea2021-10-27 19:40:15 +02009type Bits uint8
10
Akronb71b51f2024-03-04 15:39:02 +010011// TODO-Perf:
12// - TokenWriter may support AvailableBuffer(), so tokens can be written
13// directly without a separate buffer. copying from the same underlying
14// byte array is a nop thren (Go 1.18).
15
16
Akron0f087ea2021-10-27 19:40:15 +020017const (
18 TOKENS Bits = 1 << iota
19 SENTENCES
20 TOKEN_POS
21 SENTENCE_POS
22 NEWLINE_AFTER_EOT
Akron96fdc9b2021-10-27 21:11:17 +020023
24 SIMPLE = TOKENS | SENTENCES
Akron0f087ea2021-10-27 19:40:15 +020025)
26
Akron4f6b28c2021-10-25 00:52:03 +020027type TokenWriter struct {
28 SentenceEnd func(int)
29 TextEnd func(int)
30 Flush func() error
31 Token func(int, []rune)
Akrondf275812022-03-27 12:54:46 +020032 // Fail func(int)
Akrone396a932021-10-19 01:06:13 +020033}
34
Akron4f6b28c2021-10-25 00:52:03 +020035// Create a new token writer based on the options
Akron96fdc9b2021-10-27 21:11:17 +020036func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020037 writer := bufio.NewWriter(w)
38 posC := 0
Akronfceddb62021-10-27 19:27:54 +020039 pos := make([]int, 0, 1024)
40 sentB := true
41 sent := make([]int, 0, 1024)
Akron4a6e0ff2021-11-04 00:15:54 +010042 init := true
Akrone396a932021-10-19 01:06:13 +020043
Akron4f6b28c2021-10-25 00:52:03 +020044 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020045
Akrondf275812022-03-27 12:54:46 +020046 // tw.Fail = func(_ int) {}
47
Akron7035d2e2021-10-28 00:54:01 +020048 // Collect token positions and maybe tokens
49 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
50
51 // TODO:
52 // Split to
53 // - Token_pos+Tokens+Newline
54 // - Token_pos+Newline
55 // - Token_pos|Sentence_pos
56 // - Sentence_pos
57 // - Tokens
58
Akron4f6b28c2021-10-25 00:52:03 +020059 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020060
Akron4f6b28c2021-10-25 00:52:03 +020061 // TODO:
62 // Store in []uint16
63 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020064
Akrone9431ec2021-10-25 21:35:33 +020065 // Accept newline after EOT
Akrone87906b2021-11-24 10:39:14 +010066 if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init {
Akrone9431ec2021-10-25 21:35:33 +020067 posC--
68 }
69
Akron4a6e0ff2021-11-04 00:15:54 +010070 init = false
71
Akron4f6b28c2021-10-25 00:52:03 +020072 posC += offset
73 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020074
75 // Token is the start of a sentence
76 if sentB {
77 sentB = false
78 sent = append(sent, posC)
79 }
Akron4f6b28c2021-10-25 00:52:03 +020080 posC += len(buf) - offset
81 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020082
Akron7035d2e2021-10-28 00:54:01 +020083 // Collect tokens also
Akron0f087ea2021-10-27 19:40:15 +020084 if flags&TOKENS != 0 {
Akrona9e0c422021-10-27 19:01:17 +020085 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020086 writer.WriteByte('\n')
Akrona9e0c422021-10-27 19:01:17 +020087 }
Akron4f6b28c2021-10-25 00:52:03 +020088 }
Akronfceddb62021-10-27 19:27:54 +020089
Akron7035d2e2021-10-28 00:54:01 +020090 // Collect tokens
Akron96fdc9b2021-10-27 21:11:17 +020091 } else if flags&TOKENS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020092 tw.Token = func(offset int, buf []rune) {
93 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020094 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +020095 }
Akron7035d2e2021-10-28 00:54:01 +020096
97 // Ignore tokens
Akron96fdc9b2021-10-27 21:11:17 +020098 } else {
99 tw.Token = func(_ int, _ []rune) {}
Akron4f6b28c2021-10-25 00:52:03 +0200100 }
101
Akron7035d2e2021-10-28 00:54:01 +0200102 // Collect sentence positions and maybe sentence boundaries
Akron96fdc9b2021-10-27 21:11:17 +0200103 if flags&SENTENCE_POS != 0 {
Akron9fb63af2021-10-28 01:15:53 +0200104 tw.SentenceEnd = func(_ int) {
Akronfceddb62021-10-27 19:27:54 +0200105
106 // Add end position of last token to sentence boundary
Akron0f087ea2021-10-27 19:40:15 +0200107 // TODO: This only works if token positions are taking into account
Akronfceddb62021-10-27 19:27:54 +0200108 sent = append(sent, pos[len(pos)-1])
109 sentB = true
110
Akron7035d2e2021-10-28 00:54:01 +0200111 // Collect sentences also
Akron0f087ea2021-10-27 19:40:15 +0200112 if flags&SENTENCES != 0 {
Akron7035d2e2021-10-28 00:54:01 +0200113 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200114 }
115 }
116
Akron7035d2e2021-10-28 00:54:01 +0200117 // Collect sentence boundaries
Akron0f087ea2021-10-27 19:40:15 +0200118 } else if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +0200119 tw.SentenceEnd = func(_ int) {
Akron7035d2e2021-10-28 00:54:01 +0200120 writer.WriteByte('\n')
Akron274600e2021-11-03 20:09:06 +0100121 writer.Flush()
Akronfceddb62021-10-27 19:27:54 +0200122 }
123
124 // Ignore sentence boundaries
125 } else {
126 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200127 }
128
Akron7035d2e2021-10-28 00:54:01 +0200129 // Write token or sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200130 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
Akrone9431ec2021-10-25 21:35:33 +0200131 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200132
Akron7035d2e2021-10-28 00:54:01 +0200133 // Write token positions
Akron0f087ea2021-10-27 19:40:15 +0200134 if flags&TOKEN_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200135 writer.WriteString(strconv.Itoa(pos[0]))
136 for _, x := range pos[1:] {
137 writer.WriteByte(' ')
138 writer.WriteString(strconv.Itoa(x))
139 }
Akron7035d2e2021-10-28 00:54:01 +0200140 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200141 }
Akronfceddb62021-10-27 19:27:54 +0200142
Akron7035d2e2021-10-28 00:54:01 +0200143 // Write sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200144 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200145 writer.WriteString(strconv.Itoa(sent[0]))
146 for _, x := range sent[1:] {
147 writer.WriteByte(' ')
148 writer.WriteString(strconv.Itoa(x))
149 }
Akron7035d2e2021-10-28 00:54:01 +0200150 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200151 sent = sent[:0]
152 sentB = true
153 }
Akron4f6b28c2021-10-25 00:52:03 +0200154
Akron274600e2021-11-03 20:09:06 +0100155 writer.Flush()
156
Akron8cc2dd92021-10-25 19:49:41 +0200157 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200158 pos = pos[:0]
159 }
Akron7035d2e2021-10-28 00:54:01 +0200160
161 // Collect text ends
Akron4f6b28c2021-10-25 00:52:03 +0200162 } else {
163 tw.TextEnd = func(_ int) {
Akrone87906b2021-11-24 10:39:14 +0100164 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200165 writer.Flush()
166 }
Akron4f6b28c2021-10-25 00:52:03 +0200167 }
168
Akron7035d2e2021-10-28 00:54:01 +0200169 // Flush the writer
Akron4f6b28c2021-10-25 00:52:03 +0200170 tw.Flush = func() error {
171 return writer.Flush()
172 }
173
174 return tw
Akrone396a932021-10-19 01:06:13 +0200175}