blob: f6dffa495d9a727569c0c2de378b44e9f9a8cebf [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron0f087ea2021-10-27 19:40:15 +02009type Bits uint8
10
11const (
12 TOKENS Bits = 1 << iota
13 SENTENCES
14 TOKEN_POS
15 SENTENCE_POS
16 NEWLINE_AFTER_EOT
Akron96fdc9b2021-10-27 21:11:17 +020017
18 SIMPLE = TOKENS | SENTENCES
Akron0f087ea2021-10-27 19:40:15 +020019)
20
Akron4f6b28c2021-10-25 00:52:03 +020021type TokenWriter struct {
22 SentenceEnd func(int)
23 TextEnd func(int)
24 Flush func() error
25 Token func(int, []rune)
Akrondf275812022-03-27 12:54:46 +020026 // Fail func(int)
Akrone396a932021-10-19 01:06:13 +020027}
28
Akron4f6b28c2021-10-25 00:52:03 +020029// Create a new token writer based on the options
Akron96fdc9b2021-10-27 21:11:17 +020030func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020031 writer := bufio.NewWriter(w)
32 posC := 0
Akronfceddb62021-10-27 19:27:54 +020033 pos := make([]int, 0, 1024)
34 sentB := true
35 sent := make([]int, 0, 1024)
Akron4a6e0ff2021-11-04 00:15:54 +010036 init := true
Akrone396a932021-10-19 01:06:13 +020037
Akron4f6b28c2021-10-25 00:52:03 +020038 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020039
Akrondf275812022-03-27 12:54:46 +020040 // tw.Fail = func(_ int) {}
41
Akron7035d2e2021-10-28 00:54:01 +020042 // Collect token positions and maybe tokens
43 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
44
45 // TODO:
46 // Split to
47 // - Token_pos+Tokens+Newline
48 // - Token_pos+Newline
49 // - Token_pos|Sentence_pos
50 // - Sentence_pos
51 // - Tokens
52
Akron4f6b28c2021-10-25 00:52:03 +020053 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020054
Akron4f6b28c2021-10-25 00:52:03 +020055 // TODO:
56 // Store in []uint16
57 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020058
Akrone9431ec2021-10-25 21:35:33 +020059 // Accept newline after EOT
Akrone87906b2021-11-24 10:39:14 +010060 if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init {
Akrone9431ec2021-10-25 21:35:33 +020061 posC--
62 }
63
Akron4a6e0ff2021-11-04 00:15:54 +010064 init = false
65
Akron4f6b28c2021-10-25 00:52:03 +020066 posC += offset
67 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020068
69 // Token is the start of a sentence
70 if sentB {
71 sentB = false
72 sent = append(sent, posC)
73 }
Akron4f6b28c2021-10-25 00:52:03 +020074 posC += len(buf) - offset
75 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020076
Akron7035d2e2021-10-28 00:54:01 +020077 // Collect tokens also
Akron0f087ea2021-10-27 19:40:15 +020078 if flags&TOKENS != 0 {
Akrona9e0c422021-10-27 19:01:17 +020079 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020080 writer.WriteByte('\n')
Akrona9e0c422021-10-27 19:01:17 +020081 }
Akron4f6b28c2021-10-25 00:52:03 +020082 }
Akronfceddb62021-10-27 19:27:54 +020083
Akron7035d2e2021-10-28 00:54:01 +020084 // Collect tokens
Akron96fdc9b2021-10-27 21:11:17 +020085 } else if flags&TOKENS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020086 tw.Token = func(offset int, buf []rune) {
87 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020088 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +020089 }
Akron7035d2e2021-10-28 00:54:01 +020090
91 // Ignore tokens
Akron96fdc9b2021-10-27 21:11:17 +020092 } else {
93 tw.Token = func(_ int, _ []rune) {}
Akron4f6b28c2021-10-25 00:52:03 +020094 }
95
Akron7035d2e2021-10-28 00:54:01 +020096 // Collect sentence positions and maybe sentence boundaries
Akron96fdc9b2021-10-27 21:11:17 +020097 if flags&SENTENCE_POS != 0 {
Akron9fb63af2021-10-28 01:15:53 +020098 tw.SentenceEnd = func(_ int) {
Akronfceddb62021-10-27 19:27:54 +020099
100 // Add end position of last token to sentence boundary
Akron0f087ea2021-10-27 19:40:15 +0200101 // TODO: This only works if token positions are taking into account
Akronfceddb62021-10-27 19:27:54 +0200102 sent = append(sent, pos[len(pos)-1])
103 sentB = true
104
Akron7035d2e2021-10-28 00:54:01 +0200105 // Collect sentences also
Akron0f087ea2021-10-27 19:40:15 +0200106 if flags&SENTENCES != 0 {
Akron7035d2e2021-10-28 00:54:01 +0200107 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200108 }
109 }
110
Akron7035d2e2021-10-28 00:54:01 +0200111 // Collect sentence boundaries
Akron0f087ea2021-10-27 19:40:15 +0200112 } else if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +0200113 tw.SentenceEnd = func(_ int) {
Akron7035d2e2021-10-28 00:54:01 +0200114 writer.WriteByte('\n')
Akron274600e2021-11-03 20:09:06 +0100115 writer.Flush()
Akronfceddb62021-10-27 19:27:54 +0200116 }
117
118 // Ignore sentence boundaries
119 } else {
120 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200121 }
122
Akron7035d2e2021-10-28 00:54:01 +0200123 // Write token or sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200124 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
Akrone9431ec2021-10-25 21:35:33 +0200125 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200126
Akron7035d2e2021-10-28 00:54:01 +0200127 // Write token positions
Akron0f087ea2021-10-27 19:40:15 +0200128 if flags&TOKEN_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200129 writer.WriteString(strconv.Itoa(pos[0]))
130 for _, x := range pos[1:] {
131 writer.WriteByte(' ')
132 writer.WriteString(strconv.Itoa(x))
133 }
Akron7035d2e2021-10-28 00:54:01 +0200134 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200135 }
Akronfceddb62021-10-27 19:27:54 +0200136
Akron7035d2e2021-10-28 00:54:01 +0200137 // Write sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200138 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200139 writer.WriteString(strconv.Itoa(sent[0]))
140 for _, x := range sent[1:] {
141 writer.WriteByte(' ')
142 writer.WriteString(strconv.Itoa(x))
143 }
Akron7035d2e2021-10-28 00:54:01 +0200144 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200145 sent = sent[:0]
146 sentB = true
147 }
Akron4f6b28c2021-10-25 00:52:03 +0200148
Akron274600e2021-11-03 20:09:06 +0100149 writer.Flush()
150
Akron8cc2dd92021-10-25 19:49:41 +0200151 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200152 pos = pos[:0]
153 }
Akron7035d2e2021-10-28 00:54:01 +0200154
155 // Collect text ends
Akron4f6b28c2021-10-25 00:52:03 +0200156 } else {
157 tw.TextEnd = func(_ int) {
Akrone87906b2021-11-24 10:39:14 +0100158 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200159 writer.Flush()
160 }
Akron4f6b28c2021-10-25 00:52:03 +0200161 }
162
Akron7035d2e2021-10-28 00:54:01 +0200163 // Flush the writer
Akron4f6b28c2021-10-25 00:52:03 +0200164 tw.Flush = func() error {
165 return writer.Flush()
166 }
167
168 return tw
Akrone396a932021-10-19 01:06:13 +0200169}