blob: ed580ef955e4605171d80f7f335ae0995115c32f [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron0f087ea2021-10-27 19:40:15 +02009type Bits uint8
10
11const (
12 TOKENS Bits = 1 << iota
13 SENTENCES
14 TOKEN_POS
15 SENTENCE_POS
16 NEWLINE_AFTER_EOT
Akron96fdc9b2021-10-27 21:11:17 +020017
18 SIMPLE = TOKENS | SENTENCES
Akron0f087ea2021-10-27 19:40:15 +020019)
20
Akron4f6b28c2021-10-25 00:52:03 +020021type TokenWriter struct {
22 SentenceEnd func(int)
23 TextEnd func(int)
24 Flush func() error
25 Token func(int, []rune)
Akrone396a932021-10-19 01:06:13 +020026}
27
Akron4f6b28c2021-10-25 00:52:03 +020028// Create a new token writer based on the options
Akron96fdc9b2021-10-27 21:11:17 +020029func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020030 writer := bufio.NewWriter(w)
31 posC := 0
Akronfceddb62021-10-27 19:27:54 +020032 pos := make([]int, 0, 1024)
33 sentB := true
34 sent := make([]int, 0, 1024)
Akron4a6e0ff2021-11-04 00:15:54 +010035 init := true
Akrone396a932021-10-19 01:06:13 +020036
Akron4f6b28c2021-10-25 00:52:03 +020037 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020038
Akron7035d2e2021-10-28 00:54:01 +020039 // Collect token positions and maybe tokens
40 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
41
42 // TODO:
43 // Split to
44 // - Token_pos+Tokens+Newline
45 // - Token_pos+Newline
46 // - Token_pos|Sentence_pos
47 // - Sentence_pos
48 // - Tokens
49
Akron4f6b28c2021-10-25 00:52:03 +020050 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020051
Akron4f6b28c2021-10-25 00:52:03 +020052 // TODO:
53 // Store in []uint16
54 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020055
Akrone9431ec2021-10-25 21:35:33 +020056 // Accept newline after EOT
Akrone87906b2021-11-24 10:39:14 +010057 if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init {
Akrone9431ec2021-10-25 21:35:33 +020058 posC--
59 }
60
Akron4a6e0ff2021-11-04 00:15:54 +010061 init = false
62
Akron4f6b28c2021-10-25 00:52:03 +020063 posC += offset
64 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020065
66 // Token is the start of a sentence
67 if sentB {
68 sentB = false
69 sent = append(sent, posC)
70 }
Akron4f6b28c2021-10-25 00:52:03 +020071 posC += len(buf) - offset
72 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020073
Akron7035d2e2021-10-28 00:54:01 +020074 // Collect tokens also
Akron0f087ea2021-10-27 19:40:15 +020075 if flags&TOKENS != 0 {
Akrona9e0c422021-10-27 19:01:17 +020076 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020077 writer.WriteByte('\n')
Akrona9e0c422021-10-27 19:01:17 +020078 }
Akron4f6b28c2021-10-25 00:52:03 +020079 }
Akronfceddb62021-10-27 19:27:54 +020080
Akron7035d2e2021-10-28 00:54:01 +020081 // Collect tokens
Akron96fdc9b2021-10-27 21:11:17 +020082 } else if flags&TOKENS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020083 tw.Token = func(offset int, buf []rune) {
84 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020085 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +020086 }
Akron7035d2e2021-10-28 00:54:01 +020087
88 // Ignore tokens
Akron96fdc9b2021-10-27 21:11:17 +020089 } else {
90 tw.Token = func(_ int, _ []rune) {}
Akron4f6b28c2021-10-25 00:52:03 +020091 }
92
Akron7035d2e2021-10-28 00:54:01 +020093 // Collect sentence positions and maybe sentence boundaries
Akron96fdc9b2021-10-27 21:11:17 +020094 if flags&SENTENCE_POS != 0 {
Akron9fb63af2021-10-28 01:15:53 +020095 tw.SentenceEnd = func(_ int) {
Akronfceddb62021-10-27 19:27:54 +020096
97 // Add end position of last token to sentence boundary
Akron0f087ea2021-10-27 19:40:15 +020098 // TODO: This only works if token positions are taking into account
Akronfceddb62021-10-27 19:27:54 +020099 sent = append(sent, pos[len(pos)-1])
100 sentB = true
101
Akron7035d2e2021-10-28 00:54:01 +0200102 // Collect sentences also
Akron0f087ea2021-10-27 19:40:15 +0200103 if flags&SENTENCES != 0 {
Akron7035d2e2021-10-28 00:54:01 +0200104 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200105 }
106 }
107
Akron7035d2e2021-10-28 00:54:01 +0200108 // Collect sentence boundaries
Akron0f087ea2021-10-27 19:40:15 +0200109 } else if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +0200110 tw.SentenceEnd = func(_ int) {
Akron7035d2e2021-10-28 00:54:01 +0200111 writer.WriteByte('\n')
Akron274600e2021-11-03 20:09:06 +0100112 writer.Flush()
Akronfceddb62021-10-27 19:27:54 +0200113 }
114
115 // Ignore sentence boundaries
116 } else {
117 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200118 }
119
Akron7035d2e2021-10-28 00:54:01 +0200120 // Write token or sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200121 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
Akrone9431ec2021-10-25 21:35:33 +0200122 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200123
Akron7035d2e2021-10-28 00:54:01 +0200124 // Write token positions
Akron0f087ea2021-10-27 19:40:15 +0200125 if flags&TOKEN_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200126 writer.WriteString(strconv.Itoa(pos[0]))
127 for _, x := range pos[1:] {
128 writer.WriteByte(' ')
129 writer.WriteString(strconv.Itoa(x))
130 }
Akron7035d2e2021-10-28 00:54:01 +0200131 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200132 }
Akronfceddb62021-10-27 19:27:54 +0200133
Akron7035d2e2021-10-28 00:54:01 +0200134 // Write sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200135 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200136 writer.WriteString(strconv.Itoa(sent[0]))
137 for _, x := range sent[1:] {
138 writer.WriteByte(' ')
139 writer.WriteString(strconv.Itoa(x))
140 }
Akron7035d2e2021-10-28 00:54:01 +0200141 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200142 sent = sent[:0]
143 sentB = true
144 }
Akron4f6b28c2021-10-25 00:52:03 +0200145
Akron274600e2021-11-03 20:09:06 +0100146 writer.Flush()
147
Akron8cc2dd92021-10-25 19:49:41 +0200148 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200149 pos = pos[:0]
150 }
Akron7035d2e2021-10-28 00:54:01 +0200151
152 // Collect text ends
Akron4f6b28c2021-10-25 00:52:03 +0200153 } else {
154 tw.TextEnd = func(_ int) {
Akrone87906b2021-11-24 10:39:14 +0100155 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200156 writer.Flush()
157 }
Akron4f6b28c2021-10-25 00:52:03 +0200158 }
159
Akron7035d2e2021-10-28 00:54:01 +0200160 // Flush the writer
Akron4f6b28c2021-10-25 00:52:03 +0200161 tw.Flush = func() error {
162 return writer.Flush()
163 }
164
165 return tw
Akrone396a932021-10-19 01:06:13 +0200166}