blob: 11179d392462f741e64bdf3a3ebf0a5dd822a22c [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron0f087ea2021-10-27 19:40:15 +02009type Bits uint8
10
11const (
12 TOKENS Bits = 1 << iota
13 SENTENCES
14 TOKEN_POS
15 SENTENCE_POS
16 NEWLINE_AFTER_EOT
Akron96fdc9b2021-10-27 21:11:17 +020017
18 SIMPLE = TOKENS | SENTENCES
Akron0f087ea2021-10-27 19:40:15 +020019)
20
Akron4f6b28c2021-10-25 00:52:03 +020021type TokenWriter struct {
22 SentenceEnd func(int)
23 TextEnd func(int)
24 Flush func() error
25 Token func(int, []rune)
Akrone396a932021-10-19 01:06:13 +020026}
27
Akron4f6b28c2021-10-25 00:52:03 +020028// Create a new token writer based on the options
Akron96fdc9b2021-10-27 21:11:17 +020029func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020030 writer := bufio.NewWriter(w)
31 posC := 0
Akronfceddb62021-10-27 19:27:54 +020032 pos := make([]int, 0, 1024)
33 sentB := true
34 sent := make([]int, 0, 1024)
Akrone396a932021-10-19 01:06:13 +020035
Akron4f6b28c2021-10-25 00:52:03 +020036 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020037
Akron7035d2e2021-10-28 00:54:01 +020038 // Collect token positions and maybe tokens
39 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
40
41 // TODO:
42 // Split to
43 // - Token_pos+Tokens+Newline
44 // - Token_pos+Newline
45 // - Token_pos|Sentence_pos
46 // - Sentence_pos
47 // - Tokens
48
Akron4f6b28c2021-10-25 00:52:03 +020049 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020050
Akron4f6b28c2021-10-25 00:52:03 +020051 // TODO:
52 // Store in []uint16
53 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020054
Akrone9431ec2021-10-25 21:35:33 +020055 // Accept newline after EOT
Akron0f087ea2021-10-27 19:40:15 +020056 if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
Akrone9431ec2021-10-25 21:35:33 +020057 posC--
58 }
59
Akron4f6b28c2021-10-25 00:52:03 +020060 posC += offset
61 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020062
63 // Token is the start of a sentence
64 if sentB {
65 sentB = false
66 sent = append(sent, posC)
67 }
Akron4f6b28c2021-10-25 00:52:03 +020068 posC += len(buf) - offset
69 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020070
Akron7035d2e2021-10-28 00:54:01 +020071 // Collect tokens also
Akron0f087ea2021-10-27 19:40:15 +020072 if flags&TOKENS != 0 {
Akrona9e0c422021-10-27 19:01:17 +020073 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020074 writer.WriteByte('\n')
Akrona9e0c422021-10-27 19:01:17 +020075 }
Akron4f6b28c2021-10-25 00:52:03 +020076 }
Akronfceddb62021-10-27 19:27:54 +020077
Akron7035d2e2021-10-28 00:54:01 +020078 // Collect tokens
Akron96fdc9b2021-10-27 21:11:17 +020079 } else if flags&TOKENS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020080 tw.Token = func(offset int, buf []rune) {
81 writer.WriteString(string(buf[offset:]))
Akron7035d2e2021-10-28 00:54:01 +020082 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +020083 }
Akron7035d2e2021-10-28 00:54:01 +020084
85 // Ignore tokens
Akron96fdc9b2021-10-27 21:11:17 +020086 } else {
87 tw.Token = func(_ int, _ []rune) {}
Akron4f6b28c2021-10-25 00:52:03 +020088 }
89
Akron7035d2e2021-10-28 00:54:01 +020090 // Collect sentence positions and maybe sentence boundaries
Akron96fdc9b2021-10-27 21:11:17 +020091 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +020092 tw.SentenceEnd = func(offset int) {
93
94 // Add end position of last token to sentence boundary
Akron0f087ea2021-10-27 19:40:15 +020095 // TODO: This only works if token positions are taking into account
Akronfceddb62021-10-27 19:27:54 +020096 sent = append(sent, pos[len(pos)-1])
97 sentB = true
98
Akron7035d2e2021-10-28 00:54:01 +020099 // Collect sentences also
Akron0f087ea2021-10-27 19:40:15 +0200100 if flags&SENTENCES != 0 {
Akron7035d2e2021-10-28 00:54:01 +0200101 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200102 }
103 }
104
Akron7035d2e2021-10-28 00:54:01 +0200105 // Collect sentence boundaries
Akron0f087ea2021-10-27 19:40:15 +0200106 } else if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +0200107 tw.SentenceEnd = func(_ int) {
Akron7035d2e2021-10-28 00:54:01 +0200108 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200109 }
110
111 // Ignore sentence boundaries
112 } else {
113 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200114 }
115
Akron7035d2e2021-10-28 00:54:01 +0200116 // Write token or sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200117 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
Akrone9431ec2021-10-25 21:35:33 +0200118 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200119 writer.Flush()
120
Akron7035d2e2021-10-28 00:54:01 +0200121 // Write token positions
Akron0f087ea2021-10-27 19:40:15 +0200122 if flags&TOKEN_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200123 writer.WriteString(strconv.Itoa(pos[0]))
124 for _, x := range pos[1:] {
125 writer.WriteByte(' ')
126 writer.WriteString(strconv.Itoa(x))
127 }
Akron7035d2e2021-10-28 00:54:01 +0200128 writer.WriteByte('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200129 }
Akronfceddb62021-10-27 19:27:54 +0200130
Akron7035d2e2021-10-28 00:54:01 +0200131 // Write sentence positions
Akron0f087ea2021-10-27 19:40:15 +0200132 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200133 writer.WriteString(strconv.Itoa(sent[0]))
134 for _, x := range sent[1:] {
135 writer.WriteByte(' ')
136 writer.WriteString(strconv.Itoa(x))
137 }
Akron7035d2e2021-10-28 00:54:01 +0200138 writer.WriteByte('\n')
Akronfceddb62021-10-27 19:27:54 +0200139 sent = sent[:0]
140 sentB = true
141 }
Akron4f6b28c2021-10-25 00:52:03 +0200142
Akron8cc2dd92021-10-25 19:49:41 +0200143 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200144 pos = pos[:0]
145 }
Akron7035d2e2021-10-28 00:54:01 +0200146
147 // Collect text ends
Akron4f6b28c2021-10-25 00:52:03 +0200148 } else {
149 tw.TextEnd = func(_ int) {
150 writer.WriteRune('\n')
151 writer.Flush()
152 }
Akron4f6b28c2021-10-25 00:52:03 +0200153 }
154
Akron7035d2e2021-10-28 00:54:01 +0200155 // Flush the writer
Akron4f6b28c2021-10-25 00:52:03 +0200156 tw.Flush = func() error {
157 return writer.Flush()
158 }
159
160 return tw
Akrone396a932021-10-19 01:06:13 +0200161}