blob: aa9c32dafb8ada8f90e80cdc00497bc55ead0952 [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron0f087ea2021-10-27 19:40:15 +02009type Bits uint8
10
11const (
12 TOKENS Bits = 1 << iota
13 SENTENCES
14 TOKEN_POS
15 SENTENCE_POS
16 NEWLINE_AFTER_EOT
17)
18
Akron4f6b28c2021-10-25 00:52:03 +020019type TokenWriter struct {
20 SentenceEnd func(int)
21 TextEnd func(int)
22 Flush func() error
23 Token func(int, []rune)
Akrone396a932021-10-19 01:06:13 +020024}
25
Akron4f6b28c2021-10-25 00:52:03 +020026func NewTokenWriter(w io.Writer) *TokenWriter {
27 writer := bufio.NewWriter(w)
Akrone396a932021-10-19 01:06:13 +020028
Akron4f6b28c2021-10-25 00:52:03 +020029 return &TokenWriter{
30 SentenceEnd: func(_ int) {
31 writer.WriteRune('\n')
32 },
33 TextEnd: func(_ int) {
34 writer.WriteRune('\n')
35 writer.Flush()
36 },
37 Token: func(offset int, buf []rune) {
38 writer.WriteString(string(buf[offset:]))
39 writer.WriteRune('\n')
40 },
41 Flush: func() error {
42 return writer.Flush()
43 },
44 }
Akrone396a932021-10-19 01:06:13 +020045}
46
Akron4f6b28c2021-10-25 00:52:03 +020047// Create a new token writer based on the options
Akron0f087ea2021-10-27 19:40:15 +020048func NewTokenWriterFromOptions(w io.Writer, flags Bits) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020049 writer := bufio.NewWriter(w)
50 posC := 0
Akronfceddb62021-10-27 19:27:54 +020051 pos := make([]int, 0, 1024)
52 sentB := true
53 sent := make([]int, 0, 1024)
Akrone396a932021-10-19 01:06:13 +020054
Akron4f6b28c2021-10-25 00:52:03 +020055 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020056
Akron0f087ea2021-10-27 19:40:15 +020057 if flags&TOKEN_POS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020058 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020059
Akron4f6b28c2021-10-25 00:52:03 +020060 // TODO:
61 // Store in []uint16
62 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020063
Akrone9431ec2021-10-25 21:35:33 +020064 // Accept newline after EOT
Akron0f087ea2021-10-27 19:40:15 +020065 if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
Akrone9431ec2021-10-25 21:35:33 +020066 posC--
67 }
68
Akron4f6b28c2021-10-25 00:52:03 +020069 posC += offset
70 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020071
72 // Token is the start of a sentence
73 if sentB {
74 sentB = false
75 sent = append(sent, posC)
76 }
Akron4f6b28c2021-10-25 00:52:03 +020077 posC += len(buf) - offset
78 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020079
Akron0f087ea2021-10-27 19:40:15 +020080 if flags&TOKENS != 0 {
Akrona9e0c422021-10-27 19:01:17 +020081 writer.WriteString(string(buf[offset:]))
82 writer.WriteRune('\n')
83 }
Akron4f6b28c2021-10-25 00:52:03 +020084 }
Akronfceddb62021-10-27 19:27:54 +020085
86 // Only print one token per line
Akron4f6b28c2021-10-25 00:52:03 +020087 } else {
88 tw.Token = func(offset int, buf []rune) {
89 writer.WriteString(string(buf[offset:]))
90 writer.WriteRune('\n')
91 }
92 }
93
Akronfceddb62021-10-27 19:27:54 +020094 // Print sentence boundaries
Akron0f087ea2021-10-27 19:40:15 +020095 if flags&(SENTENCES|SENTENCE_POS) != 0 {
Akronfceddb62021-10-27 19:27:54 +020096 tw.SentenceEnd = func(offset int) {
97
98 // Add end position of last token to sentence boundary
Akron0f087ea2021-10-27 19:40:15 +020099 // TODO: This only works if token positions are taking into account
Akronfceddb62021-10-27 19:27:54 +0200100 sent = append(sent, pos[len(pos)-1])
101 sentB = true
102
Akron0f087ea2021-10-27 19:40:15 +0200103 if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +0200104 writer.WriteRune('\n')
105 }
106 }
107
108 // Print sentence boundaries as newlines
Akron0f087ea2021-10-27 19:40:15 +0200109 } else if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +0200110 tw.SentenceEnd = func(_ int) {
111 writer.WriteRune('\n')
112 }
113
114 // Ignore sentence boundaries
115 } else {
116 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200117 }
118
Akron0f087ea2021-10-27 19:40:15 +0200119 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
Akrone9431ec2021-10-25 21:35:33 +0200120 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200121 writer.Flush()
122
Akron0f087ea2021-10-27 19:40:15 +0200123 if flags&TOKEN_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200124 writer.WriteString(strconv.Itoa(pos[0]))
125 for _, x := range pos[1:] {
126 writer.WriteByte(' ')
127 writer.WriteString(strconv.Itoa(x))
128 }
129 writer.WriteRune('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200130 }
Akronfceddb62021-10-27 19:27:54 +0200131
Akron0f087ea2021-10-27 19:40:15 +0200132 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200133 writer.WriteString(strconv.Itoa(sent[0]))
134 for _, x := range sent[1:] {
135 writer.WriteByte(' ')
136 writer.WriteString(strconv.Itoa(x))
137 }
138 writer.WriteRune('\n')
139 sent = sent[:0]
140 sentB = true
141 }
Akron4f6b28c2021-10-25 00:52:03 +0200142
Akron8cc2dd92021-10-25 19:49:41 +0200143 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200144 pos = pos[:0]
145 }
146 } else {
147 tw.TextEnd = func(_ int) {
148 writer.WriteRune('\n')
149 writer.Flush()
150 }
151
152 }
153
154 tw.Flush = func() error {
155 return writer.Flush()
156 }
157
158 return tw
Akrone396a932021-10-19 01:06:13 +0200159}