blob: dd528b5fd1a02d32a23845a27ae22f6fcb250412 [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron4f6b28c2021-10-25 00:52:03 +02009type TokenWriter struct {
10 SentenceEnd func(int)
11 TextEnd func(int)
12 Flush func() error
13 Token func(int, []rune)
Akrone396a932021-10-19 01:06:13 +020014}
15
Akron4f6b28c2021-10-25 00:52:03 +020016func NewTokenWriter(w io.Writer) *TokenWriter {
17 writer := bufio.NewWriter(w)
Akrone396a932021-10-19 01:06:13 +020018
Akron4f6b28c2021-10-25 00:52:03 +020019 return &TokenWriter{
20 SentenceEnd: func(_ int) {
21 writer.WriteRune('\n')
22 },
23 TextEnd: func(_ int) {
24 writer.WriteRune('\n')
25 writer.Flush()
26 },
27 Token: func(offset int, buf []rune) {
28 writer.WriteString(string(buf[offset:]))
29 writer.WriteRune('\n')
30 },
31 Flush: func() error {
32 return writer.Flush()
33 },
34 }
Akrone396a932021-10-19 01:06:13 +020035}
36
Akron4f6b28c2021-10-25 00:52:03 +020037// Create a new token writer based on the options
Akronfceddb62021-10-27 19:27:54 +020038func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, sentenceFlag bool, sentencePositionFlag bool, newlineAfterEot bool) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020039 writer := bufio.NewWriter(w)
40 posC := 0
Akronfceddb62021-10-27 19:27:54 +020041 pos := make([]int, 0, 1024)
42 sentB := true
43 sent := make([]int, 0, 1024)
Akrone396a932021-10-19 01:06:13 +020044
Akron4f6b28c2021-10-25 00:52:03 +020045 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020046
Akron4f6b28c2021-10-25 00:52:03 +020047 if positionFlag {
48 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020049
Akron4f6b28c2021-10-25 00:52:03 +020050 // TODO:
51 // Store in []uint16
52 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020053
Akrone9431ec2021-10-25 21:35:33 +020054 // Accept newline after EOT
55 if newlineAfterEot && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
56 posC--
57 }
58
Akron4f6b28c2021-10-25 00:52:03 +020059 posC += offset
60 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020061
62 // Token is the start of a sentence
63 if sentB {
64 sentB = false
65 sent = append(sent, posC)
66 }
Akron4f6b28c2021-10-25 00:52:03 +020067 posC += len(buf) - offset
68 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020069
Akrona9e0c422021-10-27 19:01:17 +020070 if tokenFlag {
71 writer.WriteString(string(buf[offset:]))
72 writer.WriteRune('\n')
73 }
Akron4f6b28c2021-10-25 00:52:03 +020074 }
Akronfceddb62021-10-27 19:27:54 +020075
76 // Only print one token per line
Akron4f6b28c2021-10-25 00:52:03 +020077 } else {
78 tw.Token = func(offset int, buf []rune) {
79 writer.WriteString(string(buf[offset:]))
80 writer.WriteRune('\n')
81 }
82 }
83
Akronfceddb62021-10-27 19:27:54 +020084 // Print sentence boundaries
85 if sentenceFlag || sentencePositionFlag {
86 tw.SentenceEnd = func(offset int) {
87
88 // Add end position of last token to sentence boundary
89 sent = append(sent, pos[len(pos)-1])
90 sentB = true
91
92 if sentenceFlag {
93 writer.WriteRune('\n')
94 }
95 }
96
97 // Print sentence boundaries as newlines
98 } else if sentenceFlag {
99 tw.SentenceEnd = func(_ int) {
100 writer.WriteRune('\n')
101 }
102
103 // Ignore sentence boundaries
104 } else {
105 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200106 }
107
Akronfceddb62021-10-27 19:27:54 +0200108 if positionFlag || sentencePositionFlag {
Akrone9431ec2021-10-25 21:35:33 +0200109 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200110 writer.Flush()
111
Akronfceddb62021-10-27 19:27:54 +0200112 if positionFlag {
113 writer.WriteString(strconv.Itoa(pos[0]))
114 for _, x := range pos[1:] {
115 writer.WriteByte(' ')
116 writer.WriteString(strconv.Itoa(x))
117 }
118 writer.WriteRune('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200119 }
Akronfceddb62021-10-27 19:27:54 +0200120
121 if sentencePositionFlag {
122 writer.WriteString(strconv.Itoa(sent[0]))
123 for _, x := range sent[1:] {
124 writer.WriteByte(' ')
125 writer.WriteString(strconv.Itoa(x))
126 }
127 writer.WriteRune('\n')
128 sent = sent[:0]
129 sentB = true
130 }
Akron4f6b28c2021-10-25 00:52:03 +0200131
Akron8cc2dd92021-10-25 19:49:41 +0200132 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200133 pos = pos[:0]
134 }
135 } else {
136 tw.TextEnd = func(_ int) {
137 writer.WriteRune('\n')
138 writer.Flush()
139 }
140
141 }
142
143 tw.Flush = func() error {
144 return writer.Flush()
145 }
146
147 return tw
Akrone396a932021-10-19 01:06:13 +0200148}