blob: da4ae4d72ee6f36ac2c7e558989d4e64a8a20e2e [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron0f087ea2021-10-27 19:40:15 +02009type Bits uint8
10
11const (
12 TOKENS Bits = 1 << iota
13 SENTENCES
14 TOKEN_POS
15 SENTENCE_POS
16 NEWLINE_AFTER_EOT
Akron96fdc9b2021-10-27 21:11:17 +020017
18 SIMPLE = TOKENS | SENTENCES
Akron0f087ea2021-10-27 19:40:15 +020019)
20
Akron4f6b28c2021-10-25 00:52:03 +020021type TokenWriter struct {
22 SentenceEnd func(int)
23 TextEnd func(int)
24 Flush func() error
25 Token func(int, []rune)
Akrone396a932021-10-19 01:06:13 +020026}
27
Akron4f6b28c2021-10-25 00:52:03 +020028// Create a new token writer based on the options
Akron96fdc9b2021-10-27 21:11:17 +020029func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020030 writer := bufio.NewWriter(w)
31 posC := 0
Akronfceddb62021-10-27 19:27:54 +020032 pos := make([]int, 0, 1024)
33 sentB := true
34 sent := make([]int, 0, 1024)
Akrone396a932021-10-19 01:06:13 +020035
Akron4f6b28c2021-10-25 00:52:03 +020036 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020037
Akron0f087ea2021-10-27 19:40:15 +020038 if flags&TOKEN_POS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020039 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020040
Akron4f6b28c2021-10-25 00:52:03 +020041 // TODO:
42 // Store in []uint16
43 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020044
Akrone9431ec2021-10-25 21:35:33 +020045 // Accept newline after EOT
Akron0f087ea2021-10-27 19:40:15 +020046 if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
Akrone9431ec2021-10-25 21:35:33 +020047 posC--
48 }
49
Akron4f6b28c2021-10-25 00:52:03 +020050 posC += offset
51 pos = append(pos, posC)
Akronfceddb62021-10-27 19:27:54 +020052
53 // Token is the start of a sentence
54 if sentB {
55 sentB = false
56 sent = append(sent, posC)
57 }
Akron4f6b28c2021-10-25 00:52:03 +020058 posC += len(buf) - offset
59 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020060
Akron0f087ea2021-10-27 19:40:15 +020061 if flags&TOKENS != 0 {
Akrona9e0c422021-10-27 19:01:17 +020062 writer.WriteString(string(buf[offset:]))
63 writer.WriteRune('\n')
64 }
Akron4f6b28c2021-10-25 00:52:03 +020065 }
Akronfceddb62021-10-27 19:27:54 +020066
67 // Only print one token per line
Akron96fdc9b2021-10-27 21:11:17 +020068 } else if flags&TOKENS != 0 {
Akron4f6b28c2021-10-25 00:52:03 +020069 tw.Token = func(offset int, buf []rune) {
70 writer.WriteString(string(buf[offset:]))
71 writer.WriteRune('\n')
72 }
Akron96fdc9b2021-10-27 21:11:17 +020073 } else {
74 tw.Token = func(_ int, _ []rune) {}
Akron4f6b28c2021-10-25 00:52:03 +020075 }
76
Akronfceddb62021-10-27 19:27:54 +020077 // Print sentence boundaries
Akron96fdc9b2021-10-27 21:11:17 +020078 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +020079 tw.SentenceEnd = func(offset int) {
80
81 // Add end position of last token to sentence boundary
Akron0f087ea2021-10-27 19:40:15 +020082 // TODO: This only works if token positions are taking into account
Akronfceddb62021-10-27 19:27:54 +020083 sent = append(sent, pos[len(pos)-1])
84 sentB = true
85
Akron0f087ea2021-10-27 19:40:15 +020086 if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +020087 writer.WriteRune('\n')
88 }
89 }
90
91 // Print sentence boundaries as newlines
Akron0f087ea2021-10-27 19:40:15 +020092 } else if flags&SENTENCES != 0 {
Akronfceddb62021-10-27 19:27:54 +020093 tw.SentenceEnd = func(_ int) {
94 writer.WriteRune('\n')
95 }
96
97 // Ignore sentence boundaries
98 } else {
99 tw.SentenceEnd = func(_ int) {}
Akron4f6b28c2021-10-25 00:52:03 +0200100 }
101
Akron0f087ea2021-10-27 19:40:15 +0200102 if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
Akrone9431ec2021-10-25 21:35:33 +0200103 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +0200104 writer.Flush()
105
Akron0f087ea2021-10-27 19:40:15 +0200106 if flags&TOKEN_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200107 writer.WriteString(strconv.Itoa(pos[0]))
108 for _, x := range pos[1:] {
109 writer.WriteByte(' ')
110 writer.WriteString(strconv.Itoa(x))
111 }
112 writer.WriteRune('\n')
Akron4f6b28c2021-10-25 00:52:03 +0200113 }
Akronfceddb62021-10-27 19:27:54 +0200114
Akron0f087ea2021-10-27 19:40:15 +0200115 if flags&SENTENCE_POS != 0 {
Akronfceddb62021-10-27 19:27:54 +0200116 writer.WriteString(strconv.Itoa(sent[0]))
117 for _, x := range sent[1:] {
118 writer.WriteByte(' ')
119 writer.WriteString(strconv.Itoa(x))
120 }
121 writer.WriteRune('\n')
122 sent = sent[:0]
123 sentB = true
124 }
Akron4f6b28c2021-10-25 00:52:03 +0200125
Akron8cc2dd92021-10-25 19:49:41 +0200126 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +0200127 pos = pos[:0]
128 }
129 } else {
130 tw.TextEnd = func(_ int) {
131 writer.WriteRune('\n')
132 writer.Flush()
133 }
Akron4f6b28c2021-10-25 00:52:03 +0200134 }
135
136 tw.Flush = func() error {
137 return writer.Flush()
138 }
139
140 return tw
Akrone396a932021-10-19 01:06:13 +0200141}