blob: 5e4b72dca8fe8617f343edcf8ae67100d0a06a2c [file] [log] [blame]
Akrone396a932021-10-19 01:06:13 +02001package datok
2
3import (
4 "bufio"
5 "io"
Akron4f6b28c2021-10-25 00:52:03 +02006 "strconv"
Akrone396a932021-10-19 01:06:13 +02007)
8
Akron4f6b28c2021-10-25 00:52:03 +02009type TokenWriter struct {
10 SentenceEnd func(int)
11 TextEnd func(int)
12 Flush func() error
13 Token func(int, []rune)
Akrone396a932021-10-19 01:06:13 +020014}
15
Akron4f6b28c2021-10-25 00:52:03 +020016func NewTokenWriter(w io.Writer) *TokenWriter {
17 writer := bufio.NewWriter(w)
Akrone396a932021-10-19 01:06:13 +020018
Akron4f6b28c2021-10-25 00:52:03 +020019 return &TokenWriter{
20 SentenceEnd: func(_ int) {
21 writer.WriteRune('\n')
22 },
23 TextEnd: func(_ int) {
24 writer.WriteRune('\n')
25 writer.Flush()
26 },
27 Token: func(offset int, buf []rune) {
28 writer.WriteString(string(buf[offset:]))
29 writer.WriteRune('\n')
30 },
31 Flush: func() error {
32 return writer.Flush()
33 },
34 }
Akrone396a932021-10-19 01:06:13 +020035}
36
Akron4f6b28c2021-10-25 00:52:03 +020037// Create a new token writer based on the options
Akrona9e0c422021-10-27 19:01:17 +020038func NewTokenWriterFromOptions(w io.Writer, positionFlag bool, tokenFlag bool, newlineAfterEot bool) *TokenWriter {
Akron4f6b28c2021-10-25 00:52:03 +020039 writer := bufio.NewWriter(w)
40 posC := 0
41 pos := make([]int, 0, 200)
Akrone396a932021-10-19 01:06:13 +020042
Akron4f6b28c2021-10-25 00:52:03 +020043 tw := &TokenWriter{}
Akrone396a932021-10-19 01:06:13 +020044
Akron4f6b28c2021-10-25 00:52:03 +020045 if positionFlag {
46 tw.Token = func(offset int, buf []rune) {
Akrona854faa2021-10-22 19:31:08 +020047
Akron4f6b28c2021-10-25 00:52:03 +020048 // TODO:
49 // Store in []uint16
50 // and write to string
Akron8cc2dd92021-10-25 19:49:41 +020051
Akrone9431ec2021-10-25 21:35:33 +020052 // Accept newline after EOT
53 if newlineAfterEot && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 {
54 posC--
55 }
56
Akron4f6b28c2021-10-25 00:52:03 +020057 posC += offset
58 pos = append(pos, posC)
59 posC += len(buf) - offset
60 pos = append(pos, posC)
Akrone396a932021-10-19 01:06:13 +020061
Akrona9e0c422021-10-27 19:01:17 +020062 if tokenFlag {
63 writer.WriteString(string(buf[offset:]))
64 writer.WriteRune('\n')
65 }
Akron4f6b28c2021-10-25 00:52:03 +020066 }
67 } else {
68 tw.Token = func(offset int, buf []rune) {
69 writer.WriteString(string(buf[offset:]))
70 writer.WriteRune('\n')
71 }
72 }
73
74 tw.SentenceEnd = func(_ int) {
75 writer.WriteRune('\n')
76 }
77
78 if positionFlag {
Akrone9431ec2021-10-25 21:35:33 +020079 tw.TextEnd = func(_ int) {
Akron4f6b28c2021-10-25 00:52:03 +020080 writer.Flush()
81
82 writer.WriteString(strconv.Itoa(pos[0]))
83 for _, x := range pos[1:] {
84 writer.WriteByte(' ')
85 writer.WriteString(strconv.Itoa(x))
86 }
87 writer.WriteRune('\n')
88
Akron8cc2dd92021-10-25 19:49:41 +020089 posC = 0
Akron4f6b28c2021-10-25 00:52:03 +020090 pos = pos[:0]
91 }
92 } else {
93 tw.TextEnd = func(_ int) {
94 writer.WriteRune('\n')
95 writer.Flush()
96 }
97
98 }
99
100 tw.Flush = func() error {
101 return writer.Flush()
102 }
103
104 return tw
Akrone396a932021-10-19 01:06:13 +0200105}