Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "io" |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 6 | "strconv" |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 7 | ) |
| 8 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 9 | type Bits uint8 |
| 10 | |
Akron | b71b51f | 2024-03-04 15:39:02 +0100 | [diff] [blame^] | 11 | // TODO-Perf: |
| 12 | // - TokenWriter may support AvailableBuffer(), so tokens can be written |
| 13 | // directly without a separate buffer. copying from the same underlying |
| 14 | // byte array is a nop thren (Go 1.18). |
| 15 | |
| 16 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 17 | const ( |
| 18 | TOKENS Bits = 1 << iota |
| 19 | SENTENCES |
| 20 | TOKEN_POS |
| 21 | SENTENCE_POS |
| 22 | NEWLINE_AFTER_EOT |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 23 | |
| 24 | SIMPLE = TOKENS | SENTENCES |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 25 | ) |
| 26 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 27 | type TokenWriter struct { |
| 28 | SentenceEnd func(int) |
| 29 | TextEnd func(int) |
| 30 | Flush func() error |
| 31 | Token func(int, []rune) |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 32 | // Fail func(int) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 33 | } |
| 34 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 35 | // Create a new token writer based on the options |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 36 | func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 37 | writer := bufio.NewWriter(w) |
| 38 | posC := 0 |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 39 | pos := make([]int, 0, 1024) |
| 40 | sentB := true |
| 41 | sent := make([]int, 0, 1024) |
Akron | 4a6e0ff | 2021-11-04 00:15:54 +0100 | [diff] [blame] | 42 | init := true |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 43 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 44 | tw := &TokenWriter{} |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 45 | |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 46 | // tw.Fail = func(_ int) {} |
| 47 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 48 | // Collect token positions and maybe tokens |
| 49 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
| 50 | |
| 51 | // TODO: |
| 52 | // Split to |
| 53 | // - Token_pos+Tokens+Newline |
| 54 | // - Token_pos+Newline |
| 55 | // - Token_pos|Sentence_pos |
| 56 | // - Sentence_pos |
| 57 | // - Tokens |
| 58 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 59 | tw.Token = func(offset int, buf []rune) { |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 60 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 61 | // TODO: |
| 62 | // Store in []uint16 |
| 63 | // and write to string |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 64 | |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 65 | // Accept newline after EOT |
Akron | e87906b | 2021-11-24 10:39:14 +0100 | [diff] [blame] | 66 | if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 67 | posC-- |
| 68 | } |
| 69 | |
Akron | 4a6e0ff | 2021-11-04 00:15:54 +0100 | [diff] [blame] | 70 | init = false |
| 71 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 72 | posC += offset |
| 73 | pos = append(pos, posC) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 74 | |
| 75 | // Token is the start of a sentence |
| 76 | if sentB { |
| 77 | sentB = false |
| 78 | sent = append(sent, posC) |
| 79 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 80 | posC += len(buf) - offset |
| 81 | pos = append(pos, posC) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 82 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 83 | // Collect tokens also |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 84 | if flags&TOKENS != 0 { |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 85 | writer.WriteString(string(buf[offset:])) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 86 | writer.WriteByte('\n') |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 87 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 88 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 89 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 90 | // Collect tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 91 | } else if flags&TOKENS != 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 92 | tw.Token = func(offset int, buf []rune) { |
| 93 | writer.WriteString(string(buf[offset:])) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 94 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 95 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 96 | |
| 97 | // Ignore tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 98 | } else { |
| 99 | tw.Token = func(_ int, _ []rune) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 100 | } |
| 101 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 102 | // Collect sentence positions and maybe sentence boundaries |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 103 | if flags&SENTENCE_POS != 0 { |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 104 | tw.SentenceEnd = func(_ int) { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 105 | |
| 106 | // Add end position of last token to sentence boundary |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 107 | // TODO: This only works if token positions are taking into account |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 108 | sent = append(sent, pos[len(pos)-1]) |
| 109 | sentB = true |
| 110 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 111 | // Collect sentences also |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 112 | if flags&SENTENCES != 0 { |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 113 | writer.WriteByte('\n') |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 114 | } |
| 115 | } |
| 116 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 117 | // Collect sentence boundaries |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 118 | } else if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 119 | tw.SentenceEnd = func(_ int) { |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 120 | writer.WriteByte('\n') |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 121 | writer.Flush() |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 122 | } |
| 123 | |
| 124 | // Ignore sentence boundaries |
| 125 | } else { |
| 126 | tw.SentenceEnd = func(_ int) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 127 | } |
| 128 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 129 | // Write token or sentence positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 130 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 131 | tw.TextEnd = func(_ int) { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 132 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 133 | // Write token positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 134 | if flags&TOKEN_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 135 | writer.WriteString(strconv.Itoa(pos[0])) |
| 136 | for _, x := range pos[1:] { |
| 137 | writer.WriteByte(' ') |
| 138 | writer.WriteString(strconv.Itoa(x)) |
| 139 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 140 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 141 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 142 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 143 | // Write sentence positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 144 | if flags&SENTENCE_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 145 | writer.WriteString(strconv.Itoa(sent[0])) |
| 146 | for _, x := range sent[1:] { |
| 147 | writer.WriteByte(' ') |
| 148 | writer.WriteString(strconv.Itoa(x)) |
| 149 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 150 | writer.WriteByte('\n') |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 151 | sent = sent[:0] |
| 152 | sentB = true |
| 153 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 154 | |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 155 | writer.Flush() |
| 156 | |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 157 | posC = 0 |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 158 | pos = pos[:0] |
| 159 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 160 | |
| 161 | // Collect text ends |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 162 | } else { |
| 163 | tw.TextEnd = func(_ int) { |
Akron | e87906b | 2021-11-24 10:39:14 +0100 | [diff] [blame] | 164 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 165 | writer.Flush() |
| 166 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 167 | } |
| 168 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 169 | // Flush the writer |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 170 | tw.Flush = func() error { |
| 171 | return writer.Flush() |
| 172 | } |
| 173 | |
| 174 | return tw |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 175 | } |