Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "io" |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 6 | "strconv" |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 7 | ) |
| 8 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 9 | type Bits uint8 |
| 10 | |
| 11 | const ( |
| 12 | TOKENS Bits = 1 << iota |
| 13 | SENTENCES |
| 14 | TOKEN_POS |
| 15 | SENTENCE_POS |
| 16 | NEWLINE_AFTER_EOT |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 17 | |
| 18 | SIMPLE = TOKENS | SENTENCES |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 19 | ) |
| 20 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 21 | type TokenWriter struct { |
| 22 | SentenceEnd func(int) |
| 23 | TextEnd func(int) |
| 24 | Flush func() error |
| 25 | Token func(int, []rune) |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 26 | // Fail func(int) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 27 | } |
| 28 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 29 | // Create a new token writer based on the options |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 30 | func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 31 | writer := bufio.NewWriter(w) |
| 32 | posC := 0 |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 33 | pos := make([]int, 0, 1024) |
| 34 | sentB := true |
| 35 | sent := make([]int, 0, 1024) |
Akron | 4a6e0ff | 2021-11-04 00:15:54 +0100 | [diff] [blame] | 36 | init := true |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 37 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 38 | tw := &TokenWriter{} |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 39 | |
Akron | df27581 | 2022-03-27 12:54:46 +0200 | [diff] [blame] | 40 | // tw.Fail = func(_ int) {} |
| 41 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 42 | // Collect token positions and maybe tokens |
| 43 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
| 44 | |
| 45 | // TODO: |
| 46 | // Split to |
| 47 | // - Token_pos+Tokens+Newline |
| 48 | // - Token_pos+Newline |
| 49 | // - Token_pos|Sentence_pos |
| 50 | // - Sentence_pos |
| 51 | // - Tokens |
| 52 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 53 | tw.Token = func(offset int, buf []rune) { |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 54 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 55 | // TODO: |
| 56 | // Store in []uint16 |
| 57 | // and write to string |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 58 | |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 59 | // Accept newline after EOT |
Akron | e87906b | 2021-11-24 10:39:14 +0100 | [diff] [blame] | 60 | if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 61 | posC-- |
| 62 | } |
| 63 | |
Akron | 4a6e0ff | 2021-11-04 00:15:54 +0100 | [diff] [blame] | 64 | init = false |
| 65 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 66 | posC += offset |
| 67 | pos = append(pos, posC) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 68 | |
| 69 | // Token is the start of a sentence |
| 70 | if sentB { |
| 71 | sentB = false |
| 72 | sent = append(sent, posC) |
| 73 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 74 | posC += len(buf) - offset |
| 75 | pos = append(pos, posC) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 76 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 77 | // Collect tokens also |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 78 | if flags&TOKENS != 0 { |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 79 | writer.WriteString(string(buf[offset:])) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 80 | writer.WriteByte('\n') |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 81 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 82 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 83 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 84 | // Collect tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 85 | } else if flags&TOKENS != 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 86 | tw.Token = func(offset int, buf []rune) { |
| 87 | writer.WriteString(string(buf[offset:])) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 88 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 89 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 90 | |
| 91 | // Ignore tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 92 | } else { |
| 93 | tw.Token = func(_ int, _ []rune) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 94 | } |
| 95 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 96 | // Collect sentence positions and maybe sentence boundaries |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 97 | if flags&SENTENCE_POS != 0 { |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 98 | tw.SentenceEnd = func(_ int) { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 99 | |
| 100 | // Add end position of last token to sentence boundary |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 101 | // TODO: This only works if token positions are taking into account |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 102 | sent = append(sent, pos[len(pos)-1]) |
| 103 | sentB = true |
| 104 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 105 | // Collect sentences also |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 106 | if flags&SENTENCES != 0 { |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 107 | writer.WriteByte('\n') |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 108 | } |
| 109 | } |
| 110 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 111 | // Collect sentence boundaries |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 112 | } else if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 113 | tw.SentenceEnd = func(_ int) { |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 114 | writer.WriteByte('\n') |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 115 | writer.Flush() |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 116 | } |
| 117 | |
| 118 | // Ignore sentence boundaries |
| 119 | } else { |
| 120 | tw.SentenceEnd = func(_ int) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 121 | } |
| 122 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 123 | // Write token or sentence positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 124 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 125 | tw.TextEnd = func(_ int) { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 126 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 127 | // Write token positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 128 | if flags&TOKEN_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 129 | writer.WriteString(strconv.Itoa(pos[0])) |
| 130 | for _, x := range pos[1:] { |
| 131 | writer.WriteByte(' ') |
| 132 | writer.WriteString(strconv.Itoa(x)) |
| 133 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 134 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 135 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 136 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 137 | // Write sentence positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 138 | if flags&SENTENCE_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 139 | writer.WriteString(strconv.Itoa(sent[0])) |
| 140 | for _, x := range sent[1:] { |
| 141 | writer.WriteByte(' ') |
| 142 | writer.WriteString(strconv.Itoa(x)) |
| 143 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 144 | writer.WriteByte('\n') |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 145 | sent = sent[:0] |
| 146 | sentB = true |
| 147 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 148 | |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 149 | writer.Flush() |
| 150 | |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 151 | posC = 0 |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 152 | pos = pos[:0] |
| 153 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 154 | |
| 155 | // Collect text ends |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 156 | } else { |
| 157 | tw.TextEnd = func(_ int) { |
Akron | e87906b | 2021-11-24 10:39:14 +0100 | [diff] [blame] | 158 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 159 | writer.Flush() |
| 160 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 161 | } |
| 162 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 163 | // Flush the writer |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 164 | tw.Flush = func() error { |
| 165 | return writer.Flush() |
| 166 | } |
| 167 | |
| 168 | return tw |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 169 | } |