Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "io" |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 6 | "strconv" |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 7 | ) |
| 8 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 9 | type Bits uint8 |
| 10 | |
| 11 | const ( |
| 12 | TOKENS Bits = 1 << iota |
| 13 | SENTENCES |
| 14 | TOKEN_POS |
| 15 | SENTENCE_POS |
| 16 | NEWLINE_AFTER_EOT |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 17 | |
| 18 | SIMPLE = TOKENS | SENTENCES |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 19 | ) |
| 20 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 21 | type TokenWriter struct { |
| 22 | SentenceEnd func(int) |
| 23 | TextEnd func(int) |
| 24 | Flush func() error |
| 25 | Token func(int, []rune) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 26 | } |
| 27 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 28 | // Create a new token writer based on the options |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 29 | func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 30 | writer := bufio.NewWriter(w) |
| 31 | posC := 0 |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 32 | pos := make([]int, 0, 1024) |
| 33 | sentB := true |
| 34 | sent := make([]int, 0, 1024) |
Akron | 4a6e0ff | 2021-11-04 00:15:54 +0100 | [diff] [blame] | 35 | init := true |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 36 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 37 | tw := &TokenWriter{} |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 38 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 39 | // Collect token positions and maybe tokens |
| 40 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
| 41 | |
| 42 | // TODO: |
| 43 | // Split to |
| 44 | // - Token_pos+Tokens+Newline |
| 45 | // - Token_pos+Newline |
| 46 | // - Token_pos|Sentence_pos |
| 47 | // - Sentence_pos |
| 48 | // - Tokens |
| 49 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 50 | tw.Token = func(offset int, buf []rune) { |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 51 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 52 | // TODO: |
| 53 | // Store in []uint16 |
| 54 | // and write to string |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 55 | |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 56 | // Accept newline after EOT |
Akron | e87906b | 2021-11-24 10:39:14 +0100 | [diff] [blame] | 57 | if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 58 | posC-- |
| 59 | } |
| 60 | |
Akron | 4a6e0ff | 2021-11-04 00:15:54 +0100 | [diff] [blame] | 61 | init = false |
| 62 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 63 | posC += offset |
| 64 | pos = append(pos, posC) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 65 | |
| 66 | // Token is the start of a sentence |
| 67 | if sentB { |
| 68 | sentB = false |
| 69 | sent = append(sent, posC) |
| 70 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 71 | posC += len(buf) - offset |
| 72 | pos = append(pos, posC) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 73 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 74 | // Collect tokens also |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 75 | if flags&TOKENS != 0 { |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 76 | writer.WriteString(string(buf[offset:])) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 77 | writer.WriteByte('\n') |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 78 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 79 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 80 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 81 | // Collect tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 82 | } else if flags&TOKENS != 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 83 | tw.Token = func(offset int, buf []rune) { |
| 84 | writer.WriteString(string(buf[offset:])) |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 85 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 86 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 87 | |
| 88 | // Ignore tokens |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 89 | } else { |
| 90 | tw.Token = func(_ int, _ []rune) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 91 | } |
| 92 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 93 | // Collect sentence positions and maybe sentence boundaries |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame] | 94 | if flags&SENTENCE_POS != 0 { |
Akron | 9fb63af | 2021-10-28 01:15:53 +0200 | [diff] [blame] | 95 | tw.SentenceEnd = func(_ int) { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 96 | |
| 97 | // Add end position of last token to sentence boundary |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 98 | // TODO: This only works if token positions are taking into account |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 99 | sent = append(sent, pos[len(pos)-1]) |
| 100 | sentB = true |
| 101 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 102 | // Collect sentences also |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 103 | if flags&SENTENCES != 0 { |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 104 | writer.WriteByte('\n') |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 105 | } |
| 106 | } |
| 107 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 108 | // Collect sentence boundaries |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 109 | } else if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 110 | tw.SentenceEnd = func(_ int) { |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 111 | writer.WriteByte('\n') |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 112 | writer.Flush() |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 113 | } |
| 114 | |
| 115 | // Ignore sentence boundaries |
| 116 | } else { |
| 117 | tw.SentenceEnd = func(_ int) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 118 | } |
| 119 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 120 | // Write token or sentence positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 121 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 122 | tw.TextEnd = func(_ int) { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 123 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 124 | // Write token positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 125 | if flags&TOKEN_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 126 | writer.WriteString(strconv.Itoa(pos[0])) |
| 127 | for _, x := range pos[1:] { |
| 128 | writer.WriteByte(' ') |
| 129 | writer.WriteString(strconv.Itoa(x)) |
| 130 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 131 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 132 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 133 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 134 | // Write sentence positions |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 135 | if flags&SENTENCE_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 136 | writer.WriteString(strconv.Itoa(sent[0])) |
| 137 | for _, x := range sent[1:] { |
| 138 | writer.WriteByte(' ') |
| 139 | writer.WriteString(strconv.Itoa(x)) |
| 140 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 141 | writer.WriteByte('\n') |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 142 | sent = sent[:0] |
| 143 | sentB = true |
| 144 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 145 | |
Akron | 274600e | 2021-11-03 20:09:06 +0100 | [diff] [blame] | 146 | writer.Flush() |
| 147 | |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 148 | posC = 0 |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 149 | pos = pos[:0] |
| 150 | } |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 151 | |
| 152 | // Collect text ends |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 153 | } else { |
| 154 | tw.TextEnd = func(_ int) { |
Akron | e87906b | 2021-11-24 10:39:14 +0100 | [diff] [blame] | 155 | writer.WriteByte('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 156 | writer.Flush() |
| 157 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 158 | } |
| 159 | |
Akron | 7035d2e | 2021-10-28 00:54:01 +0200 | [diff] [blame] | 160 | // Flush the writer |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 161 | tw.Flush = func() error { |
| 162 | return writer.Flush() |
| 163 | } |
| 164 | |
| 165 | return tw |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 166 | } |