Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "io" |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 6 | "strconv" |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 7 | ) |
| 8 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 9 | type Bits uint8 |
| 10 | |
| 11 | const ( |
| 12 | TOKENS Bits = 1 << iota |
| 13 | SENTENCES |
| 14 | TOKEN_POS |
| 15 | SENTENCE_POS |
| 16 | NEWLINE_AFTER_EOT |
| 17 | ) |
| 18 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 19 | type TokenWriter struct { |
| 20 | SentenceEnd func(int) |
| 21 | TextEnd func(int) |
| 22 | Flush func() error |
| 23 | Token func(int, []rune) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 24 | } |
| 25 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 26 | func NewTokenWriter(w io.Writer) *TokenWriter { |
| 27 | writer := bufio.NewWriter(w) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 28 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 29 | return &TokenWriter{ |
| 30 | SentenceEnd: func(_ int) { |
| 31 | writer.WriteRune('\n') |
| 32 | }, |
| 33 | TextEnd: func(_ int) { |
| 34 | writer.WriteRune('\n') |
| 35 | writer.Flush() |
| 36 | }, |
| 37 | Token: func(offset int, buf []rune) { |
| 38 | writer.WriteString(string(buf[offset:])) |
| 39 | writer.WriteRune('\n') |
| 40 | }, |
| 41 | Flush: func() error { |
| 42 | return writer.Flush() |
| 43 | }, |
| 44 | } |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 45 | } |
| 46 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 47 | // Create a new token writer based on the options |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 48 | func NewTokenWriterFromOptions(w io.Writer, flags Bits) *TokenWriter { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 49 | writer := bufio.NewWriter(w) |
| 50 | posC := 0 |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 51 | pos := make([]int, 0, 1024) |
| 52 | sentB := true |
| 53 | sent := make([]int, 0, 1024) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 54 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 55 | tw := &TokenWriter{} |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 56 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 57 | if flags&TOKEN_POS != 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 58 | tw.Token = func(offset int, buf []rune) { |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 59 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 60 | // TODO: |
| 61 | // Store in []uint16 |
| 62 | // and write to string |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 63 | |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 64 | // Accept newline after EOT |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 65 | if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 66 | posC-- |
| 67 | } |
| 68 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 69 | posC += offset |
| 70 | pos = append(pos, posC) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 71 | |
| 72 | // Token is the start of a sentence |
| 73 | if sentB { |
| 74 | sentB = false |
| 75 | sent = append(sent, posC) |
| 76 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 77 | posC += len(buf) - offset |
| 78 | pos = append(pos, posC) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 79 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 80 | if flags&TOKENS != 0 { |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 81 | writer.WriteString(string(buf[offset:])) |
| 82 | writer.WriteRune('\n') |
| 83 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 84 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 85 | |
| 86 | // Only print one token per line |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 87 | } else { |
| 88 | tw.Token = func(offset int, buf []rune) { |
| 89 | writer.WriteString(string(buf[offset:])) |
| 90 | writer.WriteRune('\n') |
| 91 | } |
| 92 | } |
| 93 | |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 94 | // Print sentence boundaries |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 95 | if flags&(SENTENCES|SENTENCE_POS) != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 96 | tw.SentenceEnd = func(offset int) { |
| 97 | |
| 98 | // Add end position of last token to sentence boundary |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 99 | // TODO: This only works if token positions are taking into account |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 100 | sent = append(sent, pos[len(pos)-1]) |
| 101 | sentB = true |
| 102 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 103 | if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 104 | writer.WriteRune('\n') |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | // Print sentence boundaries as newlines |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 109 | } else if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 110 | tw.SentenceEnd = func(_ int) { |
| 111 | writer.WriteRune('\n') |
| 112 | } |
| 113 | |
| 114 | // Ignore sentence boundaries |
| 115 | } else { |
| 116 | tw.SentenceEnd = func(_ int) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 117 | } |
| 118 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 119 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 120 | tw.TextEnd = func(_ int) { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 121 | writer.Flush() |
| 122 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 123 | if flags&TOKEN_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 124 | writer.WriteString(strconv.Itoa(pos[0])) |
| 125 | for _, x := range pos[1:] { |
| 126 | writer.WriteByte(' ') |
| 127 | writer.WriteString(strconv.Itoa(x)) |
| 128 | } |
| 129 | writer.WriteRune('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 130 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 131 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 132 | if flags&SENTENCE_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 133 | writer.WriteString(strconv.Itoa(sent[0])) |
| 134 | for _, x := range sent[1:] { |
| 135 | writer.WriteByte(' ') |
| 136 | writer.WriteString(strconv.Itoa(x)) |
| 137 | } |
| 138 | writer.WriteRune('\n') |
| 139 | sent = sent[:0] |
| 140 | sentB = true |
| 141 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 142 | |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 143 | posC = 0 |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 144 | pos = pos[:0] |
| 145 | } |
| 146 | } else { |
| 147 | tw.TextEnd = func(_ int) { |
| 148 | writer.WriteRune('\n') |
| 149 | writer.Flush() |
| 150 | } |
| 151 | |
| 152 | } |
| 153 | |
| 154 | tw.Flush = func() error { |
| 155 | return writer.Flush() |
| 156 | } |
| 157 | |
| 158 | return tw |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 159 | } |