Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 1 | package datok |
| 2 | |
| 3 | import ( |
| 4 | "bufio" |
| 5 | "io" |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 6 | "strconv" |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 7 | ) |
| 8 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 9 | type Bits uint8 |
| 10 | |
| 11 | const ( |
| 12 | TOKENS Bits = 1 << iota |
| 13 | SENTENCES |
| 14 | TOKEN_POS |
| 15 | SENTENCE_POS |
| 16 | NEWLINE_AFTER_EOT |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame^] | 17 | |
| 18 | SIMPLE = TOKENS | SENTENCES |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 19 | ) |
| 20 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 21 | type TokenWriter struct { |
| 22 | SentenceEnd func(int) |
| 23 | TextEnd func(int) |
| 24 | Flush func() error |
| 25 | Token func(int, []rune) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 26 | } |
| 27 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 28 | // Create a new token writer based on the options |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame^] | 29 | func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 30 | writer := bufio.NewWriter(w) |
| 31 | posC := 0 |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 32 | pos := make([]int, 0, 1024) |
| 33 | sentB := true |
| 34 | sent := make([]int, 0, 1024) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 35 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 36 | tw := &TokenWriter{} |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 37 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 38 | if flags&TOKEN_POS != 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 39 | tw.Token = func(offset int, buf []rune) { |
Akron | a854faa | 2021-10-22 19:31:08 +0200 | [diff] [blame] | 40 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 41 | // TODO: |
| 42 | // Store in []uint16 |
| 43 | // and write to string |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 44 | |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 45 | // Accept newline after EOT |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 46 | if flags&NEWLINE_AFTER_EOT != 0 && posC == 0 && buf[0] == '\n' && writer.Buffered() != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 47 | posC-- |
| 48 | } |
| 49 | |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 50 | posC += offset |
| 51 | pos = append(pos, posC) |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 52 | |
| 53 | // Token is the start of a sentence |
| 54 | if sentB { |
| 55 | sentB = false |
| 56 | sent = append(sent, posC) |
| 57 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 58 | posC += len(buf) - offset |
| 59 | pos = append(pos, posC) |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 60 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 61 | if flags&TOKENS != 0 { |
Akron | a9e0c42 | 2021-10-27 19:01:17 +0200 | [diff] [blame] | 62 | writer.WriteString(string(buf[offset:])) |
| 63 | writer.WriteRune('\n') |
| 64 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 65 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 66 | |
| 67 | // Only print one token per line |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame^] | 68 | } else if flags&TOKENS != 0 { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 69 | tw.Token = func(offset int, buf []rune) { |
| 70 | writer.WriteString(string(buf[offset:])) |
| 71 | writer.WriteRune('\n') |
| 72 | } |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame^] | 73 | } else { |
| 74 | tw.Token = func(_ int, _ []rune) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 75 | } |
| 76 | |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 77 | // Print sentence boundaries |
Akron | 96fdc9b | 2021-10-27 21:11:17 +0200 | [diff] [blame^] | 78 | if flags&SENTENCE_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 79 | tw.SentenceEnd = func(offset int) { |
| 80 | |
| 81 | // Add end position of last token to sentence boundary |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 82 | // TODO: This only works if token positions are taking into account |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 83 | sent = append(sent, pos[len(pos)-1]) |
| 84 | sentB = true |
| 85 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 86 | if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 87 | writer.WriteRune('\n') |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | // Print sentence boundaries as newlines |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 92 | } else if flags&SENTENCES != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 93 | tw.SentenceEnd = func(_ int) { |
| 94 | writer.WriteRune('\n') |
| 95 | } |
| 96 | |
| 97 | // Ignore sentence boundaries |
| 98 | } else { |
| 99 | tw.SentenceEnd = func(_ int) {} |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 100 | } |
| 101 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 102 | if flags&(TOKEN_POS|SENTENCE_POS) != 0 { |
Akron | e9431ec | 2021-10-25 21:35:33 +0200 | [diff] [blame] | 103 | tw.TextEnd = func(_ int) { |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 104 | writer.Flush() |
| 105 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 106 | if flags&TOKEN_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 107 | writer.WriteString(strconv.Itoa(pos[0])) |
| 108 | for _, x := range pos[1:] { |
| 109 | writer.WriteByte(' ') |
| 110 | writer.WriteString(strconv.Itoa(x)) |
| 111 | } |
| 112 | writer.WriteRune('\n') |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 113 | } |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 114 | |
Akron | 0f087ea | 2021-10-27 19:40:15 +0200 | [diff] [blame] | 115 | if flags&SENTENCE_POS != 0 { |
Akron | fceddb6 | 2021-10-27 19:27:54 +0200 | [diff] [blame] | 116 | writer.WriteString(strconv.Itoa(sent[0])) |
| 117 | for _, x := range sent[1:] { |
| 118 | writer.WriteByte(' ') |
| 119 | writer.WriteString(strconv.Itoa(x)) |
| 120 | } |
| 121 | writer.WriteRune('\n') |
| 122 | sent = sent[:0] |
| 123 | sentB = true |
| 124 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 125 | |
Akron | 8cc2dd9 | 2021-10-25 19:49:41 +0200 | [diff] [blame] | 126 | posC = 0 |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 127 | pos = pos[:0] |
| 128 | } |
| 129 | } else { |
| 130 | tw.TextEnd = func(_ int) { |
| 131 | writer.WriteRune('\n') |
| 132 | writer.Flush() |
| 133 | } |
Akron | 4f6b28c | 2021-10-25 00:52:03 +0200 | [diff] [blame] | 134 | } |
| 135 | |
| 136 | tw.Flush = func() error { |
| 137 | return writer.Flush() |
| 138 | } |
| 139 | |
| 140 | return tw |
Akron | e396a93 | 2021-10-19 01:06:13 +0200 | [diff] [blame] | 141 | } |