| Leo Repp | 58b9f11 | 2021-11-22 11:57:47 +0100 | [diff] [blame^] | 1 | var util = require('./util'); |
| 2 | var types = require('./types'); |
| 3 | var sets = require('./sets'); |
| 4 | var positions = require('./positions'); |
| 5 | |
| 6 | |
| 7 | module.exports = function(regexpStr) { |
| 8 | var i = 0, l, c, |
| 9 | start = { type: types.ROOT, stack: []}, |
| 10 | |
| 11 | // Keep track of last clause/group and stack. |
| 12 | lastGroup = start, |
| 13 | last = start.stack, |
| 14 | groupStack = []; |
| 15 | |
| 16 | |
| 17 | var repeatErr = function(i) { |
| 18 | util.error(regexpStr, 'Nothing to repeat at column ' + (i - 1)); |
| 19 | }; |
| 20 | |
| 21 | // Decode a few escaped characters. |
| 22 | var str = util.strToChars(regexpStr); |
| 23 | l = str.length; |
| 24 | |
| 25 | // Iterate through each character in string. |
| 26 | while (i < l) { |
| 27 | c = str[i++]; |
| 28 | |
| 29 | switch (c) { |
| 30 | // Handle escaped characters, inclues a few sets. |
| 31 | case '\\': |
| 32 | c = str[i++]; |
| 33 | |
| 34 | switch (c) { |
| 35 | case 'b': |
| 36 | last.push(positions.wordBoundary()); |
| 37 | break; |
| 38 | |
| 39 | case 'B': |
| 40 | last.push(positions.nonWordBoundary()); |
| 41 | break; |
| 42 | |
| 43 | case 'w': |
| 44 | last.push(sets.words()); |
| 45 | break; |
| 46 | |
| 47 | case 'W': |
| 48 | last.push(sets.notWords()); |
| 49 | break; |
| 50 | |
| 51 | case 'd': |
| 52 | last.push(sets.ints()); |
| 53 | break; |
| 54 | |
| 55 | case 'D': |
| 56 | last.push(sets.notInts()); |
| 57 | break; |
| 58 | |
| 59 | case 's': |
| 60 | last.push(sets.whitespace()); |
| 61 | break; |
| 62 | |
| 63 | case 'S': |
| 64 | last.push(sets.notWhitespace()); |
| 65 | break; |
| 66 | |
| 67 | default: |
| 68 | // Check if c is integer. |
| 69 | // In which case it's a reference. |
| 70 | if (/\d/.test(c)) { |
| 71 | last.push({ type: types.REFERENCE, value: parseInt(c, 10) }); |
| 72 | |
| 73 | // Escaped character. |
| 74 | } else { |
| 75 | last.push({ type: types.CHAR, value: c.charCodeAt(0) }); |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | break; |
| 80 | |
| 81 | |
| 82 | // Positionals. |
| 83 | case '^': |
| 84 | last.push(positions.begin()); |
| 85 | break; |
| 86 | |
| 87 | case '$': |
| 88 | last.push(positions.end()); |
| 89 | break; |
| 90 | |
| 91 | |
| 92 | // Handle custom sets. |
| 93 | case '[': |
| 94 | // Check if this class is 'anti' i.e. [^abc]. |
| 95 | var not; |
| 96 | if (str[i] === '^') { |
| 97 | not = true; |
| 98 | i++; |
| 99 | } else { |
| 100 | not = false; |
| 101 | } |
| 102 | |
| 103 | // Get all the characters in class. |
| 104 | var classTokens = util.tokenizeClass(str.slice(i), regexpStr); |
| 105 | |
| 106 | // Increase index by length of class. |
| 107 | i += classTokens[1]; |
| 108 | last.push({ |
| 109 | type: types.SET, |
| 110 | set: classTokens[0], |
| 111 | not: not, |
| 112 | }); |
| 113 | |
| 114 | break; |
| 115 | |
| 116 | |
| 117 | // Class of any character except \n. |
| 118 | case '.': |
| 119 | last.push(sets.anyChar()); |
| 120 | break; |
| 121 | |
| 122 | |
| 123 | // Push group onto stack. |
| 124 | case '(': |
| 125 | // Create group. |
| 126 | var group = { |
| 127 | type: types.GROUP, |
| 128 | stack: [], |
| 129 | remember: true, |
| 130 | }; |
| 131 | |
| 132 | c = str[i]; |
| 133 | |
| 134 | // If if this is a special kind of group. |
| 135 | if (c === '?') { |
| 136 | c = str[i + 1]; |
| 137 | i += 2; |
| 138 | |
| 139 | // Match if followed by. |
| 140 | if (c === '=') { |
| 141 | group.followedBy = true; |
| 142 | |
| 143 | // Match if not followed by. |
| 144 | } else if (c === '!') { |
| 145 | group.notFollowedBy = true; |
| 146 | |
| 147 | } else if (c !== ':') { |
| 148 | util.error(regexpStr, |
| 149 | 'Invalid group, character \'' + c + |
| 150 | '\' after \'?\' at column ' + (i - 1)); |
| 151 | } |
| 152 | |
| 153 | group.remember = false; |
| 154 | } |
| 155 | |
| 156 | // Insert subgroup into current group stack. |
| 157 | last.push(group); |
| 158 | |
| 159 | // Remember the current group for when the group closes. |
| 160 | groupStack.push(lastGroup); |
| 161 | |
| 162 | // Make this new group the current group. |
| 163 | lastGroup = group; |
| 164 | last = group.stack; |
| 165 | break; |
| 166 | |
| 167 | |
| 168 | // Pop group out of stack. |
| 169 | case ')': |
| 170 | if (groupStack.length === 0) { |
| 171 | util.error(regexpStr, 'Unmatched ) at column ' + (i - 1)); |
| 172 | } |
| 173 | lastGroup = groupStack.pop(); |
| 174 | |
| 175 | // Check if this group has a PIPE. |
| 176 | // To get back the correct last stack. |
| 177 | last = lastGroup.options ? |
| 178 | lastGroup.options[lastGroup.options.length - 1] : lastGroup.stack; |
| 179 | break; |
| 180 | |
| 181 | |
| 182 | // Use pipe character to give more choices. |
| 183 | case '|': |
| 184 | // Create array where options are if this is the first PIPE |
| 185 | // in this clause. |
| 186 | if (!lastGroup.options) { |
| 187 | lastGroup.options = [lastGroup.stack]; |
| 188 | delete lastGroup.stack; |
| 189 | } |
| 190 | |
| 191 | // Create a new stack and add to options for rest of clause. |
| 192 | var stack = []; |
| 193 | lastGroup.options.push(stack); |
| 194 | last = stack; |
| 195 | break; |
| 196 | |
| 197 | |
| 198 | // Repetition. |
| 199 | // For every repetition, remove last element from last stack |
| 200 | // then insert back a RANGE object. |
| 201 | // This design is chosen because there could be more than |
| 202 | // one repetition symbols in a regex i.e. `a?+{2,3}`. |
| 203 | case '{': |
| 204 | var rs = /^(\d+)(,(\d+)?)?\}/.exec(str.slice(i)), min, max; |
| 205 | if (rs !== null) { |
| 206 | if (last.length === 0) { |
| 207 | repeatErr(i); |
| 208 | } |
| 209 | min = parseInt(rs[1], 10); |
| 210 | max = rs[2] ? rs[3] ? parseInt(rs[3], 10) : Infinity : min; |
| 211 | i += rs[0].length; |
| 212 | |
| 213 | last.push({ |
| 214 | type: types.REPETITION, |
| 215 | min: min, |
| 216 | max: max, |
| 217 | value: last.pop(), |
| 218 | }); |
| 219 | } else { |
| 220 | last.push({ |
| 221 | type: types.CHAR, |
| 222 | value: 123, |
| 223 | }); |
| 224 | } |
| 225 | break; |
| 226 | |
| 227 | case '?': |
| 228 | if (last.length === 0) { |
| 229 | repeatErr(i); |
| 230 | } |
| 231 | last.push({ |
| 232 | type: types.REPETITION, |
| 233 | min: 0, |
| 234 | max: 1, |
| 235 | value: last.pop(), |
| 236 | }); |
| 237 | break; |
| 238 | |
| 239 | case '+': |
| 240 | if (last.length === 0) { |
| 241 | repeatErr(i); |
| 242 | } |
| 243 | last.push({ |
| 244 | type: types.REPETITION, |
| 245 | min: 1, |
| 246 | max: Infinity, |
| 247 | value: last.pop(), |
| 248 | }); |
| 249 | break; |
| 250 | |
| 251 | case '*': |
| 252 | if (last.length === 0) { |
| 253 | repeatErr(i); |
| 254 | } |
| 255 | last.push({ |
| 256 | type: types.REPETITION, |
| 257 | min: 0, |
| 258 | max: Infinity, |
| 259 | value: last.pop(), |
| 260 | }); |
| 261 | break; |
| 262 | |
| 263 | |
| 264 | // Default is a character that is not `\[](){}?+*^$`. |
| 265 | default: |
| 266 | last.push({ |
| 267 | type: types.CHAR, |
| 268 | value: c.charCodeAt(0), |
| 269 | }); |
| 270 | } |
| 271 | |
| 272 | } |
| 273 | |
| 274 | // Check if any groups have not been closed. |
| 275 | if (groupStack.length !== 0) { |
| 276 | util.error(regexpStr, 'Unterminated group'); |
| 277 | } |
| 278 | |
| 279 | return start; |
| 280 | }; |
| 281 | |
| 282 | module.exports.types = types; |