ast.js 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. // parse a single path portion
  2. import { parseClass } from './brace-expressions.js';
  3. import { unescape } from './unescape.js';
  4. const types = new Set(['!', '?', '+', '*', '@']);
  5. const isExtglobType = (c) => types.has(c);
  6. // Patterns that get prepended to bind to the start of either the
  7. // entire string, or just a single path portion, to prevent dots
  8. // and/or traversal patterns, when needed.
  9. // Exts don't need the ^ or / bit, because the root binds that already.
  10. const startNoTraversal = '(?!(?:^|/)\\.\\.?(?:$|/))';
  11. const startNoDot = '(?!\\.)';
  12. // characters that indicate a start of pattern needs the "no dots" bit,
  13. // because a dot *might* be matched. ( is not in the list, because in
  14. // the case of a child extglob, it will handle the prevention itself.
  15. const addPatternStart = new Set(['[', '.']);
  16. // cases where traversal is A-OK, no dot prevention needed
  17. const justDots = new Set(['..', '.']);
  18. const reSpecials = new Set('().*{}+?[]^$\\!');
  19. const regExpEscape = (s) => s.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, '\\$&');
  20. // any single thing other than /
  21. const qmark = '[^/]';
  22. // * => any number of characters
  23. const star = qmark + '*?';
  24. // use + when we need to ensure that *something* matches, because the * is
  25. // the only thing in the path portion.
  26. const starNoEmpty = qmark + '+?';
  27. // remove the \ chars that we added if we end up doing a nonmagic compare
  28. // const deslash = (s: string) => s.replace(/\\(.)/g, '$1')
  29. export class AST {
  30. type;
  31. #root;
  32. #hasMagic;
  33. #uflag = false;
  34. #parts = [];
  35. #parent;
  36. #parentIndex;
  37. #negs;
  38. #filledNegs = false;
  39. #options;
  40. #toString;
  41. // set to true if it's an extglob with no children
  42. // (which really means one child of '')
  43. #emptyExt = false;
  44. constructor(type, parent, options = {}) {
  45. this.type = type;
  46. // extglobs are inherently magical
  47. if (type)
  48. this.#hasMagic = true;
  49. this.#parent = parent;
  50. this.#root = this.#parent ? this.#parent.#root : this;
  51. this.#options = this.#root === this ? options : this.#root.#options;
  52. this.#negs = this.#root === this ? [] : this.#root.#negs;
  53. if (type === '!' && !this.#root.#filledNegs)
  54. this.#negs.push(this);
  55. this.#parentIndex = this.#parent ? this.#parent.#parts.length : 0;
  56. }
  57. get hasMagic() {
  58. /* c8 ignore start */
  59. if (this.#hasMagic !== undefined)
  60. return this.#hasMagic;
  61. /* c8 ignore stop */
  62. for (const p of this.#parts) {
  63. if (typeof p === 'string')
  64. continue;
  65. if (p.type || p.hasMagic)
  66. return (this.#hasMagic = true);
  67. }
  68. // note: will be undefined until we generate the regexp src and find out
  69. return this.#hasMagic;
  70. }
  71. // reconstructs the pattern
  72. toString() {
  73. if (this.#toString !== undefined)
  74. return this.#toString;
  75. if (!this.type) {
  76. return (this.#toString = this.#parts.map(p => String(p)).join(''));
  77. }
  78. else {
  79. return (this.#toString =
  80. this.type + '(' + this.#parts.map(p => String(p)).join('|') + ')');
  81. }
  82. }
  83. #fillNegs() {
  84. /* c8 ignore start */
  85. if (this !== this.#root)
  86. throw new Error('should only call on root');
  87. if (this.#filledNegs)
  88. return this;
  89. /* c8 ignore stop */
  90. // call toString() once to fill this out
  91. this.toString();
  92. this.#filledNegs = true;
  93. let n;
  94. while ((n = this.#negs.pop())) {
  95. if (n.type !== '!')
  96. continue;
  97. // walk up the tree, appending everthing that comes AFTER parentIndex
  98. let p = n;
  99. let pp = p.#parent;
  100. while (pp) {
  101. for (let i = p.#parentIndex + 1; !pp.type && i < pp.#parts.length; i++) {
  102. for (const part of n.#parts) {
  103. /* c8 ignore start */
  104. if (typeof part === 'string') {
  105. throw new Error('string part in extglob AST??');
  106. }
  107. /* c8 ignore stop */
  108. part.copyIn(pp.#parts[i]);
  109. }
  110. }
  111. p = pp;
  112. pp = p.#parent;
  113. }
  114. }
  115. return this;
  116. }
  117. push(...parts) {
  118. for (const p of parts) {
  119. if (p === '')
  120. continue;
  121. /* c8 ignore start */
  122. if (typeof p !== 'string' && !(p instanceof AST && p.#parent === this)) {
  123. throw new Error('invalid part: ' + p);
  124. }
  125. /* c8 ignore stop */
  126. this.#parts.push(p);
  127. }
  128. }
  129. toJSON() {
  130. const ret = this.type === null
  131. ? this.#parts.slice().map(p => (typeof p === 'string' ? p : p.toJSON()))
  132. : [this.type, ...this.#parts.map(p => p.toJSON())];
  133. if (this.isStart() && !this.type)
  134. ret.unshift([]);
  135. if (this.isEnd() &&
  136. (this === this.#root ||
  137. (this.#root.#filledNegs && this.#parent?.type === '!'))) {
  138. ret.push({});
  139. }
  140. return ret;
  141. }
  142. isStart() {
  143. if (this.#root === this)
  144. return true;
  145. // if (this.type) return !!this.#parent?.isStart()
  146. if (!this.#parent?.isStart())
  147. return false;
  148. if (this.#parentIndex === 0)
  149. return true;
  150. // if everything AHEAD of this is a negation, then it's still the "start"
  151. const p = this.#parent;
  152. for (let i = 0; i < this.#parentIndex; i++) {
  153. const pp = p.#parts[i];
  154. if (!(pp instanceof AST && pp.type === '!')) {
  155. return false;
  156. }
  157. }
  158. return true;
  159. }
  160. isEnd() {
  161. if (this.#root === this)
  162. return true;
  163. if (this.#parent?.type === '!')
  164. return true;
  165. if (!this.#parent?.isEnd())
  166. return false;
  167. if (!this.type)
  168. return this.#parent?.isEnd();
  169. // if not root, it'll always have a parent
  170. /* c8 ignore start */
  171. const pl = this.#parent ? this.#parent.#parts.length : 0;
  172. /* c8 ignore stop */
  173. return this.#parentIndex === pl - 1;
  174. }
  175. copyIn(part) {
  176. if (typeof part === 'string')
  177. this.push(part);
  178. else
  179. this.push(part.clone(this));
  180. }
  181. clone(parent) {
  182. const c = new AST(this.type, parent);
  183. for (const p of this.#parts) {
  184. c.copyIn(p);
  185. }
  186. return c;
  187. }
  188. static #parseAST(str, ast, pos, opt) {
  189. let escaping = false;
  190. let inBrace = false;
  191. let braceStart = -1;
  192. let braceNeg = false;
  193. if (ast.type === null) {
  194. // outside of a extglob, append until we find a start
  195. let i = pos;
  196. let acc = '';
  197. while (i < str.length) {
  198. const c = str.charAt(i++);
  199. // still accumulate escapes at this point, but we do ignore
  200. // starts that are escaped
  201. if (escaping || c === '\\') {
  202. escaping = !escaping;
  203. acc += c;
  204. continue;
  205. }
  206. if (inBrace) {
  207. if (i === braceStart + 1) {
  208. if (c === '^' || c === '!') {
  209. braceNeg = true;
  210. }
  211. }
  212. else if (c === ']' && !(i === braceStart + 2 && braceNeg)) {
  213. inBrace = false;
  214. }
  215. acc += c;
  216. continue;
  217. }
  218. else if (c === '[') {
  219. inBrace = true;
  220. braceStart = i;
  221. braceNeg = false;
  222. acc += c;
  223. continue;
  224. }
  225. if (!opt.noext && isExtglobType(c) && str.charAt(i) === '(') {
  226. ast.push(acc);
  227. acc = '';
  228. const ext = new AST(c, ast);
  229. i = AST.#parseAST(str, ext, i, opt);
  230. ast.push(ext);
  231. continue;
  232. }
  233. acc += c;
  234. }
  235. ast.push(acc);
  236. return i;
  237. }
  238. // some kind of extglob, pos is at the (
  239. // find the next | or )
  240. let i = pos + 1;
  241. let part = new AST(null, ast);
  242. const parts = [];
  243. let acc = '';
  244. while (i < str.length) {
  245. const c = str.charAt(i++);
  246. // still accumulate escapes at this point, but we do ignore
  247. // starts that are escaped
  248. if (escaping || c === '\\') {
  249. escaping = !escaping;
  250. acc += c;
  251. continue;
  252. }
  253. if (inBrace) {
  254. if (i === braceStart + 1) {
  255. if (c === '^' || c === '!') {
  256. braceNeg = true;
  257. }
  258. }
  259. else if (c === ']' && !(i === braceStart + 2 && braceNeg)) {
  260. inBrace = false;
  261. }
  262. acc += c;
  263. continue;
  264. }
  265. else if (c === '[') {
  266. inBrace = true;
  267. braceStart = i;
  268. braceNeg = false;
  269. acc += c;
  270. continue;
  271. }
  272. if (isExtglobType(c) && str.charAt(i) === '(') {
  273. part.push(acc);
  274. acc = '';
  275. const ext = new AST(c, part);
  276. part.push(ext);
  277. i = AST.#parseAST(str, ext, i, opt);
  278. continue;
  279. }
  280. if (c === '|') {
  281. part.push(acc);
  282. acc = '';
  283. parts.push(part);
  284. part = new AST(null, ast);
  285. continue;
  286. }
  287. if (c === ')') {
  288. if (acc === '' && ast.#parts.length === 0) {
  289. ast.#emptyExt = true;
  290. }
  291. part.push(acc);
  292. acc = '';
  293. ast.push(...parts, part);
  294. return i;
  295. }
  296. acc += c;
  297. }
  298. // unfinished extglob
  299. // if we got here, it was a malformed extglob! not an extglob, but
  300. // maybe something else in there.
  301. ast.type = null;
  302. ast.#hasMagic = undefined;
  303. ast.#parts = [str.substring(pos - 1)];
  304. return i;
  305. }
  306. static fromGlob(pattern, options = {}) {
  307. const ast = new AST(null, undefined, options);
  308. AST.#parseAST(pattern, ast, 0, options);
  309. return ast;
  310. }
  311. // returns the regular expression if there's magic, or the unescaped
  312. // string if not.
  313. toMMPattern() {
  314. // should only be called on root
  315. /* c8 ignore start */
  316. if (this !== this.#root)
  317. return this.#root.toMMPattern();
  318. /* c8 ignore stop */
  319. const glob = this.toString();
  320. const [re, body, hasMagic, uflag] = this.toRegExpSource();
  321. // if we're in nocase mode, and not nocaseMagicOnly, then we do
  322. // still need a regular expression if we have to case-insensitively
  323. // match capital/lowercase characters.
  324. const anyMagic = hasMagic ||
  325. this.#hasMagic ||
  326. (this.#options.nocase &&
  327. !this.#options.nocaseMagicOnly &&
  328. glob.toUpperCase() !== glob.toLowerCase());
  329. if (!anyMagic) {
  330. return body;
  331. }
  332. const flags = (this.#options.nocase ? 'i' : '') + (uflag ? 'u' : '');
  333. return Object.assign(new RegExp(`^${re}$`, flags), {
  334. _src: re,
  335. _glob: glob,
  336. });
  337. }
  338. get options() {
  339. return this.#options;
  340. }
  341. // returns the string match, the regexp source, whether there's magic
  342. // in the regexp (so a regular expression is required) and whether or
  343. // not the uflag is needed for the regular expression (for posix classes)
  344. // TODO: instead of injecting the start/end at this point, just return
  345. // the BODY of the regexp, along with the start/end portions suitable
  346. // for binding the start/end in either a joined full-path makeRe context
  347. // (where we bind to (^|/), or a standalone matchPart context (where
  348. // we bind to ^, and not /). Otherwise slashes get duped!
  349. //
  350. // In part-matching mode, the start is:
  351. // - if not isStart: nothing
  352. // - if traversal possible, but not allowed: ^(?!\.\.?$)
  353. // - if dots allowed or not possible: ^
  354. // - if dots possible and not allowed: ^(?!\.)
  355. // end is:
  356. // - if not isEnd(): nothing
  357. // - else: $
  358. //
  359. // In full-path matching mode, we put the slash at the START of the
  360. // pattern, so start is:
  361. // - if first pattern: same as part-matching mode
  362. // - if not isStart(): nothing
  363. // - if traversal possible, but not allowed: /(?!\.\.?(?:$|/))
  364. // - if dots allowed or not possible: /
  365. // - if dots possible and not allowed: /(?!\.)
  366. // end is:
  367. // - if last pattern, same as part-matching mode
  368. // - else nothing
  369. //
  370. // Always put the (?:$|/) on negated tails, though, because that has to be
  371. // there to bind the end of the negated pattern portion, and it's easier to
  372. // just stick it in now rather than try to inject it later in the middle of
  373. // the pattern.
  374. //
  375. // We can just always return the same end, and leave it up to the caller
  376. // to know whether it's going to be used joined or in parts.
  377. // And, if the start is adjusted slightly, can do the same there:
  378. // - if not isStart: nothing
  379. // - if traversal possible, but not allowed: (?:/|^)(?!\.\.?$)
  380. // - if dots allowed or not possible: (?:/|^)
  381. // - if dots possible and not allowed: (?:/|^)(?!\.)
  382. //
  383. // But it's better to have a simpler binding without a conditional, for
  384. // performance, so probably better to return both start options.
  385. //
  386. // Then the caller just ignores the end if it's not the first pattern,
  387. // and the start always gets applied.
  388. //
  389. // But that's always going to be $ if it's the ending pattern, or nothing,
  390. // so the caller can just attach $ at the end of the pattern when building.
  391. //
  392. // So the todo is:
  393. // - better detect what kind of start is needed
  394. // - return both flavors of starting pattern
  395. // - attach $ at the end of the pattern when creating the actual RegExp
  396. //
  397. // Ah, but wait, no, that all only applies to the root when the first pattern
  398. // is not an extglob. If the first pattern IS an extglob, then we need all
  399. // that dot prevention biz to live in the extglob portions, because eg
  400. // +(*|.x*) can match .xy but not .yx.
  401. //
  402. // So, return the two flavors if it's #root and the first child is not an
  403. // AST, otherwise leave it to the child AST to handle it, and there,
  404. // use the (?:^|/) style of start binding.
  405. //
  406. // Even simplified further:
  407. // - Since the start for a join is eg /(?!\.) and the start for a part
  408. // is ^(?!\.), we can just prepend (?!\.) to the pattern (either root
  409. // or start or whatever) and prepend ^ or / at the Regexp construction.
  410. toRegExpSource(allowDot) {
  411. const dot = allowDot ?? !!this.#options.dot;
  412. if (this.#root === this)
  413. this.#fillNegs();
  414. if (!this.type) {
  415. const noEmpty = this.isStart() && this.isEnd();
  416. const src = this.#parts
  417. .map(p => {
  418. const [re, _, hasMagic, uflag] = typeof p === 'string'
  419. ? AST.#parseGlob(p, this.#hasMagic, noEmpty)
  420. : p.toRegExpSource(allowDot);
  421. this.#hasMagic = this.#hasMagic || hasMagic;
  422. this.#uflag = this.#uflag || uflag;
  423. return re;
  424. })
  425. .join('');
  426. let start = '';
  427. if (this.isStart()) {
  428. if (typeof this.#parts[0] === 'string') {
  429. // this is the string that will match the start of the pattern,
  430. // so we need to protect against dots and such.
  431. // '.' and '..' cannot match unless the pattern is that exactly,
  432. // even if it starts with . or dot:true is set.
  433. const dotTravAllowed = this.#parts.length === 1 && justDots.has(this.#parts[0]);
  434. if (!dotTravAllowed) {
  435. const aps = addPatternStart;
  436. // check if we have a possibility of matching . or ..,
  437. // and prevent that.
  438. const needNoTrav =
  439. // dots are allowed, and the pattern starts with [ or .
  440. (dot && aps.has(src.charAt(0))) ||
  441. // the pattern starts with \., and then [ or .
  442. (src.startsWith('\\.') && aps.has(src.charAt(2))) ||
  443. // the pattern starts with \.\., and then [ or .
  444. (src.startsWith('\\.\\.') && aps.has(src.charAt(4)));
  445. // no need to prevent dots if it can't match a dot, or if a
  446. // sub-pattern will be preventing it anyway.
  447. const needNoDot = !dot && !allowDot && aps.has(src.charAt(0));
  448. start = needNoTrav ? startNoTraversal : needNoDot ? startNoDot : '';
  449. }
  450. }
  451. }
  452. // append the "end of path portion" pattern to negation tails
  453. let end = '';
  454. if (this.isEnd() &&
  455. this.#root.#filledNegs &&
  456. this.#parent?.type === '!') {
  457. end = '(?:$|\\/)';
  458. }
  459. const final = start + src + end;
  460. return [
  461. final,
  462. unescape(src),
  463. (this.#hasMagic = !!this.#hasMagic),
  464. this.#uflag,
  465. ];
  466. }
  467. // We need to calculate the body *twice* if it's a repeat pattern
  468. // at the start, once in nodot mode, then again in dot mode, so a
  469. // pattern like *(?) can match 'x.y'
  470. const repeated = this.type === '*' || this.type === '+';
  471. // some kind of extglob
  472. const start = this.type === '!' ? '(?:(?!(?:' : '(?:';
  473. let body = this.#partsToRegExp(dot);
  474. if (this.isStart() && this.isEnd() && !body && this.type !== '!') {
  475. // invalid extglob, has to at least be *something* present, if it's
  476. // the entire path portion.
  477. const s = this.toString();
  478. this.#parts = [s];
  479. this.type = null;
  480. this.#hasMagic = undefined;
  481. return [s, unescape(this.toString()), false, false];
  482. }
  483. // XXX abstract out this map method
  484. let bodyDotAllowed = !repeated || allowDot || dot || !startNoDot
  485. ? ''
  486. : this.#partsToRegExp(true);
  487. if (bodyDotAllowed === body) {
  488. bodyDotAllowed = '';
  489. }
  490. if (bodyDotAllowed) {
  491. body = `(?:${body})(?:${bodyDotAllowed})*?`;
  492. }
  493. // an empty !() is exactly equivalent to a starNoEmpty
  494. let final = '';
  495. if (this.type === '!' && this.#emptyExt) {
  496. final = (this.isStart() && !dot ? startNoDot : '') + starNoEmpty;
  497. }
  498. else {
  499. const close = this.type === '!'
  500. ? // !() must match something,but !(x) can match ''
  501. '))' +
  502. (this.isStart() && !dot && !allowDot ? startNoDot : '') +
  503. star +
  504. ')'
  505. : this.type === '@'
  506. ? ')'
  507. : this.type === '?'
  508. ? ')?'
  509. : this.type === '+' && bodyDotAllowed
  510. ? ')'
  511. : this.type === '*' && bodyDotAllowed
  512. ? `)?`
  513. : `)${this.type}`;
  514. final = start + body + close;
  515. }
  516. return [
  517. final,
  518. unescape(body),
  519. (this.#hasMagic = !!this.#hasMagic),
  520. this.#uflag,
  521. ];
  522. }
  523. #partsToRegExp(dot) {
  524. return this.#parts
  525. .map(p => {
  526. // extglob ASTs should only contain parent ASTs
  527. /* c8 ignore start */
  528. if (typeof p === 'string') {
  529. throw new Error('string type in extglob ast??');
  530. }
  531. /* c8 ignore stop */
  532. // can ignore hasMagic, because extglobs are already always magic
  533. const [re, _, _hasMagic, uflag] = p.toRegExpSource(dot);
  534. this.#uflag = this.#uflag || uflag;
  535. return re;
  536. })
  537. .filter(p => !(this.isStart() && this.isEnd()) || !!p)
  538. .join('|');
  539. }
  540. static #parseGlob(glob, hasMagic, noEmpty = false) {
  541. let escaping = false;
  542. let re = '';
  543. let uflag = false;
  544. for (let i = 0; i < glob.length; i++) {
  545. const c = glob.charAt(i);
  546. if (escaping) {
  547. escaping = false;
  548. re += (reSpecials.has(c) ? '\\' : '') + c;
  549. continue;
  550. }
  551. if (c === '\\') {
  552. if (i === glob.length - 1) {
  553. re += '\\\\';
  554. }
  555. else {
  556. escaping = true;
  557. }
  558. continue;
  559. }
  560. if (c === '[') {
  561. const [src, needUflag, consumed, magic] = parseClass(glob, i);
  562. if (consumed) {
  563. re += src;
  564. uflag = uflag || needUflag;
  565. i += consumed - 1;
  566. hasMagic = hasMagic || magic;
  567. continue;
  568. }
  569. }
  570. if (c === '*') {
  571. if (noEmpty && glob === '*')
  572. re += starNoEmpty;
  573. else
  574. re += star;
  575. hasMagic = true;
  576. continue;
  577. }
  578. if (c === '?') {
  579. re += qmark;
  580. hasMagic = true;
  581. continue;
  582. }
  583. re += regExpEscape(c);
  584. }
  585. return [re, unescape(glob), !!hasMagic, uflag];
  586. }
  587. }
  588. //# sourceMappingURL=ast.js.map