rewrite-pattern.js 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. 'use strict';
  2. const generate = require('regjsgen').generate;
  3. const parse = require('regjsparser').parse;
  4. const regenerate = require('regenerate');
  5. const unicodeMatchProperty = require('unicode-match-property-ecmascript');
  6. const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
  7. const iuMappings = require('./data/iu-mappings.js');
  8. const iBMPMappings = require('./data/i-bmp-mappings.js');
  9. const iuFoldings = require('./data/iu-foldings.js');
  10. const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
  11. const { UNICODE_SET, UNICODE_IV_SET } = require('./data/all-characters.js');
  12. function flatMap(array, callback) {
  13. const result = [];
  14. array.forEach(item => {
  15. const res = callback(item);
  16. if (Array.isArray(res)) {
  17. result.push.apply(result, res);
  18. } else {
  19. result.push(res);
  20. }
  21. });
  22. return result;
  23. }
  24. function regenerateContainsAstral(regenerateData) {
  25. const data = regenerateData.data;
  26. return data.length >= 1 && data[data.length - 1] >= 0x10000;
  27. }
  28. // https://tc39.es/ecma262/#prod-SyntaxCharacter
  29. const SYNTAX_CHARS = /[\\^$.*+?()[\]{}|]/g;
  30. const ASTRAL_SET = regenerate().addRange(0x10000, 0x10FFFF);
  31. const NEWLINE_SET = regenerate().add(
  32. // `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
  33. 0x000A, // Line Feed <LF>
  34. 0x000D, // Carriage Return <CR>
  35. 0x2028, // Line Separator <LS>
  36. 0x2029 // Paragraph Separator <PS>
  37. );
  38. // Prepare a Regenerate set containing all code points that are supposed to be
  39. // matched by `/./u`. https://mths.be/es6#sec-atom
  40. const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
  41. .remove(NEWLINE_SET);
  42. const getCharacterClassEscapeSet = (character, unicode, ignoreCase, shouldApplySCF) => {
  43. if (unicode) {
  44. if (ignoreCase) {
  45. const result = ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
  46. if (shouldApplySCF) {
  47. return ESCAPE_SETS.UNICODESET_IGNORE_CASE.get(character);
  48. } else {
  49. return result;
  50. }
  51. }
  52. return ESCAPE_SETS.UNICODE.get(character);
  53. }
  54. return ESCAPE_SETS.REGULAR.get(character);
  55. };
  56. const getUnicodeDotSet = (dotAll) => {
  57. return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
  58. };
  59. const getUnicodePropertyValueSet = (property, value) => {
  60. const path = value ?
  61. `${ property }/${ value }` :
  62. `Binary_Property/${ property }`;
  63. try {
  64. return require(`regenerate-unicode-properties/${ path }.js`);
  65. } catch (exception) {
  66. throw new Error(
  67. `Failed to recognize value \`${ value }\` for property ` +
  68. `\`${ property }\`.`
  69. );
  70. }
  71. };
  72. const handleLoneUnicodePropertyNameOrValue = (value) => {
  73. // It could be a `General_Category` value or a binary property.
  74. // Note: `unicodeMatchPropertyValue` throws on invalid values.
  75. try {
  76. const property = 'General_Category';
  77. const category = unicodeMatchPropertyValue(property, value);
  78. return getUnicodePropertyValueSet(property, category);
  79. } catch (exception) {}
  80. // It’s not a `General_Category` value, so check if it’s a property
  81. // of strings.
  82. try {
  83. return getUnicodePropertyValueSet('Property_of_Strings', value);
  84. } catch (exception) {}
  85. // Lastly, check if it’s a binary property of single code points.
  86. // Note: `unicodeMatchProperty` throws on invalid properties.
  87. const property = unicodeMatchProperty(value);
  88. return getUnicodePropertyValueSet(property);
  89. };
  90. const getUnicodePropertyEscapeSet = (value, isNegative, isUnicodeSetIgnoreCase) => {
  91. const parts = value.split('=');
  92. const firstPart = parts[0];
  93. let set;
  94. if (parts.length == 1) {
  95. set = handleLoneUnicodePropertyNameOrValue(firstPart);
  96. } else {
  97. // The pattern consists of two parts, i.e. `Property=Value`.
  98. const property = unicodeMatchProperty(firstPart);
  99. const value = unicodeMatchPropertyValue(property, parts[1]);
  100. set = getUnicodePropertyValueSet(property, value);
  101. }
  102. if (isNegative) {
  103. if (set.strings) {
  104. throw new Error('Cannot negate Unicode property of strings');
  105. }
  106. return {
  107. characters: (isUnicodeSetIgnoreCase ? UNICODE_IV_SET : UNICODE_SET).clone().remove(set.characters),
  108. strings: new Set()
  109. };
  110. }
  111. return {
  112. characters: set.characters.clone(),
  113. strings: set.strings
  114. // We need to escape strings like *️⃣ to make sure that they can be safely used in unions.
  115. ? new Set(set.strings.map(str => str.replace(SYNTAX_CHARS, '\\$&')))
  116. : new Set()
  117. };
  118. };
  119. const getUnicodePropertyEscapeCharacterClassData = (property, isNegative, isUnicodeSetIgnoreCase, shouldApplySCF) => {
  120. const set = getUnicodePropertyEscapeSet(property, isNegative, isUnicodeSetIgnoreCase);
  121. const data = getCharacterClassEmptyData();
  122. const singleChars = shouldApplySCF ? regenerate(set.characters.toArray().map(ch => simpleCaseFolding(ch))) : set.characters;
  123. const caseEqFlags = configGetCaseEqFlags();
  124. if (caseEqFlags) {
  125. for (const codepoint of singleChars.toArray()) {
  126. const list = getCaseEquivalents(codepoint, caseEqFlags);
  127. if (list) {
  128. singleChars.add(list);
  129. }
  130. }
  131. }
  132. data.singleChars = singleChars;
  133. if (set.strings.size > 0) {
  134. data.longStrings = set.strings;
  135. data.maybeIncludesStrings = true;
  136. }
  137. return data;
  138. };
  139. const CASE_EQ_FLAG_NONE = 0b00;
  140. const CASE_EQ_FLAG_BMP = 0b01;
  141. const CASE_EQ_FLAG_UNICODE = 0b10;
  142. function configGetCaseEqFlags() {
  143. let flags = CASE_EQ_FLAG_NONE;
  144. if (config.modifiersData.i === true) {
  145. if (config.transform.modifiers) {
  146. flags |= CASE_EQ_FLAG_BMP;
  147. if (config.flags.unicode || config.flags.unicodeSets) {
  148. flags |= CASE_EQ_FLAG_UNICODE;
  149. }
  150. }
  151. } else if (config.modifiersData.i === undefined) {
  152. if (config.transform.unicodeFlag && config.flags.ignoreCase) {
  153. flags |= CASE_EQ_FLAG_UNICODE;
  154. }
  155. }
  156. return flags;
  157. }
  158. // Given a range of code points, add any case-equivalent code points in that range
  159. // to a set.
  160. regenerate.prototype.iuAddRange = function(min, max, caseEqFlags) {
  161. const $this = this;
  162. do {
  163. const list = getCaseEquivalents(min, caseEqFlags);
  164. if (list) {
  165. $this.add(list);
  166. }
  167. } while (++min <= max);
  168. return $this;
  169. };
  170. regenerate.prototype.iuRemoveRange = function(min, max, caseEqFlags) {
  171. const $this = this;
  172. do {
  173. const list = getCaseEquivalents(min, caseEqFlags);
  174. if (list) {
  175. $this.remove(list);
  176. }
  177. } while (++min <= max);
  178. return $this;
  179. };
  180. const update = (item, pattern) => {
  181. let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '', {
  182. lookbehind: true,
  183. namedGroups: true,
  184. unicodePropertyEscape: true,
  185. unicodeSet: true,
  186. modifiers: true,
  187. });
  188. switch (tree.type) {
  189. case 'characterClass':
  190. case 'group':
  191. case 'value':
  192. // No wrapping needed.
  193. break;
  194. default:
  195. // Wrap the pattern in a non-capturing group.
  196. tree = wrap(tree, pattern);
  197. }
  198. Object.assign(item, tree);
  199. };
  200. const wrap = (tree, pattern) => {
  201. // Wrap the pattern in a non-capturing group.
  202. return {
  203. 'type': 'group',
  204. 'behavior': 'ignore',
  205. 'body': [tree],
  206. 'raw': `(?:${ pattern })`
  207. };
  208. };
  209. /**
  210. * Given any codepoint ch, returns false or an array of characters,
  211. * such that for every c in the array,
  212. * c != ch and Canonicalize(~, c) == Canonicalize(~, ch)
  213. *
  214. * where Canonicalize is defined in
  215. * https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
  216. * @param {number} codePoint input code point
  217. * @param {number} flags bitwise flags composed of CASE_EQ_FLAG_*
  218. * @returns false | number[]
  219. */
  220. const getCaseEquivalents = (codePoint, flags) => {
  221. if (flags === CASE_EQ_FLAG_NONE) {
  222. return false;
  223. }
  224. let result = ((flags & CASE_EQ_FLAG_UNICODE) ? iuMappings.get(codePoint) : undefined) || [];
  225. if (typeof result === "number") result = [result];
  226. if (flags & CASE_EQ_FLAG_BMP) {
  227. for (const cp of [codePoint].concat(result)) {
  228. // Fast path for ASCII characters
  229. if (cp >= 0x41 && cp <= 0x5a) {
  230. result.push(cp + 0x20);
  231. } else if (cp >= 0x61 && cp <= 0x7a) {
  232. result.push(cp - 0x20);
  233. } else {
  234. result = result.concat(iBMPMappings.get(cp) || []);
  235. }
  236. }
  237. }
  238. return result.length == 0 ? false : result;
  239. };
  240. // https://tc39.es/ecma262/#sec-maybesimplecasefolding
  241. const simpleCaseFolding = (codePoint) => {
  242. // Fast path for ASCII characters
  243. if (codePoint <= 0x7F) {
  244. if (codePoint >= 0x41 && codePoint <= 0x5A) {
  245. return codePoint + 0x20;
  246. }
  247. return codePoint;
  248. }
  249. return iuFoldings.get(codePoint) || codePoint;
  250. }
  251. const buildHandler = (action) => {
  252. switch (action) {
  253. case 'union':
  254. return {
  255. single: (data, cp) => {
  256. data.singleChars.add(cp);
  257. },
  258. regSet: (data, set2) => {
  259. data.singleChars.add(set2);
  260. },
  261. range: (data, start, end) => {
  262. data.singleChars.addRange(start, end);
  263. },
  264. iuRange: (data, start, end, caseEqFlags) => {
  265. data.singleChars.iuAddRange(start, end, caseEqFlags);
  266. },
  267. nested: (data, nestedData) => {
  268. data.singleChars.add(nestedData.singleChars);
  269. for (const str of nestedData.longStrings) data.longStrings.add(str);
  270. if (nestedData.maybeIncludesStrings) data.maybeIncludesStrings = true;
  271. }
  272. };
  273. case 'union-negative': {
  274. const regSet = (data, set2) => {
  275. data.singleChars = UNICODE_SET.clone().remove(set2).add(data.singleChars);
  276. };
  277. return {
  278. single: (data, cp) => {
  279. const unicode = UNICODE_SET.clone();
  280. data.singleChars = data.singleChars.contains(cp) ? unicode : unicode.remove(cp);
  281. },
  282. regSet: regSet,
  283. range: (data, start, end) => {
  284. data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars);
  285. },
  286. iuRange: (data, start, end, caseEqFlags) => {
  287. data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end, caseEqFlags).add(data.singleChars);
  288. },
  289. nested: (data, nestedData) => {
  290. regSet(data, nestedData.singleChars);
  291. if (nestedData.maybeIncludesStrings) throw new Error('ASSERTION ERROR');
  292. }
  293. };
  294. }
  295. case 'intersection': {
  296. const regSet = (data, set2) => {
  297. if (data.first) data.singleChars = set2;
  298. else data.singleChars.intersection(set2);
  299. };
  300. return {
  301. single: (data, cp) => {
  302. data.singleChars = data.first || data.singleChars.contains(cp) ? regenerate(cp) : regenerate();
  303. data.longStrings.clear();
  304. data.maybeIncludesStrings = false;
  305. },
  306. regSet: (data, set) => {
  307. regSet(data, set);
  308. data.longStrings.clear();
  309. data.maybeIncludesStrings = false;
  310. },
  311. range: (data, start, end) => {
  312. if (data.first) data.singleChars.addRange(start, end);
  313. else data.singleChars.intersection(regenerate().addRange(start, end));
  314. data.longStrings.clear();
  315. data.maybeIncludesStrings = false;
  316. },
  317. iuRange: (data, start, end, caseEqFlags) => {
  318. if (data.first) data.singleChars.iuAddRange(start, end, caseEqFlags);
  319. else data.singleChars.intersection(regenerate().iuAddRange(start, end, caseEqFlags));
  320. data.longStrings.clear();
  321. data.maybeIncludesStrings = false;
  322. },
  323. nested: (data, nestedData) => {
  324. regSet(data, nestedData.singleChars);
  325. if (data.first) {
  326. data.longStrings = nestedData.longStrings;
  327. data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
  328. } else {
  329. for (const str of data.longStrings) {
  330. if (!nestedData.longStrings.has(str)) data.longStrings.delete(str);
  331. }
  332. if (!nestedData.maybeIncludesStrings) data.maybeIncludesStrings = false;
  333. }
  334. }
  335. };
  336. }
  337. case 'subtraction': {
  338. const regSet = (data, set2) => {
  339. if (data.first) data.singleChars.add(set2);
  340. else data.singleChars.remove(set2);
  341. };
  342. return {
  343. single: (data, cp) => {
  344. if (data.first) data.singleChars.add(cp);
  345. else data.singleChars.remove(cp);
  346. },
  347. regSet: regSet,
  348. range: (data, start, end) => {
  349. if (data.first) data.singleChars.addRange(start, end);
  350. else data.singleChars.removeRange(start, end);
  351. },
  352. iuRange: (data, start, end, caseEqFlags) => {
  353. if (data.first) data.singleChars.iuAddRange(start, end, caseEqFlags);
  354. else data.singleChars.iuRemoveRange(start, end, caseEqFlags);
  355. },
  356. nested: (data, nestedData) => {
  357. regSet(data, nestedData.singleChars);
  358. if (data.first) {
  359. data.longStrings = nestedData.longStrings;
  360. data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
  361. } else {
  362. for (const str of data.longStrings) {
  363. if (nestedData.longStrings.has(str)) data.longStrings.delete(str);
  364. }
  365. }
  366. }
  367. };
  368. }
  369. // The `default` clause is only here as a safeguard; it should never be
  370. // reached. Code coverage tools should ignore it.
  371. /* node:coverage ignore next */
  372. default:
  373. throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
  374. }
  375. };
  376. const getCharacterClassEmptyData = () => ({
  377. transformed: config.transform.unicodeFlag,
  378. singleChars: regenerate(),
  379. longStrings: new Set(),
  380. hasEmptyString: false,
  381. first: true,
  382. maybeIncludesStrings: false
  383. });
  384. const concatCaseEquivalents = (codePoint, caseEqFlags) => {
  385. const caseEquivalents = getCaseEquivalents(codePoint, caseEqFlags);
  386. if (caseEquivalents) {
  387. return [codePoint, ...caseEquivalents];
  388. }
  389. return [codePoint];
  390. };
  391. const computeClassStrings = (classStrings, regenerateOptions, caseEqFlags, shouldApplySCF) => {
  392. let data = getCharacterClassEmptyData();
  393. for (const string of classStrings.strings) {
  394. if (string.characters.length === 1) {
  395. const codePoint = shouldApplySCF ? simpleCaseFolding(string.characters[0].codePoint) : string.characters[0].codePoint
  396. concatCaseEquivalents(codePoint, caseEqFlags).forEach((cp) => {
  397. data.singleChars.add(cp);
  398. });
  399. } else {
  400. let stringifiedString = '';
  401. if (caseEqFlags) {
  402. for (const ch of string.characters) {
  403. const codePoint = shouldApplySCF ? simpleCaseFolding(ch.codePoint) : ch.codePoint;
  404. const set = regenerate(concatCaseEquivalents(codePoint, caseEqFlags));
  405. stringifiedString += set.toString(regenerateOptions);
  406. }
  407. } else {
  408. for (const ch of string.characters) {
  409. const codePoint = shouldApplySCF ? simpleCaseFolding(ch.codePoint) : ch.codePoint;
  410. if (codePoint !== ch.codePoint) {
  411. stringifiedString += regenerate(codePoint).toString(regenerateOptions);
  412. } else {
  413. stringifiedString += generate(ch);
  414. }
  415. }
  416. }
  417. data.longStrings.add(stringifiedString);
  418. data.maybeIncludesStrings = true;
  419. }
  420. }
  421. return data;
  422. }
  423. const computeCharacterClass = (characterClassItem, regenerateOptions, shouldApplySCF) => {
  424. let data = getCharacterClassEmptyData();
  425. let handlePositive;
  426. let handleNegative;
  427. let caseEqFlags = configGetCaseEqFlags();
  428. switch (characterClassItem.kind) {
  429. case 'union':
  430. handlePositive = buildHandler('union');
  431. handleNegative = buildHandler('union-negative');
  432. break;
  433. case 'intersection':
  434. handlePositive = buildHandler('intersection');
  435. handleNegative = buildHandler('subtraction');
  436. if (config.transform.unicodeSetsFlag) data.transformed = true;
  437. if (config.isIgnoreCaseMode) {
  438. shouldApplySCF = true;
  439. }
  440. break;
  441. case 'subtraction':
  442. handlePositive = buildHandler('subtraction');
  443. handleNegative = buildHandler('intersection');
  444. if (config.transform.unicodeSetsFlag) data.transformed = true;
  445. if (config.isIgnoreCaseMode) {
  446. shouldApplySCF = true;
  447. }
  448. break;
  449. // The `default` clause is only here as a safeguard; it should never be
  450. // reached. Code coverage tools should ignore it.
  451. /* node:coverage ignore next */
  452. default:
  453. throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
  454. }
  455. for (const item of characterClassItem.body) {
  456. switch (item.type) {
  457. case 'value':
  458. const codePoint = shouldApplySCF ? simpleCaseFolding(item.codePoint) : item.codePoint;
  459. const list = concatCaseEquivalents(codePoint, caseEqFlags);
  460. handlePositive.regSet(data, regenerate(list));
  461. if (list.length > 1) {
  462. data.transformed = true;
  463. }
  464. break;
  465. case 'characterClassRange':
  466. const min = item.min.codePoint;
  467. const max = item.max.codePoint;
  468. if (shouldApplySCF) {
  469. let list = [];
  470. for (let cp = min; cp <= max; cp++) {
  471. list.push(simpleCaseFolding(cp));
  472. }
  473. handlePositive.regSet(data, regenerate(list));
  474. } else {
  475. handlePositive.range(data, min, max);
  476. }
  477. if (caseEqFlags) {
  478. // If shouldApplySCF is true, it is still ok to call iuRange because
  479. // the set [min, max] shares the same case equivalents with scf([min, max])
  480. handlePositive.iuRange(data, min, max, caseEqFlags);
  481. data.transformed = true;
  482. }
  483. break;
  484. case 'characterClassEscape':
  485. handlePositive.regSet(data, getCharacterClassEscapeSet(
  486. item.value,
  487. config.flags.unicode || config.flags.unicodeSets,
  488. config.flags.ignoreCase,
  489. shouldApplySCF
  490. ));
  491. break;
  492. case 'unicodePropertyEscape':
  493. const nestedData = getUnicodePropertyEscapeCharacterClassData(
  494. item.value,
  495. item.negative,
  496. config.flags.unicodeSets && config.isIgnoreCaseMode,
  497. shouldApplySCF
  498. );
  499. handlePositive.nested(data, nestedData);
  500. data.transformed =
  501. data.transformed ||
  502. config.transform.unicodePropertyEscapes ||
  503. (config.transform.unicodeSetsFlag && (nestedData.maybeIncludesStrings || characterClassItem.kind !== "union" || item.negative));
  504. break;
  505. case 'characterClass':
  506. const handler = item.negative ? handleNegative : handlePositive;
  507. const res = computeCharacterClass(item, regenerateOptions, shouldApplySCF);
  508. handler.nested(data, res);
  509. data.transformed = true;
  510. break;
  511. case 'classStrings':
  512. handlePositive.nested(data, computeClassStrings(item, regenerateOptions, caseEqFlags, shouldApplySCF));
  513. data.transformed = true;
  514. break;
  515. // The `default` clause is only here as a safeguard; it should never be
  516. // reached. Code coverage tools should ignore it.
  517. /* node:coverage ignore next */
  518. default:
  519. throw new Error(`Unknown term type: ${ item.type }`);
  520. }
  521. data.first = false;
  522. }
  523. if (characterClassItem.negative && data.maybeIncludesStrings) {
  524. throw new SyntaxError('Cannot negate set containing strings');
  525. }
  526. return data;
  527. }
  528. const processCharacterClass = (
  529. characterClassItem,
  530. regenerateOptions,
  531. computed = computeCharacterClass(characterClassItem, regenerateOptions)
  532. ) => {
  533. const negative = characterClassItem.negative;
  534. const { singleChars, transformed, longStrings } = computed;
  535. if (transformed) {
  536. // If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
  537. const bmpOnly = regenerateContainsAstral(singleChars);
  538. const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));
  539. if (negative) {
  540. if (config.useUnicodeFlag) {
  541. update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
  542. } else {
  543. if (config.flags.unicode || config.flags.unicodeSets) {
  544. if (config.flags.ignoreCase) {
  545. const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
  546. // Assumption: singleChars do not contain lone surrogates.
  547. // Regex like /[^\ud800]/u is not supported
  548. const surrogateOrBMPSetStr = singleChars
  549. .clone()
  550. .remove(astralCharsSet)
  551. .addRange(0xd800, 0xdfff)
  552. .toString({ bmpOnly: true });
  553. // Don't generate negative lookahead for astral characters
  554. // because the case folding is not working anyway as we break
  555. // code points into surrogate pairs.
  556. const astralNegativeSetStr = ASTRAL_SET
  557. .clone()
  558. .remove(astralCharsSet)
  559. .toString(regenerateOptions);
  560. // The transform here does not support lone surrogates.
  561. update(
  562. characterClassItem,
  563. `(?!${surrogateOrBMPSetStr})[^]|${astralNegativeSetStr}`
  564. );
  565. } else {
  566. // Generate negative set directly when case folding is not involved.
  567. const negativeSet = UNICODE_SET.clone().remove(singleChars);
  568. update(characterClassItem, negativeSet.toString(regenerateOptions));
  569. }
  570. } else {
  571. update(characterClassItem, `(?!${setStr})[^]`);
  572. }
  573. }
  574. } else {
  575. const hasEmptyString = longStrings.has('');
  576. const pieces = Array.from(longStrings).sort((a, b) => b.length - a.length);
  577. if (setStr !== '[]' || longStrings.size === 0) {
  578. pieces.splice(pieces.length - (hasEmptyString ? 1 : 0), 0, setStr);
  579. }
  580. update(characterClassItem, pieces.join('|'));
  581. }
  582. }
  583. return characterClassItem;
  584. };
  585. const assertNoUnmatchedReferences = (groups) => {
  586. const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
  587. if (unmatchedReferencesNames.length > 0) {
  588. throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
  589. }
  590. };
  591. const processModifiers = (item, regenerateOptions, groups) => {
  592. const enabling = item.modifierFlags.enabling;
  593. const disabling = item.modifierFlags.disabling;
  594. const oldData = Object.assign({}, config.modifiersData);
  595. for (const flag of enabling) {
  596. config.modifiersData[flag] = true;
  597. }
  598. for (const flag of disabling) {
  599. config.modifiersData[flag] = false;
  600. }
  601. if (config.transform.modifiers) {
  602. delete item.modifierFlags;
  603. item.behavior = 'ignore';
  604. }
  605. item.body = item.body.map(term => {
  606. return processTerm(term, regenerateOptions, groups);
  607. });
  608. config.modifiersData = oldData;
  609. return item;
  610. }
  611. const processTerm = (item, regenerateOptions, groups) => {
  612. switch (item.type) {
  613. case 'dot':
  614. if (config.transform.unicodeFlag) {
  615. update(
  616. item,
  617. getUnicodeDotSet(config.isDotAllMode).toString(regenerateOptions)
  618. );
  619. } else if ((config.modifiersData.s != null ? config.modifiersData.s && config.transform.modifiers : config.transform.dotAllFlag)) {
  620. // TODO: consider changing this at the regenerate level.
  621. update(item, '[^]');
  622. }
  623. break;
  624. case 'characterClass':
  625. item = processCharacterClass(item, regenerateOptions);
  626. break;
  627. case 'unicodePropertyEscape':
  628. const data = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative, config.flags.unicodeSets && config.isIgnoreCaseMode);
  629. if (data.maybeIncludesStrings) {
  630. if (!config.flags.unicodeSets) {
  631. throw new Error(
  632. 'Properties of strings are only supported when using the unicodeSets (v) flag.'
  633. );
  634. }
  635. if (config.transform.unicodeSetsFlag) {
  636. data.transformed = true;
  637. item = processCharacterClass(item, regenerateOptions, data);
  638. }
  639. } else if (config.transform.unicodePropertyEscapes || configGetCaseEqFlags()) {
  640. update(
  641. item,
  642. data.singleChars.toString(regenerateOptions)
  643. );
  644. }
  645. break;
  646. case 'characterClassEscape':
  647. if (config.transform.unicodeFlag) {
  648. update(
  649. item,
  650. getCharacterClassEscapeSet(
  651. item.value,
  652. /* config.transform.unicodeFlag implies config.flags.unicode */ true,
  653. config.flags.ignoreCase
  654. ).toString(regenerateOptions)
  655. );
  656. }
  657. break;
  658. case 'group':
  659. if (item.behavior == 'normal') {
  660. groups.lastIndex++;
  661. }
  662. if (item.name) {
  663. const name = item.name.value;
  664. if (groups.namesConflicts[name]) {
  665. throw new Error(
  666. `Group '${ name }' has already been defined in this context.`
  667. );
  668. }
  669. groups.namesConflicts[name] = true;
  670. if (config.transform.namedGroups) {
  671. delete item.name;
  672. }
  673. const index = groups.lastIndex;
  674. if (!groups.names[name]) {
  675. groups.names[name] = [];
  676. }
  677. groups.names[name].push(index);
  678. if (groups.onNamedGroup) {
  679. groups.onNamedGroup.call(null, name, index);
  680. }
  681. if (groups.unmatchedReferences[name]) {
  682. delete groups.unmatchedReferences[name];
  683. }
  684. }
  685. if (item.modifierFlags) {
  686. return processModifiers(item, regenerateOptions, groups);
  687. }
  688. /* falls through */
  689. case 'quantifier':
  690. item.body = item.body.map(term => {
  691. return processTerm(term, regenerateOptions, groups);
  692. });
  693. break;
  694. case 'disjunction':
  695. const outerNamesConflicts = groups.namesConflicts;
  696. item.body = item.body.map(term => {
  697. groups.namesConflicts = Object.create(outerNamesConflicts);
  698. return processTerm(term, regenerateOptions, groups);
  699. });
  700. break;
  701. case 'alternative':
  702. item.body = flatMap(item.body, term => {
  703. const res = processTerm(term, regenerateOptions, groups);
  704. // Alternatives cannot contain alternatives; flatten them.
  705. return res.type === 'alternative' ? res.body : res;
  706. });
  707. break;
  708. case 'value':
  709. const codePoint = item.codePoint;
  710. const caseEqFlags = configGetCaseEqFlags();
  711. const list = concatCaseEquivalents(codePoint, caseEqFlags);
  712. if (list.length === 1 && item.kind === "symbol" && codePoint >= 0x20 && codePoint <= 0x7E) {
  713. // skip regenerate when it is a printable ASCII symbol
  714. break;
  715. }
  716. const set = regenerate(list);
  717. update(item, set.toString(regenerateOptions));
  718. break;
  719. case 'reference':
  720. if (item.name) {
  721. const name = item.name.value;
  722. const indexes = groups.names[name];
  723. if (!indexes) {
  724. groups.unmatchedReferences[name] = true;
  725. }
  726. if (config.transform.namedGroups) {
  727. if (indexes) {
  728. const body = indexes.map(index => ({
  729. 'type': 'reference',
  730. 'matchIndex': index,
  731. 'raw': '\\' + index,
  732. }));
  733. if (body.length === 1) {
  734. return body[0];
  735. }
  736. return {
  737. 'type': 'alternative',
  738. 'body': body,
  739. 'raw': body.map(term => term.raw).join(''),
  740. };
  741. }
  742. // This named reference comes before the group where it’s defined,
  743. // so it’s always an empty match.
  744. return {
  745. 'type': 'group',
  746. 'behavior': 'ignore',
  747. 'body': [],
  748. 'raw': '(?:)',
  749. };
  750. }
  751. }
  752. break;
  753. case 'anchor':
  754. if (config.modifiersData.m && config.transform.modifiers) {
  755. if (item.kind == 'start') {
  756. update(item, `(?:^|(?<=${NEWLINE_SET.toString()}))`);
  757. } else if (item.kind == 'end') {
  758. update(item, `(?:$|(?=${NEWLINE_SET.toString()}))`);
  759. }
  760. }
  761. case 'empty':
  762. // Nothing to do here.
  763. break;
  764. // The `default` clause is only here as a safeguard; it should never be
  765. // reached. Code coverage tools should ignore it.
  766. /* node:coverage ignore next */
  767. default:
  768. throw new Error(`Unknown term type: ${ item.type }`);
  769. }
  770. return item;
  771. };
  772. const config = {
  773. 'flags': {
  774. 'ignoreCase': false,
  775. 'unicode': false,
  776. 'unicodeSets': false,
  777. 'dotAll': false,
  778. 'multiline': false,
  779. },
  780. 'transform': {
  781. 'dotAllFlag': false,
  782. 'unicodeFlag': false,
  783. 'unicodeSetsFlag': false,
  784. 'unicodePropertyEscapes': false,
  785. 'namedGroups': false,
  786. 'modifiers': false,
  787. },
  788. 'modifiersData': {
  789. 'i': undefined,
  790. 's': undefined,
  791. 'm': undefined,
  792. },
  793. get useUnicodeFlag() {
  794. return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag;
  795. },
  796. get isDotAllMode() {
  797. return (this.modifiersData.s !== undefined ? this.modifiersData.s : this.flags.dotAll);
  798. },
  799. get isIgnoreCaseMode() {
  800. return (this.modifiersData.i !== undefined ? this.modifiersData.i : this.flags.ignoreCase);
  801. }
  802. };
  803. const validateOptions = (options) => {
  804. if (!options) return;
  805. for (const key of Object.keys(options)) {
  806. const value = options[key];
  807. switch (key) {
  808. case 'dotAllFlag':
  809. case 'unicodeFlag':
  810. case 'unicodePropertyEscapes':
  811. case 'unicodeSetsFlag':
  812. case 'namedGroups':
  813. if (value != null && value !== false && value !== 'transform') {
  814. throw new Error(`.${key} must be false (default) or 'transform'.`);
  815. }
  816. break;
  817. // todo: remove modifiers: 'parse' in regexpu-core v7
  818. case 'modifiers':
  819. if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
  820. throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
  821. }
  822. break;
  823. case 'onNamedGroup':
  824. case 'onNewFlags':
  825. if (value != null && typeof value !== 'function') {
  826. throw new Error(`.${key} must be a function.`);
  827. }
  828. break;
  829. default:
  830. throw new Error(`.${key} is not a valid regexpu-core option.`);
  831. }
  832. }
  833. };
  834. const hasFlag = (flags, flag) => flags ? flags.includes(flag) : false;
  835. const transform = (options, name) => options ? options[name] === 'transform' : false;
  836. const rewritePattern = (pattern, flags, options) => {
  837. validateOptions(options);
  838. config.flags.unicode = hasFlag(flags, 'u');
  839. config.flags.unicodeSets = hasFlag(flags, 'v');
  840. config.flags.ignoreCase = hasFlag(flags, 'i');
  841. config.flags.dotAll = hasFlag(flags, 's');
  842. config.flags.multiline = hasFlag(flags, 'm');
  843. config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
  844. config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag');
  845. config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag');
  846. // unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
  847. config.transform.unicodePropertyEscapes = (config.flags.unicode || config.flags.unicodeSets) && (
  848. transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
  849. );
  850. config.transform.namedGroups = transform(options, 'namedGroups');
  851. config.transform.modifiers = transform(options, 'modifiers');
  852. config.modifiersData.i = undefined;
  853. config.modifiersData.s = undefined;
  854. config.modifiersData.m = undefined;
  855. const regjsparserFeatures = {
  856. // Enable every stable RegExp feature by default
  857. 'modifiers': true,
  858. 'unicodePropertyEscape': true,
  859. 'unicodeSet': true,
  860. 'namedGroups': true,
  861. 'lookbehind': true,
  862. };
  863. const regenerateOptions = {
  864. 'hasUnicodeFlag': config.useUnicodeFlag,
  865. 'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
  866. };
  867. const groups = {
  868. 'onNamedGroup': options && options.onNamedGroup,
  869. 'lastIndex': 0,
  870. 'names': Object.create(null), // { [name]: Array<index> }
  871. 'namesConflicts': Object.create(null), // { [name]: true }
  872. 'unmatchedReferences': Object.create(null) // { [name]: true }
  873. };
  874. const tree = parse(pattern, flags, regjsparserFeatures);
  875. if (config.transform.modifiers) {
  876. if (/\(\?[a-z]*-[a-z]+:/.test(pattern)) {
  877. // the pattern _likely_ contain inline disabled modifiers
  878. // we need to traverse to make sure that they are actually modifiers and to collect them
  879. const allDisabledModifiers = Object.create(null)
  880. const itemStack = [tree];
  881. let node;
  882. while (node = itemStack.pop(), node != undefined) {
  883. if (Array.isArray(node)) {
  884. Array.prototype.push.apply(itemStack, node);
  885. } else if (typeof node == 'object' && node != null) {
  886. for (const key of Object.keys(node)) {
  887. const value = node[key];
  888. if (key == 'modifierFlags') {
  889. for (const flag of value.disabling) {
  890. allDisabledModifiers[flag] = true;
  891. }
  892. } else if (typeof value == 'object' && value != null) {
  893. itemStack.push(value);
  894. }
  895. }
  896. }
  897. }
  898. if (allDisabledModifiers.i) {
  899. config.modifiersData.i = config.flags.ignoreCase;
  900. }
  901. if (allDisabledModifiers.m) {
  902. config.modifiersData.m = config.flags.multiline;
  903. }
  904. if (allDisabledModifiers.s) {
  905. config.modifiersData.s = config.flags.dotAll;
  906. }
  907. }
  908. }
  909. // Note: `processTerm` mutates `tree` and `groups`.
  910. processTerm(tree, regenerateOptions, groups);
  911. assertNoUnmatchedReferences(groups);
  912. const onNewFlags = options && options.onNewFlags;
  913. if (onNewFlags) {
  914. let newFlags = flags.split('').filter((flag) => !config.modifiersData[flag]).join('');
  915. if (config.transform.unicodeSetsFlag) {
  916. newFlags = newFlags.replace('v', 'u');
  917. }
  918. if (config.transform.unicodeFlag) {
  919. newFlags = newFlags.replace('u', '');
  920. }
  921. if (config.transform.dotAllFlag) {
  922. newFlags = newFlags.replace('s', '');
  923. }
  924. onNewFlags(newFlags);
  925. }
  926. return generate(tree);
  927. };
  928. module.exports = rewritePattern;