1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003 |
- 'use strict';
- const generate = require('regjsgen').generate;
- const parse = require('regjsparser').parse;
- const regenerate = require('regenerate');
- const unicodeMatchProperty = require('unicode-match-property-ecmascript');
- const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
- const iuMappings = require('./data/iu-mappings.js');
- const iBMPMappings = require('./data/i-bmp-mappings.js');
- const iuFoldings = require('./data/iu-foldings.js');
- const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
- const { UNICODE_SET, UNICODE_IV_SET } = require('./data/all-characters.js');
- function flatMap(array, callback) {
- const result = [];
- array.forEach(item => {
- const res = callback(item);
- if (Array.isArray(res)) {
- result.push.apply(result, res);
- } else {
- result.push(res);
- }
- });
- return result;
- }
- function regenerateContainsAstral(regenerateData) {
- const data = regenerateData.data;
- return data.length >= 1 && data[data.length - 1] >= 0x10000;
- }
- // https://tc39.es/ecma262/#prod-SyntaxCharacter
- const SYNTAX_CHARS = /[\\^$.*+?()[\]{}|]/g;
- const ASTRAL_SET = regenerate().addRange(0x10000, 0x10FFFF);
- const NEWLINE_SET = regenerate().add(
- // `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
- 0x000A, // Line Feed <LF>
- 0x000D, // Carriage Return <CR>
- 0x2028, // Line Separator <LS>
- 0x2029 // Paragraph Separator <PS>
- );
- // Prepare a Regenerate set containing all code points that are supposed to be
- // matched by `/./u`. https://mths.be/es6#sec-atom
- const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
- .remove(NEWLINE_SET);
- const getCharacterClassEscapeSet = (character, unicode, ignoreCase, shouldApplySCF) => {
- if (unicode) {
- if (ignoreCase) {
- const result = ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
- if (shouldApplySCF) {
- return ESCAPE_SETS.UNICODESET_IGNORE_CASE.get(character);
- } else {
- return result;
- }
- }
- return ESCAPE_SETS.UNICODE.get(character);
- }
- return ESCAPE_SETS.REGULAR.get(character);
- };
- const getUnicodeDotSet = (dotAll) => {
- return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
- };
- const getUnicodePropertyValueSet = (property, value) => {
- const path = value ?
- `${ property }/${ value }` :
- `Binary_Property/${ property }`;
- try {
- return require(`regenerate-unicode-properties/${ path }.js`);
- } catch (exception) {
- throw new Error(
- `Failed to recognize value \`${ value }\` for property ` +
- `\`${ property }\`.`
- );
- }
- };
- const handleLoneUnicodePropertyNameOrValue = (value) => {
- // It could be a `General_Category` value or a binary property.
- // Note: `unicodeMatchPropertyValue` throws on invalid values.
- try {
- const property = 'General_Category';
- const category = unicodeMatchPropertyValue(property, value);
- return getUnicodePropertyValueSet(property, category);
- } catch (exception) {}
- // It’s not a `General_Category` value, so check if it’s a property
- // of strings.
- try {
- return getUnicodePropertyValueSet('Property_of_Strings', value);
- } catch (exception) {}
- // Lastly, check if it’s a binary property of single code points.
- // Note: `unicodeMatchProperty` throws on invalid properties.
- const property = unicodeMatchProperty(value);
- return getUnicodePropertyValueSet(property);
- };
- const getUnicodePropertyEscapeSet = (value, isNegative, isUnicodeSetIgnoreCase) => {
- const parts = value.split('=');
- const firstPart = parts[0];
- let set;
- if (parts.length == 1) {
- set = handleLoneUnicodePropertyNameOrValue(firstPart);
- } else {
- // The pattern consists of two parts, i.e. `Property=Value`.
- const property = unicodeMatchProperty(firstPart);
- const value = unicodeMatchPropertyValue(property, parts[1]);
- set = getUnicodePropertyValueSet(property, value);
- }
- if (isNegative) {
- if (set.strings) {
- throw new Error('Cannot negate Unicode property of strings');
- }
- return {
- characters: (isUnicodeSetIgnoreCase ? UNICODE_IV_SET : UNICODE_SET).clone().remove(set.characters),
- strings: new Set()
- };
- }
- return {
- characters: set.characters.clone(),
- strings: set.strings
- // We need to escape strings like *️⃣ to make sure that they can be safely used in unions.
- ? new Set(set.strings.map(str => str.replace(SYNTAX_CHARS, '\\$&')))
- : new Set()
- };
- };
- const getUnicodePropertyEscapeCharacterClassData = (property, isNegative, isUnicodeSetIgnoreCase, shouldApplySCF) => {
- const set = getUnicodePropertyEscapeSet(property, isNegative, isUnicodeSetIgnoreCase);
- const data = getCharacterClassEmptyData();
- const singleChars = shouldApplySCF ? regenerate(set.characters.toArray().map(ch => simpleCaseFolding(ch))) : set.characters;
- const caseEqFlags = configGetCaseEqFlags();
- if (caseEqFlags) {
- for (const codepoint of singleChars.toArray()) {
- const list = getCaseEquivalents(codepoint, caseEqFlags);
- if (list) {
- singleChars.add(list);
- }
- }
- }
- data.singleChars = singleChars;
- if (set.strings.size > 0) {
- data.longStrings = set.strings;
- data.maybeIncludesStrings = true;
- }
- return data;
- };
- const CASE_EQ_FLAG_NONE = 0b00;
- const CASE_EQ_FLAG_BMP = 0b01;
- const CASE_EQ_FLAG_UNICODE = 0b10;
- function configGetCaseEqFlags() {
- let flags = CASE_EQ_FLAG_NONE;
- if (config.modifiersData.i === true) {
- if (config.transform.modifiers) {
- flags |= CASE_EQ_FLAG_BMP;
- if (config.flags.unicode || config.flags.unicodeSets) {
- flags |= CASE_EQ_FLAG_UNICODE;
- }
- }
- } else if (config.modifiersData.i === undefined) {
- if (config.transform.unicodeFlag && config.flags.ignoreCase) {
- flags |= CASE_EQ_FLAG_UNICODE;
- }
- }
- return flags;
- }
- // Given a range of code points, add any case-equivalent code points in that range
- // to a set.
- regenerate.prototype.iuAddRange = function(min, max, caseEqFlags) {
- const $this = this;
- do {
- const list = getCaseEquivalents(min, caseEqFlags);
- if (list) {
- $this.add(list);
- }
- } while (++min <= max);
- return $this;
- };
- regenerate.prototype.iuRemoveRange = function(min, max, caseEqFlags) {
- const $this = this;
- do {
- const list = getCaseEquivalents(min, caseEqFlags);
- if (list) {
- $this.remove(list);
- }
- } while (++min <= max);
- return $this;
- };
- const update = (item, pattern) => {
- let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '', {
- lookbehind: true,
- namedGroups: true,
- unicodePropertyEscape: true,
- unicodeSet: true,
- modifiers: true,
- });
- switch (tree.type) {
- case 'characterClass':
- case 'group':
- case 'value':
- // No wrapping needed.
- break;
- default:
- // Wrap the pattern in a non-capturing group.
- tree = wrap(tree, pattern);
- }
- Object.assign(item, tree);
- };
- const wrap = (tree, pattern) => {
- // Wrap the pattern in a non-capturing group.
- return {
- 'type': 'group',
- 'behavior': 'ignore',
- 'body': [tree],
- 'raw': `(?:${ pattern })`
- };
- };
- /**
- * Given any codepoint ch, returns false or an array of characters,
- * such that for every c in the array,
- * c != ch and Canonicalize(~, c) == Canonicalize(~, ch)
- *
- * where Canonicalize is defined in
- * https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
- * @param {number} codePoint input code point
- * @param {number} flags bitwise flags composed of CASE_EQ_FLAG_*
- * @returns false | number[]
- */
- const getCaseEquivalents = (codePoint, flags) => {
- if (flags === CASE_EQ_FLAG_NONE) {
- return false;
- }
- let result = ((flags & CASE_EQ_FLAG_UNICODE) ? iuMappings.get(codePoint) : undefined) || [];
- if (typeof result === "number") result = [result];
- if (flags & CASE_EQ_FLAG_BMP) {
- for (const cp of [codePoint].concat(result)) {
- // Fast path for ASCII characters
- if (cp >= 0x41 && cp <= 0x5a) {
- result.push(cp + 0x20);
- } else if (cp >= 0x61 && cp <= 0x7a) {
- result.push(cp - 0x20);
- } else {
- result = result.concat(iBMPMappings.get(cp) || []);
- }
- }
- }
- return result.length == 0 ? false : result;
- };
- // https://tc39.es/ecma262/#sec-maybesimplecasefolding
- const simpleCaseFolding = (codePoint) => {
- // Fast path for ASCII characters
- if (codePoint <= 0x7F) {
- if (codePoint >= 0x41 && codePoint <= 0x5A) {
- return codePoint + 0x20;
- }
- return codePoint;
- }
- return iuFoldings.get(codePoint) || codePoint;
- }
- const buildHandler = (action) => {
- switch (action) {
- case 'union':
- return {
- single: (data, cp) => {
- data.singleChars.add(cp);
- },
- regSet: (data, set2) => {
- data.singleChars.add(set2);
- },
- range: (data, start, end) => {
- data.singleChars.addRange(start, end);
- },
- iuRange: (data, start, end, caseEqFlags) => {
- data.singleChars.iuAddRange(start, end, caseEqFlags);
- },
- nested: (data, nestedData) => {
- data.singleChars.add(nestedData.singleChars);
- for (const str of nestedData.longStrings) data.longStrings.add(str);
- if (nestedData.maybeIncludesStrings) data.maybeIncludesStrings = true;
- }
- };
- case 'union-negative': {
- const regSet = (data, set2) => {
- data.singleChars = UNICODE_SET.clone().remove(set2).add(data.singleChars);
- };
- return {
- single: (data, cp) => {
- const unicode = UNICODE_SET.clone();
- data.singleChars = data.singleChars.contains(cp) ? unicode : unicode.remove(cp);
- },
- regSet: regSet,
- range: (data, start, end) => {
- data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars);
- },
- iuRange: (data, start, end, caseEqFlags) => {
- data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end, caseEqFlags).add(data.singleChars);
- },
- nested: (data, nestedData) => {
- regSet(data, nestedData.singleChars);
- if (nestedData.maybeIncludesStrings) throw new Error('ASSERTION ERROR');
- }
- };
- }
- case 'intersection': {
- const regSet = (data, set2) => {
- if (data.first) data.singleChars = set2;
- else data.singleChars.intersection(set2);
- };
- return {
- single: (data, cp) => {
- data.singleChars = data.first || data.singleChars.contains(cp) ? regenerate(cp) : regenerate();
- data.longStrings.clear();
- data.maybeIncludesStrings = false;
- },
- regSet: (data, set) => {
- regSet(data, set);
- data.longStrings.clear();
- data.maybeIncludesStrings = false;
- },
- range: (data, start, end) => {
- if (data.first) data.singleChars.addRange(start, end);
- else data.singleChars.intersection(regenerate().addRange(start, end));
- data.longStrings.clear();
- data.maybeIncludesStrings = false;
- },
- iuRange: (data, start, end, caseEqFlags) => {
- if (data.first) data.singleChars.iuAddRange(start, end, caseEqFlags);
- else data.singleChars.intersection(regenerate().iuAddRange(start, end, caseEqFlags));
- data.longStrings.clear();
- data.maybeIncludesStrings = false;
- },
- nested: (data, nestedData) => {
- regSet(data, nestedData.singleChars);
- if (data.first) {
- data.longStrings = nestedData.longStrings;
- data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
- } else {
- for (const str of data.longStrings) {
- if (!nestedData.longStrings.has(str)) data.longStrings.delete(str);
- }
- if (!nestedData.maybeIncludesStrings) data.maybeIncludesStrings = false;
- }
- }
- };
- }
- case 'subtraction': {
- const regSet = (data, set2) => {
- if (data.first) data.singleChars.add(set2);
- else data.singleChars.remove(set2);
- };
- return {
- single: (data, cp) => {
- if (data.first) data.singleChars.add(cp);
- else data.singleChars.remove(cp);
- },
- regSet: regSet,
- range: (data, start, end) => {
- if (data.first) data.singleChars.addRange(start, end);
- else data.singleChars.removeRange(start, end);
- },
- iuRange: (data, start, end, caseEqFlags) => {
- if (data.first) data.singleChars.iuAddRange(start, end, caseEqFlags);
- else data.singleChars.iuRemoveRange(start, end, caseEqFlags);
- },
- nested: (data, nestedData) => {
- regSet(data, nestedData.singleChars);
- if (data.first) {
- data.longStrings = nestedData.longStrings;
- data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
- } else {
- for (const str of data.longStrings) {
- if (nestedData.longStrings.has(str)) data.longStrings.delete(str);
- }
- }
- }
- };
- }
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* node:coverage ignore next */
- default:
- throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
- }
- };
- const getCharacterClassEmptyData = () => ({
- transformed: config.transform.unicodeFlag,
- singleChars: regenerate(),
- longStrings: new Set(),
- hasEmptyString: false,
- first: true,
- maybeIncludesStrings: false
- });
- const concatCaseEquivalents = (codePoint, caseEqFlags) => {
- const caseEquivalents = getCaseEquivalents(codePoint, caseEqFlags);
- if (caseEquivalents) {
- return [codePoint, ...caseEquivalents];
- }
- return [codePoint];
- };
- const computeClassStrings = (classStrings, regenerateOptions, caseEqFlags, shouldApplySCF) => {
- let data = getCharacterClassEmptyData();
- for (const string of classStrings.strings) {
- if (string.characters.length === 1) {
- const codePoint = shouldApplySCF ? simpleCaseFolding(string.characters[0].codePoint) : string.characters[0].codePoint
- concatCaseEquivalents(codePoint, caseEqFlags).forEach((cp) => {
- data.singleChars.add(cp);
- });
- } else {
- let stringifiedString = '';
- if (caseEqFlags) {
- for (const ch of string.characters) {
- const codePoint = shouldApplySCF ? simpleCaseFolding(ch.codePoint) : ch.codePoint;
- const set = regenerate(concatCaseEquivalents(codePoint, caseEqFlags));
- stringifiedString += set.toString(regenerateOptions);
- }
- } else {
- for (const ch of string.characters) {
- const codePoint = shouldApplySCF ? simpleCaseFolding(ch.codePoint) : ch.codePoint;
- if (codePoint !== ch.codePoint) {
- stringifiedString += regenerate(codePoint).toString(regenerateOptions);
- } else {
- stringifiedString += generate(ch);
- }
- }
- }
- data.longStrings.add(stringifiedString);
- data.maybeIncludesStrings = true;
- }
- }
- return data;
- }
- const computeCharacterClass = (characterClassItem, regenerateOptions, shouldApplySCF) => {
- let data = getCharacterClassEmptyData();
- let handlePositive;
- let handleNegative;
- let caseEqFlags = configGetCaseEqFlags();
- switch (characterClassItem.kind) {
- case 'union':
- handlePositive = buildHandler('union');
- handleNegative = buildHandler('union-negative');
- break;
- case 'intersection':
- handlePositive = buildHandler('intersection');
- handleNegative = buildHandler('subtraction');
- if (config.transform.unicodeSetsFlag) data.transformed = true;
- if (config.isIgnoreCaseMode) {
- shouldApplySCF = true;
- }
- break;
- case 'subtraction':
- handlePositive = buildHandler('subtraction');
- handleNegative = buildHandler('intersection');
- if (config.transform.unicodeSetsFlag) data.transformed = true;
- if (config.isIgnoreCaseMode) {
- shouldApplySCF = true;
- }
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* node:coverage ignore next */
- default:
- throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
- }
- for (const item of characterClassItem.body) {
- switch (item.type) {
- case 'value':
- const codePoint = shouldApplySCF ? simpleCaseFolding(item.codePoint) : item.codePoint;
- const list = concatCaseEquivalents(codePoint, caseEqFlags);
- handlePositive.regSet(data, regenerate(list));
- if (list.length > 1) {
- data.transformed = true;
- }
- break;
- case 'characterClassRange':
- const min = item.min.codePoint;
- const max = item.max.codePoint;
- if (shouldApplySCF) {
- let list = [];
- for (let cp = min; cp <= max; cp++) {
- list.push(simpleCaseFolding(cp));
- }
- handlePositive.regSet(data, regenerate(list));
- } else {
- handlePositive.range(data, min, max);
- }
- if (caseEqFlags) {
- // If shouldApplySCF is true, it is still ok to call iuRange because
- // the set [min, max] shares the same case equivalents with scf([min, max])
- handlePositive.iuRange(data, min, max, caseEqFlags);
- data.transformed = true;
- }
- break;
- case 'characterClassEscape':
- handlePositive.regSet(data, getCharacterClassEscapeSet(
- item.value,
- config.flags.unicode || config.flags.unicodeSets,
- config.flags.ignoreCase,
- shouldApplySCF
- ));
- break;
- case 'unicodePropertyEscape':
- const nestedData = getUnicodePropertyEscapeCharacterClassData(
- item.value,
- item.negative,
- config.flags.unicodeSets && config.isIgnoreCaseMode,
- shouldApplySCF
- );
- handlePositive.nested(data, nestedData);
- data.transformed =
- data.transformed ||
- config.transform.unicodePropertyEscapes ||
- (config.transform.unicodeSetsFlag && (nestedData.maybeIncludesStrings || characterClassItem.kind !== "union" || item.negative));
- break;
- case 'characterClass':
- const handler = item.negative ? handleNegative : handlePositive;
- const res = computeCharacterClass(item, regenerateOptions, shouldApplySCF);
- handler.nested(data, res);
- data.transformed = true;
- break;
- case 'classStrings':
- handlePositive.nested(data, computeClassStrings(item, regenerateOptions, caseEqFlags, shouldApplySCF));
- data.transformed = true;
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* node:coverage ignore next */
- default:
- throw new Error(`Unknown term type: ${ item.type }`);
- }
- data.first = false;
- }
- if (characterClassItem.negative && data.maybeIncludesStrings) {
- throw new SyntaxError('Cannot negate set containing strings');
- }
- return data;
- }
- const processCharacterClass = (
- characterClassItem,
- regenerateOptions,
- computed = computeCharacterClass(characterClassItem, regenerateOptions)
- ) => {
- const negative = characterClassItem.negative;
- const { singleChars, transformed, longStrings } = computed;
- if (transformed) {
- // If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
- const bmpOnly = regenerateContainsAstral(singleChars);
- const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));
- if (negative) {
- if (config.useUnicodeFlag) {
- update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
- } else {
- if (config.flags.unicode || config.flags.unicodeSets) {
- if (config.flags.ignoreCase) {
- const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
- // Assumption: singleChars do not contain lone surrogates.
- // Regex like /[^\ud800]/u is not supported
- const surrogateOrBMPSetStr = singleChars
- .clone()
- .remove(astralCharsSet)
- .addRange(0xd800, 0xdfff)
- .toString({ bmpOnly: true });
- // Don't generate negative lookahead for astral characters
- // because the case folding is not working anyway as we break
- // code points into surrogate pairs.
- const astralNegativeSetStr = ASTRAL_SET
- .clone()
- .remove(astralCharsSet)
- .toString(regenerateOptions);
- // The transform here does not support lone surrogates.
- update(
- characterClassItem,
- `(?!${surrogateOrBMPSetStr})[^]|${astralNegativeSetStr}`
- );
- } else {
- // Generate negative set directly when case folding is not involved.
- const negativeSet = UNICODE_SET.clone().remove(singleChars);
- update(characterClassItem, negativeSet.toString(regenerateOptions));
- }
- } else {
- update(characterClassItem, `(?!${setStr})[^]`);
- }
- }
- } else {
- const hasEmptyString = longStrings.has('');
- const pieces = Array.from(longStrings).sort((a, b) => b.length - a.length);
- if (setStr !== '[]' || longStrings.size === 0) {
- pieces.splice(pieces.length - (hasEmptyString ? 1 : 0), 0, setStr);
- }
- update(characterClassItem, pieces.join('|'));
- }
- }
- return characterClassItem;
- };
- const assertNoUnmatchedReferences = (groups) => {
- const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
- if (unmatchedReferencesNames.length > 0) {
- throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
- }
- };
- const processModifiers = (item, regenerateOptions, groups) => {
- const enabling = item.modifierFlags.enabling;
- const disabling = item.modifierFlags.disabling;
- const oldData = Object.assign({}, config.modifiersData);
- for (const flag of enabling) {
- config.modifiersData[flag] = true;
- }
- for (const flag of disabling) {
- config.modifiersData[flag] = false;
- }
- if (config.transform.modifiers) {
- delete item.modifierFlags;
- item.behavior = 'ignore';
- }
- item.body = item.body.map(term => {
- return processTerm(term, regenerateOptions, groups);
- });
- config.modifiersData = oldData;
- return item;
- }
- const processTerm = (item, regenerateOptions, groups) => {
- switch (item.type) {
- case 'dot':
- if (config.transform.unicodeFlag) {
- update(
- item,
- getUnicodeDotSet(config.isDotAllMode).toString(regenerateOptions)
- );
- } else if ((config.modifiersData.s != null ? config.modifiersData.s && config.transform.modifiers : config.transform.dotAllFlag)) {
- // TODO: consider changing this at the regenerate level.
- update(item, '[^]');
- }
- break;
- case 'characterClass':
- item = processCharacterClass(item, regenerateOptions);
- break;
- case 'unicodePropertyEscape':
- const data = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative, config.flags.unicodeSets && config.isIgnoreCaseMode);
- if (data.maybeIncludesStrings) {
- if (!config.flags.unicodeSets) {
- throw new Error(
- 'Properties of strings are only supported when using the unicodeSets (v) flag.'
- );
- }
- if (config.transform.unicodeSetsFlag) {
- data.transformed = true;
- item = processCharacterClass(item, regenerateOptions, data);
- }
- } else if (config.transform.unicodePropertyEscapes || configGetCaseEqFlags()) {
- update(
- item,
- data.singleChars.toString(regenerateOptions)
- );
- }
- break;
- case 'characterClassEscape':
- if (config.transform.unicodeFlag) {
- update(
- item,
- getCharacterClassEscapeSet(
- item.value,
- /* config.transform.unicodeFlag implies config.flags.unicode */ true,
- config.flags.ignoreCase
- ).toString(regenerateOptions)
- );
- }
- break;
- case 'group':
- if (item.behavior == 'normal') {
- groups.lastIndex++;
- }
- if (item.name) {
- const name = item.name.value;
- if (groups.namesConflicts[name]) {
- throw new Error(
- `Group '${ name }' has already been defined in this context.`
- );
- }
- groups.namesConflicts[name] = true;
- if (config.transform.namedGroups) {
- delete item.name;
- }
- const index = groups.lastIndex;
- if (!groups.names[name]) {
- groups.names[name] = [];
- }
- groups.names[name].push(index);
- if (groups.onNamedGroup) {
- groups.onNamedGroup.call(null, name, index);
- }
- if (groups.unmatchedReferences[name]) {
- delete groups.unmatchedReferences[name];
- }
- }
- if (item.modifierFlags) {
- return processModifiers(item, regenerateOptions, groups);
- }
- /* falls through */
- case 'quantifier':
- item.body = item.body.map(term => {
- return processTerm(term, regenerateOptions, groups);
- });
- break;
- case 'disjunction':
- const outerNamesConflicts = groups.namesConflicts;
- item.body = item.body.map(term => {
- groups.namesConflicts = Object.create(outerNamesConflicts);
- return processTerm(term, regenerateOptions, groups);
- });
- break;
- case 'alternative':
- item.body = flatMap(item.body, term => {
- const res = processTerm(term, regenerateOptions, groups);
- // Alternatives cannot contain alternatives; flatten them.
- return res.type === 'alternative' ? res.body : res;
- });
- break;
- case 'value':
- const codePoint = item.codePoint;
- const caseEqFlags = configGetCaseEqFlags();
- const list = concatCaseEquivalents(codePoint, caseEqFlags);
- if (list.length === 1 && item.kind === "symbol" && codePoint >= 0x20 && codePoint <= 0x7E) {
- // skip regenerate when it is a printable ASCII symbol
- break;
- }
- const set = regenerate(list);
- update(item, set.toString(regenerateOptions));
- break;
- case 'reference':
- if (item.name) {
- const name = item.name.value;
- const indexes = groups.names[name];
- if (!indexes) {
- groups.unmatchedReferences[name] = true;
- }
- if (config.transform.namedGroups) {
- if (indexes) {
- const body = indexes.map(index => ({
- 'type': 'reference',
- 'matchIndex': index,
- 'raw': '\\' + index,
- }));
- if (body.length === 1) {
- return body[0];
- }
- return {
- 'type': 'alternative',
- 'body': body,
- 'raw': body.map(term => term.raw).join(''),
- };
- }
- // This named reference comes before the group where it’s defined,
- // so it’s always an empty match.
- return {
- 'type': 'group',
- 'behavior': 'ignore',
- 'body': [],
- 'raw': '(?:)',
- };
- }
- }
- break;
- case 'anchor':
- if (config.modifiersData.m && config.transform.modifiers) {
- if (item.kind == 'start') {
- update(item, `(?:^|(?<=${NEWLINE_SET.toString()}))`);
- } else if (item.kind == 'end') {
- update(item, `(?:$|(?=${NEWLINE_SET.toString()}))`);
- }
- }
- case 'empty':
- // Nothing to do here.
- break;
- // The `default` clause is only here as a safeguard; it should never be
- // reached. Code coverage tools should ignore it.
- /* node:coverage ignore next */
- default:
- throw new Error(`Unknown term type: ${ item.type }`);
- }
- return item;
- };
- const config = {
- 'flags': {
- 'ignoreCase': false,
- 'unicode': false,
- 'unicodeSets': false,
- 'dotAll': false,
- 'multiline': false,
- },
- 'transform': {
- 'dotAllFlag': false,
- 'unicodeFlag': false,
- 'unicodeSetsFlag': false,
- 'unicodePropertyEscapes': false,
- 'namedGroups': false,
- 'modifiers': false,
- },
- 'modifiersData': {
- 'i': undefined,
- 's': undefined,
- 'm': undefined,
- },
- get useUnicodeFlag() {
- return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag;
- },
- get isDotAllMode() {
- return (this.modifiersData.s !== undefined ? this.modifiersData.s : this.flags.dotAll);
- },
- get isIgnoreCaseMode() {
- return (this.modifiersData.i !== undefined ? this.modifiersData.i : this.flags.ignoreCase);
- }
- };
- const validateOptions = (options) => {
- if (!options) return;
- for (const key of Object.keys(options)) {
- const value = options[key];
- switch (key) {
- case 'dotAllFlag':
- case 'unicodeFlag':
- case 'unicodePropertyEscapes':
- case 'unicodeSetsFlag':
- case 'namedGroups':
- if (value != null && value !== false && value !== 'transform') {
- throw new Error(`.${key} must be false (default) or 'transform'.`);
- }
- break;
- // todo: remove modifiers: 'parse' in regexpu-core v7
- case 'modifiers':
- if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
- throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
- }
- break;
- case 'onNamedGroup':
- case 'onNewFlags':
- if (value != null && typeof value !== 'function') {
- throw new Error(`.${key} must be a function.`);
- }
- break;
- default:
- throw new Error(`.${key} is not a valid regexpu-core option.`);
- }
- }
- };
- const hasFlag = (flags, flag) => flags ? flags.includes(flag) : false;
- const transform = (options, name) => options ? options[name] === 'transform' : false;
- const rewritePattern = (pattern, flags, options) => {
- validateOptions(options);
- config.flags.unicode = hasFlag(flags, 'u');
- config.flags.unicodeSets = hasFlag(flags, 'v');
- config.flags.ignoreCase = hasFlag(flags, 'i');
- config.flags.dotAll = hasFlag(flags, 's');
- config.flags.multiline = hasFlag(flags, 'm');
- config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
- config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag');
- config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag');
- // unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
- config.transform.unicodePropertyEscapes = (config.flags.unicode || config.flags.unicodeSets) && (
- transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
- );
- config.transform.namedGroups = transform(options, 'namedGroups');
- config.transform.modifiers = transform(options, 'modifiers');
- config.modifiersData.i = undefined;
- config.modifiersData.s = undefined;
- config.modifiersData.m = undefined;
- const regjsparserFeatures = {
- // Enable every stable RegExp feature by default
- 'modifiers': true,
- 'unicodePropertyEscape': true,
- 'unicodeSet': true,
- 'namedGroups': true,
- 'lookbehind': true,
- };
- const regenerateOptions = {
- 'hasUnicodeFlag': config.useUnicodeFlag,
- 'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
- };
- const groups = {
- 'onNamedGroup': options && options.onNamedGroup,
- 'lastIndex': 0,
- 'names': Object.create(null), // { [name]: Array<index> }
- 'namesConflicts': Object.create(null), // { [name]: true }
- 'unmatchedReferences': Object.create(null) // { [name]: true }
- };
- const tree = parse(pattern, flags, regjsparserFeatures);
- if (config.transform.modifiers) {
- if (/\(\?[a-z]*-[a-z]+:/.test(pattern)) {
- // the pattern _likely_ contain inline disabled modifiers
- // we need to traverse to make sure that they are actually modifiers and to collect them
- const allDisabledModifiers = Object.create(null)
- const itemStack = [tree];
- let node;
- while (node = itemStack.pop(), node != undefined) {
- if (Array.isArray(node)) {
- Array.prototype.push.apply(itemStack, node);
- } else if (typeof node == 'object' && node != null) {
- for (const key of Object.keys(node)) {
- const value = node[key];
- if (key == 'modifierFlags') {
- for (const flag of value.disabling) {
- allDisabledModifiers[flag] = true;
- }
- } else if (typeof value == 'object' && value != null) {
- itemStack.push(value);
- }
- }
- }
- }
- if (allDisabledModifiers.i) {
- config.modifiersData.i = config.flags.ignoreCase;
- }
- if (allDisabledModifiers.m) {
- config.modifiersData.m = config.flags.multiline;
- }
- if (allDisabledModifiers.s) {
- config.modifiersData.s = config.flags.dotAll;
- }
- }
- }
- // Note: `processTerm` mutates `tree` and `groups`.
- processTerm(tree, regenerateOptions, groups);
- assertNoUnmatchedReferences(groups);
- const onNewFlags = options && options.onNewFlags;
- if (onNewFlags) {
- let newFlags = flags.split('').filter((flag) => !config.modifiersData[flag]).join('');
- if (config.transform.unicodeSetsFlag) {
- newFlags = newFlags.replace('v', 'u');
- }
- if (config.transform.unicodeFlag) {
- newFlags = newFlags.replace('u', '');
- }
- if (config.transform.dotAllFlag) {
- newFlags = newFlags.replace('s', '');
- }
- onNewFlags(newFlags);
- }
- return generate(tree);
- };
- module.exports = rewritePattern;
|