utils.mjs 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. // Utilities
  2. //
  3. import * as mdurl from 'mdurl'
  4. import * as ucmicro from 'uc.micro'
  5. import { decodeHTML } from 'entities'
  6. function _class (obj) { return Object.prototype.toString.call(obj) }
  7. function isString (obj) { return _class(obj) === '[object String]' }
  8. const _hasOwnProperty = Object.prototype.hasOwnProperty
  9. function has (object, key) {
  10. return _hasOwnProperty.call(object, key)
  11. }
  12. // Merge objects
  13. //
  14. function assign (obj /* from1, from2, from3, ... */) {
  15. const sources = Array.prototype.slice.call(arguments, 1)
  16. sources.forEach(function (source) {
  17. if (!source) { return }
  18. if (typeof source !== 'object') {
  19. throw new TypeError(source + 'must be object')
  20. }
  21. Object.keys(source).forEach(function (key) {
  22. obj[key] = source[key]
  23. })
  24. })
  25. return obj
  26. }
  27. // Remove element from array and put another array at those position.
  28. // Useful for some operations with tokens
  29. function arrayReplaceAt (src, pos, newElements) {
  30. return [].concat(src.slice(0, pos), newElements, src.slice(pos + 1))
  31. }
  32. function isValidEntityCode (c) {
  33. /* eslint no-bitwise:0 */
  34. // broken sequence
  35. if (c >= 0xD800 && c <= 0xDFFF) { return false }
  36. // never used
  37. if (c >= 0xFDD0 && c <= 0xFDEF) { return false }
  38. if ((c & 0xFFFF) === 0xFFFF || (c & 0xFFFF) === 0xFFFE) { return false }
  39. // control codes
  40. if (c >= 0x00 && c <= 0x08) { return false }
  41. if (c === 0x0B) { return false }
  42. if (c >= 0x0E && c <= 0x1F) { return false }
  43. if (c >= 0x7F && c <= 0x9F) { return false }
  44. // out of range
  45. if (c > 0x10FFFF) { return false }
  46. return true
  47. }
  48. function fromCodePoint (c) {
  49. /* eslint no-bitwise:0 */
  50. if (c > 0xffff) {
  51. c -= 0x10000
  52. const surrogate1 = 0xd800 + (c >> 10)
  53. const surrogate2 = 0xdc00 + (c & 0x3ff)
  54. return String.fromCharCode(surrogate1, surrogate2)
  55. }
  56. return String.fromCharCode(c)
  57. }
  58. const UNESCAPE_MD_RE = /\\([!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])/g
  59. const ENTITY_RE = /&([a-z#][a-z0-9]{1,31});/gi
  60. const UNESCAPE_ALL_RE = new RegExp(UNESCAPE_MD_RE.source + '|' + ENTITY_RE.source, 'gi')
  61. const DIGITAL_ENTITY_TEST_RE = /^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i
  62. function replaceEntityPattern (match, name) {
  63. if (name.charCodeAt(0) === 0x23/* # */ && DIGITAL_ENTITY_TEST_RE.test(name)) {
  64. const code = name[1].toLowerCase() === 'x'
  65. ? parseInt(name.slice(2), 16)
  66. : parseInt(name.slice(1), 10)
  67. if (isValidEntityCode(code)) {
  68. return fromCodePoint(code)
  69. }
  70. return match
  71. }
  72. const decoded = decodeHTML(match)
  73. if (decoded !== match) {
  74. return decoded
  75. }
  76. return match
  77. }
  78. /* function replaceEntities(str) {
  79. if (str.indexOf('&') < 0) { return str; }
  80. return str.replace(ENTITY_RE, replaceEntityPattern);
  81. } */
  82. function unescapeMd (str) {
  83. if (str.indexOf('\\') < 0) { return str }
  84. return str.replace(UNESCAPE_MD_RE, '$1')
  85. }
  86. function unescapeAll (str) {
  87. if (str.indexOf('\\') < 0 && str.indexOf('&') < 0) { return str }
  88. return str.replace(UNESCAPE_ALL_RE, function (match, escaped, entity) {
  89. if (escaped) { return escaped }
  90. return replaceEntityPattern(match, entity)
  91. })
  92. }
  93. const HTML_ESCAPE_TEST_RE = /[&<>"]/
  94. const HTML_ESCAPE_REPLACE_RE = /[&<>"]/g
  95. const HTML_REPLACEMENTS = {
  96. '&': '&amp;',
  97. '<': '&lt;',
  98. '>': '&gt;',
  99. '"': '&quot;'
  100. }
  101. function replaceUnsafeChar (ch) {
  102. return HTML_REPLACEMENTS[ch]
  103. }
  104. function escapeHtml (str) {
  105. if (HTML_ESCAPE_TEST_RE.test(str)) {
  106. return str.replace(HTML_ESCAPE_REPLACE_RE, replaceUnsafeChar)
  107. }
  108. return str
  109. }
  110. const REGEXP_ESCAPE_RE = /[.?*+^$[\]\\(){}|-]/g
  111. function escapeRE (str) {
  112. return str.replace(REGEXP_ESCAPE_RE, '\\$&')
  113. }
  114. function isSpace (code) {
  115. switch (code) {
  116. case 0x09:
  117. case 0x20:
  118. return true
  119. }
  120. return false
  121. }
  122. // Zs (unicode class) || [\t\f\v\r\n]
  123. function isWhiteSpace (code) {
  124. if (code >= 0x2000 && code <= 0x200A) { return true }
  125. switch (code) {
  126. case 0x09: // \t
  127. case 0x0A: // \n
  128. case 0x0B: // \v
  129. case 0x0C: // \f
  130. case 0x0D: // \r
  131. case 0x20:
  132. case 0xA0:
  133. case 0x1680:
  134. case 0x202F:
  135. case 0x205F:
  136. case 0x3000:
  137. return true
  138. }
  139. return false
  140. }
  141. /* eslint-disable max-len */
  142. // Currently without astral characters support.
  143. function isPunctChar (ch) {
  144. return ucmicro.P.test(ch) || ucmicro.S.test(ch)
  145. }
  146. // Markdown ASCII punctuation characters.
  147. //
  148. // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~
  149. // http://spec.commonmark.org/0.15/#ascii-punctuation-character
  150. //
  151. // Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
  152. //
  153. function isMdAsciiPunct (ch) {
  154. switch (ch) {
  155. case 0x21/* ! */:
  156. case 0x22/* " */:
  157. case 0x23/* # */:
  158. case 0x24/* $ */:
  159. case 0x25/* % */:
  160. case 0x26/* & */:
  161. case 0x27/* ' */:
  162. case 0x28/* ( */:
  163. case 0x29/* ) */:
  164. case 0x2A/* * */:
  165. case 0x2B/* + */:
  166. case 0x2C/* , */:
  167. case 0x2D/* - */:
  168. case 0x2E/* . */:
  169. case 0x2F/* / */:
  170. case 0x3A/* : */:
  171. case 0x3B/* ; */:
  172. case 0x3C/* < */:
  173. case 0x3D/* = */:
  174. case 0x3E/* > */:
  175. case 0x3F/* ? */:
  176. case 0x40/* @ */:
  177. case 0x5B/* [ */:
  178. case 0x5C/* \ */:
  179. case 0x5D/* ] */:
  180. case 0x5E/* ^ */:
  181. case 0x5F/* _ */:
  182. case 0x60/* ` */:
  183. case 0x7B/* { */:
  184. case 0x7C/* | */:
  185. case 0x7D/* } */:
  186. case 0x7E/* ~ */:
  187. return true
  188. default:
  189. return false
  190. }
  191. }
  192. // Hepler to unify [reference labels].
  193. //
  194. function normalizeReference (str) {
  195. // Trim and collapse whitespace
  196. //
  197. str = str.trim().replace(/\s+/g, ' ')
  198. // In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
  199. // fixed in v12 (couldn't find any details).
  200. //
  201. // So treat this one as a special case
  202. // (remove this when node v10 is no longer supported).
  203. //
  204. if ('ẞ'.toLowerCase() === 'Ṿ') {
  205. str = str.replace(/ẞ/g, 'ß')
  206. }
  207. // .toLowerCase().toUpperCase() should get rid of all differences
  208. // between letter variants.
  209. //
  210. // Simple .toLowerCase() doesn't normalize 125 code points correctly,
  211. // and .toUpperCase doesn't normalize 6 of them (list of exceptions:
  212. // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
  213. // uppercased versions).
  214. //
  215. // Here's an example showing how it happens. Lets take greek letter omega:
  216. // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
  217. //
  218. // Unicode entries:
  219. // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8;
  220. // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
  221. // 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
  222. // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
  223. //
  224. // Case-insensitive comparison should treat all of them as equivalent.
  225. //
  226. // But .toLowerCase() doesn't change ϑ (it's already lowercase),
  227. // and .toUpperCase() doesn't change ϴ (already uppercase).
  228. //
  229. // Applying first lower then upper case normalizes any character:
  230. // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
  231. //
  232. // Note: this is equivalent to unicode case folding; unicode normalization
  233. // is a different step that is not required here.
  234. //
  235. // Final result should be uppercased, because it's later stored in an object
  236. // (this avoid a conflict with Object.prototype members,
  237. // most notably, `__proto__`)
  238. //
  239. return str.toLowerCase().toUpperCase()
  240. }
  241. // Re-export libraries commonly used in both markdown-it and its plugins,
  242. // so plugins won't have to depend on them explicitly, which reduces their
  243. // bundled size (e.g. a browser build).
  244. //
  245. const lib = { mdurl, ucmicro }
  246. export {
  247. lib,
  248. assign,
  249. isString,
  250. has,
  251. unescapeMd,
  252. unescapeAll,
  253. isValidEntityCode,
  254. fromCodePoint,
  255. escapeHtml,
  256. arrayReplaceAt,
  257. isSpace,
  258. isWhiteSpace,
  259. isMdAsciiPunct,
  260. isPunctChar,
  261. escapeRE,
  262. normalizeReference
  263. }