re.mjs 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. import { Any, Cc, Z, P } from 'uc.micro'
  2. export default function (opts) {
  3. const re = {}
  4. opts = opts || {}
  5. re.src_Any = Any.source
  6. re.src_Cc = Cc.source
  7. re.src_Z = Z.source
  8. re.src_P = P.source
  9. // \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
  10. re.src_ZPCc = [re.src_Z, re.src_P, re.src_Cc].join('|')
  11. // \p{\Z\Cc} (white spaces + control)
  12. re.src_ZCc = [re.src_Z, re.src_Cc].join('|')
  13. // Experimental. List of chars, completely prohibited in links
  14. // because can separate it from other part of text
  15. const text_separators = '[><\uff5c]'
  16. // All possible word characters (everything without punctuation, spaces & controls)
  17. // Defined via punctuation & spaces to save space
  18. // Should be something like \p{\L\N\S\M} (\w but without `_`)
  19. re.src_pseudo_letter = '(?:(?!' + text_separators + '|' + re.src_ZPCc + ')' + re.src_Any + ')'
  20. // The same as abothe but without [0-9]
  21. // var src_pseudo_letter_non_d = '(?:(?![0-9]|' + src_ZPCc + ')' + src_Any + ')';
  22. re.src_ip4 =
  23. '(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'
  24. // Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
  25. re.src_auth = '(?:(?:(?!' + re.src_ZCc + '|[@/\\[\\]()]).)+@)?'
  26. re.src_port =
  27. '(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?'
  28. re.src_host_terminator =
  29. '(?=$|' + text_separators + '|' + re.src_ZPCc + ')' +
  30. '(?!' + (opts['---'] ? '-(?!--)|' : '-|') + '_|:\\d|\\.-|\\.(?!$|' + re.src_ZPCc + '))'
  31. re.src_path =
  32. '(?:' +
  33. '[/?#]' +
  34. '(?:' +
  35. '(?!' + re.src_ZCc + '|' + text_separators + '|[()[\\]{}.,"\'?!\\-;]).|' +
  36. '\\[(?:(?!' + re.src_ZCc + '|\\]).)*\\]|' +
  37. '\\((?:(?!' + re.src_ZCc + '|[)]).)*\\)|' +
  38. '\\{(?:(?!' + re.src_ZCc + '|[}]).)*\\}|' +
  39. '\\"(?:(?!' + re.src_ZCc + '|["]).)+\\"|' +
  40. "\\'(?:(?!" + re.src_ZCc + "|[']).)+\\'|" +
  41. // allow `I'm_king` if no pair found
  42. "\\'(?=" + re.src_pseudo_letter + '|[-])|' +
  43. // google has many dots in "google search" links (#66, #81).
  44. // github has ... in commit range links,
  45. // Restrict to
  46. // - english
  47. // - percent-encoded
  48. // - parts of file path
  49. // - params separator
  50. // until more examples found.
  51. '\\.{2,}[a-zA-Z0-9%/&]|' +
  52. '\\.(?!' + re.src_ZCc + '|[.]|$)|' +
  53. (opts['---']
  54. ? '\\-(?!--(?:[^-]|$))(?:-*)|' // `---` => long dash, terminate
  55. : '\\-+|'
  56. ) +
  57. // allow `,,,` in paths
  58. ',(?!' + re.src_ZCc + '|$)|' +
  59. // allow `;` if not followed by space-like char
  60. ';(?!' + re.src_ZCc + '|$)|' +
  61. // allow `!!!` in paths, but not at the end
  62. '\\!+(?!' + re.src_ZCc + '|[!]|$)|' +
  63. '\\?(?!' + re.src_ZCc + '|[?]|$)' +
  64. ')+' +
  65. '|\\/' +
  66. ')?'
  67. // Allow anything in markdown spec, forbid quote (") at the first position
  68. // because emails enclosed in quotes are far more common
  69. re.src_email_name =
  70. '[\\-;:&=\\+\\$,\\.a-zA-Z0-9_][\\-;:&=\\+\\$,\\"\\.a-zA-Z0-9_]*'
  71. re.src_xn =
  72. 'xn--[a-z0-9\\-]{1,59}'
  73. // More to read about domain names
  74. // http://serverfault.com/questions/638260/
  75. re.src_domain_root =
  76. // Allow letters & digits (http://test1)
  77. '(?:' +
  78. re.src_xn +
  79. '|' +
  80. re.src_pseudo_letter + '{1,63}' +
  81. ')'
  82. re.src_domain =
  83. '(?:' +
  84. re.src_xn +
  85. '|' +
  86. '(?:' + re.src_pseudo_letter + ')' +
  87. '|' +
  88. '(?:' + re.src_pseudo_letter + '(?:-|' + re.src_pseudo_letter + '){0,61}' + re.src_pseudo_letter + ')' +
  89. ')'
  90. re.src_host =
  91. '(?:' +
  92. // Don't need IP check, because digits are already allowed in normal domain names
  93. // src_ip4 +
  94. // '|' +
  95. '(?:(?:(?:' + re.src_domain + ')\\.)*' + re.src_domain/* _root */ + ')' +
  96. ')'
  97. re.tpl_host_fuzzy =
  98. '(?:' +
  99. re.src_ip4 +
  100. '|' +
  101. '(?:(?:(?:' + re.src_domain + ')\\.)+(?:%TLDS%))' +
  102. ')'
  103. re.tpl_host_no_ip_fuzzy =
  104. '(?:(?:(?:' + re.src_domain + ')\\.)+(?:%TLDS%))'
  105. re.src_host_strict =
  106. re.src_host + re.src_host_terminator
  107. re.tpl_host_fuzzy_strict =
  108. re.tpl_host_fuzzy + re.src_host_terminator
  109. re.src_host_port_strict =
  110. re.src_host + re.src_port + re.src_host_terminator
  111. re.tpl_host_port_fuzzy_strict =
  112. re.tpl_host_fuzzy + re.src_port + re.src_host_terminator
  113. re.tpl_host_port_no_ip_fuzzy_strict =
  114. re.tpl_host_no_ip_fuzzy + re.src_port + re.src_host_terminator
  115. //
  116. // Main rules
  117. //
  118. // Rude test fuzzy links by host, for quick deny
  119. re.tpl_host_fuzzy_test =
  120. 'localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:' + re.src_ZPCc + '|>|$))'
  121. re.tpl_email_fuzzy =
  122. '(^|' + text_separators + '|"|\\(|' + re.src_ZCc + ')' +
  123. '(' + re.src_email_name + '@' + re.tpl_host_fuzzy_strict + ')'
  124. re.tpl_link_fuzzy =
  125. // Fuzzy link can't be prepended with .:/\- and non punctuation.
  126. // but can start with > (markdown blockquote)
  127. '(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|' + re.src_ZPCc + '))' +
  128. '((?![$+<=>^`|\uff5c])' + re.tpl_host_port_fuzzy_strict + re.src_path + ')'
  129. re.tpl_link_no_ip_fuzzy =
  130. // Fuzzy link can't be prepended with .:/\- and non punctuation.
  131. // but can start with > (markdown blockquote)
  132. '(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|' + re.src_ZPCc + '))' +
  133. '((?![$+<=>^`|\uff5c])' + re.tpl_host_port_no_ip_fuzzy_strict + re.src_path + ')'
  134. return re
  135. }