Parser.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. /*************************************************************
  2. *
  3. * Copyright (c) 2018-2022 The MathJax Consortium
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /**
  18. * @fileoverview Implements a lightweight DOM adaptor
  19. *
  20. * @author dpvc@mathjax.org (Davide Cervone)
  21. */
  22. import {AttributeData} from '../../core/DOMAdaptor.js';
  23. import {MinDOMParser} from '../HTMLAdaptor.js';
  24. import * as Entities from '../../util/Entities.js';
  25. import {LiteDocument} from './Document.js';
  26. import {LiteElement} from './Element.js';
  27. import {LiteText, LiteComment} from './Text.js';
  28. import {LiteAdaptor} from '../liteAdaptor.js';
  29. /**
  30. * Patterns used in parsing serialized HTML
  31. */
  32. export namespace PATTERNS {
  33. export const TAGNAME = '[a-z][^\\s\\n>]*';
  34. export const ATTNAME = '[a-z][^\\s\\n>=]*';
  35. export const VALUE = `(?:'[^']*'|"[^"]*"|[^\\s\\n]+)`;
  36. export const VALUESPLIT = `(?:'([^']*)'|"([^"]*)"|([^\\s\\n]+))`;
  37. export const SPACE = '(?:\\s|\\n)+';
  38. export const OPTIONALSPACE = '(?:\\s|\\n)*';
  39. export const ATTRIBUTE = ATTNAME + '(?:' + OPTIONALSPACE + '=' + OPTIONALSPACE + VALUE + ')?';
  40. export const ATTRIBUTESPLIT = '(' + ATTNAME + ')(?:' + OPTIONALSPACE + '=' + OPTIONALSPACE + VALUESPLIT + ')?';
  41. export const TAG = '(<(?:' + TAGNAME + '(?:' + SPACE + ATTRIBUTE + ')*'
  42. + OPTIONALSPACE + '/?|/' + TAGNAME + '|!--[^]*?--|![^]*?)(?:>|$))';
  43. export const tag = new RegExp(TAG, 'i');
  44. export const attr = new RegExp(ATTRIBUTE, 'i');
  45. export const attrsplit = new RegExp(ATTRIBUTESPLIT, 'i');
  46. }
  47. /************************************************************/
  48. /**
  49. * Implements a lightweight DOMParser replacement
  50. * (Not perfect, but handles most well-formed HTML)
  51. */
  52. export class LiteParser implements MinDOMParser<LiteDocument> {
  53. /**
  54. * The list of self-closing tags
  55. */
  56. public static SELF_CLOSING: {[name: string]: boolean} = {
  57. area: true,
  58. base: true,
  59. br: true,
  60. col: true,
  61. command: true,
  62. embed: true,
  63. hr: true,
  64. img: true,
  65. input: true,
  66. keygen: true,
  67. link: true,
  68. menuitem: true,
  69. meta: true,
  70. param: true,
  71. source: true,
  72. track: true,
  73. wbr: true
  74. };
  75. /**
  76. * The list of tags chose content is not parsed (PCDATA)
  77. */
  78. public static PCDATA: {[name: string]: boolean} = {
  79. option: true,
  80. textarea: true,
  81. fieldset: true,
  82. title: true,
  83. style: true,
  84. script: true
  85. };
  86. /**
  87. * The list of attributes that don't get entity translation
  88. */
  89. public static CDATA_ATTR: {[name: string]: boolean} = {
  90. style: true,
  91. datafld: true,
  92. datasrc: true,
  93. href: true,
  94. src: true,
  95. longdesc: true,
  96. usemap: true,
  97. cite: true,
  98. datetime: true,
  99. action: true,
  100. axis: true,
  101. profile: true,
  102. content: true,
  103. scheme: true
  104. };
  105. /**
  106. * @override
  107. */
  108. public parseFromString(text: string, _format: string = 'text/html', adaptor: LiteAdaptor = null) {
  109. const root = adaptor.createDocument();
  110. let node = adaptor.body(root);
  111. //
  112. // Split the HTML into an array of text, tag, text, tag, ...
  113. // Then loop through them and add text nodes and process tags.
  114. //
  115. let parts = text.replace(/<\?.*?\?>/g, '').split(PATTERNS.tag);
  116. while (parts.length) {
  117. const text = parts.shift();
  118. const tag = parts.shift();
  119. if (text) {
  120. this.addText(adaptor, node, text);
  121. }
  122. if (tag && tag.charAt(tag.length - 1) === '>') {
  123. if (tag.charAt(1) === '!') {
  124. this.addComment(adaptor, node, tag);
  125. } else if (tag.charAt(1) === '/') {
  126. node = this.closeTag(adaptor, node, tag);
  127. } else {
  128. node = this.openTag(adaptor, node, tag, parts);
  129. }
  130. }
  131. }
  132. this.checkDocument(adaptor, root);
  133. return root;
  134. }
  135. /**
  136. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  137. * @param {LiteElement} node The node to add a text element to
  138. * @param {string} text The text for the text node
  139. * @return {LiteText} The text element
  140. */
  141. protected addText(adaptor: LiteAdaptor, node: LiteElement, text: string): LiteText {
  142. text = Entities.translate(text);
  143. return adaptor.append(node, adaptor.text(text)) as LiteText;
  144. }
  145. /**
  146. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  147. * @param {LiteElement} node The node to add a comment to
  148. * @param {string} comment The text for the comment node
  149. * @return {LiteComment} The comment element
  150. */
  151. protected addComment(adaptor: LiteAdaptor, node: LiteElement, comment: string): LiteComment {
  152. return adaptor.append(node, new LiteComment(comment)) as LiteComment;
  153. }
  154. /**
  155. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  156. * @param {LiteElement} node The node to close
  157. * @param {string} tag The close tag being processed
  158. * @return {LiteElement} The first unclosed parent node
  159. */
  160. protected closeTag(adaptor: LiteAdaptor, node: LiteElement, tag: string): LiteElement {
  161. const kind = tag.slice(2, tag.length - 1).toLowerCase();
  162. while (adaptor.parent(node) && adaptor.kind(node) !== kind) {
  163. node = adaptor.parent(node);
  164. }
  165. return adaptor.parent(node);
  166. }
  167. /**
  168. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  169. * @param {LiteElement} node The parent node for the tag
  170. * @param {string} tag The tag being processed
  171. * @param {string[]} parts The rest of the text/tag array
  172. * @return {LiteElement} The node to which the next tag will be added
  173. */
  174. protected openTag(adaptor: LiteAdaptor, node: LiteElement, tag: string, parts: string[]): LiteElement {
  175. const PCDATA = (this.constructor as typeof LiteParser).PCDATA;
  176. const SELF_CLOSING = (this.constructor as typeof LiteParser).SELF_CLOSING;
  177. //
  178. // Get the child to be added to the node
  179. //
  180. const kind = tag.match(/<(.*?)[\s\n>\/]/)[1].toLowerCase();
  181. const child = adaptor.node(kind) as LiteElement;
  182. //
  183. // Split out the tag attributes as an array of space, name, value1, value3, value3,
  184. // where value1, value2, and value3 are the value of the node (only one is defined)
  185. // that come from matching quoted strings with ' (value1), " (value2) or no quotes (value3).
  186. //
  187. const attributes = tag.replace(/^<.*?[\s\n>]/, '').split(PATTERNS.attrsplit);
  188. //
  189. // If the tag was complete (it ends with > or has no attributes)
  190. //
  191. if (attributes.pop().match(/>$/) || attributes.length < 5) {
  192. this.addAttributes(adaptor, child, attributes);
  193. adaptor.append(node, child);
  194. //
  195. // For non-self-closing tags,
  196. // For tags whose contents is PCDATA (like <script>), collect the
  197. // content up until the end tag, and continue adding nee tags
  198. // to the current parent node.
  199. // Otherwise, the child tag becames the parent node to which
  200. // new tags are added
  201. //
  202. if (!SELF_CLOSING[kind] && !tag.match(/\/>$/)) {
  203. if (PCDATA[kind]) {
  204. this.handlePCDATA(adaptor, child, kind, parts);
  205. } else {
  206. node = child;
  207. }
  208. }
  209. }
  210. return node;
  211. }
  212. /**
  213. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  214. * @param {LiteElement} node The node getting the attributes
  215. * @param {string[]} attributes The array of space, name, value1, value2, value3
  216. * as described above.
  217. */
  218. protected addAttributes(adaptor: LiteAdaptor, node: LiteElement, attributes: string[]) {
  219. const CDATA_ATTR = (this.constructor as typeof LiteParser).CDATA_ATTR;
  220. while (attributes.length) {
  221. let [ , name, v1, v2, v3] = attributes.splice(0, 5);
  222. let value = v1 || v2 || v3 || '';
  223. if (!CDATA_ATTR[name]) {
  224. value = Entities.translate(value);
  225. }
  226. adaptor.setAttribute(node, name, value);
  227. }
  228. }
  229. /**
  230. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  231. * @param {LiteElement} node The node whose PCDATA content is being collected
  232. * @param {string} kind The tag name being handled
  233. * @param {string[]} parts The array of text/tag data for the document
  234. */
  235. protected handlePCDATA(adaptor: LiteAdaptor, node: LiteElement, kind: string, parts: string[]) {
  236. const pcdata = [] as string[];
  237. const etag = '</' + kind + '>';
  238. let ptag = '';
  239. //
  240. // Look through the parts until the end tag is found
  241. // Add the unmatched tag and the following text
  242. // and try the next tag until we find the end tag.
  243. //
  244. while (parts.length && ptag !== etag) {
  245. pcdata.push(ptag);
  246. pcdata.push(parts.shift());
  247. ptag = parts.shift();
  248. }
  249. //
  250. // Add the collected contents as a text node
  251. //
  252. adaptor.append(node, adaptor.text(pcdata.join('')));
  253. }
  254. /**
  255. * Check the contents of the parsed document and move html, head, and body
  256. * tags into the document structure. That way, you can parse fragments or
  257. * full documents and still get a valid document.
  258. *
  259. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  260. * @param {LiteDocument} root The document being checked
  261. */
  262. protected checkDocument(adaptor: LiteAdaptor, root: LiteDocument) {
  263. let node = this.getOnlyChild(adaptor, adaptor.body(root));
  264. if (!node) return;
  265. for (const child of adaptor.childNodes(adaptor.body(root))) {
  266. if (child === node) {
  267. break;
  268. }
  269. if (child instanceof LiteComment && child.value.match(/^<!DOCTYPE/)) {
  270. root.type = child.value;
  271. }
  272. }
  273. switch (adaptor.kind(node)) {
  274. case 'html':
  275. //
  276. // Look through the children for the head and body
  277. //
  278. for (const child of node.children) {
  279. switch (adaptor.kind(child)) {
  280. case 'head':
  281. root.head = child as LiteElement;
  282. break;
  283. case 'body':
  284. root.body = child as LiteElement;
  285. break;
  286. }
  287. }
  288. //
  289. // Make sure the elements are linked in properly
  290. //
  291. root.root = node;
  292. adaptor.remove(node);
  293. if (adaptor.parent(root.body) !== node) {
  294. adaptor.append(node, root.body);
  295. }
  296. if (adaptor.parent(root.head) !== node) {
  297. adaptor.insert(root.head, root.body);
  298. }
  299. break;
  300. case 'head':
  301. root.head = adaptor.replace(node, root.head) as LiteElement;
  302. break;
  303. case 'body':
  304. root.body = adaptor.replace(node, root.body) as LiteElement;
  305. break;
  306. }
  307. }
  308. /**
  309. * Checks if the body has only one element child (as opposed to comments or text nodes)
  310. * and returns that sole element (or null if none or more than one)
  311. *
  312. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  313. * @param {LiteElement} body The body element being checked
  314. * @return {LiteElement} The sole LiteElement child of the body, or null if none or more than one
  315. */
  316. protected getOnlyChild(adaptor: LiteAdaptor, body: LiteElement): LiteElement {
  317. let node: LiteElement = null;
  318. for (const child of adaptor.childNodes(body)) {
  319. if (child instanceof LiteElement) {
  320. if (node) return null;
  321. node = child;
  322. }
  323. }
  324. return node;
  325. }
  326. /**
  327. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  328. * @param {LiteElement} node The node to serialize
  329. * @param {boolean} xml True when producing XML, false for HTML
  330. * @return {string} The serialized element (like outerHTML)
  331. */
  332. public serialize(adaptor: LiteAdaptor, node: LiteElement, xml: boolean = false): string {
  333. const SELF_CLOSING = (this.constructor as typeof LiteParser).SELF_CLOSING;
  334. const CDATA = (this.constructor as typeof LiteParser).CDATA_ATTR;
  335. const tag = adaptor.kind(node);
  336. const attributes = adaptor.allAttributes(node).map(
  337. (x: AttributeData) => x.name + '="' + (CDATA[x.name] ? x.value : this.protectAttribute(x.value)) + '"'
  338. ).join(' ');
  339. const content = this.serializeInner(adaptor, node, xml);
  340. const html =
  341. '<' + tag + (attributes ? ' ' + attributes : '')
  342. + ((!xml || content) && !SELF_CLOSING[tag] ? `>${content}</${tag}>` : xml ? '/>' : '>');
  343. return html;
  344. }
  345. /**
  346. * @param {LiteAdaptor} adaptor The adaptor for managing nodes
  347. * @param {LiteElement} node The node whose innerHTML is needed
  348. * @return {string} The serialized element (like innerHTML)
  349. */
  350. public serializeInner(adaptor: LiteAdaptor, node: LiteElement, xml: boolean = false): string {
  351. const PCDATA = (this.constructor as typeof LiteParser).PCDATA;
  352. if (PCDATA.hasOwnProperty(node.kind)) {
  353. return adaptor.childNodes(node).map(x => adaptor.value(x)).join('');
  354. }
  355. return adaptor.childNodes(node).map(x => {
  356. const kind = adaptor.kind(x);
  357. return (kind === '#text' ? this.protectHTML(adaptor.value(x)) :
  358. kind === '#comment' ? (x as LiteComment).value :
  359. this.serialize(adaptor, x as LiteElement, xml));
  360. }).join('');
  361. }
  362. /**
  363. * @param {string} text The attribute value to be HTML escaped
  364. * @return {string} The string with " replaced by entities
  365. */
  366. public protectAttribute(text: string): string {
  367. if (typeof text !== 'string') {
  368. text = String(text);
  369. }
  370. return text.replace(/"/g, '&quot;');
  371. }
  372. /**
  373. * @param {string} text The text to be HTML escaped
  374. * @return {string} The string with &, <, and > replaced by entities
  375. */
  376. public protectHTML(text: string): string {
  377. return text.replace(/&/g, '&amp;')
  378. .replace(/</g, '&lt;')
  379. .replace(/>/g, '&gt;');
  380. }
  381. }