HTMLDomStrings.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. /*************************************************************
  2. *
  3. * Copyright (c) 2017-2022 The MathJax Consortium
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /**
  18. * @fileoverview Implements the HTMLDomStrings class
  19. *
  20. * @author dpvc@mathjax.org (Davide Cervone)
  21. */
  22. import {userOptions, defaultOptions, OptionList, makeArray} from '../../util/Options.js';
  23. import {DOMAdaptor} from '../../core/DOMAdaptor.js';
  24. /**
  25. * List of consecutive text nodes and their text lengths
  26. *
  27. * @template N The HTMLElement node class
  28. * @template T The Text node class
  29. */
  30. export type HTMLNodeList<N, T> = [N | T, number][];
  31. /*****************************************************************/
  32. /**
  33. * The HTMLDocument class (extends AbstractMathDocument)
  34. *
  35. * A class for extracting the text from DOM trees
  36. *
  37. * @template N The HTMLElement node class
  38. * @template T The Text node class
  39. * @template D The Document class
  40. */
  41. export class HTMLDomStrings<N, T, D> {
  42. /**
  43. * The default options for string processing
  44. */
  45. public static OPTIONS: OptionList = {
  46. skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code', 'annotation', 'annotation-xml'],
  47. // The names of the tags whose contents will not be
  48. // scanned for math delimiters
  49. includeHtmlTags: {br: '\n', wbr: '', '#comment': ''},
  50. // tags to be included in the text (and what
  51. // text to replace them with)
  52. ignoreHtmlClass: 'mathjax_ignore', // the class name of elements whose contents should
  53. // NOT be processed by tex2jax. Note that this
  54. // is a regular expression, so be sure to quote any
  55. // regexp special characters
  56. processHtmlClass: 'mathjax_process' // the class name of elements whose contents SHOULD
  57. // be processed when they appear inside ones that
  58. // are ignored. Note that this is a regular expression,
  59. // so be sure to quote any regexp special characters
  60. };
  61. /**
  62. * The options for this instance
  63. */
  64. protected options: OptionList;
  65. /**
  66. * The array of strings found in the DOM
  67. */
  68. protected strings: string[];
  69. /**
  70. * The string currently being constructed
  71. */
  72. protected string: string;
  73. /**
  74. * The list of nodes and lengths for the string being constructed
  75. */
  76. protected snodes: HTMLNodeList<N, T>;
  77. /**
  78. * The list of node lists corresponding to the strings in this.strings
  79. */
  80. protected nodes: HTMLNodeList<N, T>[];
  81. /**
  82. * The container nodes that are currently being traversed, and whether their
  83. * contents are being ignored or not
  84. */
  85. protected stack: [N | T, boolean][];
  86. /**
  87. * Regular expression for the tags to be skipped
  88. * processing of math
  89. */
  90. protected skipHtmlTags: RegExp;
  91. /**
  92. * Regular expression for which classes should stop processing of math
  93. */
  94. protected ignoreHtmlClass: RegExp;
  95. /**
  96. * Regular expression for which classes should start processing of math
  97. */
  98. protected processHtmlClass: RegExp;
  99. /**
  100. * The DOM Adaptor to managing HTML elements
  101. */
  102. public adaptor: DOMAdaptor<N, T, D>;
  103. /**
  104. * @param {OptionList} options The user-supplied options
  105. * @constructor
  106. */
  107. constructor(options: OptionList = null) {
  108. let CLASS = this.constructor as typeof HTMLDomStrings;
  109. this.options = userOptions(defaultOptions({}, CLASS.OPTIONS), options);
  110. this.init();
  111. this.getPatterns();
  112. }
  113. /**
  114. * Set the initial values of the main properties
  115. */
  116. protected init() {
  117. this.strings = [];
  118. this.string = '';
  119. this.snodes = [];
  120. this.nodes = [];
  121. this.stack = [];
  122. }
  123. /**
  124. * Create the search patterns for skipHtmlTags, ignoreHtmlClass, and processHtmlClass
  125. */
  126. protected getPatterns() {
  127. let skip = makeArray(this.options['skipHtmlTags']);
  128. let ignore = makeArray(this.options['ignoreHtmlClass']);
  129. let process = makeArray(this.options['processHtmlClass']);
  130. this.skipHtmlTags = new RegExp('^(?:' + skip.join('|') + ')$', 'i');
  131. this.ignoreHtmlClass = new RegExp('(?:^| )(?:' + ignore.join('|') + ')(?: |$)');
  132. this.processHtmlClass = new RegExp('(?:^| )(?:' + process + ')(?: |$)');
  133. }
  134. /**
  135. * Add a string to the string array and record its node list
  136. */
  137. protected pushString() {
  138. if (this.string.match(/\S/)) {
  139. this.strings.push(this.string);
  140. this.nodes.push(this.snodes);
  141. }
  142. this.string = '';
  143. this.snodes = [];
  144. }
  145. /**
  146. * Add more text to the current string, and record the
  147. * node and its position in the string.
  148. *
  149. * @param {N|T} node The node to be pushed
  150. * @param {string} text The text to be added (it may not be the actual text
  151. * of the node, if it is one of the nodes that gets
  152. * translated to text, like <br> to a newline).
  153. */
  154. protected extendString(node: N | T, text: string) {
  155. this.snodes.push([node, text.length]);
  156. this.string += text;
  157. }
  158. /**
  159. * Handle a #text node (add its text to the current string)
  160. *
  161. * @param {T} node The Text node to process
  162. * @param {boolean} ignore Whether we are currently ignoring content
  163. * @return {N | T} The next element to process
  164. */
  165. protected handleText(node: T, ignore: boolean): N | T {
  166. if (!ignore) {
  167. this.extendString(node, this.adaptor.value(node));
  168. }
  169. return this.adaptor.next(node);
  170. }
  171. /**
  172. * Handle a BR, WBR, or #comment element (or others in the includeHtmlTags object).
  173. *
  174. * @param {N} node The node to process
  175. * @param {boolean} ignore Whether we are currently ignoring content
  176. * @return {N | T} The next element to process
  177. */
  178. protected handleTag(node: N, ignore: boolean): N | T {
  179. if (!ignore) {
  180. let text = this.options['includeHtmlTags'][this.adaptor.kind(node)];
  181. this.extendString(node, text);
  182. }
  183. return this.adaptor.next(node);
  184. }
  185. /**
  186. * Handle an arbitrary DOM node:
  187. * Check the class to see if it matches the processHtmlClass regex
  188. * If the node has a child and is not marked as created by MathJax (data-MJX)
  189. * and either it is marked as restarting processing or is not a tag to be skipped, then
  190. * Save the next node (if there is one) and whether we are currently ignoring content
  191. * Move to the first child node
  192. * Update whether we are ignoring content
  193. * Otherwise
  194. * Move on to the next sibling
  195. * Return the next node to process and the ignore state
  196. *
  197. * @param {N} node The node to process
  198. * @param {boolean} ignore Whether we are currently ignoring content
  199. * @return {[N|T, boolean]} The next element to process and whether to ignore its content
  200. */
  201. protected handleContainer(node: N, ignore: boolean): [N | T, boolean] {
  202. this.pushString();
  203. const cname = this.adaptor.getAttribute(node, 'class') || '';
  204. const tname = this.adaptor.kind(node) || '';
  205. const process = this.processHtmlClass.exec(cname);
  206. let next = node as N | T;
  207. if (this.adaptor.firstChild(node) && !this.adaptor.getAttribute(node, 'data-MJX') &&
  208. (process || !this.skipHtmlTags.exec(tname))) {
  209. if (this.adaptor.next(node)) {
  210. this.stack.push([this.adaptor.next(node), ignore]);
  211. }
  212. next = this.adaptor.firstChild(node);
  213. ignore = (ignore || this.ignoreHtmlClass.exec(cname)) && !process;
  214. } else {
  215. next = this.adaptor.next(node);
  216. }
  217. return [next, ignore];
  218. }
  219. /**
  220. * Handle an unknown node type (nodeType other than 1, 3, 8)
  221. *
  222. * @param {N} node The node to process
  223. * @param {boolean} ignore Whether we are currently ignoring content
  224. * @return {N|T} The next element to process
  225. */
  226. protected handleOther(node: N, _ignore: boolean): N | T {
  227. this.pushString();
  228. return this.adaptor.next(node);
  229. }
  230. /**
  231. * Find the strings for a given DOM element:
  232. * Initialize the state
  233. * Get the element where we stop processing
  234. * While we still have a node, and it is not the one where we are to stop:
  235. * If it is a text node, handle it and get the next node
  236. * Otherwise, if it is in the includeHtmlTags list, handle it and get the next node
  237. * Otherwise, handle it as a container and get the next node and ignore status
  238. * If there is no next node, and there are more nodes on the stack:
  239. * Save the current string, and pop the node and ignore status from the stack
  240. * Push the final string
  241. * Get the string array and array of associated DOM nodes
  242. * Clear the internal values (so the memory can be freed)
  243. * Return the strings and node lists
  244. *
  245. * @param {N} node The node to search
  246. * @return {[string[], HTMLNodeList[]]} The array of strings and their associated lists of nodes
  247. */
  248. public find(node: N | T): [string[], HTMLNodeList<N, T>[]] {
  249. this.init();
  250. let stop = this.adaptor.next(node);
  251. let ignore = false;
  252. let include = this.options['includeHtmlTags'];
  253. while (node && node !== stop) {
  254. const kind = this.adaptor.kind(node);
  255. if (kind === '#text') {
  256. node = this.handleText(node as T, ignore);
  257. } else if (include.hasOwnProperty(kind)) {
  258. node = this.handleTag(node as N, ignore);
  259. } else if (kind) {
  260. [node, ignore] = this.handleContainer(node as N, ignore);
  261. } else {
  262. node = this.handleOther(node as N, ignore);
  263. }
  264. if (!node && this.stack.length) {
  265. this.pushString();
  266. [node, ignore] = this.stack.pop();
  267. }
  268. }
  269. this.pushString();
  270. let result = [this.strings, this.nodes] as [string[], HTMLNodeList<N, T>[]];
  271. this.init(); // free up memory
  272. return result;
  273. }
  274. }