123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303 |
- /*************************************************************
- *
- * Copyright (c) 2017-2022 The MathJax Consortium
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /**
- * @fileoverview Implements the HTMLDomStrings class
- *
- * @author dpvc@mathjax.org (Davide Cervone)
- */
- import {userOptions, defaultOptions, OptionList, makeArray} from '../../util/Options.js';
- import {DOMAdaptor} from '../../core/DOMAdaptor.js';
- /**
- * List of consecutive text nodes and their text lengths
- *
- * @template N The HTMLElement node class
- * @template T The Text node class
- */
- export type HTMLNodeList<N, T> = [N | T, number][];
- /*****************************************************************/
- /**
- * The HTMLDocument class (extends AbstractMathDocument)
- *
- * A class for extracting the text from DOM trees
- *
- * @template N The HTMLElement node class
- * @template T The Text node class
- * @template D The Document class
- */
- export class HTMLDomStrings<N, T, D> {
- /**
- * The default options for string processing
- */
- public static OPTIONS: OptionList = {
- skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code', 'annotation', 'annotation-xml'],
- // The names of the tags whose contents will not be
- // scanned for math delimiters
- includeHtmlTags: {br: '\n', wbr: '', '#comment': ''},
- // tags to be included in the text (and what
- // text to replace them with)
- ignoreHtmlClass: 'mathjax_ignore', // the class name of elements whose contents should
- // NOT be processed by tex2jax. Note that this
- // is a regular expression, so be sure to quote any
- // regexp special characters
- processHtmlClass: 'mathjax_process' // the class name of elements whose contents SHOULD
- // be processed when they appear inside ones that
- // are ignored. Note that this is a regular expression,
- // so be sure to quote any regexp special characters
- };
- /**
- * The options for this instance
- */
- protected options: OptionList;
- /**
- * The array of strings found in the DOM
- */
- protected strings: string[];
- /**
- * The string currently being constructed
- */
- protected string: string;
- /**
- * The list of nodes and lengths for the string being constructed
- */
- protected snodes: HTMLNodeList<N, T>;
- /**
- * The list of node lists corresponding to the strings in this.strings
- */
- protected nodes: HTMLNodeList<N, T>[];
- /**
- * The container nodes that are currently being traversed, and whether their
- * contents are being ignored or not
- */
- protected stack: [N | T, boolean][];
- /**
- * Regular expression for the tags to be skipped
- * processing of math
- */
- protected skipHtmlTags: RegExp;
- /**
- * Regular expression for which classes should stop processing of math
- */
- protected ignoreHtmlClass: RegExp;
- /**
- * Regular expression for which classes should start processing of math
- */
- protected processHtmlClass: RegExp;
- /**
- * The DOM Adaptor to managing HTML elements
- */
- public adaptor: DOMAdaptor<N, T, D>;
- /**
- * @param {OptionList} options The user-supplied options
- * @constructor
- */
- constructor(options: OptionList = null) {
- let CLASS = this.constructor as typeof HTMLDomStrings;
- this.options = userOptions(defaultOptions({}, CLASS.OPTIONS), options);
- this.init();
- this.getPatterns();
- }
- /**
- * Set the initial values of the main properties
- */
- protected init() {
- this.strings = [];
- this.string = '';
- this.snodes = [];
- this.nodes = [];
- this.stack = [];
- }
- /**
- * Create the search patterns for skipHtmlTags, ignoreHtmlClass, and processHtmlClass
- */
- protected getPatterns() {
- let skip = makeArray(this.options['skipHtmlTags']);
- let ignore = makeArray(this.options['ignoreHtmlClass']);
- let process = makeArray(this.options['processHtmlClass']);
- this.skipHtmlTags = new RegExp('^(?:' + skip.join('|') + ')$', 'i');
- this.ignoreHtmlClass = new RegExp('(?:^| )(?:' + ignore.join('|') + ')(?: |$)');
- this.processHtmlClass = new RegExp('(?:^| )(?:' + process + ')(?: |$)');
- }
- /**
- * Add a string to the string array and record its node list
- */
- protected pushString() {
- if (this.string.match(/\S/)) {
- this.strings.push(this.string);
- this.nodes.push(this.snodes);
- }
- this.string = '';
- this.snodes = [];
- }
- /**
- * Add more text to the current string, and record the
- * node and its position in the string.
- *
- * @param {N|T} node The node to be pushed
- * @param {string} text The text to be added (it may not be the actual text
- * of the node, if it is one of the nodes that gets
- * translated to text, like <br> to a newline).
- */
- protected extendString(node: N | T, text: string) {
- this.snodes.push([node, text.length]);
- this.string += text;
- }
- /**
- * Handle a #text node (add its text to the current string)
- *
- * @param {T} node The Text node to process
- * @param {boolean} ignore Whether we are currently ignoring content
- * @return {N | T} The next element to process
- */
- protected handleText(node: T, ignore: boolean): N | T {
- if (!ignore) {
- this.extendString(node, this.adaptor.value(node));
- }
- return this.adaptor.next(node);
- }
- /**
- * Handle a BR, WBR, or #comment element (or others in the includeHtmlTags object).
- *
- * @param {N} node The node to process
- * @param {boolean} ignore Whether we are currently ignoring content
- * @return {N | T} The next element to process
- */
- protected handleTag(node: N, ignore: boolean): N | T {
- if (!ignore) {
- let text = this.options['includeHtmlTags'][this.adaptor.kind(node)];
- this.extendString(node, text);
- }
- return this.adaptor.next(node);
- }
- /**
- * Handle an arbitrary DOM node:
- * Check the class to see if it matches the processHtmlClass regex
- * If the node has a child and is not marked as created by MathJax (data-MJX)
- * and either it is marked as restarting processing or is not a tag to be skipped, then
- * Save the next node (if there is one) and whether we are currently ignoring content
- * Move to the first child node
- * Update whether we are ignoring content
- * Otherwise
- * Move on to the next sibling
- * Return the next node to process and the ignore state
- *
- * @param {N} node The node to process
- * @param {boolean} ignore Whether we are currently ignoring content
- * @return {[N|T, boolean]} The next element to process and whether to ignore its content
- */
- protected handleContainer(node: N, ignore: boolean): [N | T, boolean] {
- this.pushString();
- const cname = this.adaptor.getAttribute(node, 'class') || '';
- const tname = this.adaptor.kind(node) || '';
- const process = this.processHtmlClass.exec(cname);
- let next = node as N | T;
- if (this.adaptor.firstChild(node) && !this.adaptor.getAttribute(node, 'data-MJX') &&
- (process || !this.skipHtmlTags.exec(tname))) {
- if (this.adaptor.next(node)) {
- this.stack.push([this.adaptor.next(node), ignore]);
- }
- next = this.adaptor.firstChild(node);
- ignore = (ignore || this.ignoreHtmlClass.exec(cname)) && !process;
- } else {
- next = this.adaptor.next(node);
- }
- return [next, ignore];
- }
- /**
- * Handle an unknown node type (nodeType other than 1, 3, 8)
- *
- * @param {N} node The node to process
- * @param {boolean} ignore Whether we are currently ignoring content
- * @return {N|T} The next element to process
- */
- protected handleOther(node: N, _ignore: boolean): N | T {
- this.pushString();
- return this.adaptor.next(node);
- }
- /**
- * Find the strings for a given DOM element:
- * Initialize the state
- * Get the element where we stop processing
- * While we still have a node, and it is not the one where we are to stop:
- * If it is a text node, handle it and get the next node
- * Otherwise, if it is in the includeHtmlTags list, handle it and get the next node
- * Otherwise, handle it as a container and get the next node and ignore status
- * If there is no next node, and there are more nodes on the stack:
- * Save the current string, and pop the node and ignore status from the stack
- * Push the final string
- * Get the string array and array of associated DOM nodes
- * Clear the internal values (so the memory can be freed)
- * Return the strings and node lists
- *
- * @param {N} node The node to search
- * @return {[string[], HTMLNodeList[]]} The array of strings and their associated lists of nodes
- */
- public find(node: N | T): [string[], HTMLNodeList<N, T>[]] {
- this.init();
- let stop = this.adaptor.next(node);
- let ignore = false;
- let include = this.options['includeHtmlTags'];
- while (node && node !== stop) {
- const kind = this.adaptor.kind(node);
- if (kind === '#text') {
- node = this.handleText(node as T, ignore);
- } else if (include.hasOwnProperty(kind)) {
- node = this.handleTag(node as N, ignore);
- } else if (kind) {
- [node, ignore] = this.handleContainer(node as N, ignore);
- } else {
- node = this.handleOther(node as N, ignore);
- }
- if (!node && this.stack.length) {
- this.pushString();
- [node, ignore] = this.stack.pop();
- }
- }
- this.pushString();
- let result = [this.strings, this.nodes] as [string[], HTMLNodeList<N, T>[]];
- this.init(); // free up memory
- return result;
- }
- }
|