tokenize.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. "use strict";
  2. module.exports = tokenize;
  3. var delimRe = /[\s{}=;:[\],'"()<>]/g,
  4. stringDoubleRe = /(?:"([^"\\]*(?:\\.[^"\\]*)*)")/g,
  5. stringSingleRe = /(?:'([^'\\]*(?:\\.[^'\\]*)*)')/g;
  6. var setCommentRe = /^ *[*/]+ */,
  7. setCommentAltRe = /^\s*\*?\/*/,
  8. setCommentSplitRe = /\n/g,
  9. whitespaceRe = /\s/,
  10. unescapeRe = /\\(.?)/g;
  11. var unescapeMap = {
  12. "0": "\0",
  13. "r": "\r",
  14. "n": "\n",
  15. "t": "\t"
  16. };
  17. /**
  18. * Unescapes a string.
  19. * @param {string} str String to unescape
  20. * @returns {string} Unescaped string
  21. * @property {Object.<string,string>} map Special characters map
  22. * @memberof tokenize
  23. */
  24. function unescape(str) {
  25. return str.replace(unescapeRe, function($0, $1) {
  26. switch ($1) {
  27. case "\\":
  28. case "":
  29. return $1;
  30. default:
  31. return unescapeMap[$1] || "";
  32. }
  33. });
  34. }
  35. tokenize.unescape = unescape;
  36. /**
  37. * Gets the next token and advances.
  38. * @typedef TokenizerHandleNext
  39. * @type {function}
  40. * @returns {string|null} Next token or `null` on eof
  41. */
  42. /**
  43. * Peeks for the next token.
  44. * @typedef TokenizerHandlePeek
  45. * @type {function}
  46. * @returns {string|null} Next token or `null` on eof
  47. */
  48. /**
  49. * Pushes a token back to the stack.
  50. * @typedef TokenizerHandlePush
  51. * @type {function}
  52. * @param {string} token Token
  53. * @returns {undefined}
  54. */
  55. /**
  56. * Skips the next token.
  57. * @typedef TokenizerHandleSkip
  58. * @type {function}
  59. * @param {string} expected Expected token
  60. * @param {boolean} [optional=false] If optional
  61. * @returns {boolean} Whether the token matched
  62. * @throws {Error} If the token didn't match and is not optional
  63. */
  64. /**
  65. * Gets the comment on the previous line or, alternatively, the line comment on the specified line.
  66. * @typedef TokenizerHandleCmnt
  67. * @type {function}
  68. * @param {number} [line] Line number
  69. * @returns {string|null} Comment text or `null` if none
  70. */
  71. /**
  72. * Handle object returned from {@link tokenize}.
  73. * @interface ITokenizerHandle
  74. * @property {TokenizerHandleNext} next Gets the next token and advances (`null` on eof)
  75. * @property {TokenizerHandlePeek} peek Peeks for the next token (`null` on eof)
  76. * @property {TokenizerHandlePush} push Pushes a token back to the stack
  77. * @property {TokenizerHandleSkip} skip Skips a token, returns its presence and advances or, if non-optional and not present, throws
  78. * @property {TokenizerHandleCmnt} cmnt Gets the comment on the previous line or the line comment on the specified line, if any
  79. * @property {number} line Current line number
  80. */
  81. /**
  82. * Tokenizes the given .proto source and returns an object with useful utility functions.
  83. * @param {string} source Source contents
  84. * @param {boolean} alternateCommentMode Whether we should activate alternate comment parsing mode.
  85. * @returns {ITokenizerHandle} Tokenizer handle
  86. */
  87. function tokenize(source, alternateCommentMode) {
  88. /* eslint-disable callback-return */
  89. source = source.toString();
  90. var offset = 0,
  91. length = source.length,
  92. line = 1,
  93. commentType = null,
  94. commentText = null,
  95. commentLine = 0,
  96. commentLineEmpty = false;
  97. var stack = [];
  98. var stringDelim = null;
  99. /* istanbul ignore next */
  100. /**
  101. * Creates an error for illegal syntax.
  102. * @param {string} subject Subject
  103. * @returns {Error} Error created
  104. * @inner
  105. */
  106. function illegal(subject) {
  107. return Error("illegal " + subject + " (line " + line + ")");
  108. }
  109. /**
  110. * Reads a string till its end.
  111. * @returns {string} String read
  112. * @inner
  113. */
  114. function readString() {
  115. var re = stringDelim === "'" ? stringSingleRe : stringDoubleRe;
  116. re.lastIndex = offset - 1;
  117. var match = re.exec(source);
  118. if (!match)
  119. throw illegal("string");
  120. offset = re.lastIndex;
  121. push(stringDelim);
  122. stringDelim = null;
  123. return unescape(match[1]);
  124. }
  125. /**
  126. * Gets the character at `pos` within the source.
  127. * @param {number} pos Position
  128. * @returns {string} Character
  129. * @inner
  130. */
  131. function charAt(pos) {
  132. return source.charAt(pos);
  133. }
  134. /**
  135. * Sets the current comment text.
  136. * @param {number} start Start offset
  137. * @param {number} end End offset
  138. * @returns {undefined}
  139. * @inner
  140. */
  141. function setComment(start, end) {
  142. commentType = source.charAt(start++);
  143. commentLine = line;
  144. commentLineEmpty = false;
  145. var lookback;
  146. if (alternateCommentMode) {
  147. lookback = 2; // alternate comment parsing: "//" or "/*"
  148. } else {
  149. lookback = 3; // "///" or "/**"
  150. }
  151. var commentOffset = start - lookback,
  152. c;
  153. do {
  154. if (--commentOffset < 0 ||
  155. (c = source.charAt(commentOffset)) === "\n") {
  156. commentLineEmpty = true;
  157. break;
  158. }
  159. } while (c === " " || c === "\t");
  160. var lines = source
  161. .substring(start, end)
  162. .split(setCommentSplitRe);
  163. for (var i = 0; i < lines.length; ++i)
  164. lines[i] = lines[i]
  165. .replace(alternateCommentMode ? setCommentAltRe : setCommentRe, "")
  166. .trim();
  167. commentText = lines
  168. .join("\n")
  169. .trim();
  170. }
  171. function isDoubleSlashCommentLine(startOffset) {
  172. var endOffset = findEndOfLine(startOffset);
  173. // see if remaining line matches comment pattern
  174. var lineText = source.substring(startOffset, endOffset);
  175. // look for 1 or 2 slashes since startOffset would already point past
  176. // the first slash that started the comment.
  177. var isComment = /^\s*\/{1,2}/.test(lineText);
  178. return isComment;
  179. }
  180. function findEndOfLine(cursor) {
  181. // find end of cursor's line
  182. var endOffset = cursor;
  183. while (endOffset < length && charAt(endOffset) !== "\n") {
  184. endOffset++;
  185. }
  186. return endOffset;
  187. }
  188. /**
  189. * Obtains the next token.
  190. * @returns {string|null} Next token or `null` on eof
  191. * @inner
  192. */
  193. function next() {
  194. if (stack.length > 0)
  195. return stack.shift();
  196. if (stringDelim)
  197. return readString();
  198. var repeat,
  199. prev,
  200. curr,
  201. start,
  202. isDoc;
  203. do {
  204. if (offset === length)
  205. return null;
  206. repeat = false;
  207. while (whitespaceRe.test(curr = charAt(offset))) {
  208. if (curr === "\n")
  209. ++line;
  210. if (++offset === length)
  211. return null;
  212. }
  213. if (charAt(offset) === "/") {
  214. if (++offset === length) {
  215. throw illegal("comment");
  216. }
  217. if (charAt(offset) === "/") { // Line
  218. if (!alternateCommentMode) {
  219. // check for triple-slash comment
  220. isDoc = charAt(start = offset + 1) === "/";
  221. while (charAt(++offset) !== "\n") {
  222. if (offset === length) {
  223. return null;
  224. }
  225. }
  226. ++offset;
  227. if (isDoc) {
  228. setComment(start, offset - 1);
  229. }
  230. ++line;
  231. repeat = true;
  232. } else {
  233. // check for double-slash comments, consolidating consecutive lines
  234. start = offset;
  235. isDoc = false;
  236. if (isDoubleSlashCommentLine(offset)) {
  237. isDoc = true;
  238. do {
  239. offset = findEndOfLine(offset);
  240. if (offset === length) {
  241. break;
  242. }
  243. offset++;
  244. } while (isDoubleSlashCommentLine(offset));
  245. } else {
  246. offset = Math.min(length, findEndOfLine(offset) + 1);
  247. }
  248. if (isDoc) {
  249. setComment(start, offset);
  250. }
  251. line++;
  252. repeat = true;
  253. }
  254. } else if ((curr = charAt(offset)) === "*") { /* Block */
  255. // check for /** (regular comment mode) or /* (alternate comment mode)
  256. start = offset + 1;
  257. isDoc = alternateCommentMode || charAt(start) === "*";
  258. do {
  259. if (curr === "\n") {
  260. ++line;
  261. }
  262. if (++offset === length) {
  263. throw illegal("comment");
  264. }
  265. prev = curr;
  266. curr = charAt(offset);
  267. } while (prev !== "*" || curr !== "/");
  268. ++offset;
  269. if (isDoc) {
  270. setComment(start, offset - 2);
  271. }
  272. repeat = true;
  273. } else {
  274. return "/";
  275. }
  276. }
  277. } while (repeat);
  278. // offset !== length if we got here
  279. var end = offset;
  280. delimRe.lastIndex = 0;
  281. var delim = delimRe.test(charAt(end++));
  282. if (!delim)
  283. while (end < length && !delimRe.test(charAt(end)))
  284. ++end;
  285. var token = source.substring(offset, offset = end);
  286. if (token === "\"" || token === "'")
  287. stringDelim = token;
  288. return token;
  289. }
  290. /**
  291. * Pushes a token back to the stack.
  292. * @param {string} token Token
  293. * @returns {undefined}
  294. * @inner
  295. */
  296. function push(token) {
  297. stack.push(token);
  298. }
  299. /**
  300. * Peeks for the next token.
  301. * @returns {string|null} Token or `null` on eof
  302. * @inner
  303. */
  304. function peek() {
  305. if (!stack.length) {
  306. var token = next();
  307. if (token === null)
  308. return null;
  309. push(token);
  310. }
  311. return stack[0];
  312. }
  313. /**
  314. * Skips a token.
  315. * @param {string} expected Expected token
  316. * @param {boolean} [optional=false] Whether the token is optional
  317. * @returns {boolean} `true` when skipped, `false` if not
  318. * @throws {Error} When a required token is not present
  319. * @inner
  320. */
  321. function skip(expected, optional) {
  322. var actual = peek(),
  323. equals = actual === expected;
  324. if (equals) {
  325. next();
  326. return true;
  327. }
  328. if (!optional)
  329. throw illegal("token '" + actual + "', '" + expected + "' expected");
  330. return false;
  331. }
  332. /**
  333. * Gets a comment.
  334. * @param {number} [trailingLine] Line number if looking for a trailing comment
  335. * @returns {string|null} Comment text
  336. * @inner
  337. */
  338. function cmnt(trailingLine) {
  339. var ret = null;
  340. if (trailingLine === undefined) {
  341. if (commentLine === line - 1 && (alternateCommentMode || commentType === "*" || commentLineEmpty)) {
  342. ret = commentText;
  343. }
  344. } else {
  345. /* istanbul ignore else */
  346. if (commentLine < trailingLine) {
  347. peek();
  348. }
  349. if (commentLine === trailingLine && !commentLineEmpty && (alternateCommentMode || commentType === "/")) {
  350. ret = commentText;
  351. }
  352. }
  353. return ret;
  354. }
  355. return Object.defineProperty({
  356. next: next,
  357. peek: peek,
  358. push: push,
  359. skip: skip,
  360. cmnt: cmnt
  361. }, "line", {
  362. get: function() { return line; }
  363. });
  364. /* eslint-enable callback-return */
  365. }