regex-tokeniser.js 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. var Token = require("./Token");
  2. var StringSource = require("./StringSource");
  3. exports.RegexTokeniser = RegexTokeniser;
  4. function RegexTokeniser(rules) {
  5. rules = rules.map(function(rule) {
  6. return {
  7. name: rule.name,
  8. regex: new RegExp(rule.regex.source, "g")
  9. };
  10. });
  11. function tokenise(input, description) {
  12. var source = new StringSource(input, description);
  13. var index = 0;
  14. var tokens = [];
  15. while (index < input.length) {
  16. var result = readNextToken(input, index, source);
  17. index = result.endIndex;
  18. tokens.push(result.token);
  19. }
  20. tokens.push(endToken(input, source));
  21. return tokens;
  22. }
  23. function readNextToken(string, startIndex, source) {
  24. for (var i = 0; i < rules.length; i++) {
  25. var regex = rules[i].regex;
  26. regex.lastIndex = startIndex;
  27. var result = regex.exec(string);
  28. if (result) {
  29. var endIndex = startIndex + result[0].length;
  30. if (result.index === startIndex && endIndex > startIndex) {
  31. var value = result[1];
  32. var token = new Token(
  33. rules[i].name,
  34. value,
  35. source.range(startIndex, endIndex)
  36. );
  37. return {token: token, endIndex: endIndex};
  38. }
  39. }
  40. }
  41. var endIndex = startIndex + 1;
  42. var token = new Token(
  43. "unrecognisedCharacter",
  44. string.substring(startIndex, endIndex),
  45. source.range(startIndex, endIndex)
  46. );
  47. return {token: token, endIndex: endIndex};
  48. }
  49. function endToken(input, source) {
  50. return new Token(
  51. "end",
  52. null,
  53. source.range(input.length, input.length)
  54. );
  55. }
  56. return {
  57. tokenise: tokenise
  58. }
  59. }