query_lexer.js 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. lunr.QueryLexer = function (str) {
  2. this.lexemes = []
  3. this.str = str
  4. this.length = str.length
  5. this.pos = 0
  6. this.start = 0
  7. this.escapeCharPositions = []
  8. }
  9. lunr.QueryLexer.prototype.run = function () {
  10. var state = lunr.QueryLexer.lexText
  11. while (state) {
  12. state = state(this)
  13. }
  14. }
  15. lunr.QueryLexer.prototype.sliceString = function () {
  16. var subSlices = [],
  17. sliceStart = this.start,
  18. sliceEnd = this.pos
  19. for (var i = 0; i < this.escapeCharPositions.length; i++) {
  20. sliceEnd = this.escapeCharPositions[i]
  21. subSlices.push(this.str.slice(sliceStart, sliceEnd))
  22. sliceStart = sliceEnd + 1
  23. }
  24. subSlices.push(this.str.slice(sliceStart, this.pos))
  25. this.escapeCharPositions.length = 0
  26. return subSlices.join('')
  27. }
  28. lunr.QueryLexer.prototype.emit = function (type) {
  29. this.lexemes.push({
  30. type: type,
  31. str: this.sliceString(),
  32. start: this.start,
  33. end: this.pos
  34. })
  35. this.start = this.pos
  36. }
  37. lunr.QueryLexer.prototype.escapeCharacter = function () {
  38. this.escapeCharPositions.push(this.pos - 1)
  39. this.pos += 1
  40. }
  41. lunr.QueryLexer.prototype.next = function () {
  42. if (this.pos >= this.length) {
  43. return lunr.QueryLexer.EOS
  44. }
  45. var char = this.str.charAt(this.pos)
  46. this.pos += 1
  47. return char
  48. }
  49. lunr.QueryLexer.prototype.width = function () {
  50. return this.pos - this.start
  51. }
  52. lunr.QueryLexer.prototype.ignore = function () {
  53. if (this.start == this.pos) {
  54. this.pos += 1
  55. }
  56. this.start = this.pos
  57. }
  58. lunr.QueryLexer.prototype.backup = function () {
  59. this.pos -= 1
  60. }
  61. lunr.QueryLexer.prototype.acceptDigitRun = function () {
  62. var char, charCode
  63. do {
  64. char = this.next()
  65. charCode = char.charCodeAt(0)
  66. } while (charCode > 47 && charCode < 58)
  67. if (char != lunr.QueryLexer.EOS) {
  68. this.backup()
  69. }
  70. }
  71. lunr.QueryLexer.prototype.more = function () {
  72. return this.pos < this.length
  73. }
  74. lunr.QueryLexer.EOS = 'EOS'
  75. lunr.QueryLexer.FIELD = 'FIELD'
  76. lunr.QueryLexer.TERM = 'TERM'
  77. lunr.QueryLexer.EDIT_DISTANCE = 'EDIT_DISTANCE'
  78. lunr.QueryLexer.BOOST = 'BOOST'
  79. lunr.QueryLexer.PRESENCE = 'PRESENCE'
  80. lunr.QueryLexer.lexField = function (lexer) {
  81. lexer.backup()
  82. lexer.emit(lunr.QueryLexer.FIELD)
  83. lexer.ignore()
  84. return lunr.QueryLexer.lexText
  85. }
  86. lunr.QueryLexer.lexTerm = function (lexer) {
  87. if (lexer.width() > 1) {
  88. lexer.backup()
  89. lexer.emit(lunr.QueryLexer.TERM)
  90. }
  91. lexer.ignore()
  92. if (lexer.more()) {
  93. return lunr.QueryLexer.lexText
  94. }
  95. }
  96. lunr.QueryLexer.lexEditDistance = function (lexer) {
  97. lexer.ignore()
  98. lexer.acceptDigitRun()
  99. lexer.emit(lunr.QueryLexer.EDIT_DISTANCE)
  100. return lunr.QueryLexer.lexText
  101. }
  102. lunr.QueryLexer.lexBoost = function (lexer) {
  103. lexer.ignore()
  104. lexer.acceptDigitRun()
  105. lexer.emit(lunr.QueryLexer.BOOST)
  106. return lunr.QueryLexer.lexText
  107. }
  108. lunr.QueryLexer.lexEOS = function (lexer) {
  109. if (lexer.width() > 0) {
  110. lexer.emit(lunr.QueryLexer.TERM)
  111. }
  112. }
  113. // This matches the separator used when tokenising fields
  114. // within a document. These should match otherwise it is
  115. // not possible to search for some tokens within a document.
  116. //
  117. // It is possible for the user to change the separator on the
  118. // tokenizer so it _might_ clash with any other of the special
  119. // characters already used within the search string, e.g. :.
  120. //
  121. // This means that it is possible to change the separator in
  122. // such a way that makes some words unsearchable using a search
  123. // string.
  124. lunr.QueryLexer.termSeparator = lunr.tokenizer.separator
  125. lunr.QueryLexer.lexText = function (lexer) {
  126. while (true) {
  127. var char = lexer.next()
  128. if (char == lunr.QueryLexer.EOS) {
  129. return lunr.QueryLexer.lexEOS
  130. }
  131. // Escape character is '\'
  132. if (char.charCodeAt(0) == 92) {
  133. lexer.escapeCharacter()
  134. continue
  135. }
  136. if (char == ":") {
  137. return lunr.QueryLexer.lexField
  138. }
  139. if (char == "~") {
  140. lexer.backup()
  141. if (lexer.width() > 0) {
  142. lexer.emit(lunr.QueryLexer.TERM)
  143. }
  144. return lunr.QueryLexer.lexEditDistance
  145. }
  146. if (char == "^") {
  147. lexer.backup()
  148. if (lexer.width() > 0) {
  149. lexer.emit(lunr.QueryLexer.TERM)
  150. }
  151. return lunr.QueryLexer.lexBoost
  152. }
  153. // "+" indicates term presence is required
  154. // checking for length to ensure that only
  155. // leading "+" are considered
  156. if (char == "+" && lexer.width() === 1) {
  157. lexer.emit(lunr.QueryLexer.PRESENCE)
  158. return lunr.QueryLexer.lexText
  159. }
  160. // "-" indicates term presence is prohibited
  161. // checking for length to ensure that only
  162. // leading "-" are considered
  163. if (char == "-" && lexer.width() === 1) {
  164. lexer.emit(lunr.QueryLexer.PRESENCE)
  165. return lunr.QueryLexer.lexText
  166. }
  167. if (char.match(lunr.QueryLexer.termSeparator)) {
  168. return lunr.QueryLexer.lexTerm
  169. }
  170. }
  171. }