token_set.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. /*!
  2. * lunr.TokenSet
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. */
  5. /**
  6. * A token set is used to store the unique list of all tokens
  7. * within an index. Token sets are also used to represent an
  8. * incoming query to the index, this query token set and index
  9. * token set are then intersected to find which tokens to look
  10. * up in the inverted index.
  11. *
  12. * A token set can hold multiple tokens, as in the case of the
  13. * index token set, or it can hold a single token as in the
  14. * case of a simple query token set.
  15. *
  16. * Additionally token sets are used to perform wildcard matching.
  17. * Leading, contained and trailing wildcards are supported, and
  18. * from this edit distance matching can also be provided.
  19. *
  20. * Token sets are implemented as a minimal finite state automata,
  21. * where both common prefixes and suffixes are shared between tokens.
  22. * This helps to reduce the space used for storing the token set.
  23. *
  24. * @constructor
  25. */
  26. lunr.TokenSet = function () {
  27. this.final = false
  28. this.edges = {}
  29. this.id = lunr.TokenSet._nextId
  30. lunr.TokenSet._nextId += 1
  31. }
  32. /**
  33. * Keeps track of the next, auto increment, identifier to assign
  34. * to a new tokenSet.
  35. *
  36. * TokenSets require a unique identifier to be correctly minimised.
  37. *
  38. * @private
  39. */
  40. lunr.TokenSet._nextId = 1
  41. /**
  42. * Creates a TokenSet instance from the given sorted array of words.
  43. *
  44. * @param {String[]} arr - A sorted array of strings to create the set from.
  45. * @returns {lunr.TokenSet}
  46. * @throws Will throw an error if the input array is not sorted.
  47. */
  48. lunr.TokenSet.fromArray = function (arr) {
  49. var builder = new lunr.TokenSet.Builder
  50. for (var i = 0, len = arr.length; i < len; i++) {
  51. builder.insert(arr[i])
  52. }
  53. builder.finish()
  54. return builder.root
  55. }
  56. /**
  57. * Creates a token set from a query clause.
  58. *
  59. * @private
  60. * @param {Object} clause - A single clause from lunr.Query.
  61. * @param {string} clause.term - The query clause term.
  62. * @param {number} [clause.editDistance] - The optional edit distance for the term.
  63. * @returns {lunr.TokenSet}
  64. */
  65. lunr.TokenSet.fromClause = function (clause) {
  66. if ('editDistance' in clause) {
  67. return lunr.TokenSet.fromFuzzyString(clause.term, clause.editDistance)
  68. } else {
  69. return lunr.TokenSet.fromString(clause.term)
  70. }
  71. }
  72. /**
  73. * Creates a token set representing a single string with a specified
  74. * edit distance.
  75. *
  76. * Insertions, deletions, substitutions and transpositions are each
  77. * treated as an edit distance of 1.
  78. *
  79. * Increasing the allowed edit distance will have a dramatic impact
  80. * on the performance of both creating and intersecting these TokenSets.
  81. * It is advised to keep the edit distance less than 3.
  82. *
  83. * @param {string} str - The string to create the token set from.
  84. * @param {number} editDistance - The allowed edit distance to match.
  85. * @returns {lunr.Vector}
  86. */
  87. lunr.TokenSet.fromFuzzyString = function (str, editDistance) {
  88. var root = new lunr.TokenSet
  89. var stack = [{
  90. node: root,
  91. editsRemaining: editDistance,
  92. str: str
  93. }]
  94. while (stack.length) {
  95. var frame = stack.pop()
  96. // no edit
  97. if (frame.str.length > 0) {
  98. var char = frame.str.charAt(0),
  99. noEditNode
  100. if (char in frame.node.edges) {
  101. noEditNode = frame.node.edges[char]
  102. } else {
  103. noEditNode = new lunr.TokenSet
  104. frame.node.edges[char] = noEditNode
  105. }
  106. if (frame.str.length == 1) {
  107. noEditNode.final = true
  108. }
  109. stack.push({
  110. node: noEditNode,
  111. editsRemaining: frame.editsRemaining,
  112. str: frame.str.slice(1)
  113. })
  114. }
  115. if (frame.editsRemaining == 0) {
  116. continue
  117. }
  118. // insertion
  119. if ("*" in frame.node.edges) {
  120. var insertionNode = frame.node.edges["*"]
  121. } else {
  122. var insertionNode = new lunr.TokenSet
  123. frame.node.edges["*"] = insertionNode
  124. }
  125. if (frame.str.length == 0) {
  126. insertionNode.final = true
  127. }
  128. stack.push({
  129. node: insertionNode,
  130. editsRemaining: frame.editsRemaining - 1,
  131. str: frame.str
  132. })
  133. // deletion
  134. // can only do a deletion if we have enough edits remaining
  135. // and if there are characters left to delete in the string
  136. if (frame.str.length > 1) {
  137. stack.push({
  138. node: frame.node,
  139. editsRemaining: frame.editsRemaining - 1,
  140. str: frame.str.slice(1)
  141. })
  142. }
  143. // deletion
  144. // just removing the last character from the str
  145. if (frame.str.length == 1) {
  146. frame.node.final = true
  147. }
  148. // substitution
  149. // can only do a substitution if we have enough edits remaining
  150. // and if there are characters left to substitute
  151. if (frame.str.length >= 1) {
  152. if ("*" in frame.node.edges) {
  153. var substitutionNode = frame.node.edges["*"]
  154. } else {
  155. var substitutionNode = new lunr.TokenSet
  156. frame.node.edges["*"] = substitutionNode
  157. }
  158. if (frame.str.length == 1) {
  159. substitutionNode.final = true
  160. }
  161. stack.push({
  162. node: substitutionNode,
  163. editsRemaining: frame.editsRemaining - 1,
  164. str: frame.str.slice(1)
  165. })
  166. }
  167. // transposition
  168. // can only do a transposition if there are edits remaining
  169. // and there are enough characters to transpose
  170. if (frame.str.length > 1) {
  171. var charA = frame.str.charAt(0),
  172. charB = frame.str.charAt(1),
  173. transposeNode
  174. if (charB in frame.node.edges) {
  175. transposeNode = frame.node.edges[charB]
  176. } else {
  177. transposeNode = new lunr.TokenSet
  178. frame.node.edges[charB] = transposeNode
  179. }
  180. if (frame.str.length == 1) {
  181. transposeNode.final = true
  182. }
  183. stack.push({
  184. node: transposeNode,
  185. editsRemaining: frame.editsRemaining - 1,
  186. str: charA + frame.str.slice(2)
  187. })
  188. }
  189. }
  190. return root
  191. }
  192. /**
  193. * Creates a TokenSet from a string.
  194. *
  195. * The string may contain one or more wildcard characters (*)
  196. * that will allow wildcard matching when intersecting with
  197. * another TokenSet.
  198. *
  199. * @param {string} str - The string to create a TokenSet from.
  200. * @returns {lunr.TokenSet}
  201. */
  202. lunr.TokenSet.fromString = function (str) {
  203. var node = new lunr.TokenSet,
  204. root = node
  205. /*
  206. * Iterates through all characters within the passed string
  207. * appending a node for each character.
  208. *
  209. * When a wildcard character is found then a self
  210. * referencing edge is introduced to continually match
  211. * any number of any characters.
  212. */
  213. for (var i = 0, len = str.length; i < len; i++) {
  214. var char = str[i],
  215. final = (i == len - 1)
  216. if (char == "*") {
  217. node.edges[char] = node
  218. node.final = final
  219. } else {
  220. var next = new lunr.TokenSet
  221. next.final = final
  222. node.edges[char] = next
  223. node = next
  224. }
  225. }
  226. return root
  227. }
  228. /**
  229. * Converts this TokenSet into an array of strings
  230. * contained within the TokenSet.
  231. *
  232. * This is not intended to be used on a TokenSet that
  233. * contains wildcards, in these cases the results are
  234. * undefined and are likely to cause an infinite loop.
  235. *
  236. * @returns {string[]}
  237. */
  238. lunr.TokenSet.prototype.toArray = function () {
  239. var words = []
  240. var stack = [{
  241. prefix: "",
  242. node: this
  243. }]
  244. while (stack.length) {
  245. var frame = stack.pop(),
  246. edges = Object.keys(frame.node.edges),
  247. len = edges.length
  248. if (frame.node.final) {
  249. /* In Safari, at this point the prefix is sometimes corrupted, see:
  250. * https://github.com/olivernn/lunr.js/issues/279 Calling any
  251. * String.prototype method forces Safari to "cast" this string to what
  252. * it's supposed to be, fixing the bug. */
  253. frame.prefix.charAt(0)
  254. words.push(frame.prefix)
  255. }
  256. for (var i = 0; i < len; i++) {
  257. var edge = edges[i]
  258. stack.push({
  259. prefix: frame.prefix.concat(edge),
  260. node: frame.node.edges[edge]
  261. })
  262. }
  263. }
  264. return words
  265. }
  266. /**
  267. * Generates a string representation of a TokenSet.
  268. *
  269. * This is intended to allow TokenSets to be used as keys
  270. * in objects, largely to aid the construction and minimisation
  271. * of a TokenSet. As such it is not designed to be a human
  272. * friendly representation of the TokenSet.
  273. *
  274. * @returns {string}
  275. */
  276. lunr.TokenSet.prototype.toString = function () {
  277. // NOTE: Using Object.keys here as this.edges is very likely
  278. // to enter 'hash-mode' with many keys being added
  279. //
  280. // avoiding a for-in loop here as it leads to the function
  281. // being de-optimised (at least in V8). From some simple
  282. // benchmarks the performance is comparable, but allowing
  283. // V8 to optimize may mean easy performance wins in the future.
  284. if (this._str) {
  285. return this._str
  286. }
  287. var str = this.final ? '1' : '0',
  288. labels = Object.keys(this.edges).sort(),
  289. len = labels.length
  290. for (var i = 0; i < len; i++) {
  291. var label = labels[i],
  292. node = this.edges[label]
  293. str = str + label + node.id
  294. }
  295. return str
  296. }
  297. /**
  298. * Returns a new TokenSet that is the intersection of
  299. * this TokenSet and the passed TokenSet.
  300. *
  301. * This intersection will take into account any wildcards
  302. * contained within the TokenSet.
  303. *
  304. * @param {lunr.TokenSet} b - An other TokenSet to intersect with.
  305. * @returns {lunr.TokenSet}
  306. */
  307. lunr.TokenSet.prototype.intersect = function (b) {
  308. var output = new lunr.TokenSet,
  309. frame = undefined
  310. var stack = [{
  311. qNode: b,
  312. output: output,
  313. node: this
  314. }]
  315. while (stack.length) {
  316. frame = stack.pop()
  317. // NOTE: As with the #toString method, we are using
  318. // Object.keys and a for loop instead of a for-in loop
  319. // as both of these objects enter 'hash' mode, causing
  320. // the function to be de-optimised in V8
  321. var qEdges = Object.keys(frame.qNode.edges),
  322. qLen = qEdges.length,
  323. nEdges = Object.keys(frame.node.edges),
  324. nLen = nEdges.length
  325. for (var q = 0; q < qLen; q++) {
  326. var qEdge = qEdges[q]
  327. for (var n = 0; n < nLen; n++) {
  328. var nEdge = nEdges[n]
  329. if (nEdge == qEdge || qEdge == '*') {
  330. var node = frame.node.edges[nEdge],
  331. qNode = frame.qNode.edges[qEdge],
  332. final = node.final && qNode.final,
  333. next = undefined
  334. if (nEdge in frame.output.edges) {
  335. // an edge already exists for this character
  336. // no need to create a new node, just set the finality
  337. // bit unless this node is already final
  338. next = frame.output.edges[nEdge]
  339. next.final = next.final || final
  340. } else {
  341. // no edge exists yet, must create one
  342. // set the finality bit and insert it
  343. // into the output
  344. next = new lunr.TokenSet
  345. next.final = final
  346. frame.output.edges[nEdge] = next
  347. }
  348. stack.push({
  349. qNode: qNode,
  350. output: next,
  351. node: node
  352. })
  353. }
  354. }
  355. }
  356. }
  357. return output
  358. }