index.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. /*!
  2. * lunr.Index
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. */
  5. /**
  6. * lunr.Index is object that manages a search index. It contains the indexes
  7. * and stores all the tokens and document lookups. It also provides the main
  8. * user facing API for the library.
  9. *
  10. * @constructor
  11. */
  12. lunr.Index = function () {
  13. this._fields = []
  14. this._ref = 'id'
  15. this.pipeline = new lunr.Pipeline
  16. this.documentStore = new lunr.Store
  17. this.tokenStore = new lunr.TokenStore
  18. this.corpusTokens = new lunr.SortedSet
  19. this.eventEmitter = new lunr.EventEmitter
  20. this.tokenizerFn = lunr.tokenizer
  21. this._idfCache = {}
  22. this.on('add', 'remove', 'update', (function () {
  23. this._idfCache = {}
  24. }).bind(this))
  25. }
  26. /**
  27. * Bind a handler to events being emitted by the index.
  28. *
  29. * The handler can be bound to many events at the same time.
  30. *
  31. * @param {String} [eventName] The name(s) of events to bind the function to.
  32. * @param {Function} fn The serialised set to load.
  33. * @memberOf Index
  34. */
  35. lunr.Index.prototype.on = function () {
  36. var args = Array.prototype.slice.call(arguments)
  37. return this.eventEmitter.addListener.apply(this.eventEmitter, args)
  38. }
  39. /**
  40. * Removes a handler from an event being emitted by the index.
  41. *
  42. * @param {String} eventName The name of events to remove the function from.
  43. * @param {Function} fn The serialised set to load.
  44. * @memberOf Index
  45. */
  46. lunr.Index.prototype.off = function (name, fn) {
  47. return this.eventEmitter.removeListener(name, fn)
  48. }
  49. /**
  50. * Loads a previously serialised index.
  51. *
  52. * Issues a warning if the index being imported was serialised
  53. * by a different version of lunr.
  54. *
  55. * @param {Object} serialisedData The serialised set to load.
  56. * @returns {lunr.Index}
  57. * @memberOf Index
  58. */
  59. lunr.Index.load = function (serialisedData) {
  60. if (serialisedData.version !== lunr.version) {
  61. lunr.utils.warn('version mismatch: current ' + lunr.version + ' importing ' + serialisedData.version)
  62. }
  63. var idx = new this
  64. idx._fields = serialisedData.fields
  65. idx._ref = serialisedData.ref
  66. idx.tokenizer(lunr.tokenizer.load(serialisedData.tokenizer))
  67. idx.documentStore = lunr.Store.load(serialisedData.documentStore)
  68. idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
  69. idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
  70. idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
  71. return idx
  72. }
  73. /**
  74. * Adds a field to the list of fields that will be searchable within documents
  75. * in the index.
  76. *
  77. * An optional boost param can be passed to affect how much tokens in this field
  78. * rank in search results, by default the boost value is 1.
  79. *
  80. * Fields should be added before any documents are added to the index, fields
  81. * that are added after documents are added to the index will only apply to new
  82. * documents added to the index.
  83. *
  84. * @param {String} fieldName The name of the field within the document that
  85. * should be indexed
  86. * @param {Number} boost An optional boost that can be applied to terms in this
  87. * field.
  88. * @returns {lunr.Index}
  89. * @memberOf Index
  90. */
  91. lunr.Index.prototype.field = function (fieldName, opts) {
  92. var opts = opts || {},
  93. field = { name: fieldName, boost: opts.boost || 1 }
  94. this._fields.push(field)
  95. return this
  96. }
  97. /**
  98. * Sets the property used to uniquely identify documents added to the index,
  99. * by default this property is 'id'.
  100. *
  101. * This should only be changed before adding documents to the index, changing
  102. * the ref property without resetting the index can lead to unexpected results.
  103. *
  104. * The value of ref can be of any type but it _must_ be stably comparable and
  105. * orderable.
  106. *
  107. * @param {String} refName The property to use to uniquely identify the
  108. * documents in the index.
  109. * @param {Boolean} emitEvent Whether to emit add events, defaults to true
  110. * @returns {lunr.Index}
  111. * @memberOf Index
  112. */
  113. lunr.Index.prototype.ref = function (refName) {
  114. this._ref = refName
  115. return this
  116. }
  117. /**
  118. * Sets the tokenizer used for this index.
  119. *
  120. * By default the index will use the default tokenizer, lunr.tokenizer. The tokenizer
  121. * should only be changed before adding documents to the index. Changing the tokenizer
  122. * without re-building the index can lead to unexpected results.
  123. *
  124. * @param {Function} fn The function to use as a tokenizer.
  125. * @returns {lunr.Index}
  126. * @memberOf Index
  127. */
  128. lunr.Index.prototype.tokenizer = function (fn) {
  129. var isRegistered = fn.label && (fn.label in lunr.tokenizer.registeredFunctions)
  130. if (!isRegistered) {
  131. lunr.utils.warn('Function is not a registered tokenizer. This may cause problems when serialising the index')
  132. }
  133. this.tokenizerFn = fn
  134. return this
  135. }
  136. /**
  137. * Add a document to the index.
  138. *
  139. * This is the way new documents enter the index, this function will run the
  140. * fields from the document through the index's pipeline and then add it to
  141. * the index, it will then show up in search results.
  142. *
  143. * An 'add' event is emitted with the document that has been added and the index
  144. * the document has been added to. This event can be silenced by passing false
  145. * as the second argument to add.
  146. *
  147. * @param {Object} doc The document to add to the index.
  148. * @param {Boolean} emitEvent Whether or not to emit events, default true.
  149. * @memberOf Index
  150. */
  151. lunr.Index.prototype.add = function (doc, emitEvent) {
  152. var docTokens = {},
  153. allDocumentTokens = new lunr.SortedSet,
  154. docRef = doc[this._ref],
  155. emitEvent = emitEvent === undefined ? true : emitEvent
  156. this._fields.forEach(function (field) {
  157. var fieldTokens = this.pipeline.run(this.tokenizerFn(doc[field.name]))
  158. docTokens[field.name] = fieldTokens
  159. for (var i = 0; i < fieldTokens.length; i++) {
  160. var token = fieldTokens[i]
  161. allDocumentTokens.add(token)
  162. this.corpusTokens.add(token)
  163. }
  164. }, this)
  165. this.documentStore.set(docRef, allDocumentTokens)
  166. for (var i = 0; i < allDocumentTokens.length; i++) {
  167. var token = allDocumentTokens.elements[i]
  168. var tf = 0;
  169. for (var j = 0; j < this._fields.length; j++){
  170. var field = this._fields[j]
  171. var fieldTokens = docTokens[field.name]
  172. var fieldLength = fieldTokens.length
  173. if (!fieldLength) continue
  174. var tokenCount = 0
  175. for (var k = 0; k < fieldLength; k++){
  176. if (fieldTokens[k] === token){
  177. tokenCount++
  178. }
  179. }
  180. tf += (tokenCount / fieldLength * field.boost)
  181. }
  182. this.tokenStore.add(token, { ref: docRef, tf: tf })
  183. };
  184. if (emitEvent) this.eventEmitter.emit('add', doc, this)
  185. }
  186. /**
  187. * Removes a document from the index.
  188. *
  189. * To make sure documents no longer show up in search results they can be
  190. * removed from the index using this method.
  191. *
  192. * The document passed only needs to have the same ref property value as the
  193. * document that was added to the index, they could be completely different
  194. * objects.
  195. *
  196. * A 'remove' event is emitted with the document that has been removed and the index
  197. * the document has been removed from. This event can be silenced by passing false
  198. * as the second argument to remove.
  199. *
  200. * @param {Object} doc The document to remove from the index.
  201. * @param {Boolean} emitEvent Whether to emit remove events, defaults to true
  202. * @memberOf Index
  203. */
  204. lunr.Index.prototype.remove = function (doc, emitEvent) {
  205. var docRef = doc[this._ref],
  206. emitEvent = emitEvent === undefined ? true : emitEvent
  207. if (!this.documentStore.has(docRef)) return
  208. var docTokens = this.documentStore.get(docRef)
  209. this.documentStore.remove(docRef)
  210. docTokens.forEach(function (token) {
  211. this.tokenStore.remove(token, docRef)
  212. }, this)
  213. if (emitEvent) this.eventEmitter.emit('remove', doc, this)
  214. }
  215. /**
  216. * Updates a document in the index.
  217. *
  218. * When a document contained within the index gets updated, fields changed,
  219. * added or removed, to make sure it correctly matched against search queries,
  220. * it should be updated in the index.
  221. *
  222. * This method is just a wrapper around `remove` and `add`
  223. *
  224. * An 'update' event is emitted with the document that has been updated and the index.
  225. * This event can be silenced by passing false as the second argument to update. Only
  226. * an update event will be fired, the 'add' and 'remove' events of the underlying calls
  227. * are silenced.
  228. *
  229. * @param {Object} doc The document to update in the index.
  230. * @param {Boolean} emitEvent Whether to emit update events, defaults to true
  231. * @see Index.prototype.remove
  232. * @see Index.prototype.add
  233. * @memberOf Index
  234. */
  235. lunr.Index.prototype.update = function (doc, emitEvent) {
  236. var emitEvent = emitEvent === undefined ? true : emitEvent
  237. this.remove(doc, false)
  238. this.add(doc, false)
  239. if (emitEvent) this.eventEmitter.emit('update', doc, this)
  240. }
  241. /**
  242. * Calculates the inverse document frequency for a token within the index.
  243. *
  244. * @param {String} token The token to calculate the idf of.
  245. * @see Index.prototype.idf
  246. * @private
  247. * @memberOf Index
  248. */
  249. lunr.Index.prototype.idf = function (term) {
  250. var cacheKey = "@" + term
  251. if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]
  252. var documentFrequency = this.tokenStore.count(term),
  253. idf = 1
  254. if (documentFrequency > 0) {
  255. idf = 1 + Math.log(this.documentStore.length / documentFrequency)
  256. }
  257. return this._idfCache[cacheKey] = idf
  258. }
  259. /**
  260. * Searches the index using the passed query.
  261. *
  262. * Queries should be a string, multiple words are allowed and will lead to an
  263. * AND based query, e.g. `idx.search('foo bar')` will run a search for
  264. * documents containing both 'foo' and 'bar'.
  265. *
  266. * All query tokens are passed through the same pipeline that document tokens
  267. * are passed through, so any language processing involved will be run on every
  268. * query term.
  269. *
  270. * Each query term is expanded, so that the term 'he' might be expanded to
  271. * 'hello' and 'help' if those terms were already included in the index.
  272. *
  273. * Matching documents are returned as an array of objects, each object contains
  274. * the matching document ref, as set for this index, and the similarity score
  275. * for this document against the query.
  276. *
  277. * @param {String} query The query to search the index with.
  278. * @returns {Object}
  279. * @see Index.prototype.idf
  280. * @see Index.prototype.documentVector
  281. * @memberOf Index
  282. */
  283. lunr.Index.prototype.search = function (query) {
  284. var queryTokens = this.pipeline.run(this.tokenizerFn(query)),
  285. queryVector = new lunr.Vector,
  286. documentSets = [],
  287. fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
  288. var hasSomeToken = queryTokens.some(function (token) {
  289. return this.tokenStore.has(token)
  290. }, this)
  291. if (!hasSomeToken) return []
  292. queryTokens
  293. .forEach(function (token, i, tokens) {
  294. var tf = 1 / tokens.length * this._fields.length * fieldBoosts,
  295. self = this
  296. var set = this.tokenStore.expand(token).reduce(function (memo, key) {
  297. var pos = self.corpusTokens.indexOf(key),
  298. idf = self.idf(key),
  299. similarityBoost = 1,
  300. set = new lunr.SortedSet
  301. // if the expanded key is not an exact match to the token then
  302. // penalise the score for this key by how different the key is
  303. // to the token.
  304. if (key !== token) {
  305. var diff = Math.max(3, key.length - token.length)
  306. similarityBoost = 1 / Math.log(diff)
  307. }
  308. // calculate the query tf-idf score for this token
  309. // applying an similarityBoost to ensure exact matches
  310. // these rank higher than expanded terms
  311. if (pos > -1) queryVector.insert(pos, tf * idf * similarityBoost)
  312. // add all the documents that have this key into a set
  313. // ensuring that the type of key is preserved
  314. var matchingDocuments = self.tokenStore.get(key),
  315. refs = Object.keys(matchingDocuments),
  316. refsLen = refs.length
  317. for (var i = 0; i < refsLen; i++) {
  318. set.add(matchingDocuments[refs[i]].ref)
  319. }
  320. return memo.union(set)
  321. }, new lunr.SortedSet)
  322. documentSets.push(set)
  323. }, this)
  324. var documentSet = documentSets.reduce(function (memo, set) {
  325. return memo.intersect(set)
  326. })
  327. return documentSet
  328. .map(function (ref) {
  329. return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
  330. }, this)
  331. .sort(function (a, b) {
  332. return b.score - a.score
  333. })
  334. }
  335. /**
  336. * Generates a vector containing all the tokens in the document matching the
  337. * passed documentRef.
  338. *
  339. * The vector contains the tf-idf score for each token contained in the
  340. * document with the passed documentRef. The vector will contain an element
  341. * for every token in the indexes corpus, if the document does not contain that
  342. * token the element will be 0.
  343. *
  344. * @param {Object} documentRef The ref to find the document with.
  345. * @returns {lunr.Vector}
  346. * @private
  347. * @memberOf Index
  348. */
  349. lunr.Index.prototype.documentVector = function (documentRef) {
  350. var documentTokens = this.documentStore.get(documentRef),
  351. documentTokensLength = documentTokens.length,
  352. documentVector = new lunr.Vector
  353. for (var i = 0; i < documentTokensLength; i++) {
  354. var token = documentTokens.elements[i],
  355. tf = this.tokenStore.get(token)[documentRef].tf,
  356. idf = this.idf(token)
  357. documentVector.insert(this.corpusTokens.indexOf(token), tf * idf)
  358. };
  359. return documentVector
  360. }
  361. /**
  362. * Returns a representation of the index ready for serialisation.
  363. *
  364. * @returns {Object}
  365. * @memberOf Index
  366. */
  367. lunr.Index.prototype.toJSON = function () {
  368. return {
  369. version: lunr.version,
  370. fields: this._fields,
  371. ref: this._ref,
  372. tokenizer: this.tokenizerFn.label,
  373. documentStore: this.documentStore.toJSON(),
  374. tokenStore: this.tokenStore.toJSON(),
  375. corpusTokens: this.corpusTokens.toJSON(),
  376. pipeline: this.pipeline.toJSON()
  377. }
  378. }
  379. /**
  380. * Applies a plugin to the current index.
  381. *
  382. * A plugin is a function that is called with the index as its context.
  383. * Plugins can be used to customise or extend the behaviour the index
  384. * in some way. A plugin is just a function, that encapsulated the custom
  385. * behaviour that should be applied to the index.
  386. *
  387. * The plugin function will be called with the index as its argument, additional
  388. * arguments can also be passed when calling use. The function will be called
  389. * with the index as its context.
  390. *
  391. * Example:
  392. *
  393. * var myPlugin = function (idx, arg1, arg2) {
  394. * // `this` is the index to be extended
  395. * // apply any extensions etc here.
  396. * }
  397. *
  398. * var idx = lunr(function () {
  399. * this.use(myPlugin, 'arg1', 'arg2')
  400. * })
  401. *
  402. * @param {Function} plugin The plugin to apply.
  403. * @memberOf Index
  404. */
  405. lunr.Index.prototype.use = function (plugin) {
  406. var args = Array.prototype.slice.call(arguments, 1)
  407. args.unshift(this)
  408. plugin.apply(this, args)
  409. }