123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469 |
- /*!
- * lunr.Index
- * Copyright (C) @YEAR Oliver Nightingale
- */
- /**
- * lunr.Index is object that manages a search index. It contains the indexes
- * and stores all the tokens and document lookups. It also provides the main
- * user facing API for the library.
- *
- * @constructor
- */
- lunr.Index = function () {
- this._fields = []
- this._ref = 'id'
- this.pipeline = new lunr.Pipeline
- this.documentStore = new lunr.Store
- this.tokenStore = new lunr.TokenStore
- this.corpusTokens = new lunr.SortedSet
- this.eventEmitter = new lunr.EventEmitter
- this.tokenizerFn = lunr.tokenizer
- this._idfCache = {}
- this.on('add', 'remove', 'update', (function () {
- this._idfCache = {}
- }).bind(this))
- }
- /**
- * Bind a handler to events being emitted by the index.
- *
- * The handler can be bound to many events at the same time.
- *
- * @param {String} [eventName] The name(s) of events to bind the function to.
- * @param {Function} fn The serialised set to load.
- * @memberOf Index
- */
- lunr.Index.prototype.on = function () {
- var args = Array.prototype.slice.call(arguments)
- return this.eventEmitter.addListener.apply(this.eventEmitter, args)
- }
- /**
- * Removes a handler from an event being emitted by the index.
- *
- * @param {String} eventName The name of events to remove the function from.
- * @param {Function} fn The serialised set to load.
- * @memberOf Index
- */
- lunr.Index.prototype.off = function (name, fn) {
- return this.eventEmitter.removeListener(name, fn)
- }
- /**
- * Loads a previously serialised index.
- *
- * Issues a warning if the index being imported was serialised
- * by a different version of lunr.
- *
- * @param {Object} serialisedData The serialised set to load.
- * @returns {lunr.Index}
- * @memberOf Index
- */
- lunr.Index.load = function (serialisedData) {
- if (serialisedData.version !== lunr.version) {
- lunr.utils.warn('version mismatch: current ' + lunr.version + ' importing ' + serialisedData.version)
- }
- var idx = new this
- idx._fields = serialisedData.fields
- idx._ref = serialisedData.ref
- idx.tokenizer(lunr.tokenizer.load(serialisedData.tokenizer))
- idx.documentStore = lunr.Store.load(serialisedData.documentStore)
- idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
- idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
- idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
- return idx
- }
- /**
- * Adds a field to the list of fields that will be searchable within documents
- * in the index.
- *
- * An optional boost param can be passed to affect how much tokens in this field
- * rank in search results, by default the boost value is 1.
- *
- * Fields should be added before any documents are added to the index, fields
- * that are added after documents are added to the index will only apply to new
- * documents added to the index.
- *
- * @param {String} fieldName The name of the field within the document that
- * should be indexed
- * @param {Number} boost An optional boost that can be applied to terms in this
- * field.
- * @returns {lunr.Index}
- * @memberOf Index
- */
- lunr.Index.prototype.field = function (fieldName, opts) {
- var opts = opts || {},
- field = { name: fieldName, boost: opts.boost || 1 }
- this._fields.push(field)
- return this
- }
- /**
- * Sets the property used to uniquely identify documents added to the index,
- * by default this property is 'id'.
- *
- * This should only be changed before adding documents to the index, changing
- * the ref property without resetting the index can lead to unexpected results.
- *
- * The value of ref can be of any type but it _must_ be stably comparable and
- * orderable.
- *
- * @param {String} refName The property to use to uniquely identify the
- * documents in the index.
- * @param {Boolean} emitEvent Whether to emit add events, defaults to true
- * @returns {lunr.Index}
- * @memberOf Index
- */
- lunr.Index.prototype.ref = function (refName) {
- this._ref = refName
- return this
- }
- /**
- * Sets the tokenizer used for this index.
- *
- * By default the index will use the default tokenizer, lunr.tokenizer. The tokenizer
- * should only be changed before adding documents to the index. Changing the tokenizer
- * without re-building the index can lead to unexpected results.
- *
- * @param {Function} fn The function to use as a tokenizer.
- * @returns {lunr.Index}
- * @memberOf Index
- */
- lunr.Index.prototype.tokenizer = function (fn) {
- var isRegistered = fn.label && (fn.label in lunr.tokenizer.registeredFunctions)
- if (!isRegistered) {
- lunr.utils.warn('Function is not a registered tokenizer. This may cause problems when serialising the index')
- }
- this.tokenizerFn = fn
- return this
- }
- /**
- * Add a document to the index.
- *
- * This is the way new documents enter the index, this function will run the
- * fields from the document through the index's pipeline and then add it to
- * the index, it will then show up in search results.
- *
- * An 'add' event is emitted with the document that has been added and the index
- * the document has been added to. This event can be silenced by passing false
- * as the second argument to add.
- *
- * @param {Object} doc The document to add to the index.
- * @param {Boolean} emitEvent Whether or not to emit events, default true.
- * @memberOf Index
- */
- lunr.Index.prototype.add = function (doc, emitEvent) {
- var docTokens = {},
- allDocumentTokens = new lunr.SortedSet,
- docRef = doc[this._ref],
- emitEvent = emitEvent === undefined ? true : emitEvent
- this._fields.forEach(function (field) {
- var fieldTokens = this.pipeline.run(this.tokenizerFn(doc[field.name]))
- docTokens[field.name] = fieldTokens
- for (var i = 0; i < fieldTokens.length; i++) {
- var token = fieldTokens[i]
- allDocumentTokens.add(token)
- this.corpusTokens.add(token)
- }
- }, this)
- this.documentStore.set(docRef, allDocumentTokens)
- for (var i = 0; i < allDocumentTokens.length; i++) {
- var token = allDocumentTokens.elements[i]
- var tf = 0;
- for (var j = 0; j < this._fields.length; j++){
- var field = this._fields[j]
- var fieldTokens = docTokens[field.name]
- var fieldLength = fieldTokens.length
- if (!fieldLength) continue
- var tokenCount = 0
- for (var k = 0; k < fieldLength; k++){
- if (fieldTokens[k] === token){
- tokenCount++
- }
- }
- tf += (tokenCount / fieldLength * field.boost)
- }
- this.tokenStore.add(token, { ref: docRef, tf: tf })
- };
- if (emitEvent) this.eventEmitter.emit('add', doc, this)
- }
- /**
- * Removes a document from the index.
- *
- * To make sure documents no longer show up in search results they can be
- * removed from the index using this method.
- *
- * The document passed only needs to have the same ref property value as the
- * document that was added to the index, they could be completely different
- * objects.
- *
- * A 'remove' event is emitted with the document that has been removed and the index
- * the document has been removed from. This event can be silenced by passing false
- * as the second argument to remove.
- *
- * @param {Object} doc The document to remove from the index.
- * @param {Boolean} emitEvent Whether to emit remove events, defaults to true
- * @memberOf Index
- */
- lunr.Index.prototype.remove = function (doc, emitEvent) {
- var docRef = doc[this._ref],
- emitEvent = emitEvent === undefined ? true : emitEvent
- if (!this.documentStore.has(docRef)) return
- var docTokens = this.documentStore.get(docRef)
- this.documentStore.remove(docRef)
- docTokens.forEach(function (token) {
- this.tokenStore.remove(token, docRef)
- }, this)
- if (emitEvent) this.eventEmitter.emit('remove', doc, this)
- }
- /**
- * Updates a document in the index.
- *
- * When a document contained within the index gets updated, fields changed,
- * added or removed, to make sure it correctly matched against search queries,
- * it should be updated in the index.
- *
- * This method is just a wrapper around `remove` and `add`
- *
- * An 'update' event is emitted with the document that has been updated and the index.
- * This event can be silenced by passing false as the second argument to update. Only
- * an update event will be fired, the 'add' and 'remove' events of the underlying calls
- * are silenced.
- *
- * @param {Object} doc The document to update in the index.
- * @param {Boolean} emitEvent Whether to emit update events, defaults to true
- * @see Index.prototype.remove
- * @see Index.prototype.add
- * @memberOf Index
- */
- lunr.Index.prototype.update = function (doc, emitEvent) {
- var emitEvent = emitEvent === undefined ? true : emitEvent
- this.remove(doc, false)
- this.add(doc, false)
- if (emitEvent) this.eventEmitter.emit('update', doc, this)
- }
- /**
- * Calculates the inverse document frequency for a token within the index.
- *
- * @param {String} token The token to calculate the idf of.
- * @see Index.prototype.idf
- * @private
- * @memberOf Index
- */
- lunr.Index.prototype.idf = function (term) {
- var cacheKey = "@" + term
- if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]
- var documentFrequency = this.tokenStore.count(term),
- idf = 1
- if (documentFrequency > 0) {
- idf = 1 + Math.log(this.documentStore.length / documentFrequency)
- }
- return this._idfCache[cacheKey] = idf
- }
- /**
- * Searches the index using the passed query.
- *
- * Queries should be a string, multiple words are allowed and will lead to an
- * AND based query, e.g. `idx.search('foo bar')` will run a search for
- * documents containing both 'foo' and 'bar'.
- *
- * All query tokens are passed through the same pipeline that document tokens
- * are passed through, so any language processing involved will be run on every
- * query term.
- *
- * Each query term is expanded, so that the term 'he' might be expanded to
- * 'hello' and 'help' if those terms were already included in the index.
- *
- * Matching documents are returned as an array of objects, each object contains
- * the matching document ref, as set for this index, and the similarity score
- * for this document against the query.
- *
- * @param {String} query The query to search the index with.
- * @returns {Object}
- * @see Index.prototype.idf
- * @see Index.prototype.documentVector
- * @memberOf Index
- */
- lunr.Index.prototype.search = function (query) {
- var queryTokens = this.pipeline.run(this.tokenizerFn(query)),
- queryVector = new lunr.Vector,
- documentSets = [],
- fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
- var hasSomeToken = queryTokens.some(function (token) {
- return this.tokenStore.has(token)
- }, this)
- if (!hasSomeToken) return []
- queryTokens
- .forEach(function (token, i, tokens) {
- var tf = 1 / tokens.length * this._fields.length * fieldBoosts,
- self = this
- var set = this.tokenStore.expand(token).reduce(function (memo, key) {
- var pos = self.corpusTokens.indexOf(key),
- idf = self.idf(key),
- similarityBoost = 1,
- set = new lunr.SortedSet
- // if the expanded key is not an exact match to the token then
- // penalise the score for this key by how different the key is
- // to the token.
- if (key !== token) {
- var diff = Math.max(3, key.length - token.length)
- similarityBoost = 1 / Math.log(diff)
- }
- // calculate the query tf-idf score for this token
- // applying an similarityBoost to ensure exact matches
- // these rank higher than expanded terms
- if (pos > -1) queryVector.insert(pos, tf * idf * similarityBoost)
- // add all the documents that have this key into a set
- // ensuring that the type of key is preserved
- var matchingDocuments = self.tokenStore.get(key),
- refs = Object.keys(matchingDocuments),
- refsLen = refs.length
- for (var i = 0; i < refsLen; i++) {
- set.add(matchingDocuments[refs[i]].ref)
- }
- return memo.union(set)
- }, new lunr.SortedSet)
- documentSets.push(set)
- }, this)
- var documentSet = documentSets.reduce(function (memo, set) {
- return memo.intersect(set)
- })
- return documentSet
- .map(function (ref) {
- return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
- }, this)
- .sort(function (a, b) {
- return b.score - a.score
- })
- }
- /**
- * Generates a vector containing all the tokens in the document matching the
- * passed documentRef.
- *
- * The vector contains the tf-idf score for each token contained in the
- * document with the passed documentRef. The vector will contain an element
- * for every token in the indexes corpus, if the document does not contain that
- * token the element will be 0.
- *
- * @param {Object} documentRef The ref to find the document with.
- * @returns {lunr.Vector}
- * @private
- * @memberOf Index
- */
- lunr.Index.prototype.documentVector = function (documentRef) {
- var documentTokens = this.documentStore.get(documentRef),
- documentTokensLength = documentTokens.length,
- documentVector = new lunr.Vector
- for (var i = 0; i < documentTokensLength; i++) {
- var token = documentTokens.elements[i],
- tf = this.tokenStore.get(token)[documentRef].tf,
- idf = this.idf(token)
- documentVector.insert(this.corpusTokens.indexOf(token), tf * idf)
- };
- return documentVector
- }
- /**
- * Returns a representation of the index ready for serialisation.
- *
- * @returns {Object}
- * @memberOf Index
- */
- lunr.Index.prototype.toJSON = function () {
- return {
- version: lunr.version,
- fields: this._fields,
- ref: this._ref,
- tokenizer: this.tokenizerFn.label,
- documentStore: this.documentStore.toJSON(),
- tokenStore: this.tokenStore.toJSON(),
- corpusTokens: this.corpusTokens.toJSON(),
- pipeline: this.pipeline.toJSON()
- }
- }
- /**
- * Applies a plugin to the current index.
- *
- * A plugin is a function that is called with the index as its context.
- * Plugins can be used to customise or extend the behaviour the index
- * in some way. A plugin is just a function, that encapsulated the custom
- * behaviour that should be applied to the index.
- *
- * The plugin function will be called with the index as its argument, additional
- * arguments can also be passed when calling use. The function will be called
- * with the index as its context.
- *
- * Example:
- *
- * var myPlugin = function (idx, arg1, arg2) {
- * // `this` is the index to be extended
- * // apply any extensions etc here.
- * }
- *
- * var idx = lunr(function () {
- * this.use(myPlugin, 'arg1', 'arg2')
- * })
- *
- * @param {Function} plugin The plugin to apply.
- * @memberOf Index
- */
- lunr.Index.prototype.use = function (plugin) {
- var args = Array.prototype.slice.call(arguments, 1)
- args.unshift(this)
- plugin.apply(this, args)
- }
|