123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469 |
- lunr.Index = function () {
- this._fields = []
- this._ref = 'id'
- this.pipeline = new lunr.Pipeline
- this.documentStore = new lunr.Store
- this.tokenStore = new lunr.TokenStore
- this.corpusTokens = new lunr.SortedSet
- this.eventEmitter = new lunr.EventEmitter
- this.tokenizerFn = lunr.tokenizer
- this._idfCache = {}
- this.on('add', 'remove', 'update', (function () {
- this._idfCache = {}
- }).bind(this))
- }
- lunr.Index.prototype.on = function () {
- var args = Array.prototype.slice.call(arguments)
- return this.eventEmitter.addListener.apply(this.eventEmitter, args)
- }
- lunr.Index.prototype.off = function (name, fn) {
- return this.eventEmitter.removeListener(name, fn)
- }
- lunr.Index.load = function (serialisedData) {
- if (serialisedData.version !== lunr.version) {
- lunr.utils.warn('version mismatch: current ' + lunr.version + ' importing ' + serialisedData.version)
- }
- var idx = new this
- idx._fields = serialisedData.fields
- idx._ref = serialisedData.ref
- idx.tokenizer(lunr.tokenizer.load(serialisedData.tokenizer))
- idx.documentStore = lunr.Store.load(serialisedData.documentStore)
- idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
- idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
- idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
- return idx
- }
- lunr.Index.prototype.field = function (fieldName, opts) {
- var opts = opts || {},
- field = { name: fieldName, boost: opts.boost || 1 }
- this._fields.push(field)
- return this
- }
- lunr.Index.prototype.ref = function (refName) {
- this._ref = refName
- return this
- }
- lunr.Index.prototype.tokenizer = function (fn) {
- var isRegistered = fn.label && (fn.label in lunr.tokenizer.registeredFunctions)
- if (!isRegistered) {
- lunr.utils.warn('Function is not a registered tokenizer. This may cause problems when serialising the index')
- }
- this.tokenizerFn = fn
- return this
- }
- lunr.Index.prototype.add = function (doc, emitEvent) {
- var docTokens = {},
- allDocumentTokens = new lunr.SortedSet,
- docRef = doc[this._ref],
- emitEvent = emitEvent === undefined ? true : emitEvent
- this._fields.forEach(function (field) {
- var fieldTokens = this.pipeline.run(this.tokenizerFn(doc[field.name]))
- docTokens[field.name] = fieldTokens
- for (var i = 0; i < fieldTokens.length; i++) {
- var token = fieldTokens[i]
- allDocumentTokens.add(token)
- this.corpusTokens.add(token)
- }
- }, this)
- this.documentStore.set(docRef, allDocumentTokens)
- for (var i = 0; i < allDocumentTokens.length; i++) {
- var token = allDocumentTokens.elements[i]
- var tf = 0;
- for (var j = 0; j < this._fields.length; j++){
- var field = this._fields[j]
- var fieldTokens = docTokens[field.name]
- var fieldLength = fieldTokens.length
- if (!fieldLength) continue
- var tokenCount = 0
- for (var k = 0; k < fieldLength; k++){
- if (fieldTokens[k] === token){
- tokenCount++
- }
- }
- tf += (tokenCount / fieldLength * field.boost)
- }
- this.tokenStore.add(token, { ref: docRef, tf: tf })
- };
- if (emitEvent) this.eventEmitter.emit('add', doc, this)
- }
- lunr.Index.prototype.remove = function (doc, emitEvent) {
- var docRef = doc[this._ref],
- emitEvent = emitEvent === undefined ? true : emitEvent
- if (!this.documentStore.has(docRef)) return
- var docTokens = this.documentStore.get(docRef)
- this.documentStore.remove(docRef)
- docTokens.forEach(function (token) {
- this.tokenStore.remove(token, docRef)
- }, this)
- if (emitEvent) this.eventEmitter.emit('remove', doc, this)
- }
- lunr.Index.prototype.update = function (doc, emitEvent) {
- var emitEvent = emitEvent === undefined ? true : emitEvent
- this.remove(doc, false)
- this.add(doc, false)
- if (emitEvent) this.eventEmitter.emit('update', doc, this)
- }
- lunr.Index.prototype.idf = function (term) {
- var cacheKey = "@" + term
- if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]
- var documentFrequency = this.tokenStore.count(term),
- idf = 1
- if (documentFrequency > 0) {
- idf = 1 + Math.log(this.documentStore.length / documentFrequency)
- }
- return this._idfCache[cacheKey] = idf
- }
- lunr.Index.prototype.search = function (query) {
- var queryTokens = this.pipeline.run(this.tokenizerFn(query)),
- queryVector = new lunr.Vector,
- documentSets = [],
- fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
- var hasSomeToken = queryTokens.some(function (token) {
- return this.tokenStore.has(token)
- }, this)
- if (!hasSomeToken) return []
- queryTokens
- .forEach(function (token, i, tokens) {
- var tf = 1 / tokens.length * this._fields.length * fieldBoosts,
- self = this
- var set = this.tokenStore.expand(token).reduce(function (memo, key) {
- var pos = self.corpusTokens.indexOf(key),
- idf = self.idf(key),
- similarityBoost = 1,
- set = new lunr.SortedSet
- // if the expanded key is not an exact match to the token then
- // penalise the score for this key by how different the key is
- // to the token.
- if (key !== token) {
- var diff = Math.max(3, key.length - token.length)
- similarityBoost = 1 / Math.log(diff)
- }
-
-
-
- if (pos > -1) queryVector.insert(pos, tf * idf * similarityBoost)
-
-
- var matchingDocuments = self.tokenStore.get(key),
- refs = Object.keys(matchingDocuments),
- refsLen = refs.length
- for (var i = 0; i < refsLen; i++) {
- set.add(matchingDocuments[refs[i]].ref)
- }
- return memo.union(set)
- }, new lunr.SortedSet)
- documentSets.push(set)
- }, this)
- var documentSet = documentSets.reduce(function (memo, set) {
- return memo.intersect(set)
- })
- return documentSet
- .map(function (ref) {
- return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
- }, this)
- .sort(function (a, b) {
- return b.score - a.score
- })
- }
- lunr.Index.prototype.documentVector = function (documentRef) {
- var documentTokens = this.documentStore.get(documentRef),
- documentTokensLength = documentTokens.length,
- documentVector = new lunr.Vector
- for (var i = 0; i < documentTokensLength; i++) {
- var token = documentTokens.elements[i],
- tf = this.tokenStore.get(token)[documentRef].tf,
- idf = this.idf(token)
- documentVector.insert(this.corpusTokens.indexOf(token), tf * idf)
- };
- return documentVector
- }
- lunr.Index.prototype.toJSON = function () {
- return {
- version: lunr.version,
- fields: this._fields,
- ref: this._ref,
- tokenizer: this.tokenizerFn.label,
- documentStore: this.documentStore.toJSON(),
- tokenStore: this.tokenStore.toJSON(),
- corpusTokens: this.corpusTokens.toJSON(),
- pipeline: this.pipeline.toJSON()
- }
- }
- lunr.Index.prototype.use = function (plugin) {
- var args = Array.prototype.slice.call(arguments, 1)
- args.unshift(this)
- plugin.apply(this, args)
- }
|