123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360 |
- lunr.Builder = function () {
- this._ref = "id"
- this._fields = Object.create(null)
- this._documents = Object.create(null)
- this.invertedIndex = Object.create(null)
- this.fieldTermFrequencies = {}
- this.fieldLengths = {}
- this.tokenizer = lunr.tokenizer
- this.pipeline = new lunr.Pipeline
- this.searchPipeline = new lunr.Pipeline
- this.documentCount = 0
- this._b = 0.75
- this._k1 = 1.2
- this.termIndex = 0
- this.metadataWhitelist = []
- }
- lunr.Builder.prototype.ref = function (ref) {
- this._ref = ref
- }
- lunr.Builder.prototype.field = function (fieldName, attributes) {
- if (/\//.test(fieldName)) {
- throw new RangeError ("Field '" + fieldName + "' contains illegal character '/'")
- }
- this._fields[fieldName] = attributes || {}
- }
- lunr.Builder.prototype.b = function (number) {
- if (number < 0) {
- this._b = 0
- } else if (number > 1) {
- this._b = 1
- } else {
- this._b = number
- }
- }
- lunr.Builder.prototype.k1 = function (number) {
- this._k1 = number
- }
- lunr.Builder.prototype.add = function (doc, attributes) {
- var docRef = doc[this._ref],
- fields = Object.keys(this._fields)
- this._documents[docRef] = attributes || {}
- this.documentCount += 1
- for (var i = 0; i < fields.length; i++) {
- var fieldName = fields[i],
- extractor = this._fields[fieldName].extractor,
- field = extractor ? extractor(doc) : doc[fieldName],
- tokens = this.tokenizer(field, {
- fields: [fieldName]
- }),
- terms = this.pipeline.run(tokens),
- fieldRef = new lunr.FieldRef (docRef, fieldName),
- fieldTerms = Object.create(null)
- this.fieldTermFrequencies[fieldRef] = fieldTerms
- this.fieldLengths[fieldRef] = 0
-
- this.fieldLengths[fieldRef] += terms.length
-
- for (var j = 0; j < terms.length; j++) {
- var term = terms[j]
- if (fieldTerms[term] == undefined) {
- fieldTerms[term] = 0
- }
- fieldTerms[term] += 1
-
-
- if (this.invertedIndex[term] == undefined) {
- var posting = Object.create(null)
- posting["_index"] = this.termIndex
- this.termIndex += 1
- for (var k = 0; k < fields.length; k++) {
- posting[fields[k]] = Object.create(null)
- }
- this.invertedIndex[term] = posting
- }
-
- if (this.invertedIndex[term][fieldName][docRef] == undefined) {
- this.invertedIndex[term][fieldName][docRef] = Object.create(null)
- }
-
-
- for (var l = 0; l < this.metadataWhitelist.length; l++) {
- var metadataKey = this.metadataWhitelist[l],
- metadata = term.metadata[metadataKey]
- if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {
- this.invertedIndex[term][fieldName][docRef][metadataKey] = []
- }
- this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)
- }
- }
- }
- }
- lunr.Builder.prototype.calculateAverageFieldLengths = function () {
- var fieldRefs = Object.keys(this.fieldLengths),
- numberOfFields = fieldRefs.length,
- accumulator = {},
- documentsWithField = {}
- for (var i = 0; i < numberOfFields; i++) {
- var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
- field = fieldRef.fieldName
- documentsWithField[field] || (documentsWithField[field] = 0)
- documentsWithField[field] += 1
- accumulator[field] || (accumulator[field] = 0)
- accumulator[field] += this.fieldLengths[fieldRef]
- }
- var fields = Object.keys(this._fields)
- for (var i = 0; i < fields.length; i++) {
- var fieldName = fields[i]
- accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName]
- }
- this.averageFieldLength = accumulator
- }
- lunr.Builder.prototype.createFieldVectors = function () {
- var fieldVectors = {},
- fieldRefs = Object.keys(this.fieldTermFrequencies),
- fieldRefsLength = fieldRefs.length,
- termIdfCache = Object.create(null)
- for (var i = 0; i < fieldRefsLength; i++) {
- var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
- fieldName = fieldRef.fieldName,
- fieldLength = this.fieldLengths[fieldRef],
- fieldVector = new lunr.Vector,
- termFrequencies = this.fieldTermFrequencies[fieldRef],
- terms = Object.keys(termFrequencies),
- termsLength = terms.length
- var fieldBoost = this._fields[fieldName].boost || 1,
- docBoost = this._documents[fieldRef.docRef].boost || 1
- for (var j = 0; j < termsLength; j++) {
- var term = terms[j],
- tf = termFrequencies[term],
- termIndex = this.invertedIndex[term]._index,
- idf, score, scoreWithPrecision
- if (termIdfCache[term] === undefined) {
- idf = lunr.idf(this.invertedIndex[term], this.documentCount)
- termIdfCache[term] = idf
- } else {
- idf = termIdfCache[term]
- }
- score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf)
- score *= fieldBoost
- score *= docBoost
- scoreWithPrecision = Math.round(score * 1000) / 1000
-
-
-
-
-
-
- fieldVector.insert(termIndex, scoreWithPrecision)
- }
- fieldVectors[fieldRef] = fieldVector
- }
- this.fieldVectors = fieldVectors
- }
- lunr.Builder.prototype.createTokenSet = function () {
- this.tokenSet = lunr.TokenSet.fromArray(
- Object.keys(this.invertedIndex).sort()
- )
- }
- lunr.Builder.prototype.build = function () {
- this.calculateAverageFieldLengths()
- this.createFieldVectors()
- this.createTokenSet()
- return new lunr.Index({
- invertedIndex: this.invertedIndex,
- fieldVectors: this.fieldVectors,
- tokenSet: this.tokenSet,
- fields: Object.keys(this._fields),
- pipeline: this.searchPipeline
- })
- }
- lunr.Builder.prototype.use = function (fn) {
- var args = Array.prototype.slice.call(arguments, 1)
- args.unshift(this)
- fn.apply(this, args)
- }
|