builder.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /*!
  2. * lunr.Builder
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. */
  5. /**
  6. * lunr.Builder performs indexing on a set of documents and
  7. * returns instances of lunr.Index ready for querying.
  8. *
  9. * All configuration of the index is done via the builder, the
  10. * fields to index, the document reference, the text processing
  11. * pipeline and document scoring parameters are all set on the
  12. * builder before indexing.
  13. *
  14. * @constructor
  15. * @property {string} _ref - Internal reference to the document reference field.
  16. * @property {string[]} _fields - Internal reference to the document fields to index.
  17. * @property {object} invertedIndex - The inverted index maps terms to document fields.
  18. * @property {object} documentTermFrequencies - Keeps track of document term frequencies.
  19. * @property {object} documentLengths - Keeps track of the length of documents added to the index.
  20. * @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.
  21. * @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.
  22. * @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index.
  23. * @property {number} documentCount - Keeps track of the total number of documents indexed.
  24. * @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.
  25. * @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2.
  26. * @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.
  27. * @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index.
  28. */
  29. lunr.Builder = function () {
  30. this._ref = "id"
  31. this._fields = Object.create(null)
  32. this._documents = Object.create(null)
  33. this.invertedIndex = Object.create(null)
  34. this.fieldTermFrequencies = {}
  35. this.fieldLengths = {}
  36. this.tokenizer = lunr.tokenizer
  37. this.pipeline = new lunr.Pipeline
  38. this.searchPipeline = new lunr.Pipeline
  39. this.documentCount = 0
  40. this._b = 0.75
  41. this._k1 = 1.2
  42. this.termIndex = 0
  43. this.metadataWhitelist = []
  44. }
  45. /**
  46. * Sets the document field used as the document reference. Every document must have this field.
  47. * The type of this field in the document should be a string, if it is not a string it will be
  48. * coerced into a string by calling toString.
  49. *
  50. * The default ref is 'id'.
  51. *
  52. * The ref should _not_ be changed during indexing, it should be set before any documents are
  53. * added to the index. Changing it during indexing can lead to inconsistent results.
  54. *
  55. * @param {string} ref - The name of the reference field in the document.
  56. */
  57. lunr.Builder.prototype.ref = function (ref) {
  58. this._ref = ref
  59. }
  60. /**
  61. * A function that is used to extract a field from a document.
  62. *
  63. * Lunr expects a field to be at the top level of a document, if however the field
  64. * is deeply nested within a document an extractor function can be used to extract
  65. * the right field for indexing.
  66. *
  67. * @callback fieldExtractor
  68. * @param {object} doc - The document being added to the index.
  69. * @returns {?(string|object|object[])} obj - The object that will be indexed for this field.
  70. * @example <caption>Extracting a nested field</caption>
  71. * function (doc) { return doc.nested.field }
  72. */
  73. /**
  74. * Adds a field to the list of document fields that will be indexed. Every document being
  75. * indexed should have this field. Null values for this field in indexed documents will
  76. * not cause errors but will limit the chance of that document being retrieved by searches.
  77. *
  78. * All fields should be added before adding documents to the index. Adding fields after
  79. * a document has been indexed will have no effect on already indexed documents.
  80. *
  81. * Fields can be boosted at build time. This allows terms within that field to have more
  82. * importance when ranking search results. Use a field boost to specify that matches within
  83. * one field are more important than other fields.
  84. *
  85. * @param {string} fieldName - The name of a field to index in all documents.
  86. * @param {object} attributes - Optional attributes associated with this field.
  87. * @param {number} [attributes.boost=1] - Boost applied to all terms within this field.
  88. * @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document.
  89. * @throws {RangeError} fieldName cannot contain unsupported characters '/'
  90. */
  91. lunr.Builder.prototype.field = function (fieldName, attributes) {
  92. if (/\//.test(fieldName)) {
  93. throw new RangeError ("Field '" + fieldName + "' contains illegal character '/'")
  94. }
  95. this._fields[fieldName] = attributes || {}
  96. }
  97. /**
  98. * A parameter to tune the amount of field length normalisation that is applied when
  99. * calculating relevance scores. A value of 0 will completely disable any normalisation
  100. * and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b
  101. * will be clamped to the range 0 - 1.
  102. *
  103. * @param {number} number - The value to set for this tuning parameter.
  104. */
  105. lunr.Builder.prototype.b = function (number) {
  106. if (number < 0) {
  107. this._b = 0
  108. } else if (number > 1) {
  109. this._b = 1
  110. } else {
  111. this._b = number
  112. }
  113. }
  114. /**
  115. * A parameter that controls the speed at which a rise in term frequency results in term
  116. * frequency saturation. The default value is 1.2. Setting this to a higher value will give
  117. * slower saturation levels, a lower value will result in quicker saturation.
  118. *
  119. * @param {number} number - The value to set for this tuning parameter.
  120. */
  121. lunr.Builder.prototype.k1 = function (number) {
  122. this._k1 = number
  123. }
  124. /**
  125. * Adds a document to the index.
  126. *
  127. * Before adding fields to the index the index should have been fully setup, with the document
  128. * ref and all fields to index already having been specified.
  129. *
  130. * The document must have a field name as specified by the ref (by default this is 'id') and
  131. * it should have all fields defined for indexing, though null or undefined values will not
  132. * cause errors.
  133. *
  134. * Entire documents can be boosted at build time. Applying a boost to a document indicates that
  135. * this document should rank higher in search results than other documents.
  136. *
  137. * @param {object} doc - The document to add to the index.
  138. * @param {object} attributes - Optional attributes associated with this document.
  139. * @param {number} [attributes.boost=1] - Boost applied to all terms within this document.
  140. */
  141. lunr.Builder.prototype.add = function (doc, attributes) {
  142. var docRef = doc[this._ref],
  143. fields = Object.keys(this._fields)
  144. this._documents[docRef] = attributes || {}
  145. this.documentCount += 1
  146. for (var i = 0; i < fields.length; i++) {
  147. var fieldName = fields[i],
  148. extractor = this._fields[fieldName].extractor,
  149. field = extractor ? extractor(doc) : doc[fieldName],
  150. tokens = this.tokenizer(field, {
  151. fields: [fieldName]
  152. }),
  153. terms = this.pipeline.run(tokens),
  154. fieldRef = new lunr.FieldRef (docRef, fieldName),
  155. fieldTerms = Object.create(null)
  156. this.fieldTermFrequencies[fieldRef] = fieldTerms
  157. this.fieldLengths[fieldRef] = 0
  158. // store the length of this field for this document
  159. this.fieldLengths[fieldRef] += terms.length
  160. // calculate term frequencies for this field
  161. for (var j = 0; j < terms.length; j++) {
  162. var term = terms[j]
  163. if (fieldTerms[term] == undefined) {
  164. fieldTerms[term] = 0
  165. }
  166. fieldTerms[term] += 1
  167. // add to inverted index
  168. // create an initial posting if one doesn't exist
  169. if (this.invertedIndex[term] == undefined) {
  170. var posting = Object.create(null)
  171. posting["_index"] = this.termIndex
  172. this.termIndex += 1
  173. for (var k = 0; k < fields.length; k++) {
  174. posting[fields[k]] = Object.create(null)
  175. }
  176. this.invertedIndex[term] = posting
  177. }
  178. // add an entry for this term/fieldName/docRef to the invertedIndex
  179. if (this.invertedIndex[term][fieldName][docRef] == undefined) {
  180. this.invertedIndex[term][fieldName][docRef] = Object.create(null)
  181. }
  182. // store all whitelisted metadata about this token in the
  183. // inverted index
  184. for (var l = 0; l < this.metadataWhitelist.length; l++) {
  185. var metadataKey = this.metadataWhitelist[l],
  186. metadata = term.metadata[metadataKey]
  187. if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {
  188. this.invertedIndex[term][fieldName][docRef][metadataKey] = []
  189. }
  190. this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)
  191. }
  192. }
  193. }
  194. }
  195. /**
  196. * Calculates the average document length for this index
  197. *
  198. * @private
  199. */
  200. lunr.Builder.prototype.calculateAverageFieldLengths = function () {
  201. var fieldRefs = Object.keys(this.fieldLengths),
  202. numberOfFields = fieldRefs.length,
  203. accumulator = {},
  204. documentsWithField = {}
  205. for (var i = 0; i < numberOfFields; i++) {
  206. var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
  207. field = fieldRef.fieldName
  208. documentsWithField[field] || (documentsWithField[field] = 0)
  209. documentsWithField[field] += 1
  210. accumulator[field] || (accumulator[field] = 0)
  211. accumulator[field] += this.fieldLengths[fieldRef]
  212. }
  213. var fields = Object.keys(this._fields)
  214. for (var i = 0; i < fields.length; i++) {
  215. var fieldName = fields[i]
  216. accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName]
  217. }
  218. this.averageFieldLength = accumulator
  219. }
  220. /**
  221. * Builds a vector space model of every document using lunr.Vector
  222. *
  223. * @private
  224. */
  225. lunr.Builder.prototype.createFieldVectors = function () {
  226. var fieldVectors = {},
  227. fieldRefs = Object.keys(this.fieldTermFrequencies),
  228. fieldRefsLength = fieldRefs.length,
  229. termIdfCache = Object.create(null)
  230. for (var i = 0; i < fieldRefsLength; i++) {
  231. var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
  232. fieldName = fieldRef.fieldName,
  233. fieldLength = this.fieldLengths[fieldRef],
  234. fieldVector = new lunr.Vector,
  235. termFrequencies = this.fieldTermFrequencies[fieldRef],
  236. terms = Object.keys(termFrequencies),
  237. termsLength = terms.length
  238. var fieldBoost = this._fields[fieldName].boost || 1,
  239. docBoost = this._documents[fieldRef.docRef].boost || 1
  240. for (var j = 0; j < termsLength; j++) {
  241. var term = terms[j],
  242. tf = termFrequencies[term],
  243. termIndex = this.invertedIndex[term]._index,
  244. idf, score, scoreWithPrecision
  245. if (termIdfCache[term] === undefined) {
  246. idf = lunr.idf(this.invertedIndex[term], this.documentCount)
  247. termIdfCache[term] = idf
  248. } else {
  249. idf = termIdfCache[term]
  250. }
  251. score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf)
  252. score *= fieldBoost
  253. score *= docBoost
  254. scoreWithPrecision = Math.round(score * 1000) / 1000
  255. // Converts 1.23456789 to 1.234.
  256. // Reducing the precision so that the vectors take up less
  257. // space when serialised. Doing it now so that they behave
  258. // the same before and after serialisation. Also, this is
  259. // the fastest approach to reducing a number's precision in
  260. // JavaScript.
  261. fieldVector.insert(termIndex, scoreWithPrecision)
  262. }
  263. fieldVectors[fieldRef] = fieldVector
  264. }
  265. this.fieldVectors = fieldVectors
  266. }
  267. /**
  268. * Creates a token set of all tokens in the index using lunr.TokenSet
  269. *
  270. * @private
  271. */
  272. lunr.Builder.prototype.createTokenSet = function () {
  273. this.tokenSet = lunr.TokenSet.fromArray(
  274. Object.keys(this.invertedIndex).sort()
  275. )
  276. }
  277. /**
  278. * Builds the index, creating an instance of lunr.Index.
  279. *
  280. * This completes the indexing process and should only be called
  281. * once all documents have been added to the index.
  282. *
  283. * @returns {lunr.Index}
  284. */
  285. lunr.Builder.prototype.build = function () {
  286. this.calculateAverageFieldLengths()
  287. this.createFieldVectors()
  288. this.createTokenSet()
  289. return new lunr.Index({
  290. invertedIndex: this.invertedIndex,
  291. fieldVectors: this.fieldVectors,
  292. tokenSet: this.tokenSet,
  293. fields: Object.keys(this._fields),
  294. pipeline: this.searchPipeline
  295. })
  296. }
  297. /**
  298. * Applies a plugin to the index builder.
  299. *
  300. * A plugin is a function that is called with the index builder as its context.
  301. * Plugins can be used to customise or extend the behaviour of the index
  302. * in some way. A plugin is just a function, that encapsulated the custom
  303. * behaviour that should be applied when building the index.
  304. *
  305. * The plugin function will be called with the index builder as its argument, additional
  306. * arguments can also be passed when calling use. The function will be called
  307. * with the index builder as its context.
  308. *
  309. * @param {Function} plugin The plugin to apply.
  310. */
  311. lunr.Builder.prototype.use = function (fn) {
  312. var args = Array.prototype.slice.call(arguments, 1)
  313. args.unshift(this)
  314. fn.apply(this, args)
  315. }