index.js 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. /*!
  2. * lunr.Index
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. */
  5. /**
  6. * An index contains the built index of all documents and provides a query interface
  7. * to the index.
  8. *
  9. * Usually instances of lunr.Index will not be created using this constructor, instead
  10. * lunr.Builder should be used to construct new indexes, or lunr.Index.load should be
  11. * used to load previously built and serialized indexes.
  12. *
  13. * @constructor
  14. * @param {Object} attrs - The attributes of the built search index.
  15. * @param {Object} attrs.invertedIndex - An index of term/field to document reference.
  16. * @param {Object<string, lunr.Vector>} attrs.fieldVectors - Field vectors
  17. * @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens.
  18. * @param {string[]} attrs.fields - The names of indexed document fields.
  19. * @param {lunr.Pipeline} attrs.pipeline - The pipeline to use for search terms.
  20. */
  21. lunr.Index = function (attrs) {
  22. this.invertedIndex = attrs.invertedIndex
  23. this.fieldVectors = attrs.fieldVectors
  24. this.tokenSet = attrs.tokenSet
  25. this.fields = attrs.fields
  26. this.pipeline = attrs.pipeline
  27. }
  28. /**
  29. * A result contains details of a document matching a search query.
  30. * @typedef {Object} lunr.Index~Result
  31. * @property {string} ref - The reference of the document this result represents.
  32. * @property {number} score - A number between 0 and 1 representing how similar this document is to the query.
  33. * @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match.
  34. */
  35. /**
  36. * Although lunr provides the ability to create queries using lunr.Query, it also provides a simple
  37. * query language which itself is parsed into an instance of lunr.Query.
  38. *
  39. * For programmatically building queries it is advised to directly use lunr.Query, the query language
  40. * is best used for human entered text rather than program generated text.
  41. *
  42. * At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported
  43. * and will be combined with OR, e.g `hello world` will match documents that contain either 'hello'
  44. * or 'world', though those that contain both will rank higher in the results.
  45. *
  46. * Wildcards can be included in terms to match one or more unspecified characters, these wildcards can
  47. * be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding
  48. * wildcards will increase the number of documents that will be found but can also have a negative
  49. * impact on query performance, especially with wildcards at the beginning of a term.
  50. *
  51. * Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term
  52. * hello in the title field will match this query. Using a field not present in the index will lead
  53. * to an error being thrown.
  54. *
  55. * Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term
  56. * boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported
  57. * to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2.
  58. * Avoid large values for edit distance to improve query performance.
  59. *
  60. * Each term also supports a presence modifier. By default a term's presence in document is optional, however
  61. * this can be changed to either required or prohibited. For a term's presence to be required in a document the
  62. * term should be prefixed with a '+', e.g. `+foo bar` is a search for documents that must contain 'foo' and
  63. * optionally contain 'bar'. Conversely a leading '-' sets the terms presence to prohibited, i.e. it must not
  64. * appear in a document, e.g. `-foo bar` is a search for documents that do not contain 'foo' but may contain 'bar'.
  65. *
  66. * To escape special characters the backslash character '\' can be used, this allows searches to include
  67. * characters that would normally be considered modifiers, e.g. `foo\~2` will search for a term "foo~2" instead
  68. * of attempting to apply a boost of 2 to the search term "foo".
  69. *
  70. * @typedef {string} lunr.Index~QueryString
  71. * @example <caption>Simple single term query</caption>
  72. * hello
  73. * @example <caption>Multiple term query</caption>
  74. * hello world
  75. * @example <caption>term scoped to a field</caption>
  76. * title:hello
  77. * @example <caption>term with a boost of 10</caption>
  78. * hello^10
  79. * @example <caption>term with an edit distance of 2</caption>
  80. * hello~2
  81. * @example <caption>terms with presence modifiers</caption>
  82. * -foo +bar baz
  83. */
  84. /**
  85. * Performs a search against the index using lunr query syntax.
  86. *
  87. * Results will be returned sorted by their score, the most relevant results
  88. * will be returned first. For details on how the score is calculated, please see
  89. * the {@link https://lunrjs.com/guides/searching.html#scoring|guide}.
  90. *
  91. * For more programmatic querying use lunr.Index#query.
  92. *
  93. * @param {lunr.Index~QueryString} queryString - A string containing a lunr query.
  94. * @throws {lunr.QueryParseError} If the passed query string cannot be parsed.
  95. * @returns {lunr.Index~Result[]}
  96. */
  97. lunr.Index.prototype.search = function (queryString) {
  98. return this.query(function (query) {
  99. var parser = new lunr.QueryParser(queryString, query)
  100. parser.parse()
  101. })
  102. }
  103. /**
  104. * A query builder callback provides a query object to be used to express
  105. * the query to perform on the index.
  106. *
  107. * @callback lunr.Index~queryBuilder
  108. * @param {lunr.Query} query - The query object to build up.
  109. * @this lunr.Query
  110. */
  111. /**
  112. * Performs a query against the index using the yielded lunr.Query object.
  113. *
  114. * If performing programmatic queries against the index, this method is preferred
  115. * over lunr.Index#search so as to avoid the additional query parsing overhead.
  116. *
  117. * A query object is yielded to the supplied function which should be used to
  118. * express the query to be run against the index.
  119. *
  120. * Note that although this function takes a callback parameter it is _not_ an
  121. * asynchronous operation, the callback is just yielded a query object to be
  122. * customized.
  123. *
  124. * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.
  125. * @returns {lunr.Index~Result[]}
  126. */
  127. lunr.Index.prototype.query = function (fn) {
  128. // for each query clause
  129. // * process terms
  130. // * expand terms from token set
  131. // * find matching documents and metadata
  132. // * get document vectors
  133. // * score documents
  134. var query = new lunr.Query(this.fields),
  135. matchingFields = Object.create(null),
  136. queryVectors = Object.create(null),
  137. termFieldCache = Object.create(null),
  138. requiredMatches = Object.create(null),
  139. prohibitedMatches = Object.create(null)
  140. /*
  141. * To support field level boosts a query vector is created per
  142. * field. An empty vector is eagerly created to support negated
  143. * queries.
  144. */
  145. for (var i = 0; i < this.fields.length; i++) {
  146. queryVectors[this.fields[i]] = new lunr.Vector
  147. }
  148. fn.call(query, query)
  149. for (var i = 0; i < query.clauses.length; i++) {
  150. /*
  151. * Unless the pipeline has been disabled for this term, which is
  152. * the case for terms with wildcards, we need to pass the clause
  153. * term through the search pipeline. A pipeline returns an array
  154. * of processed terms. Pipeline functions may expand the passed
  155. * term, which means we may end up performing multiple index lookups
  156. * for a single query term.
  157. */
  158. var clause = query.clauses[i],
  159. terms = null,
  160. clauseMatches = lunr.Set.empty
  161. if (clause.usePipeline) {
  162. terms = this.pipeline.runString(clause.term, {
  163. fields: clause.fields
  164. })
  165. } else {
  166. terms = [clause.term]
  167. }
  168. for (var m = 0; m < terms.length; m++) {
  169. var term = terms[m]
  170. /*
  171. * Each term returned from the pipeline needs to use the same query
  172. * clause object, e.g. the same boost and or edit distance. The
  173. * simplest way to do this is to re-use the clause object but mutate
  174. * its term property.
  175. */
  176. clause.term = term
  177. /*
  178. * From the term in the clause we create a token set which will then
  179. * be used to intersect the indexes token set to get a list of terms
  180. * to lookup in the inverted index
  181. */
  182. var termTokenSet = lunr.TokenSet.fromClause(clause),
  183. expandedTerms = this.tokenSet.intersect(termTokenSet).toArray()
  184. /*
  185. * If a term marked as required does not exist in the tokenSet it is
  186. * impossible for the search to return any matches. We set all the field
  187. * scoped required matches set to empty and stop examining any further
  188. * clauses.
  189. */
  190. if (expandedTerms.length === 0 && clause.presence === lunr.Query.presence.REQUIRED) {
  191. for (var k = 0; k < clause.fields.length; k++) {
  192. var field = clause.fields[k]
  193. requiredMatches[field] = lunr.Set.empty
  194. }
  195. break
  196. }
  197. for (var j = 0; j < expandedTerms.length; j++) {
  198. /*
  199. * For each term get the posting and termIndex, this is required for
  200. * building the query vector.
  201. */
  202. var expandedTerm = expandedTerms[j],
  203. posting = this.invertedIndex[expandedTerm],
  204. termIndex = posting._index
  205. for (var k = 0; k < clause.fields.length; k++) {
  206. /*
  207. * For each field that this query term is scoped by (by default
  208. * all fields are in scope) we need to get all the document refs
  209. * that have this term in that field.
  210. *
  211. * The posting is the entry in the invertedIndex for the matching
  212. * term from above.
  213. */
  214. var field = clause.fields[k],
  215. fieldPosting = posting[field],
  216. matchingDocumentRefs = Object.keys(fieldPosting),
  217. termField = expandedTerm + "/" + field,
  218. matchingDocumentsSet = new lunr.Set(matchingDocumentRefs)
  219. /*
  220. * if the presence of this term is required ensure that the matching
  221. * documents are added to the set of required matches for this clause.
  222. *
  223. */
  224. if (clause.presence == lunr.Query.presence.REQUIRED) {
  225. clauseMatches = clauseMatches.union(matchingDocumentsSet)
  226. if (requiredMatches[field] === undefined) {
  227. requiredMatches[field] = lunr.Set.complete
  228. }
  229. }
  230. /*
  231. * if the presence of this term is prohibited ensure that the matching
  232. * documents are added to the set of prohibited matches for this field,
  233. * creating that set if it does not yet exist.
  234. */
  235. if (clause.presence == lunr.Query.presence.PROHIBITED) {
  236. if (prohibitedMatches[field] === undefined) {
  237. prohibitedMatches[field] = lunr.Set.empty
  238. }
  239. prohibitedMatches[field] = prohibitedMatches[field].union(matchingDocumentsSet)
  240. /*
  241. * Prohibited matches should not be part of the query vector used for
  242. * similarity scoring and no metadata should be extracted so we continue
  243. * to the next field
  244. */
  245. continue
  246. }
  247. /*
  248. * The query field vector is populated using the termIndex found for
  249. * the term and a unit value with the appropriate boost applied.
  250. * Using upsert because there could already be an entry in the vector
  251. * for the term we are working with. In that case we just add the scores
  252. * together.
  253. */
  254. queryVectors[field].upsert(termIndex, clause.boost, function (a, b) { return a + b })
  255. /**
  256. * If we've already seen this term, field combo then we've already collected
  257. * the matching documents and metadata, no need to go through all that again
  258. */
  259. if (termFieldCache[termField]) {
  260. continue
  261. }
  262. for (var l = 0; l < matchingDocumentRefs.length; l++) {
  263. /*
  264. * All metadata for this term/field/document triple
  265. * are then extracted and collected into an instance
  266. * of lunr.MatchData ready to be returned in the query
  267. * results
  268. */
  269. var matchingDocumentRef = matchingDocumentRefs[l],
  270. matchingFieldRef = new lunr.FieldRef (matchingDocumentRef, field),
  271. metadata = fieldPosting[matchingDocumentRef],
  272. fieldMatch
  273. if ((fieldMatch = matchingFields[matchingFieldRef]) === undefined) {
  274. matchingFields[matchingFieldRef] = new lunr.MatchData (expandedTerm, field, metadata)
  275. } else {
  276. fieldMatch.add(expandedTerm, field, metadata)
  277. }
  278. }
  279. termFieldCache[termField] = true
  280. }
  281. }
  282. }
  283. /**
  284. * If the presence was required we need to update the requiredMatches field sets.
  285. * We do this after all fields for the term have collected their matches because
  286. * the clause terms presence is required in _any_ of the fields not _all_ of the
  287. * fields.
  288. */
  289. if (clause.presence === lunr.Query.presence.REQUIRED) {
  290. for (var k = 0; k < clause.fields.length; k++) {
  291. var field = clause.fields[k]
  292. requiredMatches[field] = requiredMatches[field].intersect(clauseMatches)
  293. }
  294. }
  295. }
  296. /**
  297. * Need to combine the field scoped required and prohibited
  298. * matching documents into a global set of required and prohibited
  299. * matches
  300. */
  301. var allRequiredMatches = lunr.Set.complete,
  302. allProhibitedMatches = lunr.Set.empty
  303. for (var i = 0; i < this.fields.length; i++) {
  304. var field = this.fields[i]
  305. if (requiredMatches[field]) {
  306. allRequiredMatches = allRequiredMatches.intersect(requiredMatches[field])
  307. }
  308. if (prohibitedMatches[field]) {
  309. allProhibitedMatches = allProhibitedMatches.union(prohibitedMatches[field])
  310. }
  311. }
  312. var matchingFieldRefs = Object.keys(matchingFields),
  313. results = [],
  314. matches = Object.create(null)
  315. /*
  316. * If the query is negated (contains only prohibited terms)
  317. * we need to get _all_ fieldRefs currently existing in the
  318. * index. This is only done when we know that the query is
  319. * entirely prohibited terms to avoid any cost of getting all
  320. * fieldRefs unnecessarily.
  321. *
  322. * Additionally, blank MatchData must be created to correctly
  323. * populate the results.
  324. */
  325. if (query.isNegated()) {
  326. matchingFieldRefs = Object.keys(this.fieldVectors)
  327. for (var i = 0; i < matchingFieldRefs.length; i++) {
  328. var matchingFieldRef = matchingFieldRefs[i]
  329. var fieldRef = lunr.FieldRef.fromString(matchingFieldRef)
  330. matchingFields[matchingFieldRef] = new lunr.MatchData
  331. }
  332. }
  333. for (var i = 0; i < matchingFieldRefs.length; i++) {
  334. /*
  335. * Currently we have document fields that match the query, but we
  336. * need to return documents. The matchData and scores are combined
  337. * from multiple fields belonging to the same document.
  338. *
  339. * Scores are calculated by field, using the query vectors created
  340. * above, and combined into a final document score using addition.
  341. */
  342. var fieldRef = lunr.FieldRef.fromString(matchingFieldRefs[i]),
  343. docRef = fieldRef.docRef
  344. if (!allRequiredMatches.contains(docRef)) {
  345. continue
  346. }
  347. if (allProhibitedMatches.contains(docRef)) {
  348. continue
  349. }
  350. var fieldVector = this.fieldVectors[fieldRef],
  351. score = queryVectors[fieldRef.fieldName].similarity(fieldVector),
  352. docMatch
  353. if ((docMatch = matches[docRef]) !== undefined) {
  354. docMatch.score += score
  355. docMatch.matchData.combine(matchingFields[fieldRef])
  356. } else {
  357. var match = {
  358. ref: docRef,
  359. score: score,
  360. matchData: matchingFields[fieldRef]
  361. }
  362. matches[docRef] = match
  363. results.push(match)
  364. }
  365. }
  366. /*
  367. * Sort the results objects by score, highest first.
  368. */
  369. return results.sort(function (a, b) {
  370. return b.score - a.score
  371. })
  372. }
  373. /**
  374. * Prepares the index for JSON serialization.
  375. *
  376. * The schema for this JSON blob will be described in a
  377. * separate JSON schema file.
  378. *
  379. * @returns {Object}
  380. */
  381. lunr.Index.prototype.toJSON = function () {
  382. var invertedIndex = Object.keys(this.invertedIndex)
  383. .sort()
  384. .map(function (term) {
  385. return [term, this.invertedIndex[term]]
  386. }, this)
  387. var fieldVectors = Object.keys(this.fieldVectors)
  388. .map(function (ref) {
  389. return [ref, this.fieldVectors[ref].toJSON()]
  390. }, this)
  391. return {
  392. version: lunr.version,
  393. fields: this.fields,
  394. fieldVectors: fieldVectors,
  395. invertedIndex: invertedIndex,
  396. pipeline: this.pipeline.toJSON()
  397. }
  398. }
  399. /**
  400. * Loads a previously serialized lunr.Index
  401. *
  402. * @param {Object} serializedIndex - A previously serialized lunr.Index
  403. * @returns {lunr.Index}
  404. */
  405. lunr.Index.load = function (serializedIndex) {
  406. var attrs = {},
  407. fieldVectors = {},
  408. serializedVectors = serializedIndex.fieldVectors,
  409. invertedIndex = Object.create(null),
  410. serializedInvertedIndex = serializedIndex.invertedIndex,
  411. tokenSetBuilder = new lunr.TokenSet.Builder,
  412. pipeline = lunr.Pipeline.load(serializedIndex.pipeline)
  413. if (serializedIndex.version != lunr.version) {
  414. lunr.utils.warn("Version mismatch when loading serialised index. Current version of lunr '" + lunr.version + "' does not match serialized index '" + serializedIndex.version + "'")
  415. }
  416. for (var i = 0; i < serializedVectors.length; i++) {
  417. var tuple = serializedVectors[i],
  418. ref = tuple[0],
  419. elements = tuple[1]
  420. fieldVectors[ref] = new lunr.Vector(elements)
  421. }
  422. for (var i = 0; i < serializedInvertedIndex.length; i++) {
  423. var tuple = serializedInvertedIndex[i],
  424. term = tuple[0],
  425. posting = tuple[1]
  426. tokenSetBuilder.insert(term)
  427. invertedIndex[term] = posting
  428. }
  429. tokenSetBuilder.finish()
  430. attrs.fields = serializedIndex.fields
  431. attrs.fieldVectors = fieldVectors
  432. attrs.invertedIndex = invertedIndex
  433. attrs.tokenSet = tokenSetBuilder.root
  434. attrs.pipeline = pipeline
  435. return new lunr.Index(attrs)
  436. }