123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492 |
- /*!
- * lunr.Index
- * Copyright (C) @YEAR Oliver Nightingale
- */
- /**
- * An index contains the built index of all documents and provides a query interface
- * to the index.
- *
- * Usually instances of lunr.Index will not be created using this constructor, instead
- * lunr.Builder should be used to construct new indexes, or lunr.Index.load should be
- * used to load previously built and serialized indexes.
- *
- * @constructor
- * @param {Object} attrs - The attributes of the built search index.
- * @param {Object} attrs.invertedIndex - An index of term/field to document reference.
- * @param {Object<string, lunr.Vector>} attrs.fieldVectors - Field vectors
- * @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens.
- * @param {string[]} attrs.fields - The names of indexed document fields.
- * @param {lunr.Pipeline} attrs.pipeline - The pipeline to use for search terms.
- */
- lunr.Index = function (attrs) {
- this.invertedIndex = attrs.invertedIndex
- this.fieldVectors = attrs.fieldVectors
- this.tokenSet = attrs.tokenSet
- this.fields = attrs.fields
- this.pipeline = attrs.pipeline
- }
- /**
- * A result contains details of a document matching a search query.
- * @typedef {Object} lunr.Index~Result
- * @property {string} ref - The reference of the document this result represents.
- * @property {number} score - A number between 0 and 1 representing how similar this document is to the query.
- * @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match.
- */
- /**
- * Although lunr provides the ability to create queries using lunr.Query, it also provides a simple
- * query language which itself is parsed into an instance of lunr.Query.
- *
- * For programmatically building queries it is advised to directly use lunr.Query, the query language
- * is best used for human entered text rather than program generated text.
- *
- * At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported
- * and will be combined with OR, e.g `hello world` will match documents that contain either 'hello'
- * or 'world', though those that contain both will rank higher in the results.
- *
- * Wildcards can be included in terms to match one or more unspecified characters, these wildcards can
- * be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding
- * wildcards will increase the number of documents that will be found but can also have a negative
- * impact on query performance, especially with wildcards at the beginning of a term.
- *
- * Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term
- * hello in the title field will match this query. Using a field not present in the index will lead
- * to an error being thrown.
- *
- * Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term
- * boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported
- * to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2.
- * Avoid large values for edit distance to improve query performance.
- *
- * Each term also supports a presence modifier. By default a term's presence in document is optional, however
- * this can be changed to either required or prohibited. For a term's presence to be required in a document the
- * term should be prefixed with a '+', e.g. `+foo bar` is a search for documents that must contain 'foo' and
- * optionally contain 'bar'. Conversely a leading '-' sets the terms presence to prohibited, i.e. it must not
- * appear in a document, e.g. `-foo bar` is a search for documents that do not contain 'foo' but may contain 'bar'.
- *
- * To escape special characters the backslash character '\' can be used, this allows searches to include
- * characters that would normally be considered modifiers, e.g. `foo\~2` will search for a term "foo~2" instead
- * of attempting to apply a boost of 2 to the search term "foo".
- *
- * @typedef {string} lunr.Index~QueryString
- * @example <caption>Simple single term query</caption>
- * hello
- * @example <caption>Multiple term query</caption>
- * hello world
- * @example <caption>term scoped to a field</caption>
- * title:hello
- * @example <caption>term with a boost of 10</caption>
- * hello^10
- * @example <caption>term with an edit distance of 2</caption>
- * hello~2
- * @example <caption>terms with presence modifiers</caption>
- * -foo +bar baz
- */
- /**
- * Performs a search against the index using lunr query syntax.
- *
- * Results will be returned sorted by their score, the most relevant results
- * will be returned first. For details on how the score is calculated, please see
- * the {@link https://lunrjs.com/guides/searching.html#scoring|guide}.
- *
- * For more programmatic querying use lunr.Index#query.
- *
- * @param {lunr.Index~QueryString} queryString - A string containing a lunr query.
- * @throws {lunr.QueryParseError} If the passed query string cannot be parsed.
- * @returns {lunr.Index~Result[]}
- */
- lunr.Index.prototype.search = function (queryString) {
- return this.query(function (query) {
- var parser = new lunr.QueryParser(queryString, query)
- parser.parse()
- })
- }
- /**
- * A query builder callback provides a query object to be used to express
- * the query to perform on the index.
- *
- * @callback lunr.Index~queryBuilder
- * @param {lunr.Query} query - The query object to build up.
- * @this lunr.Query
- */
- /**
- * Performs a query against the index using the yielded lunr.Query object.
- *
- * If performing programmatic queries against the index, this method is preferred
- * over lunr.Index#search so as to avoid the additional query parsing overhead.
- *
- * A query object is yielded to the supplied function which should be used to
- * express the query to be run against the index.
- *
- * Note that although this function takes a callback parameter it is _not_ an
- * asynchronous operation, the callback is just yielded a query object to be
- * customized.
- *
- * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.
- * @returns {lunr.Index~Result[]}
- */
- lunr.Index.prototype.query = function (fn) {
- // for each query clause
- // * process terms
- // * expand terms from token set
- // * find matching documents and metadata
- // * get document vectors
- // * score documents
- var query = new lunr.Query(this.fields),
- matchingFields = Object.create(null),
- queryVectors = Object.create(null),
- termFieldCache = Object.create(null),
- requiredMatches = Object.create(null),
- prohibitedMatches = Object.create(null)
- /*
- * To support field level boosts a query vector is created per
- * field. An empty vector is eagerly created to support negated
- * queries.
- */
- for (var i = 0; i < this.fields.length; i++) {
- queryVectors[this.fields[i]] = new lunr.Vector
- }
- fn.call(query, query)
- for (var i = 0; i < query.clauses.length; i++) {
- /*
- * Unless the pipeline has been disabled for this term, which is
- * the case for terms with wildcards, we need to pass the clause
- * term through the search pipeline. A pipeline returns an array
- * of processed terms. Pipeline functions may expand the passed
- * term, which means we may end up performing multiple index lookups
- * for a single query term.
- */
- var clause = query.clauses[i],
- terms = null,
- clauseMatches = lunr.Set.empty
- if (clause.usePipeline) {
- terms = this.pipeline.runString(clause.term, {
- fields: clause.fields
- })
- } else {
- terms = [clause.term]
- }
- for (var m = 0; m < terms.length; m++) {
- var term = terms[m]
- /*
- * Each term returned from the pipeline needs to use the same query
- * clause object, e.g. the same boost and or edit distance. The
- * simplest way to do this is to re-use the clause object but mutate
- * its term property.
- */
- clause.term = term
- /*
- * From the term in the clause we create a token set which will then
- * be used to intersect the indexes token set to get a list of terms
- * to lookup in the inverted index
- */
- var termTokenSet = lunr.TokenSet.fromClause(clause),
- expandedTerms = this.tokenSet.intersect(termTokenSet).toArray()
- /*
- * If a term marked as required does not exist in the tokenSet it is
- * impossible for the search to return any matches. We set all the field
- * scoped required matches set to empty and stop examining any further
- * clauses.
- */
- if (expandedTerms.length === 0 && clause.presence === lunr.Query.presence.REQUIRED) {
- for (var k = 0; k < clause.fields.length; k++) {
- var field = clause.fields[k]
- requiredMatches[field] = lunr.Set.empty
- }
- break
- }
- for (var j = 0; j < expandedTerms.length; j++) {
- /*
- * For each term get the posting and termIndex, this is required for
- * building the query vector.
- */
- var expandedTerm = expandedTerms[j],
- posting = this.invertedIndex[expandedTerm],
- termIndex = posting._index
- for (var k = 0; k < clause.fields.length; k++) {
- /*
- * For each field that this query term is scoped by (by default
- * all fields are in scope) we need to get all the document refs
- * that have this term in that field.
- *
- * The posting is the entry in the invertedIndex for the matching
- * term from above.
- */
- var field = clause.fields[k],
- fieldPosting = posting[field],
- matchingDocumentRefs = Object.keys(fieldPosting),
- termField = expandedTerm + "/" + field,
- matchingDocumentsSet = new lunr.Set(matchingDocumentRefs)
- /*
- * if the presence of this term is required ensure that the matching
- * documents are added to the set of required matches for this clause.
- *
- */
- if (clause.presence == lunr.Query.presence.REQUIRED) {
- clauseMatches = clauseMatches.union(matchingDocumentsSet)
- if (requiredMatches[field] === undefined) {
- requiredMatches[field] = lunr.Set.complete
- }
- }
- /*
- * if the presence of this term is prohibited ensure that the matching
- * documents are added to the set of prohibited matches for this field,
- * creating that set if it does not yet exist.
- */
- if (clause.presence == lunr.Query.presence.PROHIBITED) {
- if (prohibitedMatches[field] === undefined) {
- prohibitedMatches[field] = lunr.Set.empty
- }
- prohibitedMatches[field] = prohibitedMatches[field].union(matchingDocumentsSet)
- /*
- * Prohibited matches should not be part of the query vector used for
- * similarity scoring and no metadata should be extracted so we continue
- * to the next field
- */
- continue
- }
- /*
- * The query field vector is populated using the termIndex found for
- * the term and a unit value with the appropriate boost applied.
- * Using upsert because there could already be an entry in the vector
- * for the term we are working with. In that case we just add the scores
- * together.
- */
- queryVectors[field].upsert(termIndex, clause.boost, function (a, b) { return a + b })
- /**
- * If we've already seen this term, field combo then we've already collected
- * the matching documents and metadata, no need to go through all that again
- */
- if (termFieldCache[termField]) {
- continue
- }
- for (var l = 0; l < matchingDocumentRefs.length; l++) {
- /*
- * All metadata for this term/field/document triple
- * are then extracted and collected into an instance
- * of lunr.MatchData ready to be returned in the query
- * results
- */
- var matchingDocumentRef = matchingDocumentRefs[l],
- matchingFieldRef = new lunr.FieldRef (matchingDocumentRef, field),
- metadata = fieldPosting[matchingDocumentRef],
- fieldMatch
- if ((fieldMatch = matchingFields[matchingFieldRef]) === undefined) {
- matchingFields[matchingFieldRef] = new lunr.MatchData (expandedTerm, field, metadata)
- } else {
- fieldMatch.add(expandedTerm, field, metadata)
- }
- }
- termFieldCache[termField] = true
- }
- }
- }
- /**
- * If the presence was required we need to update the requiredMatches field sets.
- * We do this after all fields for the term have collected their matches because
- * the clause terms presence is required in _any_ of the fields not _all_ of the
- * fields.
- */
- if (clause.presence === lunr.Query.presence.REQUIRED) {
- for (var k = 0; k < clause.fields.length; k++) {
- var field = clause.fields[k]
- requiredMatches[field] = requiredMatches[field].intersect(clauseMatches)
- }
- }
- }
- /**
- * Need to combine the field scoped required and prohibited
- * matching documents into a global set of required and prohibited
- * matches
- */
- var allRequiredMatches = lunr.Set.complete,
- allProhibitedMatches = lunr.Set.empty
- for (var i = 0; i < this.fields.length; i++) {
- var field = this.fields[i]
- if (requiredMatches[field]) {
- allRequiredMatches = allRequiredMatches.intersect(requiredMatches[field])
- }
- if (prohibitedMatches[field]) {
- allProhibitedMatches = allProhibitedMatches.union(prohibitedMatches[field])
- }
- }
- var matchingFieldRefs = Object.keys(matchingFields),
- results = [],
- matches = Object.create(null)
- /*
- * If the query is negated (contains only prohibited terms)
- * we need to get _all_ fieldRefs currently existing in the
- * index. This is only done when we know that the query is
- * entirely prohibited terms to avoid any cost of getting all
- * fieldRefs unnecessarily.
- *
- * Additionally, blank MatchData must be created to correctly
- * populate the results.
- */
- if (query.isNegated()) {
- matchingFieldRefs = Object.keys(this.fieldVectors)
- for (var i = 0; i < matchingFieldRefs.length; i++) {
- var matchingFieldRef = matchingFieldRefs[i]
- var fieldRef = lunr.FieldRef.fromString(matchingFieldRef)
- matchingFields[matchingFieldRef] = new lunr.MatchData
- }
- }
- for (var i = 0; i < matchingFieldRefs.length; i++) {
- /*
- * Currently we have document fields that match the query, but we
- * need to return documents. The matchData and scores are combined
- * from multiple fields belonging to the same document.
- *
- * Scores are calculated by field, using the query vectors created
- * above, and combined into a final document score using addition.
- */
- var fieldRef = lunr.FieldRef.fromString(matchingFieldRefs[i]),
- docRef = fieldRef.docRef
- if (!allRequiredMatches.contains(docRef)) {
- continue
- }
- if (allProhibitedMatches.contains(docRef)) {
- continue
- }
- var fieldVector = this.fieldVectors[fieldRef],
- score = queryVectors[fieldRef.fieldName].similarity(fieldVector),
- docMatch
- if ((docMatch = matches[docRef]) !== undefined) {
- docMatch.score += score
- docMatch.matchData.combine(matchingFields[fieldRef])
- } else {
- var match = {
- ref: docRef,
- score: score,
- matchData: matchingFields[fieldRef]
- }
- matches[docRef] = match
- results.push(match)
- }
- }
- /*
- * Sort the results objects by score, highest first.
- */
- return results.sort(function (a, b) {
- return b.score - a.score
- })
- }
- /**
- * Prepares the index for JSON serialization.
- *
- * The schema for this JSON blob will be described in a
- * separate JSON schema file.
- *
- * @returns {Object}
- */
- lunr.Index.prototype.toJSON = function () {
- var invertedIndex = Object.keys(this.invertedIndex)
- .sort()
- .map(function (term) {
- return [term, this.invertedIndex[term]]
- }, this)
- var fieldVectors = Object.keys(this.fieldVectors)
- .map(function (ref) {
- return [ref, this.fieldVectors[ref].toJSON()]
- }, this)
- return {
- version: lunr.version,
- fields: this.fields,
- fieldVectors: fieldVectors,
- invertedIndex: invertedIndex,
- pipeline: this.pipeline.toJSON()
- }
- }
- /**
- * Loads a previously serialized lunr.Index
- *
- * @param {Object} serializedIndex - A previously serialized lunr.Index
- * @returns {lunr.Index}
- */
- lunr.Index.load = function (serializedIndex) {
- var attrs = {},
- fieldVectors = {},
- serializedVectors = serializedIndex.fieldVectors,
- invertedIndex = Object.create(null),
- serializedInvertedIndex = serializedIndex.invertedIndex,
- tokenSetBuilder = new lunr.TokenSet.Builder,
- pipeline = lunr.Pipeline.load(serializedIndex.pipeline)
- if (serializedIndex.version != lunr.version) {
- lunr.utils.warn("Version mismatch when loading serialised index. Current version of lunr '" + lunr.version + "' does not match serialized index '" + serializedIndex.version + "'")
- }
- for (var i = 0; i < serializedVectors.length; i++) {
- var tuple = serializedVectors[i],
- ref = tuple[0],
- elements = tuple[1]
- fieldVectors[ref] = new lunr.Vector(elements)
- }
- for (var i = 0; i < serializedInvertedIndex.length; i++) {
- var tuple = serializedInvertedIndex[i],
- term = tuple[0],
- posting = tuple[1]
- tokenSetBuilder.insert(term)
- invertedIndex[term] = posting
- }
- tokenSetBuilder.finish()
- attrs.fields = serializedIndex.fields
- attrs.fieldVectors = fieldVectors
- attrs.invertedIndex = invertedIndex
- attrs.tokenSet = tokenSetBuilder.root
- attrs.pipeline = pipeline
- return new lunr.Index(attrs)
- }
|