12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- /*!
- * lunr.tokenizer
- * Copyright (C) @YEAR Oliver Nightingale
- */
- /**
- * A function for splitting a string into tokens ready to be inserted into
- * the search index. Uses `lunr.tokenizer.separator` to split strings, change
- * the value of this property to change how strings are split into tokens.
- *
- * This tokenizer will convert its parameter to a string by calling `toString` and
- * then will split this string on the character in `lunr.tokenizer.separator`.
- * Arrays will have their elements converted to strings and wrapped in a lunr.Token.
- *
- * Optional metadata can be passed to the tokenizer, this metadata will be cloned and
- * added as metadata to every token that is created from the object to be tokenized.
- *
- * @static
- * @param {?(string|object|object[])} obj - The object to convert into tokens
- * @param {?object} metadata - Optional metadata to associate with every token
- * @returns {lunr.Token[]}
- * @see {@link lunr.Pipeline}
- */
- lunr.tokenizer = function (obj, metadata) {
- if (obj == null || obj == undefined) {
- return []
- }
- if (Array.isArray(obj)) {
- return obj.map(function (t) {
- return new lunr.Token(
- lunr.utils.asString(t).toLowerCase(),
- lunr.utils.clone(metadata)
- )
- })
- }
- var str = obj.toString().toLowerCase(),
- len = str.length,
- tokens = []
- for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {
- var char = str.charAt(sliceEnd),
- sliceLength = sliceEnd - sliceStart
- if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) {
- if (sliceLength > 0) {
- var tokenMetadata = lunr.utils.clone(metadata) || {}
- tokenMetadata["position"] = [sliceStart, sliceLength]
- tokenMetadata["index"] = tokens.length
- tokens.push(
- new lunr.Token (
- str.slice(sliceStart, sliceEnd),
- tokenMetadata
- )
- )
- }
- sliceStart = sliceEnd + 1
- }
- }
- return tokens
- }
- /**
- * The separator used to split a string into tokens. Override this property to change the behaviour of
- * `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
- *
- * @static
- * @see lunr.tokenizer
- */
- lunr.tokenizer.separator = /[\s\-]+/
|