tokenizer.js 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. /*!
  2. * lunr.tokenizer
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. */
  5. /**
  6. * A function for splitting a string into tokens ready to be inserted into
  7. * the search index. Uses `lunr.tokenizer.separator` to split strings, change
  8. * the value of this property to change how strings are split into tokens.
  9. *
  10. * This tokenizer will convert its parameter to a string by calling `toString` and
  11. * then will split this string on the character in `lunr.tokenizer.separator`.
  12. * Arrays will have their elements converted to strings and wrapped in a lunr.Token.
  13. *
  14. * Optional metadata can be passed to the tokenizer, this metadata will be cloned and
  15. * added as metadata to every token that is created from the object to be tokenized.
  16. *
  17. * @static
  18. * @param {?(string|object|object[])} obj - The object to convert into tokens
  19. * @param {?object} metadata - Optional metadata to associate with every token
  20. * @returns {lunr.Token[]}
  21. * @see {@link lunr.Pipeline}
  22. */
  23. lunr.tokenizer = function (obj, metadata) {
  24. if (obj == null || obj == undefined) {
  25. return []
  26. }
  27. if (Array.isArray(obj)) {
  28. return obj.map(function (t) {
  29. return new lunr.Token(
  30. lunr.utils.asString(t).toLowerCase(),
  31. lunr.utils.clone(metadata)
  32. )
  33. })
  34. }
  35. var str = obj.toString().toLowerCase(),
  36. len = str.length,
  37. tokens = []
  38. for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {
  39. var char = str.charAt(sliceEnd),
  40. sliceLength = sliceEnd - sliceStart
  41. if ((char.match(lunr.tokenizer.separator) || sliceEnd == len)) {
  42. if (sliceLength > 0) {
  43. var tokenMetadata = lunr.utils.clone(metadata) || {}
  44. tokenMetadata["position"] = [sliceStart, sliceLength]
  45. tokenMetadata["index"] = tokens.length
  46. tokens.push(
  47. new lunr.Token (
  48. str.slice(sliceStart, sliceEnd),
  49. tokenMetadata
  50. )
  51. )
  52. }
  53. sliceStart = sliceEnd + 1
  54. }
  55. }
  56. return tokens
  57. }
  58. /**
  59. * The separator used to split a string into tokens. Override this property to change the behaviour of
  60. * `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
  61. *
  62. * @static
  63. * @see lunr.tokenizer
  64. */
  65. lunr.tokenizer.separator = /[\s\-]+/