tokenizer.js 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. /*!
  2. * lunr.tokenizer
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. */
  5. /**
  6. * A function for splitting a string into tokens ready to be inserted into
  7. * the search index. Uses `lunr.tokenizer.separator` to split strings, change
  8. * the value of this property to change how strings are split into tokens.
  9. *
  10. * @module
  11. * @param {String} obj The string to convert into tokens
  12. * @see lunr.tokenizer.separator
  13. * @returns {Array}
  14. */
  15. lunr.tokenizer = function (obj) {
  16. if (!arguments.length || obj == null || obj == undefined) return []
  17. if (Array.isArray(obj)) return obj.map(function (t) { return lunr.utils.asString(t).toLowerCase() })
  18. return obj.toString().trim().toLowerCase().split(lunr.tokenizer.separator)
  19. }
  20. /**
  21. * The sperator used to split a string into tokens. Override this property to change the behaviour of
  22. * `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
  23. *
  24. * @static
  25. * @see lunr.tokenizer
  26. */
  27. lunr.tokenizer.separator = /[\s\-]+/
  28. /**
  29. * Loads a previously serialised tokenizer.
  30. *
  31. * A tokenizer function to be loaded must already be registered with lunr.tokenizer.
  32. * If the serialised tokenizer has not been registered then an error will be thrown.
  33. *
  34. * @param {String} label The label of the serialised tokenizer.
  35. * @returns {Function}
  36. * @memberOf tokenizer
  37. */
  38. lunr.tokenizer.load = function (label) {
  39. var fn = this.registeredFunctions[label]
  40. if (!fn) {
  41. throw new Error('Cannot load un-registered function: ' + label)
  42. }
  43. return fn
  44. }
  45. lunr.tokenizer.label = 'default'
  46. lunr.tokenizer.registeredFunctions = {
  47. 'default': lunr.tokenizer
  48. }
  49. /**
  50. * Register a tokenizer function.
  51. *
  52. * Functions that are used as tokenizers should be registered if they are to be used with a serialised index.
  53. *
  54. * Registering a function does not add it to an index, functions must still be associated with a specific index for them to be used when indexing and searching documents.
  55. *
  56. * @param {Function} fn The function to register.
  57. * @param {String} label The label to register this function with
  58. * @memberOf tokenizer
  59. */
  60. lunr.tokenizer.registerFunction = function (fn, label) {
  61. if (label in this.registeredFunctions) {
  62. lunr.utils.warn('Overwriting existing tokenizer: ' + label)
  63. }
  64. fn.label = label
  65. this.registeredFunctions[label] = fn
  66. }