stemmer.js 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /*!
  2. * lunr.stemmer
  3. * Copyright (C) @YEAR Oliver Nightingale
  4. * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt
  5. */
  6. /**
  7. * lunr.stemmer is an english language stemmer, this is a JavaScript
  8. * implementation of the PorterStemmer taken from http://tartarus.org/~martin
  9. *
  10. * @module
  11. * @param {String} str The string to stem
  12. * @returns {String}
  13. * @see lunr.Pipeline
  14. */
  15. lunr.stemmer = (function(){
  16. var step2list = {
  17. "ational" : "ate",
  18. "tional" : "tion",
  19. "enci" : "ence",
  20. "anci" : "ance",
  21. "izer" : "ize",
  22. "bli" : "ble",
  23. "alli" : "al",
  24. "entli" : "ent",
  25. "eli" : "e",
  26. "ousli" : "ous",
  27. "ization" : "ize",
  28. "ation" : "ate",
  29. "ator" : "ate",
  30. "alism" : "al",
  31. "iveness" : "ive",
  32. "fulness" : "ful",
  33. "ousness" : "ous",
  34. "aliti" : "al",
  35. "iviti" : "ive",
  36. "biliti" : "ble",
  37. "logi" : "log"
  38. },
  39. step3list = {
  40. "icate" : "ic",
  41. "ative" : "",
  42. "alize" : "al",
  43. "iciti" : "ic",
  44. "ical" : "ic",
  45. "ful" : "",
  46. "ness" : ""
  47. },
  48. c = "[^aeiou]", // consonant
  49. v = "[aeiouy]", // vowel
  50. C = c + "[^aeiouy]*", // consonant sequence
  51. V = v + "[aeiou]*", // vowel sequence
  52. mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
  53. meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
  54. mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
  55. s_v = "^(" + C + ")?" + v; // vowel in stem
  56. var re_mgr0 = new RegExp(mgr0);
  57. var re_mgr1 = new RegExp(mgr1);
  58. var re_meq1 = new RegExp(meq1);
  59. var re_s_v = new RegExp(s_v);
  60. var re_1a = /^(.+?)(ss|i)es$/;
  61. var re2_1a = /^(.+?)([^s])s$/;
  62. var re_1b = /^(.+?)eed$/;
  63. var re2_1b = /^(.+?)(ed|ing)$/;
  64. var re_1b_2 = /.$/;
  65. var re2_1b_2 = /(at|bl|iz)$/;
  66. var re3_1b_2 = new RegExp("([^aeiouylsz])\\1$");
  67. var re4_1b_2 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  68. var re_1c = /^(.+?[^aeiou])y$/;
  69. var re_2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
  70. var re_3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
  71. var re_4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
  72. var re2_4 = /^(.+?)(s|t)(ion)$/;
  73. var re_5 = /^(.+?)e$/;
  74. var re_5_1 = /ll$/;
  75. var re3_5 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  76. var porterStemmer = function porterStemmer(w) {
  77. var stem,
  78. suffix,
  79. firstch,
  80. re,
  81. re2,
  82. re3,
  83. re4;
  84. if (w.length < 3) { return w; }
  85. firstch = w.substr(0,1);
  86. if (firstch == "y") {
  87. w = firstch.toUpperCase() + w.substr(1);
  88. }
  89. // Step 1a
  90. re = re_1a
  91. re2 = re2_1a;
  92. if (re.test(w)) { w = w.replace(re,"$1$2"); }
  93. else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
  94. // Step 1b
  95. re = re_1b;
  96. re2 = re2_1b;
  97. if (re.test(w)) {
  98. var fp = re.exec(w);
  99. re = re_mgr0;
  100. if (re.test(fp[1])) {
  101. re = re_1b_2;
  102. w = w.replace(re,"");
  103. }
  104. } else if (re2.test(w)) {
  105. var fp = re2.exec(w);
  106. stem = fp[1];
  107. re2 = re_s_v;
  108. if (re2.test(stem)) {
  109. w = stem;
  110. re2 = re2_1b_2;
  111. re3 = re3_1b_2;
  112. re4 = re4_1b_2;
  113. if (re2.test(w)) { w = w + "e"; }
  114. else if (re3.test(w)) { re = re_1b_2; w = w.replace(re,""); }
  115. else if (re4.test(w)) { w = w + "e"; }
  116. }
  117. }
  118. // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
  119. re = re_1c;
  120. if (re.test(w)) {
  121. var fp = re.exec(w);
  122. stem = fp[1];
  123. w = stem + "i";
  124. }
  125. // Step 2
  126. re = re_2;
  127. if (re.test(w)) {
  128. var fp = re.exec(w);
  129. stem = fp[1];
  130. suffix = fp[2];
  131. re = re_mgr0;
  132. if (re.test(stem)) {
  133. w = stem + step2list[suffix];
  134. }
  135. }
  136. // Step 3
  137. re = re_3;
  138. if (re.test(w)) {
  139. var fp = re.exec(w);
  140. stem = fp[1];
  141. suffix = fp[2];
  142. re = re_mgr0;
  143. if (re.test(stem)) {
  144. w = stem + step3list[suffix];
  145. }
  146. }
  147. // Step 4
  148. re = re_4;
  149. re2 = re2_4;
  150. if (re.test(w)) {
  151. var fp = re.exec(w);
  152. stem = fp[1];
  153. re = re_mgr1;
  154. if (re.test(stem)) {
  155. w = stem;
  156. }
  157. } else if (re2.test(w)) {
  158. var fp = re2.exec(w);
  159. stem = fp[1] + fp[2];
  160. re2 = re_mgr1;
  161. if (re2.test(stem)) {
  162. w = stem;
  163. }
  164. }
  165. // Step 5
  166. re = re_5;
  167. if (re.test(w)) {
  168. var fp = re.exec(w);
  169. stem = fp[1];
  170. re = re_mgr1;
  171. re2 = re_meq1;
  172. re3 = re3_5;
  173. if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
  174. w = stem;
  175. }
  176. }
  177. re = re_5_1;
  178. re2 = re_mgr1;
  179. if (re.test(w) && re2.test(w)) {
  180. re = re_1b_2;
  181. w = w.replace(re,"");
  182. }
  183. // and turn initial Y back to y
  184. if (firstch == "y") {
  185. w = firstch.toLowerCase() + w.substr(1);
  186. }
  187. return w;
  188. };
  189. return porterStemmer;
  190. })();
  191. lunr.Pipeline.registerFunction(lunr.stemmer, 'stemmer')