sbmh.js 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. 'use strict'
  2. /**
  3. * Copyright Brian White. All rights reserved.
  4. *
  5. * @see https://github.com/mscdex/streamsearch
  6. *
  7. * Permission is hereby granted, free of charge, to any person obtaining a copy
  8. * of this software and associated documentation files (the "Software"), to
  9. * deal in the Software without restriction, including without limitation the
  10. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  11. * sell copies of the Software, and to permit persons to whom the Software is
  12. * furnished to do so, subject to the following conditions:
  13. *
  14. * The above copyright notice and this permission notice shall be included in
  15. * all copies or substantial portions of the Software.
  16. *
  17. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  23. * IN THE SOFTWARE.
  24. *
  25. * Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
  26. * by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
  27. */
  28. const EventEmitter = require('node:events').EventEmitter
  29. const inherits = require('node:util').inherits
  30. function SBMH (needle) {
  31. if (typeof needle === 'string') {
  32. needle = Buffer.from(needle)
  33. }
  34. if (!Buffer.isBuffer(needle)) {
  35. throw new TypeError('The needle has to be a String or a Buffer.')
  36. }
  37. const needleLength = needle.length
  38. if (needleLength === 0) {
  39. throw new Error('The needle cannot be an empty String/Buffer.')
  40. }
  41. if (needleLength > 256) {
  42. throw new Error('The needle cannot have a length bigger than 256.')
  43. }
  44. this.maxMatches = Infinity
  45. this.matches = 0
  46. this._occ = new Array(256)
  47. .fill(needleLength) // Initialize occurrence table.
  48. this._lookbehind_size = 0
  49. this._needle = needle
  50. this._bufpos = 0
  51. this._lookbehind = Buffer.alloc(needleLength)
  52. // Populate occurrence table with analysis of the needle,
  53. // ignoring last letter.
  54. for (var i = 0; i < needleLength - 1; ++i) { // eslint-disable-line no-var
  55. this._occ[needle[i]] = needleLength - 1 - i
  56. }
  57. }
  58. inherits(SBMH, EventEmitter)
  59. SBMH.prototype.reset = function () {
  60. this._lookbehind_size = 0
  61. this.matches = 0
  62. this._bufpos = 0
  63. }
  64. SBMH.prototype.push = function (chunk, pos) {
  65. if (!Buffer.isBuffer(chunk)) {
  66. chunk = Buffer.from(chunk, 'binary')
  67. }
  68. const chlen = chunk.length
  69. this._bufpos = pos || 0
  70. let r
  71. while (r !== chlen && this.matches < this.maxMatches) { r = this._sbmh_feed(chunk) }
  72. return r
  73. }
  74. SBMH.prototype._sbmh_feed = function (data) {
  75. const len = data.length
  76. const needle = this._needle
  77. const needleLength = needle.length
  78. const lastNeedleChar = needle[needleLength - 1]
  79. // Positive: points to a position in `data`
  80. // pos == 3 points to data[3]
  81. // Negative: points to a position in the lookbehind buffer
  82. // pos == -2 points to lookbehind[lookbehind_size - 2]
  83. let pos = -this._lookbehind_size
  84. let ch
  85. if (pos < 0) {
  86. // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
  87. // search with character lookup code that considers both the
  88. // lookbehind buffer and the current round's haystack data.
  89. //
  90. // Loop until
  91. // there is a match.
  92. // or until
  93. // we've moved past the position that requires the
  94. // lookbehind buffer. In this case we switch to the
  95. // optimized loop.
  96. // or until
  97. // the character to look at lies outside the haystack.
  98. while (pos < 0 && pos <= len - needleLength) {
  99. ch = this._sbmh_lookup_char(data, pos + needleLength - 1)
  100. if (
  101. ch === lastNeedleChar &&
  102. this._sbmh_memcmp(data, pos, needleLength - 1)
  103. ) {
  104. this._lookbehind_size = 0
  105. ++this.matches
  106. this.emit('info', true)
  107. return (this._bufpos = pos + needleLength)
  108. }
  109. pos += this._occ[ch]
  110. }
  111. // No match.
  112. if (pos < 0) {
  113. // There's too few data for Boyer-Moore-Horspool to run,
  114. // so let's use a different algorithm to skip as much as
  115. // we can.
  116. // Forward pos until
  117. // the trailing part of lookbehind + data
  118. // looks like the beginning of the needle
  119. // or until
  120. // pos == 0
  121. while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos)) { ++pos }
  122. }
  123. if (pos >= 0) {
  124. // Discard lookbehind buffer.
  125. this.emit('info', false, this._lookbehind, 0, this._lookbehind_size)
  126. this._lookbehind_size = 0
  127. } else {
  128. // Cut off part of the lookbehind buffer that has
  129. // been processed and append the entire haystack
  130. // into it.
  131. const bytesToCutOff = this._lookbehind_size + pos
  132. if (bytesToCutOff > 0) {
  133. // The cut off data is guaranteed not to contain the needle.
  134. this.emit('info', false, this._lookbehind, 0, bytesToCutOff)
  135. }
  136. this._lookbehind.copy(this._lookbehind, 0, bytesToCutOff,
  137. this._lookbehind_size - bytesToCutOff)
  138. this._lookbehind_size -= bytesToCutOff
  139. data.copy(this._lookbehind, this._lookbehind_size)
  140. this._lookbehind_size += len
  141. this._bufpos = len
  142. return len
  143. }
  144. }
  145. pos += (pos >= 0) * this._bufpos
  146. // Lookbehind buffer is now empty. We only need to check if the
  147. // needle is in the haystack.
  148. if (data.indexOf(needle, pos) !== -1) {
  149. pos = data.indexOf(needle, pos)
  150. ++this.matches
  151. if (pos > 0) { this.emit('info', true, data, this._bufpos, pos) } else { this.emit('info', true) }
  152. return (this._bufpos = pos + needleLength)
  153. } else {
  154. pos = len - needleLength
  155. }
  156. // There was no match. If there's trailing haystack data that we cannot
  157. // match yet using the Boyer-Moore-Horspool algorithm (because the trailing
  158. // data is less than the needle size) then match using a modified
  159. // algorithm that starts matching from the beginning instead of the end.
  160. // Whatever trailing data is left after running this algorithm is added to
  161. // the lookbehind buffer.
  162. while (
  163. pos < len &&
  164. (
  165. data[pos] !== needle[0] ||
  166. (
  167. (Buffer.compare(
  168. data.subarray(pos, pos + len - pos),
  169. needle.subarray(0, len - pos)
  170. ) !== 0)
  171. )
  172. )
  173. ) {
  174. ++pos
  175. }
  176. if (pos < len) {
  177. data.copy(this._lookbehind, 0, pos, pos + (len - pos))
  178. this._lookbehind_size = len - pos
  179. }
  180. // Everything until pos is guaranteed not to contain needle data.
  181. if (pos > 0) { this.emit('info', false, data, this._bufpos, pos < len ? pos : len) }
  182. this._bufpos = len
  183. return len
  184. }
  185. SBMH.prototype._sbmh_lookup_char = function (data, pos) {
  186. return (pos < 0)
  187. ? this._lookbehind[this._lookbehind_size + pos]
  188. : data[pos]
  189. }
  190. SBMH.prototype._sbmh_memcmp = function (data, pos, len) {
  191. for (var i = 0; i < len; ++i) { // eslint-disable-line no-var
  192. if (this._sbmh_lookup_char(data, pos + i) !== this._needle[i]) { return false }
  193. }
  194. return true
  195. }
  196. module.exports = SBMH