formdata-parser.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. 'use strict'
  2. const { isUSVString, bufferToLowerCasedHeaderName } = require('../../core/util')
  3. const { utf8DecodeBytes } = require('./util')
  4. const { HTTP_TOKEN_CODEPOINTS, isomorphicDecode } = require('./data-url')
  5. const { isFileLike } = require('./file')
  6. const { makeEntry } = require('./formdata')
  7. const assert = require('node:assert')
  8. const { File: NodeFile } = require('node:buffer')
  9. const File = globalThis.File ?? NodeFile
  10. const formDataNameBuffer = Buffer.from('form-data; name="')
  11. const filenameBuffer = Buffer.from('; filename')
  12. const dd = Buffer.from('--')
  13. const ddcrlf = Buffer.from('--\r\n')
  14. /**
  15. * @param {string} chars
  16. */
  17. function isAsciiString (chars) {
  18. for (let i = 0; i < chars.length; ++i) {
  19. if ((chars.charCodeAt(i) & ~0x7F) !== 0) {
  20. return false
  21. }
  22. }
  23. return true
  24. }
  25. /**
  26. * @see https://andreubotella.github.io/multipart-form-data/#multipart-form-data-boundary
  27. * @param {string} boundary
  28. */
  29. function validateBoundary (boundary) {
  30. const length = boundary.length
  31. // - its length is greater or equal to 27 and lesser or equal to 70, and
  32. if (length < 27 || length > 70) {
  33. return false
  34. }
  35. // - it is composed by bytes in the ranges 0x30 to 0x39, 0x41 to 0x5A, or
  36. // 0x61 to 0x7A, inclusive (ASCII alphanumeric), or which are 0x27 ('),
  37. // 0x2D (-) or 0x5F (_).
  38. for (let i = 0; i < length; ++i) {
  39. const cp = boundary.charCodeAt(i)
  40. if (!(
  41. (cp >= 0x30 && cp <= 0x39) ||
  42. (cp >= 0x41 && cp <= 0x5a) ||
  43. (cp >= 0x61 && cp <= 0x7a) ||
  44. cp === 0x27 ||
  45. cp === 0x2d ||
  46. cp === 0x5f
  47. )) {
  48. return false
  49. }
  50. }
  51. return true
  52. }
  53. /**
  54. * @see https://andreubotella.github.io/multipart-form-data/#multipart-form-data-parser
  55. * @param {Buffer} input
  56. * @param {ReturnType<import('./data-url')['parseMIMEType']>} mimeType
  57. */
  58. function multipartFormDataParser (input, mimeType) {
  59. // 1. Assert: mimeType’s essence is "multipart/form-data".
  60. assert(mimeType !== 'failure' && mimeType.essence === 'multipart/form-data')
  61. const boundaryString = mimeType.parameters.get('boundary')
  62. // 2. If mimeType’s parameters["boundary"] does not exist, return failure.
  63. // Otherwise, let boundary be the result of UTF-8 decoding mimeType’s
  64. // parameters["boundary"].
  65. if (boundaryString === undefined) {
  66. return 'failure'
  67. }
  68. const boundary = Buffer.from(`--${boundaryString}`, 'utf8')
  69. // 3. Let entry list be an empty entry list.
  70. const entryList = []
  71. // 4. Let position be a pointer to a byte in input, initially pointing at
  72. // the first byte.
  73. const position = { position: 0 }
  74. // Note: undici addition, allows leading and trailing CRLFs.
  75. while (input[position.position] === 0x0d && input[position.position + 1] === 0x0a) {
  76. position.position += 2
  77. }
  78. let trailing = input.length
  79. while (input[trailing - 1] === 0x0a && input[trailing - 2] === 0x0d) {
  80. trailing -= 2
  81. }
  82. if (trailing !== input.length) {
  83. input = input.subarray(0, trailing)
  84. }
  85. // 5. While true:
  86. while (true) {
  87. // 5.1. If position points to a sequence of bytes starting with 0x2D 0x2D
  88. // (`--`) followed by boundary, advance position by 2 + the length of
  89. // boundary. Otherwise, return failure.
  90. // Note: boundary is padded with 2 dashes already, no need to add 2.
  91. if (input.subarray(position.position, position.position + boundary.length).equals(boundary)) {
  92. position.position += boundary.length
  93. } else {
  94. return 'failure'
  95. }
  96. // 5.2. If position points to the sequence of bytes 0x2D 0x2D 0x0D 0x0A
  97. // (`--` followed by CR LF) followed by the end of input, return entry list.
  98. // Note: a body does NOT need to end with CRLF. It can end with --.
  99. if (
  100. (position.position === input.length - 2 && bufferStartsWith(input, dd, position)) ||
  101. (position.position === input.length - 4 && bufferStartsWith(input, ddcrlf, position))
  102. ) {
  103. return entryList
  104. }
  105. // 5.3. If position does not point to a sequence of bytes starting with 0x0D
  106. // 0x0A (CR LF), return failure.
  107. if (input[position.position] !== 0x0d || input[position.position + 1] !== 0x0a) {
  108. return 'failure'
  109. }
  110. // 5.4. Advance position by 2. (This skips past the newline.)
  111. position.position += 2
  112. // 5.5. Let name, filename and contentType be the result of parsing
  113. // multipart/form-data headers on input and position, if the result
  114. // is not failure. Otherwise, return failure.
  115. const result = parseMultipartFormDataHeaders(input, position)
  116. if (result === 'failure') {
  117. return 'failure'
  118. }
  119. let { name, filename, contentType, encoding } = result
  120. // 5.6. Advance position by 2. (This skips past the empty line that marks
  121. // the end of the headers.)
  122. position.position += 2
  123. // 5.7. Let body be the empty byte sequence.
  124. let body
  125. // 5.8. Body loop: While position is not past the end of input:
  126. // TODO: the steps here are completely wrong
  127. {
  128. const boundaryIndex = input.indexOf(boundary.subarray(2), position.position)
  129. if (boundaryIndex === -1) {
  130. return 'failure'
  131. }
  132. body = input.subarray(position.position, boundaryIndex - 4)
  133. position.position += body.length
  134. // Note: position must be advanced by the body's length before being
  135. // decoded, otherwise the parsing will fail.
  136. if (encoding === 'base64') {
  137. body = Buffer.from(body.toString(), 'base64')
  138. }
  139. }
  140. // 5.9. If position does not point to a sequence of bytes starting with
  141. // 0x0D 0x0A (CR LF), return failure. Otherwise, advance position by 2.
  142. if (input[position.position] !== 0x0d || input[position.position + 1] !== 0x0a) {
  143. return 'failure'
  144. } else {
  145. position.position += 2
  146. }
  147. // 5.10. If filename is not null:
  148. let value
  149. if (filename !== null) {
  150. // 5.10.1. If contentType is null, set contentType to "text/plain".
  151. contentType ??= 'text/plain'
  152. // 5.10.2. If contentType is not an ASCII string, set contentType to the empty string.
  153. // Note: `buffer.isAscii` can be used at zero-cost, but converting a string to a buffer is a high overhead.
  154. // Content-Type is a relatively small string, so it is faster to use `String#charCodeAt`.
  155. if (!isAsciiString(contentType)) {
  156. contentType = ''
  157. }
  158. // 5.10.3. Let value be a new File object with name filename, type contentType, and body body.
  159. value = new File([body], filename, { type: contentType })
  160. } else {
  161. // 5.11. Otherwise:
  162. // 5.11.1. Let value be the UTF-8 decoding without BOM of body.
  163. value = utf8DecodeBytes(Buffer.from(body))
  164. }
  165. // 5.12. Assert: name is a scalar value string and value is either a scalar value string or a File object.
  166. assert(isUSVString(name))
  167. assert((typeof value === 'string' && isUSVString(value)) || isFileLike(value))
  168. // 5.13. Create an entry with name and value, and append it to entry list.
  169. entryList.push(makeEntry(name, value, filename))
  170. }
  171. }
  172. /**
  173. * @see https://andreubotella.github.io/multipart-form-data/#parse-multipart-form-data-headers
  174. * @param {Buffer} input
  175. * @param {{ position: number }} position
  176. */
  177. function parseMultipartFormDataHeaders (input, position) {
  178. // 1. Let name, filename and contentType be null.
  179. let name = null
  180. let filename = null
  181. let contentType = null
  182. let encoding = null
  183. // 2. While true:
  184. while (true) {
  185. // 2.1. If position points to a sequence of bytes starting with 0x0D 0x0A (CR LF):
  186. if (input[position.position] === 0x0d && input[position.position + 1] === 0x0a) {
  187. // 2.1.1. If name is null, return failure.
  188. if (name === null) {
  189. return 'failure'
  190. }
  191. // 2.1.2. Return name, filename and contentType.
  192. return { name, filename, contentType, encoding }
  193. }
  194. // 2.2. Let header name be the result of collecting a sequence of bytes that are
  195. // not 0x0A (LF), 0x0D (CR) or 0x3A (:), given position.
  196. let headerName = collectASequenceOfBytes(
  197. (char) => char !== 0x0a && char !== 0x0d && char !== 0x3a,
  198. input,
  199. position
  200. )
  201. // 2.3. Remove any HTTP tab or space bytes from the start or end of header name.
  202. headerName = removeChars(headerName, true, true, (char) => char === 0x9 || char === 0x20)
  203. // 2.4. If header name does not match the field-name token production, return failure.
  204. if (!HTTP_TOKEN_CODEPOINTS.test(headerName.toString())) {
  205. return 'failure'
  206. }
  207. // 2.5. If the byte at position is not 0x3A (:), return failure.
  208. if (input[position.position] !== 0x3a) {
  209. return 'failure'
  210. }
  211. // 2.6. Advance position by 1.
  212. position.position++
  213. // 2.7. Collect a sequence of bytes that are HTTP tab or space bytes given position.
  214. // (Do nothing with those bytes.)
  215. collectASequenceOfBytes(
  216. (char) => char === 0x20 || char === 0x09,
  217. input,
  218. position
  219. )
  220. // 2.8. Byte-lowercase header name and switch on the result:
  221. switch (bufferToLowerCasedHeaderName(headerName)) {
  222. case 'content-disposition': {
  223. // 1. Set name and filename to null.
  224. name = filename = null
  225. // 2. If position does not point to a sequence of bytes starting with
  226. // `form-data; name="`, return failure.
  227. if (!bufferStartsWith(input, formDataNameBuffer, position)) {
  228. return 'failure'
  229. }
  230. // 3. Advance position so it points at the byte after the next 0x22 (")
  231. // byte (the one in the sequence of bytes matched above).
  232. position.position += 17
  233. // 4. Set name to the result of parsing a multipart/form-data name given
  234. // input and position, if the result is not failure. Otherwise, return
  235. // failure.
  236. name = parseMultipartFormDataName(input, position)
  237. if (name === null) {
  238. return 'failure'
  239. }
  240. // 5. If position points to a sequence of bytes starting with `; filename="`:
  241. if (bufferStartsWith(input, filenameBuffer, position)) {
  242. // Note: undici also handles filename*
  243. let check = position.position + filenameBuffer.length
  244. if (input[check] === 0x2a) {
  245. position.position += 1
  246. check += 1
  247. }
  248. if (input[check] !== 0x3d || input[check + 1] !== 0x22) { // ="
  249. return 'failure'
  250. }
  251. // 1. Advance position so it points at the byte after the next 0x22 (") byte
  252. // (the one in the sequence of bytes matched above).
  253. position.position += 12
  254. // 2. Set filename to the result of parsing a multipart/form-data name given
  255. // input and position, if the result is not failure. Otherwise, return failure.
  256. filename = parseMultipartFormDataName(input, position)
  257. if (filename === null) {
  258. return 'failure'
  259. }
  260. }
  261. break
  262. }
  263. case 'content-type': {
  264. // 1. Let header value be the result of collecting a sequence of bytes that are
  265. // not 0x0A (LF) or 0x0D (CR), given position.
  266. let headerValue = collectASequenceOfBytes(
  267. (char) => char !== 0x0a && char !== 0x0d,
  268. input,
  269. position
  270. )
  271. // 2. Remove any HTTP tab or space bytes from the end of header value.
  272. headerValue = removeChars(headerValue, false, true, (char) => char === 0x9 || char === 0x20)
  273. // 3. Set contentType to the isomorphic decoding of header value.
  274. contentType = isomorphicDecode(headerValue)
  275. break
  276. }
  277. case 'content-transfer-encoding': {
  278. let headerValue = collectASequenceOfBytes(
  279. (char) => char !== 0x0a && char !== 0x0d,
  280. input,
  281. position
  282. )
  283. headerValue = removeChars(headerValue, false, true, (char) => char === 0x9 || char === 0x20)
  284. encoding = isomorphicDecode(headerValue)
  285. break
  286. }
  287. default: {
  288. // Collect a sequence of bytes that are not 0x0A (LF) or 0x0D (CR), given position.
  289. // (Do nothing with those bytes.)
  290. collectASequenceOfBytes(
  291. (char) => char !== 0x0a && char !== 0x0d,
  292. input,
  293. position
  294. )
  295. }
  296. }
  297. // 2.9. If position does not point to a sequence of bytes starting with 0x0D 0x0A
  298. // (CR LF), return failure. Otherwise, advance position by 2 (past the newline).
  299. if (input[position.position] !== 0x0d && input[position.position + 1] !== 0x0a) {
  300. return 'failure'
  301. } else {
  302. position.position += 2
  303. }
  304. }
  305. }
  306. /**
  307. * @see https://andreubotella.github.io/multipart-form-data/#parse-a-multipart-form-data-name
  308. * @param {Buffer} input
  309. * @param {{ position: number }} position
  310. */
  311. function parseMultipartFormDataName (input, position) {
  312. // 1. Assert: The byte at (position - 1) is 0x22 (").
  313. assert(input[position.position - 1] === 0x22)
  314. // 2. Let name be the result of collecting a sequence of bytes that are not 0x0A (LF), 0x0D (CR) or 0x22 ("), given position.
  315. /** @type {string | Buffer} */
  316. let name = collectASequenceOfBytes(
  317. (char) => char !== 0x0a && char !== 0x0d && char !== 0x22,
  318. input,
  319. position
  320. )
  321. // 3. If the byte at position is not 0x22 ("), return failure. Otherwise, advance position by 1.
  322. if (input[position.position] !== 0x22) {
  323. return null // name could be 'failure'
  324. } else {
  325. position.position++
  326. }
  327. // 4. Replace any occurrence of the following subsequences in name with the given byte:
  328. // - `%0A`: 0x0A (LF)
  329. // - `%0D`: 0x0D (CR)
  330. // - `%22`: 0x22 (")
  331. name = new TextDecoder().decode(name)
  332. .replace(/%0A/ig, '\n')
  333. .replace(/%0D/ig, '\r')
  334. .replace(/%22/g, '"')
  335. // 5. Return the UTF-8 decoding without BOM of name.
  336. return name
  337. }
  338. /**
  339. * @param {(char: number) => boolean} condition
  340. * @param {Buffer} input
  341. * @param {{ position: number }} position
  342. */
  343. function collectASequenceOfBytes (condition, input, position) {
  344. let start = position.position
  345. while (start < input.length && condition(input[start])) {
  346. ++start
  347. }
  348. return input.subarray(position.position, (position.position = start))
  349. }
  350. /**
  351. * @param {Buffer} buf
  352. * @param {boolean} leading
  353. * @param {boolean} trailing
  354. * @param {(charCode: number) => boolean} predicate
  355. * @returns {Buffer}
  356. */
  357. function removeChars (buf, leading, trailing, predicate) {
  358. let lead = 0
  359. let trail = buf.length - 1
  360. if (leading) {
  361. while (lead < buf.length && predicate(buf[lead])) lead++
  362. }
  363. if (trailing) {
  364. while (trail > 0 && predicate(buf[trail])) trail--
  365. }
  366. return lead === 0 && trail === buf.length - 1 ? buf : buf.subarray(lead, trail + 1)
  367. }
  368. /**
  369. * Checks if {@param buffer} starts with {@param start}
  370. * @param {Buffer} buffer
  371. * @param {Buffer} start
  372. * @param {{ position: number }} position
  373. */
  374. function bufferStartsWith (buffer, start, position) {
  375. if (buffer.length < start.length) {
  376. return false
  377. }
  378. for (let i = 0; i < start.length; i++) {
  379. if (start[i] !== buffer[position.position + i]) {
  380. return false
  381. }
  382. }
  383. return true
  384. }
  385. module.exports = {
  386. multipartFormDataParser,
  387. validateBoundary
  388. }