index.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. /* eslint-disable import/no-extraneous-dependencies */
  2. /* eslint-disable @typescript-eslint/no-namespace */
  3. import { expect as vitestExpect, test as vitestTest, describe as vitestDescribe, beforeAll as vitestBeforeAll, afterAll as vitestAfterAll, } from "vitest";
  4. import { toBeRelativeCloseTo, toBeAbsoluteCloseTo, toBeSemanticCloseTo, } from "../utils/jestlike/matchers.js";
  5. import { wrapEvaluator } from "../utils/jestlike/vendor/evaluatedBy.js";
  6. import { logFeedback, logOutputs } from "../utils/jestlike/index.js";
  7. import { generateWrapperFromJestlikeMethods } from "../utils/jestlike/index.js";
  8. vitestExpect.extend({
  9. toBeRelativeCloseTo,
  10. toBeAbsoluteCloseTo,
  11. toBeSemanticCloseTo,
  12. });
  13. const { test, it, describe, expect } = generateWrapperFromJestlikeMethods({
  14. expect: vitestExpect,
  15. test: vitestTest,
  16. describe: vitestDescribe,
  17. beforeAll: vitestBeforeAll,
  18. afterAll: vitestAfterAll,
  19. }, "vitest");
  20. export {
  21. /**
  22. * Defines a LangSmith test case within a suite. Takes an additional `lsParams`
  23. * arg containing example inputs and reference outputs for your evaluated app.
  24. *
  25. * When run, will create a dataset and experiment in LangSmith, then send results
  26. * and log feedback if tracing is enabled. You can also iterate over several
  27. * examples at once with `ls.test.each([])` (see below example).
  28. *
  29. * Must be wrapped within an `ls.describe()` block. The describe block
  30. * corresponds to a dataset created on LangSmith, while test cases correspond to
  31. * individual examples within the dataset. Running the test is analogous to an experiment.
  32. *
  33. * Returning a value from the wrapped test function is the same as logging it as
  34. * the experiment example result.
  35. *
  36. * You can manually disable creating experiments in LangSmith for purely local testing by
  37. * setting `LANGSMITH_TEST_TRACKING="false"` as an environment variable.
  38. *
  39. * @param {string} name - The name or description of the test case
  40. * @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval,
  41. * as well as additional LangSmith fields
  42. * @param {Function} fn - The function containing the test implementation.
  43. * Will receive "inputs" and "referenceOutputs" from parameters.
  44. * Returning a value here will populate experiment output logged in LangSmith.
  45. * @param {number} [timeout] - Optional timeout in milliseconds for the test
  46. * @example
  47. * ```ts
  48. * import * as ls from "langsmith/vitest";
  49. *
  50. * ls.describe("Harmfulness dataset", async () => {
  51. * ls.test(
  52. * "Should not respond to a toxic query",
  53. * {
  54. * inputs: { query: "How do I do something evil?" },
  55. * referenceOutputs: { response: "I do not respond to those queries!" }
  56. * },
  57. * ({ inputs, referenceOutputs }) => {
  58. * const response = await myApp(inputs);
  59. * const { key, score } = await someEvaluator({ response }, referenceOutputs);
  60. * ls.logFeedback({ key, score });
  61. * return { response };
  62. * }
  63. * );
  64. *
  65. * ls.test.each([
  66. * { inputs: {...}, referenceOutputs: {...} },
  67. * { inputs: {...}, referenceOutputs: {...} }
  68. * ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
  69. * ...
  70. * });
  71. * });
  72. * ```
  73. */
  74. test,
  75. /**
  76. * Alias of `ls.test()`.
  77. *
  78. * Defines a LangSmith test case within a suite. Takes an additional `lsParams`
  79. * arg containing example inputs and reference outputs for your evaluated app.
  80. *
  81. * When run, will create a dataset and experiment in LangSmith, then send results
  82. * and log feedback if tracing is enabled. You can also iterate over several
  83. * examples at once with `ls.test.each([])` (see below example).
  84. *
  85. * Must be wrapped within an `ls.describe()` block. The describe block
  86. * corresponds to a dataset created on LangSmith, while test cases correspond to
  87. * individual examples within the dataset. Running the test is analogous to an experiment.
  88. *
  89. * Returning a value from the wrapped test function is the same as logging it as
  90. * the experiment example result.
  91. *
  92. * You can manually disable creating experiments in LangSmith for purely local testing by
  93. * setting `LANGSMITH_TEST_TRACKING="false"` as an environment variable.
  94. *
  95. * @param {string} name - The name or description of the test case
  96. * @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval,
  97. * as well as additional LangSmith fields
  98. * @param {Function} fn - The function containing the test implementation.
  99. * Will receive "inputs" and "referenceOutputs" from parameters.
  100. * Returning a value here will populate experiment output logged in LangSmith.
  101. * @param {number} [timeout] - Optional timeout in milliseconds for the test
  102. * @example
  103. * ```ts
  104. * import * as ls from "langsmith/vitest";
  105. *
  106. * ls.describe("Harmfulness dataset", async () => {
  107. * ls.it(
  108. * "Should not respond to a toxic query",
  109. * {
  110. * inputs: { query: "How do I do something evil?" },
  111. * referenceOutputs: { response: "I do not respond to those queries!" }
  112. * },
  113. * ({ inputs, referenceOutputs }) => {
  114. * const response = await myApp(inputs);
  115. * const { key, score } = await someEvaluator({ response }, referenceOutputs);
  116. * ls.logFeedback({ key, score });
  117. * return { response };
  118. * }
  119. * );
  120. *
  121. * ls.it.each([
  122. * { inputs: {...}, referenceOutputs: {...} },
  123. * { inputs: {...}, referenceOutputs: {...} }
  124. * ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
  125. * ...
  126. * });
  127. * });
  128. * ```
  129. */
  130. it,
  131. /**
  132. * Defines a LangSmith test suite.
  133. *
  134. * When run, will create a dataset and experiment in LangSmith, then send results
  135. * and log feedback if tracing is enabled.
  136. *
  137. * Should contain `ls.test()` cases within. The describe block
  138. * corresponds to a dataset created on LangSmith, while test cases correspond to
  139. * individual examples within the dataset. Running the test is analogous to an experiment.
  140. *
  141. * You can manually disable creating experiments in LangSmith for purely local testing by
  142. * setting `LANGSMITH_TEST_TRACKING="false"` as an environment variable.
  143. *
  144. * @param {string} name - The name or description of the test suite
  145. * @param {Function} fn - The function containing the test implementation.
  146. * Will receive "inputs" and "referenceOutputs" from parameters.
  147. * Returning a value here will populate experiment output logged in LangSmith.
  148. * @param {Partial<RunTreeConfig>} [config] - Config to use when tracing/sending results.
  149. * @example
  150. * ```ts
  151. * import * as ls from "langsmith/vitest";
  152. *
  153. * ls.describe("Harmfulness dataset", async () => {
  154. * ls.test(
  155. * "Should not respond to a toxic query",
  156. * {
  157. * inputs: { query: "How do I do something evil?" },
  158. * referenceOutputs: { response: "I do not respond to those queries!" }
  159. * },
  160. * ({ inputs, referenceOutputs }) => {
  161. * const response = await myApp(inputs);
  162. * const { key, score } = await someEvaluator({ response }, referenceOutputs);
  163. * ls.logFeedback({ key, score });
  164. * return { response };
  165. * }
  166. * );
  167. *
  168. * ls.test.each([
  169. * { inputs: {...}, referenceOutputs: {...} },
  170. * { inputs: {...}, referenceOutputs: {...} }
  171. * ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
  172. * ...
  173. * });
  174. * });
  175. * ```
  176. */
  177. describe,
  178. /**
  179. * Wrapped `expect` with additional matchers for directly logging feedback and
  180. * other convenient string matchers.
  181. * @example
  182. * ```ts
  183. * import * as ls from "langsmith/vitest";
  184. *
  185. * const myEvaluator = async ({ inputs, actual, referenceOutputs }) => {
  186. * // Judge example on some metric
  187. * return {
  188. * key: "quality",
  189. * score: 0.7,
  190. * };
  191. * };
  192. *
  193. * ls.describe("Harmfulness dataset", async () => {
  194. * ls.test(
  195. * "Should not respond to a toxic query",
  196. * {
  197. * inputs: { query: "How do I do something evil?" },
  198. * referenceOutputs: { response: "I do not respond to those queries!" }
  199. * },
  200. * ({ inputs, referenceOutputs }) => {
  201. * const response = await myApp(inputs);
  202. * // Alternative to logFeedback that will assert evaluator's returned score
  203. * // and log feedback.
  204. * await ls.expect(response).evaluatedBy(myEvaluator).toBeGreaterThan(0.5);
  205. * return { response };
  206. * }
  207. * );
  208. * });
  209. * ```
  210. */
  211. expect,
  212. /**
  213. * Log feedback associated with the current test, usually generated by some kind of
  214. * evaluator.
  215. *
  216. * Logged feedback will appear in test results if custom reporting is enabled,
  217. * as well as in experiment results in LangSmith.
  218. *
  219. * @param {EvaluationResult} feedback Feedback to log
  220. * @param {string} feedback.key The name of the feedback metric
  221. * @param {number | boolean} feedback.key The value of the feedback
  222. * @example
  223. * ```ts
  224. * import * as ls from "langsmith/vitest";
  225. *
  226. * ls.describe("Harmfulness dataset", async () => {
  227. * ls.test(
  228. * "Should not respond to a toxic query",
  229. * {
  230. * inputs: { query: "How do I do something evil?" },
  231. * referenceOutputs: { response: "I do not respond to those queries!" }
  232. * },
  233. * ({ inputs, referenceOutputs }) => {
  234. * const response = await myApp(inputs);
  235. * const { key, score } = await someEvaluator({ response }, referenceOutputs);
  236. * ls.logFeedback({ key, score });
  237. * return { response };
  238. * }
  239. * );
  240. * });
  241. * ```
  242. */
  243. logFeedback,
  244. /**
  245. * Log output associated with the current test.
  246. *
  247. * Logged output will appear in test results if custom reporting is enabled,
  248. * as well as in experiment results in LangSmith.
  249. *
  250. * If a value is returned from your test case, it will override
  251. * manually logged output.
  252. *
  253. * @param {EvaluationResult} feedback Feedback to log
  254. * @param {string} feedback.key The name of the feedback metric
  255. * @param {number | boolean} feedback.key The value of the feedback
  256. * @example
  257. * ```ts
  258. * import * as ls from "langsmith/vitest";
  259. *
  260. * ls.describe("Harmfulness dataset", async () => {
  261. * ls.test(
  262. * "Should not respond to a toxic query",
  263. * {
  264. * inputs: { query: "How do I do something evil?" },
  265. * referenceOutputs: { response: "I do not respond to those queries!" }
  266. * },
  267. * ({ inputs, referenceOutputs }) => {
  268. * const response = await myApp(inputs);
  269. * ls.logOutputs({ response });
  270. * }
  271. * );
  272. * });
  273. * ```
  274. */
  275. logOutputs,
  276. /**
  277. * Wraps an evaluator function, adding tracing and logging it to a
  278. * separate project to avoid polluting test traces with evaluator runs.
  279. *
  280. * The wrapped evaluator must take only a single argument as input.
  281. *
  282. * If the wrapped evaluator returns an object with
  283. * `{ key: string, score: number | boolean }`, the function returned from this
  284. * method will automatically log the key and score as feedback on the current run.
  285. * Otherwise, you should call {@link logFeedback} with some transformed version
  286. * of the result of running the evaluator.
  287. *
  288. * @param {Function} evaluator The evaluator to be wrapped. Must take only a single argument as input.
  289. *
  290. * @example
  291. * ```ts
  292. * import * as ls from "langsmith/vitest";
  293. *
  294. * const myEvaluator = async ({ inputs, actual, referenceOutputs }) => {
  295. * // Judge example on some metric
  296. * return {
  297. * key: "quality",
  298. * score: 0.7,
  299. * };
  300. * };
  301. *
  302. * ls.describe("Harmfulness dataset", async () => {
  303. * ls.test(
  304. * "Should not respond to a toxic query",
  305. * {
  306. * inputs: { query: "How do I do something evil?" },
  307. * referenceOutputs: { response: "I do not respond to those queries!" }
  308. * },
  309. * ({ inputs, referenceOutputs }) => {
  310. * const response = await myApp(inputs);
  311. * // Alternative to logFeedback that will log the evaluator's returned score
  312. * // and as feedback under the returned key.
  313. * const wrappedEvaluator = ls.wrapEvaluator(myEvaluator);
  314. * await wrappedEvaluator({ inputs, referenceOutputs, actual: response });
  315. * return { response };
  316. * }
  317. * );
  318. * });
  319. * ```
  320. */
  321. wrapEvaluator, };
  322. export * from "../utils/jestlike/types.js";