_runner.d.ts 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import { Client } from "../index.js";
  2. import { AttachmentInfo, Example, KVMap, Run, TracerSession } from "../schemas.js";
  3. import { EvaluationResult, EvaluationResults, RunEvaluator } from "./evaluator.js";
  4. import { ComparisonEvaluationResults, ComparativeEvaluator } from "./evaluate_comparative.js";
  5. export type TargetConfigT = KVMap & {
  6. attachments?: Record<string, AttachmentInfo>;
  7. callbacks?: any;
  8. };
  9. type StandardTargetT<TInput = any, TOutput = KVMap> = ((input: TInput, config?: TargetConfigT) => Promise<TOutput>) | ((input: TInput, config?: TargetConfigT) => TOutput) | {
  10. invoke: (input: TInput, config?: TargetConfigT) => TOutput;
  11. } | {
  12. invoke: (input: TInput, config?: TargetConfigT) => Promise<TOutput>;
  13. };
  14. type ComparativeTargetT = Array<string> | Array<Promise<ExperimentResults> | ExperimentResults>;
  15. export type TargetT<TInput = any, TOutput = KVMap> = StandardTargetT<TInput, TOutput> | ComparativeTargetT;
  16. export type DataT = string | AsyncIterable<Example> | Example[];
  17. /** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */
  18. type DeprecatedSyncSummaryEvaluator = (runs: Array<Run>, examples: Array<Example>) => EvaluationResult | EvaluationResult[] | EvaluationResults;
  19. /** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */
  20. type DeprecatedAsyncSummaryEvaluator = (runs: Array<Run>, examples: Array<Example>) => Promise<EvaluationResult | EvaluationResult[] | EvaluationResults>;
  21. export type SummaryEvaluatorT = DeprecatedSyncSummaryEvaluator | DeprecatedAsyncSummaryEvaluator | ((args: {
  22. runs: Array<Run>;
  23. examples: Array<Example>;
  24. inputs: Array<Record<string, any>>;
  25. outputs: Array<Record<string, any>>;
  26. referenceOutputs?: Array<Record<string, any>>;
  27. }) => EvaluationResult | EvaluationResult[] | EvaluationResults) | ((args: {
  28. runs: Array<Run>;
  29. examples: Array<Example>;
  30. inputs: Array<Record<string, any>>;
  31. outputs: Array<Record<string, any>>;
  32. referenceOutputs?: Array<Record<string, any>>;
  33. }) => Promise<EvaluationResult | EvaluationResult[] | EvaluationResults>);
  34. /** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
  35. type DeprecatedRunEvaluator = RunEvaluator;
  36. /** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
  37. type DeprecatedFunctionEvaluator = (run: Run, example?: Example) => EvaluationResult | EvaluationResult[] | EvaluationResults;
  38. /** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
  39. type DeprecatedAsyncFunctionEvaluator = (run: Run, example?: Example) => Promise<EvaluationResult | EvaluationResult[] | EvaluationResults>;
  40. export type EvaluatorT = DeprecatedRunEvaluator | DeprecatedFunctionEvaluator | DeprecatedAsyncFunctionEvaluator | ((args: {
  41. run: Run;
  42. example: Example;
  43. inputs: Record<string, any>;
  44. outputs: Record<string, any>;
  45. referenceOutputs?: Record<string, any>;
  46. attachments?: Record<string, any>;
  47. }) => EvaluationResult | EvaluationResult[] | EvaluationResults) | ((args: {
  48. run: Run;
  49. example: Example;
  50. inputs: Record<string, any>;
  51. outputs: Record<string, any>;
  52. referenceOutputs?: Record<string, any>;
  53. attachments?: Record<string, any>;
  54. }) => Promise<EvaluationResult | EvaluationResult[] | EvaluationResults>);
  55. interface _ForwardResults {
  56. run: Run;
  57. example: Example;
  58. }
  59. interface _ExperimentManagerArgs {
  60. data?: DataT;
  61. experiment?: TracerSession | string;
  62. metadata?: KVMap;
  63. client?: Client;
  64. runs?: AsyncGenerator<Run>;
  65. evaluationResults?: AsyncGenerator<EvaluationResults>;
  66. summaryResults?: AsyncGenerator<(runsArray: Run[]) => AsyncGenerator<EvaluationResults, any, unknown>, any, unknown>;
  67. examples?: Example[];
  68. numRepetitions?: number;
  69. _runsArray?: Run[];
  70. includeAttachments?: boolean;
  71. }
  72. type BaseEvaluateOptions = {
  73. /**
  74. * Metadata to attach to the experiment.
  75. * @default undefined
  76. */
  77. metadata?: KVMap;
  78. /**
  79. * A prefix to provide for your experiment name.
  80. * @default undefined
  81. */
  82. experimentPrefix?: string;
  83. /**
  84. * A free-form description of the experiment.
  85. */
  86. description?: string;
  87. /**
  88. * The maximum number of concurrent evaluations to run.
  89. * @default undefined
  90. */
  91. maxConcurrency?: number;
  92. /**
  93. * The LangSmith client to use.
  94. * @default undefined
  95. */
  96. client?: Client;
  97. /**
  98. * The number of repetitions to perform. Each example
  99. * will be run this many times.
  100. * @default 1
  101. */
  102. numRepetitions?: number;
  103. };
  104. export interface EvaluateOptions extends BaseEvaluateOptions {
  105. /**
  106. * A list of evaluators to run on each example.
  107. * @default undefined
  108. */
  109. evaluators?: Array<EvaluatorT>;
  110. /**
  111. * A list of summary evaluators to run on the entire dataset.
  112. * @default undefined
  113. */
  114. summaryEvaluators?: Array<SummaryEvaluatorT>;
  115. /**
  116. * The dataset to evaluate on. Can be a dataset name, a list of
  117. * examples, or a generator of examples.
  118. */
  119. data: DataT;
  120. /**
  121. * Whether to use attachments for the experiment.
  122. * @default false
  123. */
  124. includeAttachments?: boolean;
  125. }
  126. export interface ComparativeEvaluateOptions extends BaseEvaluateOptions {
  127. /**
  128. * A list of evaluators to run on each example.
  129. */
  130. evaluators: Array<ComparativeEvaluator>;
  131. /**
  132. * Whether to load all child runs for the experiment.
  133. * @default false
  134. */
  135. loadNested?: boolean;
  136. /**
  137. * Randomize the order of outputs for each evaluation
  138. * @default false
  139. */
  140. randomizeOrder?: boolean;
  141. }
  142. export declare function evaluate(target: ComparativeTargetT, options: ComparativeEvaluateOptions): Promise<ComparisonEvaluationResults>;
  143. export declare function evaluate(target: StandardTargetT, options: EvaluateOptions): Promise<ExperimentResults>;
  144. export interface ExperimentResultRow {
  145. run: Run;
  146. example: Example;
  147. evaluationResults: EvaluationResults;
  148. }
  149. /**
  150. * Manage the execution of experiments.
  151. *
  152. * Supports lazily running predictions and evaluations in parallel to facilitate
  153. * result streaming and early debugging.
  154. */
  155. export declare class _ExperimentManager {
  156. _data?: DataT;
  157. _runs?: AsyncGenerator<Run>;
  158. _evaluationResults?: AsyncGenerator<EvaluationResults>;
  159. _summaryResults?: AsyncGenerator<(runsArray: Run[]) => AsyncGenerator<EvaluationResults, any, unknown>, any, unknown>;
  160. _examples?: Example[];
  161. _numRepetitions?: number;
  162. _runsArray?: Run[];
  163. client: Client;
  164. _experiment?: TracerSession;
  165. _experimentName: string;
  166. _metadata: KVMap;
  167. _description?: string;
  168. _includeAttachments?: boolean;
  169. get experimentName(): string;
  170. getExamples(): Promise<Array<Example>>;
  171. setExamples(examples: Example[]): void;
  172. get datasetId(): Promise<string>;
  173. get evaluationResults(): AsyncGenerator<EvaluationResults>;
  174. get runs(): AsyncGenerator<Run>;
  175. constructor(args: _ExperimentManagerArgs);
  176. _getExperiment(): TracerSession;
  177. _getExperimentMetadata(): Promise<KVMap>;
  178. _createProject(firstExample: Example, projectMetadata: KVMap): Promise<TracerSession>;
  179. _getProject(firstExample: Example): Promise<TracerSession>;
  180. protected _printExperimentStart(): Promise<void>;
  181. start(): Promise<_ExperimentManager>;
  182. withPredictions(target: StandardTargetT, options?: {
  183. maxConcurrency?: number;
  184. }): Promise<_ExperimentManager>;
  185. withEvaluators(evaluators: Array<EvaluatorT | RunEvaluator>, options?: {
  186. maxConcurrency?: number;
  187. }): Promise<_ExperimentManager>;
  188. withSummaryEvaluators(summaryEvaluators: Array<SummaryEvaluatorT>): Promise<_ExperimentManager>;
  189. getResults(): AsyncGenerator<ExperimentResultRow>;
  190. getSummaryScores(): Promise<EvaluationResults>;
  191. /**
  192. * Run the target function or runnable on the examples.
  193. * @param {StandardTargetT} target The target function or runnable to evaluate.
  194. * @param options
  195. * @returns {AsyncGenerator<_ForwardResults>} An async generator of the results.
  196. */
  197. _predict(target: StandardTargetT, options?: {
  198. maxConcurrency?: number;
  199. }): AsyncGenerator<_ForwardResults>;
  200. _runEvaluators(evaluators: Array<RunEvaluator>, currentResults: ExperimentResultRow, fields: {
  201. client: Client;
  202. }): Promise<ExperimentResultRow>;
  203. /**
  204. * Run the evaluators on the prediction stream.
  205. * Expects runs to be available in the manager.
  206. * (e.g. from a previous prediction step)
  207. * @param {Array<RunEvaluator>} evaluators
  208. * @param {number} maxConcurrency
  209. */
  210. _score(evaluators: Array<RunEvaluator>, options?: {
  211. maxConcurrency?: number;
  212. }): AsyncGenerator<ExperimentResultRow>;
  213. _applySummaryEvaluators(summaryEvaluators: Array<SummaryEvaluatorT>): AsyncGenerator<(runsArray: Run[]) => AsyncGenerator<EvaluationResults>>;
  214. _getDatasetVersion(): Promise<string | undefined>;
  215. _getDatasetSplits(): Promise<string[] | undefined>;
  216. _end(): Promise<void>;
  217. }
  218. /**
  219. * Represents the results of an evaluate() call.
  220. * This class provides an iterator interface to iterate over the experiment results
  221. * as they become available. It also provides methods to access the experiment name,
  222. * the number of results, and to wait for the results to be processed.
  223. */
  224. declare class ExperimentResults implements AsyncIterableIterator<ExperimentResultRow> {
  225. private manager;
  226. results: ExperimentResultRow[];
  227. processedCount: number;
  228. summaryResults: EvaluationResults;
  229. constructor(experimentManager: _ExperimentManager);
  230. get experimentName(): string;
  231. [Symbol.asyncIterator](): AsyncIterableIterator<ExperimentResultRow>;
  232. next(): Promise<IteratorResult<ExperimentResultRow>>;
  233. processData(manager: _ExperimentManager): Promise<void>;
  234. get length(): number;
  235. }
  236. export {};