123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221 |
- "use strict";
- var __importDefault = (this && this.__importDefault) || function (mod) {
- return (mod && mod.__esModule) ? mod : { "default": mod };
- };
- Object.defineProperty(exports, "__esModule", { value: true });
- exports.evaluateComparative = evaluateComparative;
- const uuid_1 = require("uuid");
- const index_js_1 = require("../index.cjs");
- const shuffle_js_1 = require("../utils/shuffle.cjs");
- const async_caller_js_1 = require("../utils/async_caller.cjs");
- const p_retry_1 = __importDefault(require("p-retry"));
- const traceable_js_1 = require("../traceable.cjs");
- function isExperimentResultsList(value) {
- return value.some((x) => typeof x !== "string");
- }
- async function loadExperiment(client, experiment) {
- const value = typeof experiment === "string" ? experiment : experiment.experimentName;
- return client.readProject((0, uuid_1.validate)(value) ? { projectId: value } : { projectName: value });
- }
- async function loadTraces(client, experiment, options) {
- const executionOrder = options.loadNested ? undefined : 1;
- const runs = await client.listRuns((0, uuid_1.validate)(experiment)
- ? { projectId: experiment, executionOrder }
- : { projectName: experiment, executionOrder });
- const treeMap = {};
- const runIdMap = {};
- const results = [];
- for await (const run of runs) {
- if (run.parent_run_id != null) {
- treeMap[run.parent_run_id] ??= [];
- treeMap[run.parent_run_id].push(run);
- }
- else {
- results.push(run);
- }
- runIdMap[run.id] = run;
- }
- for (const [parentRunId, childRuns] of Object.entries(treeMap)) {
- const parentRun = runIdMap[parentRunId];
- parentRun.child_runs = childRuns.sort((a, b) => {
- if (a.dotted_order == null || b.dotted_order == null)
- return 0;
- return a.dotted_order.localeCompare(b.dotted_order);
- });
- }
- return results;
- }
- async function evaluateComparative(experiments, options) {
- if (experiments.length < 2) {
- throw new Error("Comparative evaluation requires at least 2 experiments.");
- }
- if (!options.evaluators.length) {
- throw new Error("At least one evaluator is required for comparative evaluation.");
- }
- if (options.maxConcurrency && options.maxConcurrency < 0) {
- throw new Error("maxConcurrency must be a positive number.");
- }
- const client = options.client ?? new index_js_1.Client();
- const resolvedExperiments = await Promise.all(experiments);
- const projects = await (() => {
- if (!isExperimentResultsList(resolvedExperiments)) {
- return Promise.all(resolvedExperiments.map((experiment) => loadExperiment(client, experiment)));
- }
- // if we know the number of runs beforehand, check if the
- // number of runs in the project matches the expected number of runs
- return Promise.all(resolvedExperiments.map((experiment) => (0, p_retry_1.default)(async () => {
- const project = await loadExperiment(client, experiment);
- if (project.run_count !== experiment?.results.length) {
- throw new Error("Experiment is missing runs. Retrying.");
- }
- return project;
- }, { factor: 2, minTimeout: 1000, retries: 10 })));
- })();
- if (new Set(projects.map((p) => p.reference_dataset_id)).size > 1) {
- throw new Error("All experiments must have the same reference dataset.");
- }
- const referenceDatasetId = projects.at(0)?.reference_dataset_id;
- if (!referenceDatasetId) {
- throw new Error("Reference dataset is required for comparative evaluation.");
- }
- if (new Set(projects.map((p) => p.extra?.metadata?.dataset_version)).size > 1) {
- console.warn("Detected multiple dataset versions used by experiments, which may lead to inaccurate results.");
- }
- const datasetVersion = projects.at(0)?.extra?.metadata?.dataset_version;
- const id = (0, uuid_1.v4)();
- const experimentName = (() => {
- if (!options.experimentPrefix) {
- const names = projects
- .map((p) => p.name)
- .filter(Boolean)
- .join(" vs. ");
- return `${names}-${(0, uuid_1.v4)().slice(0, 4)}`;
- }
- return `${options.experimentPrefix}-${(0, uuid_1.v4)().slice(0, 4)}`;
- })();
- // TODO: add URL to the comparative experiment
- console.log(`Starting pairwise evaluation of: ${experimentName}`);
- const comparativeExperiment = await client.createComparativeExperiment({
- id,
- name: experimentName,
- experimentIds: projects.map((p) => p.id),
- description: options.description,
- metadata: options.metadata,
- referenceDatasetId: projects.at(0)?.reference_dataset_id,
- });
- const viewUrl = await (async () => {
- const projectId = projects.at(0)?.id ?? projects.at(1)?.id;
- const datasetId = comparativeExperiment?.reference_dataset_id;
- if (projectId && datasetId) {
- const hostUrl = (await client.getProjectUrl({ projectId }))
- .split("/projects/p/")
- .at(0);
- const result = new URL(`${hostUrl}/datasets/${datasetId}/compare`);
- result.searchParams.set("selectedSessions", projects.map((p) => p.id).join(","));
- result.searchParams.set("comparativeExperiment", comparativeExperiment.id);
- return result.toString();
- }
- return null;
- })();
- if (viewUrl != null) {
- console.log(`View results at: ${viewUrl}`);
- }
- const experimentRuns = await Promise.all(projects.map((p) => loadTraces(client, p.id, { loadNested: !!options.loadNested })));
- let exampleIdsIntersect;
- for (const runs of experimentRuns) {
- const exampleIdsSet = new Set(runs
- .map((r) => r.reference_example_id)
- .filter((x) => x != null));
- if (!exampleIdsIntersect) {
- exampleIdsIntersect = exampleIdsSet;
- }
- else {
- exampleIdsIntersect = new Set([...exampleIdsIntersect].filter((x) => exampleIdsSet.has(x)));
- }
- }
- const exampleIds = [...(exampleIdsIntersect ?? [])];
- if (!exampleIds.length) {
- throw new Error("No examples found in common between experiments.");
- }
- const exampleMap = {};
- for (let start = 0; start < exampleIds.length; start += 99) {
- const exampleIdsChunk = exampleIds.slice(start, start + 99);
- for await (const example of client.listExamples({
- datasetId: referenceDatasetId,
- exampleIds: exampleIdsChunk,
- asOf: datasetVersion,
- })) {
- exampleMap[example.id] = example;
- }
- }
- const runMapByExampleId = {};
- for (const runs of experimentRuns) {
- for (const run of runs) {
- if (run.reference_example_id == null ||
- !exampleIds.includes(run.reference_example_id)) {
- continue;
- }
- runMapByExampleId[run.reference_example_id] ??= [];
- runMapByExampleId[run.reference_example_id].push(run);
- }
- }
- const caller = new async_caller_js_1.AsyncCaller({
- maxConcurrency: options.maxConcurrency,
- debug: client.debug,
- });
- async function evaluateAndSubmitFeedback(runs, example, evaluator) {
- const expectedRunIds = new Set(runs.map((r) => r.id));
- // Check if evaluator expects an object parameter
- const result = evaluator.length === 1
- ? await evaluator({
- runs: options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs,
- example,
- inputs: example.inputs,
- outputs: runs.map((run) => run.outputs || {}),
- referenceOutputs: example.outputs || {},
- })
- : await evaluator(runs, example);
- for (const [runId, score] of Object.entries(result.scores)) {
- // validate if the run id
- if (!expectedRunIds.has(runId)) {
- throw new Error(`Returning an invalid run id ${runId} from evaluator.`);
- }
- await client.createFeedback(runId, result.key, {
- score,
- sourceRunId: result.source_run_id,
- comparativeExperimentId: comparativeExperiment.id,
- });
- }
- return result;
- }
- const tracedEvaluators = options.evaluators.map((evaluator) => (0, traceable_js_1.traceable)(async (runs, example) => {
- const evaluatorRun = (0, traceable_js_1.getCurrentRunTree)();
- const result = evaluator.length === 1
- ? await evaluator({
- runs: options.randomizeOrder ? (0, shuffle_js_1.shuffle)(runs) : runs,
- example,
- inputs: example.inputs,
- outputs: runs.map((run) => run.outputs || {}),
- referenceOutputs: example.outputs || {},
- })
- : await evaluator(runs, example);
- // sanitise the payload before sending to LangSmith
- evaluatorRun.inputs = { runs: runs, example: example };
- evaluatorRun.outputs = result;
- return {
- ...result,
- source_run_id: result.source_run_id ?? evaluatorRun.id,
- };
- }, {
- project_name: "evaluators",
- name: evaluator.name || "evaluator",
- }));
- const promises = Object.entries(runMapByExampleId).flatMap(([exampleId, runs]) => {
- const example = exampleMap[exampleId];
- if (!example)
- throw new Error(`Example ${exampleId} not found.`);
- return tracedEvaluators.map((evaluator) => caller.call(evaluateAndSubmitFeedback, runs, exampleMap[exampleId], evaluator));
- });
- const results = await Promise.all(promises);
- return { experimentName, results };
- }
|