reporter.js 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. import { Table } from "console-table-printer";
  2. import chalk from "chalk";
  3. import * as os from "node:os";
  4. import * as path from "node:path";
  5. import * as fs from "node:fs/promises";
  6. import { STRIP_ANSI_REGEX, TEST_ID_DELIMITER } from "./constants.js";
  7. const FEEDBACK_COLLAPSE_THRESHOLD = 48;
  8. const MAX_TEST_PARAMS_LENGTH = 18;
  9. const RESERVED_KEYS = [
  10. "Name",
  11. "Result",
  12. "Inputs",
  13. "Reference Outputs",
  14. "Outputs",
  15. "pass",
  16. ];
  17. function formatTestName(name, duration) {
  18. if (duration != null) {
  19. return `${name} (${duration}ms)`;
  20. }
  21. else {
  22. return name;
  23. }
  24. }
  25. function getFormattedStatus(status) {
  26. const s = status.toLowerCase();
  27. if (s === "pending" || s === "skipped") {
  28. return chalk.yellow("○ Skipped");
  29. }
  30. else if (s.includes("pass")) {
  31. return chalk.green("✓ Passed");
  32. }
  33. else if (s.includes("fail")) {
  34. return chalk.red("✕ Failed");
  35. }
  36. else {
  37. return status;
  38. }
  39. }
  40. function getColorParam(status) {
  41. const s = status.toLowerCase();
  42. if (s === "pending" || s === "skipped") {
  43. return { color: "yellow" };
  44. }
  45. else if (s.includes("pass")) {
  46. return { color: "grey" };
  47. }
  48. else if (s.includes("fail")) {
  49. return { color: "red" };
  50. }
  51. else {
  52. return {};
  53. }
  54. }
  55. function formatValue(value) {
  56. if (typeof value === "object" && value !== null) {
  57. return Object.entries(value)
  58. .map(([k, v]) => {
  59. const rawValue = typeof v === "string" ? v : JSON.stringify(v);
  60. const rawEntry = `${k}: ${rawValue}`;
  61. const entry = rawEntry.length > MAX_TEST_PARAMS_LENGTH
  62. ? rawEntry.slice(0, MAX_TEST_PARAMS_LENGTH - 3) + "..."
  63. : rawEntry;
  64. return entry;
  65. })
  66. .join("\n");
  67. }
  68. if (value == null) {
  69. return;
  70. }
  71. return String(value);
  72. }
  73. export async function printReporterTable(testSuiteName, results, testStatus, failureMessage) {
  74. const rows = [];
  75. const feedbackKeys = new Set();
  76. let experimentUrl;
  77. for (const result of results) {
  78. const { title, duration, status } = result;
  79. const titleComponents = title.split(TEST_ID_DELIMITER);
  80. const testId = titleComponents.length > 1 && titleComponents.at(-1) !== undefined
  81. ? titleComponents.at(-1)
  82. : undefined;
  83. const testName = testId !== undefined
  84. ? titleComponents.slice(0, -1).join(TEST_ID_DELIMITER).trim()
  85. : titleComponents.join(TEST_ID_DELIMITER);
  86. // Non-LangSmith test
  87. if (testId === undefined) {
  88. rows.push([
  89. {
  90. Test: formatTestName(testName, duration),
  91. Status: getFormattedStatus(status),
  92. },
  93. getColorParam(status),
  94. ]);
  95. }
  96. else if (status === "pending" || status === "skipped") {
  97. // Skipped
  98. rows.push([
  99. {
  100. Test: formatTestName(testName, duration),
  101. Status: getFormattedStatus(status),
  102. },
  103. getColorParam(status),
  104. ]);
  105. }
  106. else {
  107. const resultsPath = path.join(os.tmpdir(), "langsmith_test_results", `${testId}.json`);
  108. let fileContent;
  109. try {
  110. fileContent = JSON.parse(await fs.readFile(resultsPath, "utf-8"));
  111. await fs.unlink(resultsPath);
  112. }
  113. catch (e) {
  114. console.log("[LANGSMITH]: Failed to read custom evaluation results. Please contact us for help.");
  115. rows.push([
  116. {
  117. Test: formatTestName(testName, duration),
  118. Status: getFormattedStatus(status),
  119. },
  120. getColorParam(status),
  121. ]);
  122. continue;
  123. }
  124. const feedback = fileContent.feedback.reduce((acc, current) => {
  125. if (!RESERVED_KEYS.includes(current.key) &&
  126. current.score !== undefined) {
  127. feedbackKeys.add(current.key);
  128. acc[current.key] = current.score;
  129. }
  130. return acc;
  131. }, {});
  132. experimentUrl = experimentUrl ?? fileContent.experimentUrl;
  133. rows.push([
  134. {
  135. Test: formatTestName(testName, duration),
  136. Inputs: formatValue(fileContent.inputs),
  137. "Reference Outputs": formatValue(fileContent.referenceOutputs),
  138. Outputs: formatValue(fileContent.outputs),
  139. Status: getFormattedStatus(status),
  140. ...feedback,
  141. },
  142. getColorParam(status),
  143. ]);
  144. }
  145. }
  146. const feedbackKeysTotalLength = [...feedbackKeys].reduce((l, key) => l + key.length, 0);
  147. const collapseFeedbackColumn = feedbackKeysTotalLength > FEEDBACK_COLLAPSE_THRESHOLD;
  148. for (const key of feedbackKeys) {
  149. const scores = rows
  150. .map(([row]) => row[key])
  151. .filter((score) => score !== undefined);
  152. if (scores.length > 0) {
  153. const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
  154. const stdDev = Math.sqrt(scores.reduce((sq, n) => sq + Math.pow(n - mean, 2), 0) / scores.length);
  155. for (const row of rows) {
  156. const score = row[0][key];
  157. if (score !== undefined) {
  158. const deviation = (score - mean) / stdDev;
  159. let coloredKey;
  160. let coloredScore;
  161. if (isNaN(deviation)) {
  162. coloredKey = chalk.white(`${key}:`);
  163. coloredScore = chalk.white(score);
  164. }
  165. else if (deviation <= -1) {
  166. coloredKey = chalk.redBright(`${key}:`);
  167. coloredScore = chalk.redBright(score);
  168. }
  169. else if (deviation < -0.5) {
  170. coloredKey = chalk.red(`${key}:`);
  171. coloredScore = chalk.red(score);
  172. }
  173. else if (deviation < 0) {
  174. coloredKey = chalk.yellow(`${key}:`);
  175. coloredScore = chalk.yellow(score);
  176. }
  177. else if (deviation === 0) {
  178. coloredKey = chalk.white(`${key}:`);
  179. coloredScore = chalk.white(score);
  180. }
  181. else if (deviation <= 0.5) {
  182. coloredKey = chalk.green(`${key}:`);
  183. coloredScore = chalk.green(score);
  184. }
  185. else {
  186. coloredKey = chalk.greenBright(`${key}:`);
  187. coloredScore = chalk.greenBright(score);
  188. }
  189. if (collapseFeedbackColumn) {
  190. delete row[0][key];
  191. if (row[0].Feedback === undefined) {
  192. row[0].Feedback = `${coloredKey} ${coloredScore}`;
  193. }
  194. else {
  195. row[0].Feedback = `${row[0].Feedback}\n${coloredKey} ${coloredScore}`;
  196. }
  197. }
  198. else {
  199. row[0][key] = coloredScore;
  200. }
  201. }
  202. }
  203. }
  204. }
  205. const defaultColumns = [
  206. { name: "Test", alignment: "left", maxLen: 36 },
  207. { name: "Inputs", alignment: "left", minLen: MAX_TEST_PARAMS_LENGTH },
  208. {
  209. name: "Reference Outputs",
  210. alignment: "left",
  211. minLen: MAX_TEST_PARAMS_LENGTH,
  212. },
  213. { name: "Outputs", alignment: "left", minLen: MAX_TEST_PARAMS_LENGTH },
  214. { name: "Status", alignment: "left" },
  215. ];
  216. if (collapseFeedbackColumn) {
  217. const feedbackColumnLength = rows.reduce((max, [row]) => {
  218. const maxFeedbackLineLength = row.Feedback?.split("\n").reduce((max, feedbackLine) => {
  219. return Math.max(max, feedbackLine.replace(STRIP_ANSI_REGEX, "").length);
  220. }, 0) ?? 0;
  221. return Math.max(max, maxFeedbackLineLength);
  222. }, 0);
  223. defaultColumns.push({
  224. name: "Feedback",
  225. alignment: "left",
  226. minLen: feedbackColumnLength + 8,
  227. });
  228. }
  229. console.log();
  230. const table = new Table({
  231. columns: defaultColumns,
  232. colorMap: {
  233. grey: "\x1b[90m",
  234. },
  235. });
  236. for (const row of rows) {
  237. table.addRow(row[0], row[1]);
  238. }
  239. const testStatusColor = testStatus.includes("pass")
  240. ? chalk.green
  241. : testStatus.includes("fail")
  242. ? chalk.red
  243. : chalk.yellow;
  244. if (testSuiteName) {
  245. console.log(testStatusColor(`› ${testSuiteName}`));
  246. }
  247. if (failureMessage) {
  248. console.log(failureMessage);
  249. }
  250. table.printTable();
  251. if (experimentUrl) {
  252. console.log();
  253. console.log(` [LANGSMITH]: View full results in LangSmith at ${experimentUrl}`);
  254. console.log();
  255. }
  256. }