evaluate_agent.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. #!/usr/bin/env python
  2. # coding=utf-8
  3. # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. from .agents import BASE_PYTHON_TOOLS
  17. from .python_interpreter import InterpreterError, evaluate
  18. ### Fake tools for test
  19. def classifier(text, labels):
  20. return f"This is the classification of {text} along {labels}."
  21. def translator(text, src_lang, tgt_lang):
  22. return f"This is the translation of {text} from {src_lang} to {tgt_lang}."
  23. def speaker(text):
  24. return f"This is actually a sound reading {text}."
  25. def transcriber(audio):
  26. if "sound" not in audio:
  27. raise ValueError(f"`audio` ({audio}) is not a sound.")
  28. return f"This is the transcribed text from {audio}."
  29. def image_generator(prompt):
  30. return f"This is actually an image representing {prompt}."
  31. def image_captioner(image):
  32. if "image" not in image:
  33. raise ValueError(f"`image` ({image}) is not an image.")
  34. return f"This is a description of {image}."
  35. def image_transformer(image, prompt):
  36. if "image" not in image:
  37. raise ValueError(f"`image` ({image}) is not an image.")
  38. return f"This is a transformation of {image} according to {prompt}."
  39. def question_answerer(text, question):
  40. return f"This is the answer to {question} from {text}."
  41. def image_qa(image, question):
  42. if "image" not in image:
  43. raise ValueError(f"`image` ({image}) is not an image.")
  44. return f"This is the answer to {question} from {image}."
  45. def text_downloader(url):
  46. return f"This is the content of {url}."
  47. def summarizer(text):
  48. return f"This is a summary of {text}."
  49. def video_generator(prompt, seconds=2):
  50. return f"A video of {prompt}"
  51. def document_qa(image, question):
  52. return f"This is the answer to {question} from the document {image}."
  53. def image_segmenter(image, prompt):
  54. return f"This is the mask of {prompt} in {image}"
  55. TEST_TOOLS = {
  56. "text_classifier": classifier,
  57. "translator": translator,
  58. "text_reader": speaker,
  59. "summarizer": summarizer,
  60. "transcriber": transcriber,
  61. "image_generator": image_generator,
  62. "image_captioner": image_captioner,
  63. "image_transformer": image_transformer,
  64. "text_qa": question_answerer,
  65. "text_downloader": text_downloader,
  66. "image_qa": image_qa,
  67. "video_generator": video_generator,
  68. "document_qa": document_qa,
  69. "image_segmenter": image_segmenter,
  70. }
  71. class Problem:
  72. """
  73. A class regrouping all the information to solve a problem on which we will evaluate agents.
  74. Args:
  75. task (`str` ou `list[str]`):
  76. One or several descriptions of the task to perform. If a list, it should contain variations on the
  77. phrasing, but for the same task.
  78. inputs (`list[str]` or `dict[str, str]`):
  79. The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
  80. values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
  81. inputs expected (the value used will be `<<input_name>>` in this case).
  82. answer (`str` or `list[str]`):
  83. The theoretical answer (or list of possible valid answers) to the problem, as code.
  84. """
  85. def __init__(self, task, inputs, answer):
  86. self.task = task
  87. self.inputs = inputs
  88. self.answer = answer
  89. ### The list of problems the agent will be evaluated on.
  90. EVALUATION_TASKS = [
  91. Problem(
  92. task=[
  93. "Is the following `text` (in Spanish) positive or negative?",
  94. "Is the text in the variable `text` (in Spanish) positive or negative?",
  95. "Translate the following `text` from Spanish to English then tell me if its positive or negative.",
  96. ],
  97. inputs=["text"],
  98. answer="""text_classifier(translator(text, src_lang="Spanish", tgt_lang="English"), labels=["positive", "negative"])""",
  99. ),
  100. Problem(
  101. task=[
  102. "Tell me out loud what the `image` contains.",
  103. "Describe the following `image` out loud.",
  104. "Find what is in the picture stored in `image` then read it out loud.",
  105. ],
  106. inputs=["image"],
  107. answer=[
  108. "text_reader(image_captioner(image))",
  109. "text_reader(image_qa(image, question='What is in the image?'))",
  110. ],
  111. ),
  112. Problem(
  113. task=[
  114. "Generate an image from the text given in `text_input`. Then transform it according to the text in `prompt`.",
  115. "Use the following `text_input` to generate an image, then transform it by using the text in `prompt`.",
  116. ],
  117. inputs=["text_input", "prompt"],
  118. answer="image_transformer(image_generator(text_input), prompt)",
  119. ),
  120. Problem(
  121. task=[
  122. "Download the content of `url`, summarize it then generate an image from its content.",
  123. "Use a summary of the web page at `url` to generate an image.",
  124. "Summarize the content of the web page at `url`, and use the result to generate an image.",
  125. ],
  126. inputs=["url"],
  127. answer="image_generator(summarizer(text_downloader(url)))",
  128. ),
  129. Problem(
  130. task=[
  131. "Transform the following `image` using the prompt in `text`. The prompt is in Spanish.",
  132. "Use the text prompt in `text` (in Spanish) to transform the following `image`.",
  133. "Translate the `text` from Spanish to English then use it to transform the picture in `image`.",
  134. ],
  135. inputs=["text", "image"],
  136. answer="image_transformer(image, translator(text, src_lang='Spanish', tgt_lang='English'))",
  137. ),
  138. Problem(
  139. task=[
  140. "Download the content of `url`, summarize it then read it out loud to me.",
  141. "Read me a summary of the web page at `url`.",
  142. ],
  143. inputs=["url"],
  144. answer="text_reader(summarizer(text_downloader(url)))",
  145. ),
  146. Problem(
  147. task=[
  148. "Generate an image from the text given in `text_input`.",
  149. ],
  150. inputs=["text_input"],
  151. answer="image_generator(text_input)",
  152. ),
  153. Problem(
  154. task=[
  155. "Replace the beaver in the `image` by the `prompt`.",
  156. "Transform the `image` so that it contains the `prompt`.",
  157. "Use `prompt` to transform this `image`.",
  158. ],
  159. inputs=["image", "prompt"],
  160. answer="image_transformer(image, prompt)",
  161. ),
  162. Problem(
  163. task=[
  164. "Provide me the summary of the `text`, then read it to me before transcribing it and translating it in French.",
  165. "Summarize `text`, read it out loud then transcribe the audio and translate it in French.",
  166. "Read me a summary of the `text` out loud. Transcribe this and translate it in French.",
  167. ],
  168. inputs=["text"],
  169. answer="translator(transcriber(text_reader(summarizer(text))), src_lang='English', tgt_lang='French')",
  170. ),
  171. Problem(
  172. task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
  173. inputs={"prompt": "A lobster swimming"},
  174. answer="video_generator('A lobster swimming')",
  175. ),
  176. Problem(
  177. task=[
  178. "Download the following file `url`, summarize it in a few words and generate a video from it."
  179. "Fetch the file at this `url`, summarize it, and create an animation out of it."
  180. ],
  181. inputs=["url"],
  182. answer="video_generator(summarizer(text_downloader(url)))",
  183. ),
  184. ]
  185. def get_theoretical_tools(agent_answer, theoretical_answer, code_answer):
  186. if not isinstance(theoretical_answer, list):
  187. return {name for name in TEST_TOOLS if name in code_answer}
  188. if isinstance(agent_answer, dict):
  189. for one_answer, one_code in zip(theoretical_answer, code_answer):
  190. if one_answer in agent_answer.values():
  191. return {name for name in TEST_TOOLS if name in one_code}
  192. for one_answer, one_code in zip(theoretical_answer, code_answer):
  193. if agent_answer == one_answer:
  194. return {name for name in TEST_TOOLS if name in one_code}
  195. return {name for name in TEST_TOOLS if name in code_answer[0]}
  196. def evaluate_code(code, inputs=None, state=None, verbose=False, return_interpretor_error=False):
  197. tools = BASE_PYTHON_TOOLS.copy()
  198. for name, tool in TEST_TOOLS.items():
  199. if name not in code:
  200. continue
  201. tools[name] = tool
  202. if isinstance(inputs, dict):
  203. inputs = inputs.copy()
  204. elif inputs is not None:
  205. inputs = {inp: f"<<{inp}>>" for inp in inputs}
  206. if state is not None:
  207. state.update(inputs)
  208. else:
  209. state = inputs
  210. try:
  211. return evaluate(code, tools, state)
  212. except InterpreterError as e:
  213. return str(e)
  214. except Exception as e:
  215. if verbose:
  216. print(e)
  217. return None
  218. def score_code(agent_answer, theoretical_answer, verbose: bool = False):
  219. if verbose:
  220. print(agent_answer, theoretical_answer)
  221. theoretical_answer = theoretical_answer if isinstance(theoretical_answer, list) else [theoretical_answer]
  222. if agent_answer in theoretical_answer:
  223. if verbose:
  224. print("Perfect!")
  225. return 1
  226. elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
  227. if verbose:
  228. print("Almsot perfect, result in state!")
  229. return 0.75
  230. else:
  231. if verbose:
  232. print("Result is not the right one but code executed.")
  233. return 0.3
  234. def evaluate_one_result(code, agent_answer, theoretical_answer, answer, verbose=False):
  235. tools_in_code = {name for name in TEST_TOOLS if f"`{name}`" in code}
  236. theoretical_tools = get_theoretical_tools(agent_answer, theoretical_answer, answer)
  237. if tools_in_code == theoretical_tools:
  238. tool_selection_score = 1.0
  239. tool_selection_errors = None
  240. else:
  241. missing_tools = len(theoretical_tools - tools_in_code)
  242. unexpected_tools = len(tools_in_code - theoretical_tools)
  243. tool_selection_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
  244. tool_selection_errors = {
  245. "selected_tools": tools_in_code,
  246. "theoretical_tools": theoretical_tools,
  247. }
  248. tools_in_code = {name for name in TEST_TOOLS if name in code}
  249. if tools_in_code == theoretical_tools:
  250. tool_used_score = 1.0
  251. tool_used_errors = None
  252. else:
  253. missing_tools = len(theoretical_tools - tools_in_code)
  254. unexpected_tools = len(tools_in_code - theoretical_tools)
  255. tool_used_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
  256. tool_used_errors = {
  257. "selected_tools": tools_in_code,
  258. "theoretical_tools": theoretical_tools,
  259. }
  260. score = score_code(agent_answer, theoretical_answer, verbose=verbose)
  261. if score < 1.0:
  262. code_errors = {
  263. "code_produced": code,
  264. "evaluation": agent_answer,
  265. "theoretical_answer": theoretical_answer,
  266. }
  267. else:
  268. code_errors = None
  269. return (tool_selection_score, tool_used_score, score), (tool_selection_errors, tool_used_errors, code_errors)
  270. def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
  271. """
  272. Evaluates a new agent on all `EVALUATION_TASKS`.
  273. Example:
  274. ```py
  275. agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
  276. bads = new_evaluate_agent(agent)
  277. for bad in bads:
  278. print(bad)
  279. ```
  280. """
  281. # Sanity check
  282. agent_tools = set(agent.toolbox.keys())
  283. if agent_tools != set(TEST_TOOLS):
  284. missing_tools = set(TEST_TOOLS) - agent_tools
  285. unexpected_tools = set(agent_tools) - TEST_TOOLS
  286. raise ValueError(
  287. f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
  288. )
  289. eval_tasks = []
  290. eval_idx = []
  291. for idx, pb in enumerate(EVALUATION_TASKS):
  292. if isinstance(pb.task, list):
  293. eval_tasks.extend(pb.task)
  294. eval_idx.extend([idx] * len(pb.task))
  295. else:
  296. eval_tasks.append(pb.task)
  297. eval_idx.append(idx)
  298. tool_selection_score = 0
  299. tool_used_score = 0
  300. code_score = 0
  301. if return_errors:
  302. tool_selection_errors = {}
  303. tool_used_errors = {}
  304. code_errors = {}
  305. for start_idx in range(0, len(eval_tasks), batch_size):
  306. end_idx = min(start_idx + batch_size, len(eval_tasks))
  307. batch_tasks = eval_tasks[start_idx:end_idx]
  308. results = [agent.run(task, return_generated_code=True) for task in batch_tasks]
  309. for idx, result in enumerate(results):
  310. problem = EVALUATION_TASKS[eval_idx[start_idx + idx]]
  311. if verbose:
  312. print(f"====Task {start_idx + idx}====\n{batch_tasks[idx]}\n")
  313. code = agent.extract_action(result, split_token="Answer:")
  314. # Evaluate agent answer and code answer
  315. agent_answer = evaluate_code(code, problem.inputs, verbose=verbose)
  316. if isinstance(problem.answer, list):
  317. theoretical_answer = [evaluate_code(answer, problem.inputs) for answer in problem.answer]
  318. else:
  319. theoretical_answer = evaluate_code(problem.answer, problem.inputs)
  320. scores, errors = evaluate_one_result(
  321. code, agent_answer, theoretical_answer, problem.answer, verbose=verbose
  322. )
  323. tool_selection_score += scores[0]
  324. tool_used_score += scores[1]
  325. code_score += scores[2]
  326. if return_errors:
  327. if errors[0] is not None:
  328. tool_selection_errors[batch_tasks[idx]] = errors[0]
  329. if errors[1] is not None:
  330. tool_used_errors[batch_tasks[idx]] = errors[1]
  331. if errors[2] is not None:
  332. code_errors[batch_tasks[idx]] = errors[2]
  333. scores = {
  334. "tool selection score": 100 * (tool_selection_score / len(eval_tasks)),
  335. "tool used score": 100 * (tool_used_score / len(eval_tasks)),
  336. "code score": 100 * (code_score / len(eval_tasks)),
  337. }
  338. if return_errors:
  339. return scores, tool_selection_errors, tool_used_errors, code_errors
  340. else:
  341. return scores