research.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852
  1. """
  2. 研究流程管理模块
  3. 整合关键词提取、文献检索和聚类分析
  4. """
  5. import asyncio
  6. from typing import Dict, List, Any, Optional
  7. import logging
  8. import time
  9. import json
  10. from pathlib import Path
  11. import uuid
  12. import contextlib
  13. from backend.utils.api_client import LLMClient, ArxivClient
  14. from backend.utils.keywords import extract_keywords, expand_search_queries
  15. from backend.config import (
  16. MAX_SEARCH_RESULTS,
  17. CACHE_DIR,
  18. ENABLE_CACHE
  19. )
  20. logger = logging.getLogger(__name__)
  21. class ResearchAgent:
  22. """优化后的研究智能体,支持任务拆分和异步处理"""
  23. def __init__(self):
  24. self.llm_client = LLMClient()
  25. self.arxiv_client = ArxivClient()
  26. async def process_research_intent(
  27. self,
  28. research_intent: str,
  29. max_results: int = None
  30. ) -> Dict[str, Any]:
  31. """
  32. 处理研究意图,但使用任务拆分方式
  33. """
  34. start_time = time.time()
  35. # 初始化结果字典
  36. result = {
  37. "research_intent": research_intent,
  38. "timestamp": time.time(),
  39. "language": "auto",
  40. "english_keywords": [],
  41. "original_keywords": [],
  42. "english_directions": [],
  43. "original_directions": [],
  44. "papers_by_direction": [],
  45. "direction_reports": [],
  46. "status": "processing",
  47. "progress": 0, # 添加进度跟踪
  48. "task_id": str(uuid.uuid4()) # 添加唯一任务ID
  49. }
  50. try:
  51. # 1. 提取关键词(保持不变)
  52. keywords_data = await self.llm_client.extract_keywords(research_intent, "auto")
  53. result["language"] = keywords_data["language"]
  54. result["english_keywords"] = keywords_data["english_keywords"]
  55. result["original_keywords"] = keywords_data["original_keywords"]
  56. result["progress"] = 20
  57. # 2. 生成研究方向(保持不变)
  58. directions_data = await self.llm_client.generate_research_directions(
  59. result["english_keywords"],
  60. result["language"]
  61. )
  62. result["english_directions"] = directions_data["english_directions"]
  63. result["original_directions"] = directions_data["original_directions"]
  64. result["progress"] = 30
  65. # 3. 并行处理每个研究方向
  66. async def process_directions():
  67. tasks = []
  68. for i, direction in enumerate(result["english_directions"]):
  69. original_dir = result["original_directions"][i] if i < len(result["original_directions"]) else direction
  70. task = self._process_single_direction(direction, original_dir, result["language"])
  71. tasks.append(task)
  72. # 并行执行所有方向的处理
  73. direction_results = await asyncio.gather(*tasks, return_exceptions=True)
  74. papers_by_direction = []
  75. reports = []
  76. for i, res in enumerate(direction_results):
  77. if not isinstance(res, Exception):
  78. papers_by_direction.append(res["papers_data"])
  79. if res["report"]:
  80. reports.append(res["report"])
  81. else:
  82. # 处理异常情况
  83. logger.error(f"Error processing direction {i}: {str(res)}")
  84. papers_by_direction.append({
  85. "direction": result["english_directions"][i],
  86. "original_direction": result["original_directions"][i] if i < len(result["original_directions"]) else result["english_directions"][i],
  87. "papers": []
  88. })
  89. return papers_by_direction, reports
  90. # 执行并行处理
  91. papers_by_direction, reports = await process_directions()
  92. result["papers_by_direction"] = papers_by_direction
  93. result["direction_reports"] = reports
  94. result["status"] = "completed"
  95. result["progress"] = 100
  96. result["processing_time"] = time.time() - start_time
  97. # 缓存结果
  98. if ENABLE_CACHE:
  99. self._cache_result(result)
  100. return result
  101. except Exception as e:
  102. logger.error(f"Error in research process: {str(e)}", exc_info=True)
  103. result["status"] = "error"
  104. result["error"] = str(e)
  105. return result
  106. async def _process_single_direction(
  107. self,
  108. direction: str,
  109. original_direction: str,
  110. language: str
  111. ) -> Dict[str, Any]:
  112. """处理单个研究方向的独立任务"""
  113. try:
  114. # 1. 为该方向生成搜索查询
  115. query = await self._generate_search_query(direction)
  116. logger.info(f"Searching papers with query: {query}")
  117. # 2. 搜索论文
  118. papers = await self._search_papers_with_fallback(
  119. query=query,
  120. max_results=4
  121. )
  122. # 3. 构建论文数据
  123. papers_data = {
  124. "direction": direction,
  125. "original_direction": original_direction,
  126. "papers": papers
  127. }
  128. # 4. 仅当有论文时生成报告
  129. report = None
  130. if papers:
  131. try:
  132. report = await self._generate_direction_report(
  133. direction,
  134. papers,
  135. language
  136. )
  137. if report:
  138. report = {
  139. "direction": direction,
  140. "original_direction": original_direction,
  141. "report": report
  142. }
  143. except Exception as e:
  144. logger.error(f"Report generation error for {direction}: {str(e)}")
  145. return {
  146. "papers_data": papers_data,
  147. "report": report
  148. }
  149. except Exception as e:
  150. logger.error(f"Error processing direction '{direction}': {str(e)}")
  151. raise e
  152. async def _generate_search_query(self, research_direction: str) -> str:
  153. """为研究方向生成有效的文献检索式"""
  154. prompt = f"""
  155. 将以下研究方向转换为简洁有效的文献检索式。
  156. 研究方向: {research_direction}
  157. 生成检索式时遵循以下规则:
  158. 1. 使用2-3个核心术语,用布尔运算符(AND, OR)连接
  159. 2. 结构简单,优先使用引号包围关键短语
  160. 3. 避免过度细化导致无搜索结果
  161. 4. 确保检索式能找到3-5篇相关论文
  162. 只返回检索式字符串,不包含其他说明。
  163. """
  164. query = await self.llm_client.generate_text(prompt, temperature=0.3)
  165. # 简化复杂查询,确保能找到结果
  166. simplified_query = self._simplify_query_if_needed(query.strip())
  167. return simplified_query
  168. def _simplify_query_if_needed(self, query: str) -> str:
  169. """如果查询过于复杂就简化它"""
  170. # 计算AND和OR运算符的数量
  171. and_count = query.upper().count(" AND ")
  172. or_count = query.upper().count(" OR ")
  173. # 如果太复杂,简化查询
  174. if and_count > 2:
  175. # 只保留前两个AND条件
  176. parts = query.split(" AND ", 2)
  177. return " AND ".join(parts[:2])
  178. return query
  179. async def _generate_direction_report(
  180. self,
  181. direction: str,
  182. papers: List[Dict[str, Any]],
  183. target_language: str = "en"
  184. ) -> Dict[str, Any]:
  185. """优化后的研究报告生成方法"""
  186. # 如果没有论文,返回空报告
  187. if not papers:
  188. return {
  189. "english_content": "No papers found for this research direction.",
  190. "translated_content": "未找到该研究方向的相关论文。"
  191. }
  192. # 精简论文信息
  193. papers_text = "\n\n".join([
  194. f"Title: {paper['title']}\nSummary: {paper['summary'][:300]}..." # 限制摘要长度
  195. for paper in papers[:3]
  196. ])
  197. # 定义报告结构
  198. sections = [
  199. "Overview (100 words)",
  200. "Key Findings (150 words)",
  201. "Future Directions (100 words)"
  202. ]
  203. english_content = f"# Research Report: {direction}\n\n"
  204. # 为每个部分单独生成内容
  205. for section in sections:
  206. section_name = section.split(" (")[0]
  207. section_prompt = f"""
  208. Write the {section} section for a research report on:
  209. {direction}
  210. Based on these papers:
  211. {papers_text}
  212. Be concise, focused and specific.
  213. """
  214. try:
  215. section_content = await self.llm_client.generate_text(
  216. section_prompt,
  217. temperature=0.4,
  218. max_tokens=300
  219. )
  220. english_content += f"## {section_name}\n\n{section_content}\n\n"
  221. except Exception as e:
  222. english_content += f"## {section_name}\n\n[Error generating this section: {str(e)}]\n\n"
  223. # 翻译内容
  224. translated_content = english_content
  225. if target_language != "en":
  226. try:
  227. translated_content = await self.llm_client.translate_text(
  228. english_content, "en", target_language
  229. )
  230. except Exception as e:
  231. logger.error(f"Translation error: {str(e)}")
  232. return {
  233. "english_content": english_content,
  234. "translated_content": translated_content
  235. }
  236. def _cache_result(self, result: Dict[str, Any]) -> None:
  237. """缓存研究结果"""
  238. try:
  239. cache_file = Path(CACHE_DIR) / f"research_{int(time.time())}.json"
  240. with open(cache_file, "w", encoding="utf-8") as f:
  241. json.dump(result, f, ensure_ascii=False, indent=2)
  242. logger.info(f"Cached result to {cache_file}")
  243. except Exception as e:
  244. logger.error(f"Failed to cache result: {str(e)}")
  245. async def _search_papers_with_fallback(self, query: str, max_results: int = 4) -> List[Dict[str, Any]]:
  246. """文献检索策略,确保每个研究方向有3-5篇论文"""
  247. # Validate query before proceeding
  248. if not query or "Error generating text" in query:
  249. logger.error(f"Invalid search query: {query}")
  250. return [] # Return empty list instead of attempting to search with error string
  251. # 第一次尝试:使用完整检索式
  252. logger.info(f"使用检索式搜索论文: {query}")
  253. papers = await self.arxiv_client.search_papers(query=query, max_results=max_results)
  254. # 如果结果不足,尝试简化检索式
  255. if len(papers) < 3:
  256. simplified_query = self._get_simplified_query(query)
  257. if simplified_query != query:
  258. logger.info(f"结果不足,使用简化检索式: {simplified_query}")
  259. more_papers = await self.arxiv_client.search_papers(
  260. query=simplified_query,
  261. max_results=max_results
  262. )
  263. # 添加不重复的论文
  264. existing_ids = {p["id"] for p in papers}
  265. for paper in more_papers:
  266. if paper["id"] not in existing_ids:
  267. papers.append(paper)
  268. if len(papers) >= 5: # 最多5篇
  269. break
  270. # 如果仍不足3篇,添加示例论文
  271. if len(papers) < 3:
  272. logger.info(f"无法获取足够论文,添加示例论文")
  273. clean_query = query.replace('"', '')
  274. example_papers = [
  275. {
  276. "id": f"example_{i}",
  277. "title": f"相关研究: {clean_query}",
  278. "authors": ["研究者 A", "研究者 B"],
  279. "summary": f"这是一篇关于{clean_query}的研究论文。由于搜索结果有限,系统生成了此示例条目。",
  280. "published": "2023-01-01T00:00:00",
  281. "updated": "2023-01-01T00:00:00",
  282. "link": "#",
  283. "source": "example"
  284. }
  285. for i in range(1, 4 - len(papers))
  286. ]
  287. papers.extend(example_papers)
  288. # 确保每个方向返回3-5篇论文
  289. return papers[:5]
  290. def _get_simplified_query(self, query: str) -> str:
  291. """获取简化的检索式"""
  292. # 提取引号中的关键术语
  293. import re
  294. quoted_terms = re.findall(r'"([^"]*)"', query)
  295. if quoted_terms:
  296. # 使用前两个引号术语
  297. terms = quoted_terms[:min(2, len(quoted_terms))]
  298. return ' AND '.join([f'"{term}"' for term in terms])
  299. # 如果没有引号术语,分解AND条件
  300. if " AND " in query:
  301. parts = query.split(" AND ")
  302. return parts[0] # 只用第一个条件
  303. return query
  304. async def _extract_key_terms(self, direction: str) -> List[str]:
  305. """从研究方向中提取关键术语"""
  306. prompt = f"""
  307. Extract 3 most important search terms from this research direction.
  308. Return only comma-separated terms, no explanations.
  309. Direction: {direction}
  310. """
  311. try:
  312. response = await self.llm_client.generate_text(prompt, temperature=0.1)
  313. terms = [term.strip() for term in response.split(',') if term.strip()]
  314. return terms if terms else [direction.split()[0]] # 如果提取失败,使用首个词
  315. except Exception as e:
  316. logger.error(f"Error extracting terms: {str(e)}")
  317. # 回退到简单分词
  318. words = direction.split()
  319. return [words[0], words[-1] if len(words) > 1 else words[0]]
  320. async def _search_papers_with_improved_strategy(self, direction: str, max_results: int = 4) -> List[Dict[str, Any]]:
  321. """智能论文搜索策略"""
  322. # 提取关键术语
  323. terms = await self._extract_key_terms(direction)
  324. # 构建多级查询策略
  325. queries = []
  326. if len(terms) >= 2:
  327. queries.append(f'"{terms[0]}" AND "{terms[1]}"')
  328. if len(terms) >= 3:
  329. queries.append(f'"{terms[0]}" AND "{terms[2]}"')
  330. queries.append(f'"{terms[0]}" OR "{terms[1] if len(terms) > 1 else terms[0]}"')
  331. queries.append(terms[0])
  332. # 逐一尝试查询
  333. for query in queries:
  334. logger.info(f"Trying search query: {query}")
  335. papers = await self.arxiv_client.search_papers(query, max_results)
  336. if len(papers) >= 2: # 找到足够论文则返回
  337. return papers[:max_results]
  338. # 所有查询失败,返回示例论文
  339. return self._get_example_papers(direction)
  340. async def extract_keywords_only(self, research_intent):
  341. """只提取关键词,不进行其他处理"""
  342. logging.info(f"Extracting keywords for: {research_intent}")
  343. try:
  344. # 直接使用LLMClient的extract_keywords方法
  345. result = await self.llm_client.extract_keywords(research_topic=research_intent, original_language="auto")
  346. return result
  347. except Exception as e:
  348. logging.error(f"Error extracting keywords: {str(e)}")
  349. # 返回一个默认结果,避免None
  350. return {
  351. "english_keywords": [],
  352. "original_keywords": [],
  353. "language": "en"
  354. }
  355. async def generate_directions_only(self, keywords: List[str], language: str) -> Dict[str, Any]:
  356. """仅生成研究方向"""
  357. logger.info(f"Generating directions for keywords: {keywords}")
  358. directions_data = await self.llm_client.generate_research_directions(keywords, language)
  359. logger.info(f"Generated research directions: {directions_data['english_directions']}")
  360. return directions_data
  361. async def search_papers_for_direction(self, direction: str, max_results: int = 4) -> List[Dict[str, Any]]:
  362. """仅为一个方向搜索论文"""
  363. query = await self._generate_search_query(direction)
  364. logger.info(f"Searching papers with query: {query}")
  365. papers = await self._search_papers_with_fallback(query=query, max_results=max_results)
  366. return papers
  367. async def generate_report_for_direction(
  368. self,
  369. direction: str,
  370. papers: List[Dict[str, Any]],
  371. target_language: str = "en"
  372. ) -> Dict[str, Any]:
  373. """仅为一个方向生成报告"""
  374. return await self._generate_direction_report(direction, papers, target_language)
  375. async def generate_enhanced_topics(self, keywords, language):
  376. """
  377. 基于关键词生成增强的研究主题
  378. 返回格式: [
  379. {
  380. "english_title": "Energy Transition Policies in China",
  381. "title": "中国的能源转型政策",
  382. "description": "研究中国在能源转型领域的政策制定与实施",
  383. "keywords": ["policy", "energy transition", "China"]
  384. },
  385. ...
  386. ]
  387. """
  388. prompt = f"""
  389. Based on the following keywords about a research area: {', '.join(keywords)}
  390. Generate 3-5 specific research topics that would be valuable to explore in this area.
  391. For each topic:
  392. 1. Provide a specific, focused research title (not too broad)
  393. 2. Add a brief description of what this research direction would investigate
  394. 3. List 3-5 specific keywords that best represent this research direction
  395. Format your response as a JSON array of objects with the following structure:
  396. [
  397. {{
  398. "english_title": "The English title",
  399. "description": "Brief description of this research direction",
  400. "keywords": ["keyword1", "keyword2", "keyword3"]
  401. }},
  402. ...
  403. ]
  404. """
  405. # Add logging to track the response
  406. logger.info(f"Generating enhanced topics for keywords: {keywords}")
  407. response = await self.llm_client.chat_completion(
  408. messages=[
  409. {"role": "system", "content": "You are a research assistant helping to identify valuable research topics and directions."},
  410. {"role": "user", "content": prompt}
  411. ],
  412. response_format={"type": "json_object"}
  413. )
  414. # Log the raw response for debugging
  415. logger.info(f"Raw LLM response: {response}")
  416. try:
  417. # 提取JSON响应
  418. content = response["choices"][0]["message"]["content"]
  419. logger.info(f"Parsing JSON content: {content}")
  420. # Try to handle different JSON formats that might be returned
  421. topics_data = json.loads(content)
  422. # 确保我们有一个topics字段,如果返回直接是数组,则进行调整
  423. if isinstance(topics_data, list):
  424. topics = topics_data
  425. elif "topics" in topics_data:
  426. topics = topics_data.get("topics", [])
  427. else:
  428. # Try to find any array in the response
  429. for key, value in topics_data.items():
  430. if isinstance(value, list) and len(value) > 0 and "english_title" in value[0]:
  431. topics = value
  432. break
  433. else:
  434. # If no suitable array is found, create basic topics
  435. logger.warning("Could not find topics array in response, creating basic topics")
  436. topics = []
  437. for keyword in keywords[:3]:
  438. topics.append({
  439. "english_title": f"Research on {keyword}",
  440. "description": f"Investigating various aspects of {keyword}",
  441. "keywords": [keyword]
  442. })
  443. # Log the extracted topics
  444. logger.info(f"Extracted {len(topics)} topics: {topics}")
  445. # 如果需要翻译
  446. if language != "en":
  447. translated_topics = []
  448. for topic in topics:
  449. # 翻译标题
  450. translation_prompt = f"Translate the following research title to {language}: \"{topic['english_title']}\""
  451. translation_response = await self.llm_client.chat_completion(
  452. messages=[
  453. {"role": "system", "content": "You are a professional translator."},
  454. {"role": "user", "content": translation_prompt}
  455. ]
  456. )
  457. translated_title = translation_response["choices"][0]["message"]["content"].strip().strip('"')
  458. # 添加翻译后的标题
  459. topic["title"] = translated_title
  460. translated_topics.append(topic)
  461. return translated_topics
  462. else:
  463. # 英文环境则复制标题
  464. for topic in topics:
  465. topic["title"] = topic["english_title"]
  466. return topics
  467. except Exception as e:
  468. logging.error(f"Error parsing enhanced topics: {str(e)}", exc_info=True)
  469. # 返回基础主题格式作为后备
  470. basic_topics = []
  471. for i, keyword in enumerate(keywords[:3]):
  472. basic_topics.append({
  473. "english_title": f"Research on {keyword}",
  474. "title": f"关于{keyword}的研究" if language != "en" else f"Research on {keyword}",
  475. "description": f"Investigating various aspects of {keyword}",
  476. "keywords": [keyword]
  477. })
  478. return basic_topics
  479. async def generate_search_keywords_for_topic(self, topic):
  480. """
  481. 为特定研究主题生成精确的搜索关键词
  482. """
  483. prompt = f"""
  484. Based on the following research topic:
  485. Title: {topic['english_title']}
  486. Description: {topic['description']}
  487. Initial keywords: {', '.join(topic['keywords'])}
  488. Generate 5-7 precise search keywords or phrases that would be most effective for finding
  489. relevant scientific papers on this specific topic. These should be more specific and targeted
  490. than the initial keywords.
  491. Format your response as a JSON array of strings, e.g. ["keyword1", "keyword2", ...]
  492. """
  493. try:
  494. # 添加错误处理
  495. response = await self.llm_client.chat_completion(
  496. messages=[
  497. {"role": "system", "content": "You are a research assistant helping to generate effective search keywords for academic literature."},
  498. {"role": "user", "content": prompt}
  499. ],
  500. response_format={"type": "json_object"}
  501. )
  502. content = response["choices"][0]["message"]["content"]
  503. keywords_data = json.loads(content)
  504. # 处理不同的返回格式
  505. if isinstance(keywords_data, list):
  506. return keywords_data
  507. elif "keywords" in keywords_data:
  508. return keywords_data["keywords"]
  509. else:
  510. # 提取数据中的第一个列表
  511. for key, value in keywords_data.items():
  512. if isinstance(value, list):
  513. return value
  514. # 后备选项
  515. return topic['keywords']
  516. except Exception as e:
  517. # 增强错误处理:记录详细错误并返回原始关键词
  518. logging.error(f"Error generating search keywords: {str(e)}", exc_info=True)
  519. logging.info(f"Falling back to original topic keywords due to API error")
  520. # 返回原始关键词作为后备
  521. return topic['keywords']
  522. async def search_papers_with_keywords(self, keywords):
  523. """
  524. 使用关键词搜索相关论文
  525. """
  526. all_papers = []
  527. # 对每个关键词进行搜索
  528. for keyword in keywords:
  529. try:
  530. results = await self.arxiv_client.search_papers(
  531. query=keyword,
  532. max_results=3 # 每个关键词获取少量相关性高的结果
  533. )
  534. all_papers.extend(results)
  535. except Exception as e:
  536. logging.error(f"Error searching papers for keyword '{keyword}': {str(e)}")
  537. # 去重
  538. unique_papers = []
  539. paper_ids = set()
  540. for paper in all_papers:
  541. if paper["id"] not in paper_ids:
  542. unique_papers.append(paper)
  543. paper_ids.add(paper["id"])
  544. return unique_papers
  545. async def cluster_papers_by_keywords(self, papers, search_keywords, topic):
  546. """
  547. 根据关键词和主题将论文进行聚类
  548. """
  549. if not papers or len(papers) < 3:
  550. # 如果论文太少,不进行聚类
  551. return {
  552. "papers": papers,
  553. "clusters": [{
  554. "id": 0,
  555. "name": topic["english_title"],
  556. "keywords": search_keywords[:3]
  557. }]
  558. }
  559. try:
  560. # 准备文本数据
  561. texts = [f"{p['title']} {p['summary']}" for p in papers]
  562. # 使用现有的聚类器或简单实现
  563. num_clusters = min(3, len(papers) // 2) # 动态确定聚类数
  564. # 调用现有的聚类方法
  565. cluster_results = self.paper_clusterer.cluster_papers(
  566. papers=papers,
  567. num_clusters=num_clusters
  568. ) if hasattr(self, 'paper_clusterer') else None
  569. if not cluster_results:
  570. # 简单替代方案:根据关键词匹配进行"聚类"
  571. clusters = []
  572. for i, keyword in enumerate(search_keywords[:3]):
  573. clusters.append({
  574. "id": i,
  575. "name": f"Research on {keyword}",
  576. "keywords": [keyword]
  577. })
  578. # 分配论文到"聚类"
  579. for paper in papers:
  580. best_match = 0
  581. best_score = 0
  582. for i, cluster in enumerate(clusters):
  583. # 简单相似度:关键词在标题或摘要中的出现次数
  584. score = paper['title'].lower().count(cluster['keywords'][0].lower()) + \
  585. paper['summary'].lower().count(cluster['keywords'][0].lower())
  586. if score > best_score:
  587. best_score = score
  588. best_match = i
  589. paper['cluster'] = best_match
  590. return {
  591. "papers": papers,
  592. "clusters": clusters
  593. }
  594. else:
  595. # 使用现有聚类结果
  596. return cluster_results
  597. except Exception as e:
  598. logging.error(f"Error clustering papers: {str(e)}")
  599. # 失败时返回未聚类的数据
  600. return {
  601. "papers": papers,
  602. "clusters": [{
  603. "id": 0,
  604. "name": topic["english_title"],
  605. "keywords": search_keywords[:3]
  606. }]
  607. }
  608. async def generate_enhanced_report(self, topic, clustered_papers, search_keywords, language):
  609. """
  610. 为主题生成增强研究报告 - 分段处理以减少单次API调用压力
  611. """
  612. papers = clustered_papers["papers"]
  613. clusters = clustered_papers["clusters"]
  614. try:
  615. # 准备聚类摘要 - 限制每个聚类的论文数量
  616. cluster_summaries = []
  617. for cluster in clusters:
  618. cluster_papers = [p for p in papers if p.get('cluster', 0) == cluster['id']]
  619. if cluster_papers:
  620. # 限制每个聚类最多3篇论文
  621. paper_titles = "\n".join([f"- {p['title']}" for p in cluster_papers[:3]])
  622. cluster_summaries.append(f"Cluster: {cluster['name']}\nKeywords: {', '.join(cluster['keywords'])}\nPapers:\n{paper_titles}")
  623. all_cluster_summaries = "\n\n".join(cluster_summaries[:3]) # 限制聚类数量
  624. # 拆分报告生成为多个部分
  625. # 1. 生成报告概述和主要发现
  626. intro_prompt = f"""
  627. Generate the introduction and main findings sections for a research report on:
  628. Research Topic: {topic['english_title']}
  629. Description: {topic['description']}
  630. Main Keywords: {', '.join(topic['keywords'][:5])}
  631. Based on the following paper clusters:
  632. {all_cluster_summaries}
  633. Include:
  634. 1. A brief introduction to the research topic (100 words)
  635. 2. Summary of main findings from the literature (200 words)
  636. Format with appropriate headings.
  637. """
  638. # 使用更小的max_tokens以减少单次API调用的压力
  639. intro_response = await self.llm_client.chat_completion(
  640. messages=[
  641. {"role": "system", "content": "You are a research assistant creating academic reports."},
  642. {"role": "user", "content": intro_prompt}
  643. ],
  644. max_tokens=1000
  645. )
  646. intro_content = intro_response["choices"][0]["message"]["content"]
  647. # 2. 生成研究差距和未来方向
  648. conclusion_prompt = f"""
  649. Generate the research gaps and future directions sections for a research report on:
  650. Research Topic: {topic['english_title']}
  651. Description: {topic['description']}
  652. Main Keywords: {', '.join(topic['keywords'][:5])}
  653. Based on the provided literature, identify:
  654. 1. Current research gaps (100 words)
  655. 2. Potential directions for future research (100 words)
  656. Format with appropriate headings.
  657. """
  658. conclusion_response = await self.llm_client.chat_completion(
  659. messages=[
  660. {"role": "system", "content": "You are a research assistant creating academic reports."},
  661. {"role": "user", "content": conclusion_prompt}
  662. ],
  663. max_tokens=800
  664. )
  665. conclusion_content = conclusion_response["choices"][0]["message"]["content"]
  666. # 合并内容
  667. english_content = f"{intro_content}\n\n{conclusion_content}"
  668. # 如果需要翻译
  669. if language != "en":
  670. try:
  671. # 拆分翻译为两部分
  672. translation_intro = await self.llm_client.chat_completion(
  673. messages=[
  674. {"role": "system", "content": "You are a professional translator specialized in academic content."},
  675. {"role": "user", "content": f"Translate the following research report section to {language}:\n\n{intro_content}"}
  676. ],
  677. max_tokens=1200
  678. )
  679. translation_conclusion = await self.llm_client.chat_completion(
  680. messages=[
  681. {"role": "system", "content": "You are a professional translator specialized in academic content."},
  682. {"role": "user", "content": f"Translate the following research report section to {language}:\n\n{conclusion_content}"}
  683. ],
  684. max_tokens=1000
  685. )
  686. translated_intro = translation_intro["choices"][0]["message"]["content"]
  687. translated_conclusion = translation_conclusion["choices"][0]["message"]["content"]
  688. translated_content = f"{translated_intro}\n\n{translated_conclusion}"
  689. except Exception as e:
  690. # 翻译失败时,使用原始英文内容
  691. logger.error(f"Translation failed: {str(e)}")
  692. translated_content = f"(翻译失败,显示原文) {english_content}"
  693. return {
  694. "english_content": english_content,
  695. "translated_content": translated_content
  696. }
  697. else:
  698. return {
  699. "english_content": english_content,
  700. "translated_content": english_content # 英文环境下两者相同
  701. }
  702. except Exception as e:
  703. # 生成报告失败时,提供一个简单的后备报告
  704. logger.error(f"Report generation failed: {str(e)}")
  705. fallback_content = f"""
  706. # 研究主题: {topic['english_title']}
  707. ## 概述
  708. 本报告本应提供关于"{topic['english_title']}"主题的详细研究分析,但由于API调用限制或网络问题,无法生成完整报告。
  709. ## 可用资源
  710. 我们已检索到该主题相关的{len(papers)}篇论文,您可以通过查看这些论文获取相关信息。
  711. ## 主要关键词
  712. {', '.join(search_keywords[:5] if search_keywords else [])}
  713. ## 建议
  714. 您可以重新尝试生成报告,或直接查看检索到的论文以获取更多信息。
  715. """
  716. return {
  717. "english_content": fallback_content,
  718. "translated_content": fallback_content
  719. }