indexes.service.js 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. const path = require("path");
  2. const fs = require("fs");
  3. const crypto = require("crypto");
  4. // 网络下载
  5. const download = require("download");
  6. // 加载器
  7. const { DocxLoader } = require("langchain/document_loaders/fs/docx");
  8. const { CSVLoader } = require("langchain/document_loaders/fs/csv");
  9. const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
  10. const { PPTXLoader } = require("langchain/document_loaders/fs/pptx");
  11. const { WebpageLoader } = require("./loaders/web.loader"); // 飞码AI自主开发 可过滤无用标签
  12. // const { CheerioWebBaseLoader } = require("langchain/document_loaders/web/cheerio"); // SSR网页 更加轻量
  13. // const { PuppeteerWebBaseLoader } = require("langchain/document_loaders/web/puppeteer"); // SPA网页 更加通用 不够精准会被爬虫拦截
  14. // 分割器
  15. const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
  16. class IndexesService {
  17. Document = Parse.Object.extend("Document")
  18. loaderMap = {
  19. "docx":DocxLoader,
  20. "pdf":PDFLoader,
  21. "csv":CSVLoader,
  22. "pptx":PPTXLoader,
  23. "webpage":WebpageLoader,
  24. // "web":PuppeteerWebBaseLoader
  25. }
  26. async split(docs,options){
  27. let splitter = new RecursiveCharacterTextSplitter({ // RecursiveCharacterTextSplitter 递归字符文本分割器(按语义相关分割:会通过不同的符号递归地分割文档-从“”开始,然后是“”,再然后是“ ”)
  28. chunkSize: options?.chunkSize || 500, // 控制最终文档的最大大小(以字符数为单位)。 500接近一个自然段
  29. chunkOverlap: options?.chunkOverlap || 100, // 指定文档之间应该有多少重叠。这通常有助于确保文本不会被奇怪地分割。
  30. });
  31. let docOutput = await splitter.splitDocuments(docs);
  32. return docOutput
  33. }
  34. async load(filelink,extend,md5File){
  35. // md5存在,则加载
  36. let Parse = global.Parse;
  37. if(md5File&&Parse){
  38. let query = new Parse.Query("Document");
  39. query.equalTo("md5",md5File);
  40. let document = await query.first();
  41. if(document?.id){
  42. return [{
  43. metadata:document.get("metadata"),
  44. pageContent:document.get("pageContent")
  45. }]
  46. }
  47. }
  48. // 无结果则重新获取
  49. extend = extend || filelink?.split(".")?.[filelink?.split(".")?.length-1]
  50. let LoaderClass = this.loaderMap[extend]
  51. let isHttp = (filelink?.indexOf("http")>-1) || false
  52. let loaderPath = ""
  53. let hasDownloaded = false
  54. let md5
  55. if(isHttp){
  56. // 网址且非已知文件类
  57. if(!LoaderClass){ // 纯网页地址
  58. LoaderClass = WebpageLoader
  59. loaderPath = filelink
  60. }else{ // 网址且已知文件类类型 .docs .pptx
  61. let tempFilePath
  62. try{
  63. let res = await this.download(filelink)
  64. tempFilePath = res.tempFilePath
  65. md5 = res.md5
  66. }catch(err1){
  67. console.log("文件下载失败",filelink,err1)
  68. return
  69. }
  70. hasDownloaded = true
  71. loaderPath = tempFilePath
  72. }
  73. }else{
  74. loaderPath = path.join(__dirname,filelink)
  75. if(!fs.existsSync(loaderPath)){
  76. console.log("文件不存在:",loaderPath)
  77. return
  78. }
  79. // let fileBuffer = this.readFileBuffer(loaderPath)
  80. // try{
  81. // md5 = this.calcBufferMd5(fileBuffer)
  82. // }catch(errmd5){}
  83. }
  84. let loader = new LoaderClass(
  85. loaderPath
  86. );
  87. if(!loader){
  88. console.log("仅支持:",Object.keys(loaderMap))
  89. return
  90. }
  91. let docs = await loader.load();
  92. if(hasDownloaded){
  93. if(fs.existsSync(loaderPath)){fs.rmSync(loaderPath)}
  94. }
  95. // 根据md5存储加载结果
  96. if(Parse){
  97. let document = new this.Document();
  98. let doc = docs?.[0]
  99. document.set("type","entire")
  100. document.set("md5",md5File||md5)
  101. document.set("pageContent",doc?.pageContent)
  102. document.set("metadata",doc?.metadata)
  103. try{
  104. document.save()
  105. }catch(errsave){}
  106. }
  107. return docs
  108. }
  109. /**
  110. *
  111. * @param {*} url
  112. * @returns {{md5:string,tempFilePath:string}}
  113. */
  114. async download(url){
  115. let fname = url.split("/")?.[url.split("/")?.length-1]
  116. let tempFilePath = path.join(__dirname,"temp",fname)
  117. if(!fs.existsSync(path.join(__dirname,"temp"))) fs.mkdirSync(path.join(__dirname,"temp"))
  118. return new Promise((resolve)=>{
  119. download(url).then(async fileBuffer=>{
  120. fs.writeFileSync(tempFilePath, fileBuffer);
  121. let md5
  122. try{
  123. md5 = await this.calcBufferMd5(fileBuffer)
  124. }catch(errmd5){
  125. console.error(errmd5)
  126. }
  127. console.log("Url File Md5:", md5)
  128. resolve({
  129. tempFilePath,
  130. md5
  131. })
  132. }).catch(err=>{
  133. console.error(err)
  134. resolve(null)
  135. })
  136. })
  137. }
  138. /**
  139. * 计算Buffer的MD5值
  140. * @param {Buffer} buffer - 需要计算MD5的Buffer
  141. * @returns {string} - MD5哈希值
  142. */
  143. calcBufferMd5(buffer) {
  144. const hash = crypto.createHash('md5');
  145. hash.update(buffer);
  146. return hash.digest('hex');
  147. }
  148. /**
  149. * 异步读取文件内容为Buffer
  150. * @param {string} filePath - 文件路径
  151. * @returns {Promise<Buffer>} - 返回一个Promise,解析为文件的Buffer
  152. */
  153. readFileBuffer(filePath) {
  154. return new Promise((resolve, reject) => {
  155. fs.readFile(filePath, (err, data) => {
  156. if (err) {
  157. reject(err);
  158. } else {
  159. resolve(data);
  160. }
  161. });
  162. });
  163. }
  164. }
  165. module.exports.IndexesService = IndexesService