const path = require("path"); const fs = require("fs"); const crypto = require("crypto"); // 网络下载 const download = require("download"); // 加载器 const { DocxLoader } = require("langchain/document_loaders/fs/docx"); const { CSVLoader } = require("langchain/document_loaders/fs/csv"); const { PDFLoader } = require("langchain/document_loaders/fs/pdf"); const { PPTXLoader } = require("langchain/document_loaders/fs/pptx"); const { WebpageLoader } = require("./loaders/web.loader"); // 飞码AI自主开发 可过滤无用标签 // const { CheerioWebBaseLoader } = require("langchain/document_loaders/web/cheerio"); // SSR网页 更加轻量 // const { PuppeteerWebBaseLoader } = require("langchain/document_loaders/web/puppeteer"); // SPA网页 更加通用 不够精准会被爬虫拦截 // 分割器 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); class IndexesService { Document = Parse.Object.extend("Document") loaderMap = { "docx":DocxLoader, "pdf":PDFLoader, "csv":CSVLoader, "pptx":PPTXLoader, "webpage":WebpageLoader, // "web":PuppeteerWebBaseLoader } async split(docs,options){ let splitter = new RecursiveCharacterTextSplitter({ // RecursiveCharacterTextSplitter 递归字符文本分割器(按语义相关分割:会通过不同的符号递归地分割文档-从“”开始,然后是“”,再然后是“ ”) chunkSize: options?.chunkSize || 500, // 控制最终文档的最大大小(以字符数为单位)。 500接近一个自然段 chunkOverlap: options?.chunkOverlap || 100, // 指定文档之间应该有多少重叠。这通常有助于确保文本不会被奇怪地分割。 }); let docOutput = await splitter.splitDocuments(docs); return docOutput } async load(filelink,extend,md5File){ // md5存在,则加载 let Parse = global.Parse; if(md5File&&Parse){ let query = new Parse.Query("Document"); query.equalTo("md5",md5File); let document = await query.first(); if(document?.id){ return [{ metadata:document.get("metadata"), pageContent:document.get("pageContent") }] } } // 无结果则重新获取 extend = extend || filelink?.split(".")?.[filelink?.split(".")?.length-1] let LoaderClass = this.loaderMap[extend] let isHttp = (filelink?.indexOf("http")>-1) || false let loaderPath = "" let hasDownloaded = false let md5 if(isHttp){ // 网址且非已知文件类 if(!LoaderClass){ // 纯网页地址 LoaderClass = WebpageLoader loaderPath = filelink }else{ // 网址且已知文件类类型 .docs .pptx let tempFilePath try{ let res = await this.download(filelink) tempFilePath = res.tempFilePath md5 = res.md5 }catch(err1){ console.log("文件下载失败",filelink,err1) return } hasDownloaded = true loaderPath = tempFilePath } }else{ loaderPath = path.join(__dirname,filelink) if(!fs.existsSync(loaderPath)){ console.log("文件不存在:",loaderPath) return } // let fileBuffer = this.readFileBuffer(loaderPath) // try{ // md5 = this.calcBufferMd5(fileBuffer) // }catch(errmd5){} } let loader = new LoaderClass( loaderPath ); if(!loader){ console.log("仅支持:",Object.keys(loaderMap)) return } let docs = await loader.load(); if(hasDownloaded){ if(fs.existsSync(loaderPath)){fs.rmSync(loaderPath)} } // 根据md5存储加载结果 if(Parse){ let document = new this.Document(); let doc = docs?.[0] document.set("type","entire") document.set("md5",md5File||md5) document.set("pageContent",doc?.pageContent) document.set("metadata",doc?.metadata) try{ document.save() }catch(errsave){} } return docs } /** * * @param {*} url * @returns {{md5:string,tempFilePath:string}} */ async download(url){ let fname = url.split("/")?.[url.split("/")?.length-1] let tempFilePath = path.join(__dirname,"temp",fname) if(!fs.existsSync(path.join(__dirname,"temp"))) fs.mkdirSync(path.join(__dirname,"temp")) return new Promise((resolve)=>{ download(url).then(async fileBuffer=>{ fs.writeFileSync(tempFilePath, fileBuffer); let md5 try{ md5 = await this.calcBufferMd5(fileBuffer) }catch(errmd5){ console.error(errmd5) } console.log("Url File Md5:", md5) resolve({ tempFilePath, md5 }) }).catch(err=>{ console.error(err) resolve(null) }) }) } /** * 计算Buffer的MD5值 * @param {Buffer} buffer - 需要计算MD5的Buffer * @returns {string} - MD5哈希值 */ calcBufferMd5(buffer) { const hash = crypto.createHash('md5'); hash.update(buffer); return hash.digest('hex'); } /** * 异步读取文件内容为Buffer * @param {string} filePath - 文件路径 * @returns {Promise} - 返回一个Promise,解析为文件的Buffer */ readFileBuffer(filePath) { return new Promise((resolve, reject) => { fs.readFile(filePath, (err, data) => { if (err) { reject(err); } else { resolve(data); } }); }); } } module.exports.IndexesService = IndexesService