123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- const path = require("path");
- const fs = require("fs");
- const crypto = require("crypto");
- // 网络下载
- const download = require("download");
- // 加载器
- const { DocxLoader } = require("langchain/document_loaders/fs/docx");
- const { CSVLoader } = require("langchain/document_loaders/fs/csv");
- const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
- const { PPTXLoader } = require("langchain/document_loaders/fs/pptx");
- const { WebpageLoader } = require("./loaders/web.loader"); // 飞码AI自主开发 可过滤无用标签
- // const { CheerioWebBaseLoader } = require("langchain/document_loaders/web/cheerio"); // SSR网页 更加轻量
- // const { PuppeteerWebBaseLoader } = require("langchain/document_loaders/web/puppeteer"); // SPA网页 更加通用 不够精准会被爬虫拦截
- // 分割器
- const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
- class IndexesService {
- Document = Parse.Object.extend("Document")
- loaderMap = {
- "docx":DocxLoader,
- "pdf":PDFLoader,
- "csv":CSVLoader,
- "pptx":PPTXLoader,
- "webpage":WebpageLoader,
- // "web":PuppeteerWebBaseLoader
- }
- async split(docs,options){
- let splitter = new RecursiveCharacterTextSplitter({ // RecursiveCharacterTextSplitter 递归字符文本分割器(按语义相关分割:会通过不同的符号递归地分割文档-从“”开始,然后是“”,再然后是“ ”)
- chunkSize: options?.chunkSize || 500, // 控制最终文档的最大大小(以字符数为单位)。 500接近一个自然段
- chunkOverlap: options?.chunkOverlap || 100, // 指定文档之间应该有多少重叠。这通常有助于确保文本不会被奇怪地分割。
- });
- let docOutput = await splitter.splitDocuments(docs);
- return docOutput
- }
- async load(filelink,extend,md5File){
- // md5存在,则加载
- let Parse = global.Parse;
- if(md5File&&Parse){
- let query = new Parse.Query("Document");
- query.equalTo("md5",md5File);
- let document = await query.first();
- if(document?.id){
- return [{
- metadata:document.get("metadata"),
- pageContent:document.get("pageContent")
- }]
- }
- }
- // 无结果则重新获取
- extend = extend || filelink?.split(".")?.[filelink?.split(".")?.length-1]
- let LoaderClass = this.loaderMap[extend]
- let isHttp = (filelink?.indexOf("http")>-1) || false
-
- let loaderPath = ""
- let hasDownloaded = false
- let md5
- if(isHttp){
- // 网址且非已知文件类
- if(!LoaderClass){ // 纯网页地址
- LoaderClass = WebpageLoader
- loaderPath = filelink
- }else{ // 网址且已知文件类类型 .docs .pptx
- let tempFilePath
- try{
- let res = await this.download(filelink)
- tempFilePath = res.tempFilePath
- md5 = res.md5
- }catch(err1){
- console.log("文件下载失败",filelink,err1)
- return
- }
- hasDownloaded = true
- loaderPath = tempFilePath
- }
- }else{
- loaderPath = path.join(__dirname,filelink)
- if(!fs.existsSync(loaderPath)){
- console.log("文件不存在:",loaderPath)
- return
- }
- // let fileBuffer = this.readFileBuffer(loaderPath)
- // try{
- // md5 = this.calcBufferMd5(fileBuffer)
- // }catch(errmd5){}
- }
- let loader = new LoaderClass(
- loaderPath
- );
- if(!loader){
- console.log("仅支持:",Object.keys(loaderMap))
- return
- }
- let docs = await loader.load();
- if(hasDownloaded){
- if(fs.existsSync(loaderPath)){fs.rmSync(loaderPath)}
- }
- // 根据md5存储加载结果
- if(Parse){
- let document = new this.Document();
- let doc = docs?.[0]
- document.set("type","entire")
- document.set("md5",md5File||md5)
- document.set("pageContent",doc?.pageContent)
- document.set("metadata",doc?.metadata)
- try{
- document.save()
- }catch(errsave){}
- }
- return docs
- }
- /**
- *
- * @param {*} url
- * @returns {{md5:string,tempFilePath:string}}
- */
- async download(url){
- let fname = url.split("/")?.[url.split("/")?.length-1]
- let tempFilePath = path.join(__dirname,"temp",fname)
- if(!fs.existsSync(path.join(__dirname,"temp"))) fs.mkdirSync(path.join(__dirname,"temp"))
- return new Promise((resolve)=>{
- download(url).then(async fileBuffer=>{
- fs.writeFileSync(tempFilePath, fileBuffer);
- let md5
- try{
- md5 = await this.calcBufferMd5(fileBuffer)
- }catch(errmd5){
- console.error(errmd5)
- }
- console.log("Url File Md5:", md5)
- resolve({
- tempFilePath,
- md5
- })
- }).catch(err=>{
- console.error(err)
- resolve(null)
- })
- })
- }
- /**
- * 计算Buffer的MD5值
- * @param {Buffer} buffer - 需要计算MD5的Buffer
- * @returns {string} - MD5哈希值
- */
- calcBufferMd5(buffer) {
- const hash = crypto.createHash('md5');
- hash.update(buffer);
- return hash.digest('hex');
- }
- /**
- * 异步读取文件内容为Buffer
- * @param {string} filePath - 文件路径
- * @returns {Promise<Buffer>} - 返回一个Promise,解析为文件的Buffer
- */
- readFileBuffer(filePath) {
- return new Promise((resolve, reject) => {
- fs.readFile(filePath, (err, data) => {
- if (err) {
- reject(err);
- } else {
- resolve(data);
- }
- });
- });
- }
- }
- module.exports.IndexesService = IndexesService
|