123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180 |
- const path = require("path");
- const fs = require("fs");
- const crypto = require("crypto");
- const download = require("download");
- const { DocxLoader } = require("langchain/document_loaders/fs/docx");
- const { CSVLoader } = require("langchain/document_loaders/fs/csv");
- const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
- const { PPTXLoader } = require("langchain/document_loaders/fs/pptx");
- const { WebpageLoader } = require("./loaders/web.loader");
- const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
- class IndexesService {
- Document = Parse.Object.extend("Document")
- loaderMap = {
- "docx":DocxLoader,
- "pdf":PDFLoader,
- "csv":CSVLoader,
- "pptx":PPTXLoader,
- "webpage":WebpageLoader,
-
- }
- async split(docs,options){
- let splitter = new RecursiveCharacterTextSplitter({
- chunkSize: options?.chunkSize || 500,
- chunkOverlap: options?.chunkOverlap || 100,
- });
- let docOutput = await splitter.splitDocuments(docs);
- return docOutput
- }
- async load(filelink,extend,md5File){
-
- let Parse = global.Parse;
- if(md5File&&Parse){
- let query = new Parse.Query("Document");
- query.equalTo("md5",md5File);
- let document = await query.first();
- if(document?.id){
- return [{
- metadata:document.get("metadata"),
- pageContent:document.get("pageContent")
- }]
- }
- }
-
- extend = extend || filelink?.split(".")?.[filelink?.split(".")?.length-1]
- let LoaderClass = this.loaderMap[extend]
- let isHttp = (filelink?.indexOf("http")>-1) || false
-
- let loaderPath = ""
- let hasDownloaded = false
- let md5
- if(isHttp){
-
- if(!LoaderClass){
- LoaderClass = WebpageLoader
- loaderPath = filelink
- }else{
- let tempFilePath
- try{
- let res = await this.download(filelink)
- tempFilePath = res.tempFilePath
- md5 = res.md5
- }catch(err1){
- console.log("文件下载失败",filelink,err1)
- return
- }
- hasDownloaded = true
- loaderPath = tempFilePath
- }
- }else{
- loaderPath = path.join(__dirname,filelink)
- if(!fs.existsSync(loaderPath)){
- console.log("文件不存在:",loaderPath)
- return
- }
-
-
-
-
- }
- let loader = new LoaderClass(
- loaderPath
- );
- if(!loader){
- console.log("仅支持:",Object.keys(loaderMap))
- return
- }
- let docs = await loader.load();
- if(hasDownloaded){
- if(fs.existsSync(loaderPath)){fs.rmSync(loaderPath)}
- }
-
- if(Parse){
- let document = new this.Document();
- let doc = docs?.[0]
- document.set("type","entire")
- document.set("md5",md5File||md5)
- document.set("pageContent",doc?.pageContent)
- document.set("metadata",doc?.metadata)
- try{
- document.save()
- }catch(errsave){}
- }
- return docs
- }
-
- async download(url){
- let fname = url.split("/")?.[url.split("/")?.length-1]
- let tempFilePath = path.join(__dirname,"temp",fname)
- if(!fs.existsSync(path.join(__dirname,"temp"))) fs.mkdirSync(path.join(__dirname,"temp"))
- return new Promise((resolve)=>{
- download(url).then(async fileBuffer=>{
- fs.writeFileSync(tempFilePath, fileBuffer);
- let md5
- try{
- md5 = await this.calcBufferMd5(fileBuffer)
- }catch(errmd5){
- console.error(errmd5)
- }
- console.log("Url File Md5:", md5)
- resolve({
- tempFilePath,
- md5
- })
- }).catch(err=>{
- console.error(err)
- resolve(null)
- })
- })
- }
-
- calcBufferMd5(buffer) {
- const hash = crypto.createHash('md5');
- hash.update(buffer);
- return hash.digest('hex');
- }
-
- readFileBuffer(filePath) {
- return new Promise((resolve, reject) => {
- fs.readFile(filePath, (err, data) => {
- if (err) {
- reject(err);
- } else {
- resolve(data);
- }
- });
- });
- }
- }
- module.exports.IndexesService = IndexesService
|