// import pdf from 'pdf-parse'; // import fs from 'fs'; import { CloudApi, CloudObject, CloudQuery } from 'src/lib/ncloud'; import mammoth from "mammoth"; import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; import { Document } from '@langchain/core/documents'; import * as tf from '@tensorflow/tfjs-core'; // import "@tensorflow/tfjs-backend-cpu"; // import '@tensorflow/tfjs-backend-webgpu'; import '@tensorflow/tfjs-backend-webgl'; // import '@tensorflow/tfjs-backend-wasm'; import { TensorFlowEmbeddings } from "@langchain/community/embeddings/tensorflow"; export class AgentStory{ story:CloudObject|undefined // 文件标题 title:string|undefined = "" // 文档标签 tags:Array|undefined // 文件源地址 url:string|undefined = "" // 文档完整纯文本内容 content:string|undefined = "" // 文档hash唯一值 hash:string|undefined = "" // 文档分割后的列表 docList:Array = [] constructor(metadata:{ url:string, title?:string, tags?:Array }){ this.url = metadata.url this.title = metadata.title this.tags = metadata.tags setBackend() } async save(){ if(!this.hash){ return } let query = new CloudQuery("Story"); query.equalTo("hash",this.hash); let story = await query.first(); if(!story?.id){ story = new CloudObject("Story"); } story.set({ title: this.title, url: this.url, content: this.content, hash: this.hash, tags:this.tags }) this.story = await story.save(); } async loader(url:string){ let api = new CloudApi(); let result; if(url?.endsWith(".docx")){ result = await this.loadDocx(url) } if(!result){ result = await api.fetch("agent/loader",{url:url}) } this.content = result?.data || null if(this.content){ this.url = url } this.save(); return this.content } async loadDocx(url:string){ let data:any const response = await fetch(url); const arrayBuffer:any = await response.arrayBuffer(); let text; try { text = await mammoth.extractRawText({arrayBuffer:arrayBuffer}); // 浏览器 直接传递 arrayBuffer } catch (err) { console.error(err); } this.hash = await arrayBufferToHASH(arrayBuffer) // let html = mammoth.convertToHtml(buffer) data = text?.value || ""; // 正则匹配所有 多个\n换行的字符 替换成一次换行 data = data.replaceAll(/\n+/g,"\n") // 剔除多余换行 return {data} } async splitter(options?:{ chunkSize:number, chunkOverlap:number }){ if(!this.content) return // 默认:递归字符文本分割器 let splitter = new RecursiveCharacterTextSplitter({ chunkSize: options?.chunkSize || 500, chunkOverlap: options?.chunkOverlap || 150, }); let docOutput = await splitter.splitDocuments([ new Document({ pageContent: this.content }), ]); console.log(docOutput) this.docList = docOutput return this.docList } /** * 文本向量提取 * @see * https://js.langchain.com/docs/integrations/text_embedding/tensorflow/ * @returns */ // TensorFlow embedding vector(512) NOT NULL -- NOTE: 512 for Tensorflow // OpenAI embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT async embedings(){ if(!this.docList?.length){return} const embeddings = new TensorFlowEmbeddings(); let documentRes = await embeddings.embedDocuments(this.docList?.map(item=>item.pageContent)); console.log(documentRes); // 向量持久化 documentRes.forEach(async (vector512:any,index)=>{ /** * metadata * pageContent */ let document = this.docList[index] this.docList[index].vector512 = vector512 let hash = await arrayBufferToHASH(stringToArrayBuffer(document?.pageContent)) let query = new CloudQuery("Document"); query.equalTo("hash",hash); let docObj = await query.first() if(!docObj?.id){ docObj = new CloudObject("Document"); } docObj.set({ metadata:document?.metadata, pageContent:document?.pageContent, vector512:vector512, hash:hash, story:this.story?.toPointer(), }) docObj.save(); }) return documentRes; } async destoryAllDocument(){ if(this.story?.id){ let query = new CloudQuery("Document"); query.equalTo("story",this.story?.id); let docList = await query.find(); docList.forEach(doc=>{ doc.destroy(); }) } } } export async function fetchFileBuffer(url: string): Promise { const response = await fetch(url); if (!response.ok) { throw new Error(`Failed to fetch file: ${response.status} ${response.statusText}`); } const arrayBuffer = await response.arrayBuffer(); return Buffer.from(arrayBuffer); } async function setBackend(){ let backend let WebGPU = (navigator as any).gpu if (WebGPU) { // WebGPU is supported // console.log(WebGPU) backend = "webgpu" } else { // WebGPU is not supported } let glcanvas = document.createElement('canvas'); let WebGL = glcanvas.getContext('webgl') || glcanvas.getContext('experimental-webgl'); if (WebGL) { // console.log(WebGL) // WebGL is supported if(!backend) backend = "webgl" } else { // WebGL is not supported } if (typeof WebAssembly === 'object' && typeof WebAssembly.instantiate === 'function') { // WebAssembly is supported // console.log(WebAssembly) if(!backend) backend = "wasm" } else { // WebAssembly is not supported } backend&&await tf.setBackend(backend); await tf.ready(); return } export async function arrayBufferToHASH(arrayBuffer:any) { // 使用 SubtleCrypto API 计算哈希 const hashBuffer = await crypto.subtle.digest('SHA-256', arrayBuffer); // 使用 SHA-256 代替 MD5 const hashArray = Array.from(new Uint8Array(hashBuffer)); // 将缓冲区转换为字节数组 const hashHex = hashArray.map(b => ('00' + b.toString(16)).slice(-2)).join(''); // 转换为十六进制字符串 return hashHex; } export function stringToArrayBuffer(str:string) { // 创建一个与字符串长度相同的Uint8Array const encoder = new TextEncoder(); return encoder.encode(str).buffer; } export async function EmbedQuery(str:any):Promise>{ const embeddings = new TensorFlowEmbeddings(); let documentRes = await embeddings.embedQuery(str); return documentRes } /** 向量余弦相似度计算 */ export function RetriveAllDocument(vector1: Array, docList: Array): Array { docList.forEach(doc => { const vector512 = doc.vector512; doc.similarity = cosineSimilarity(vector1, vector512); // 计算余弦相似度并存储 }); // 按照相似度排序,降序排列 docList.sort((a, b) => b.similarity - a.similarity); return docList; // 返回排序后的docList } function dotProduct(vectorA: number[], vectorB: number[]): number { return vectorA.reduce((sum, value, index) => sum + value * vectorB[index], 0); } function magnitude(vector: number[]): number { return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0)); } function cosineSimilarity(vectorA: number[], vectorB: number[]): number { const dotProd = dotProduct(vectorA, vectorB); const magnitudeA = magnitude(vectorA); const magnitudeB = magnitude(vectorB); if (magnitudeA === 0 || magnitudeB === 0) { throw new Error("One or both vectors are zero vectors, cannot compute cosine similarity."); } return dotProd / (magnitudeA * magnitudeB); }